mirror of
https://github.com/huggingface/accelerate.git
synced 2025-11-16 15:24:34 +08:00
Compare commits
112 Commits
v0.30.0
...
make-versi
| Author | SHA1 | Date | |
|---|---|---|---|
| 9a04b8b58e | |||
| 589fddd317 | |||
| 99c69aaf73 | |||
| 00785cd9fc | |||
| a452327e8e | |||
| 851cf34351 | |||
| cd5698bb32 | |||
| 90d5023901 | |||
| 3bde615607 | |||
| dc3b5ad82e | |||
| 12a5befdd6 | |||
| 79ca85c27d | |||
| 13d93c4f50 | |||
| d982751aec | |||
| 95edc68cb3 | |||
| 288accc0ec | |||
| 83b0610155 | |||
| 386f7d2825 | |||
| 308a8e9689 | |||
| f35cbd1f02 | |||
| a14260c9da | |||
| 32f368ec3f | |||
| 415eddf1be | |||
| 230857691a | |||
| a5a3e57125 | |||
| 0af1d8b8de | |||
| d16d7371a1 | |||
| 7a5c231b9e | |||
| 4f02bb764a | |||
| 709fd1e42b | |||
| f4f1260a0e | |||
| c6da9f8693 | |||
| 3ebbe573ad | |||
| 24bf5ec546 | |||
| e1247de01e | |||
| 12a007d559 | |||
| 5bdcd7e169 | |||
| 2471eacdd6 | |||
| 167cb5eb20 | |||
| 947f64ee62 | |||
| 8330b375d4 | |||
| 92404fbf5f | |||
| 3a02754915 | |||
| fec1170e35 | |||
| eac206f063 | |||
| 6882ff2bea | |||
| 57a4c7465e | |||
| 404510a5ec | |||
| 3086e26db9 | |||
| 5d5d07abfc | |||
| 5a0b7dc597 | |||
| c799c198e9 | |||
| 1f7a79b428 | |||
| 4cc3530b64 | |||
| 5d4a3beb01 | |||
| 0284f9a9f6 | |||
| 573d22d48f | |||
| 13ca7dccb6 | |||
| 3b5a00e048 | |||
| 3c4eaedd46 | |||
| c0faec766c | |||
| 91a2599f93 | |||
| 5f9235a731 | |||
| 7a36a75c7c | |||
| f62854a281 | |||
| a9869ea0dc | |||
| 6d59614603 | |||
| 2d74c0c077 | |||
| 40007b4e97 | |||
| 7141881b1f | |||
| f0049b2cfb | |||
| 83bad87559 | |||
| 24d8b63fc3 | |||
| 4a83ee5382 | |||
| 05d240af95 | |||
| bad2ce42ed | |||
| 30cb7ece76 | |||
| b7fa2fa956 | |||
| d5d378d64e | |||
| 065e74d11a | |||
| 86b6deaea1 | |||
| b24a0ef5db | |||
| e061edc6e7 | |||
| c3f422699a | |||
| 0553483638 | |||
| 415789d0e4 | |||
| ae472bac48 | |||
| 4f2c2ba45c | |||
| e26065a265 | |||
| 1cb6fdcf7b | |||
| 4ba436eccc | |||
| 91e8a3ced4 | |||
| 4ad4d28c49 | |||
| befd87f043 | |||
| abce3604f0 | |||
| 27a607ea90 | |||
| aa21174de9 | |||
| 6cf1cc0a39 | |||
| bb465a9cf0 | |||
| 67308ca6ef | |||
| 63772f6ac2 | |||
| 8798cf06ab | |||
| 47bb2dd53e | |||
| 724824abbe | |||
| afc2c99e6a | |||
| 0fb95a2d3b | |||
| 7ac153f404 | |||
| 0f1b91bb74 | |||
| d1eb44c856 | |||
| 11a363287a | |||
| 5cfe409443 | |||
| 5b3a7f3892 |
8
.github/PULL_REQUEST_TEMPLATE.md
vendored
8
.github/PULL_REQUEST_TEMPLATE.md
vendored
@ -37,11 +37,11 @@ members/contributors who may be interested in your PR.
|
||||
If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**.
|
||||
|
||||
- Big modeling: @SunMarc
|
||||
- Fully-Sharded Data Parallism: @pacman100
|
||||
- DeepSpeed: @pacman100
|
||||
- Fully-Sharded Data Parallism: @muellerzr
|
||||
- DeepSpeed: @muellerzr
|
||||
- Command Line Interface: @muellerzr
|
||||
- Documentation: @muellerzr
|
||||
- Core parts of the library: @muellerzr @BenjaminBossan
|
||||
- Maintained examples: @muellerzr or @pacman100
|
||||
- Core parts of the library: @muellerzr @BenjaminBossan @SunMarc
|
||||
- Maintained examples: @muellerzr or @SunMarc
|
||||
|
||||
-->
|
||||
@ -21,7 +21,8 @@ jobs:
|
||||
|
||||
version-cpu:
|
||||
name: "Latest Accelerate CPU [version]"
|
||||
runs-on: [self-hosted, intel-cpu, 8-cpu, ci]
|
||||
runs-on:
|
||||
group: aws-general-8-plus
|
||||
needs: get-version
|
||||
steps:
|
||||
- name: Set up Docker Buildx
|
||||
@ -41,7 +42,8 @@ jobs:
|
||||
|
||||
version-cuda:
|
||||
name: "Latest Accelerate GPU [version]"
|
||||
runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
|
||||
runs-on:
|
||||
group: aws-g6-4xlarge-plus
|
||||
needs: get-version
|
||||
steps:
|
||||
- name: Set up Docker Buildx
|
||||
@ -61,7 +63,8 @@ jobs:
|
||||
|
||||
version-cuda-deepspeed:
|
||||
name: "Latest Accelerate GPU DeepSpeed [version]"
|
||||
runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
|
||||
runs-on:
|
||||
group: aws-g6-4xlarge-plus
|
||||
needs: get-version
|
||||
steps:
|
||||
- name: Set up Docker Buildx
|
||||
|
||||
15
.github/workflows/build_docker_images.yml
vendored
15
.github/workflows/build_docker_images.yml
vendored
@ -13,7 +13,8 @@ concurrency:
|
||||
jobs:
|
||||
latest-cpu:
|
||||
name: "Latest Accelerate CPU [dev]"
|
||||
runs-on: [self-hosted, intel-cpu, 8-cpu, ci]
|
||||
runs-on:
|
||||
group: aws-general-8-plus
|
||||
steps:
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
@ -29,7 +30,7 @@ jobs:
|
||||
- name: Build and Push CPU
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
file: docker/accelerate-cpu/Dockerfile
|
||||
file: docker/accelerate-cpu/Dockerfile
|
||||
push: true
|
||||
tags: |
|
||||
huggingface/accelerate:cpu-nightly
|
||||
@ -37,7 +38,8 @@ jobs:
|
||||
|
||||
latest-cuda:
|
||||
name: "Latest Accelerate GPU [dev]"
|
||||
runs-on: [self-hosted, nvidia-gpu, t4, ci]
|
||||
runs-on:
|
||||
group: aws-g6-4xlarge-plus
|
||||
steps:
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
@ -53,7 +55,7 @@ jobs:
|
||||
- name: Build and Push GPU
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
file: docker/accelerate-gpu/Dockerfile
|
||||
file: docker/accelerate-gpu/Dockerfile
|
||||
push: true
|
||||
tags: |
|
||||
huggingface/accelerate:gpu-nightly
|
||||
@ -61,7 +63,8 @@ jobs:
|
||||
|
||||
latest-cuda-deepspeed:
|
||||
name: "Latest Accelerate GPU DeepSpeed [dev]"
|
||||
runs-on: [self-hosted, nvidia-gpu, t4, ci]
|
||||
runs-on:
|
||||
group: aws-g6-4xlarge-plus
|
||||
steps:
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
@ -77,7 +80,7 @@ jobs:
|
||||
- name: Build and Push GPU
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
file: docker/accelerate-gpu-deepspeed/Dockerfile
|
||||
file: docker/accelerate-gpu-deepspeed/Dockerfile
|
||||
push: true
|
||||
tags: |
|
||||
huggingface/accelerate:gpu-deepspeed-nightly
|
||||
|
||||
2
.github/workflows/integration_tests.yml
vendored
2
.github/workflows/integration_tests.yml
vendored
@ -31,6 +31,8 @@ jobs:
|
||||
uses: actions/setup-python@v3
|
||||
with:
|
||||
python-version: 3.8
|
||||
cache: 'pip'
|
||||
cache-dependency-path: 'setup.py'
|
||||
|
||||
- name: Install Accelerate from source
|
||||
run: |
|
||||
|
||||
22
.github/workflows/nightly.yml
vendored
22
.github/workflows/nightly.yml
vendored
@ -13,7 +13,8 @@ env:
|
||||
|
||||
jobs:
|
||||
run_core_tests_single_gpu:
|
||||
runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
|
||||
runs-on:
|
||||
group: aws-g6-4xlarge-plus
|
||||
env:
|
||||
CUDA_VISIBLE_DEVICES: "0"
|
||||
TEST_TYPE: "single_gpu"
|
||||
@ -43,7 +44,7 @@ jobs:
|
||||
run: |
|
||||
source activate accelerate
|
||||
make test
|
||||
|
||||
|
||||
- name: Run examples on GPUs
|
||||
working-directory: accelerate
|
||||
if: always()
|
||||
@ -51,7 +52,7 @@ jobs:
|
||||
source activate accelerate
|
||||
pip uninstall comet_ml -y
|
||||
make test_examples
|
||||
|
||||
|
||||
- name: Generate Report
|
||||
working-directory: accelerate
|
||||
if: always()
|
||||
@ -60,7 +61,8 @@ jobs:
|
||||
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_deepspeed_tests_single_gpu:
|
||||
runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
|
||||
runs-on:
|
||||
group: aws-g6-4xlarge-plus
|
||||
env:
|
||||
CUDA_VISIBLE_DEVICES: "0"
|
||||
TEST_TYPE: "single_gpu_deepspeed"
|
||||
@ -105,7 +107,7 @@ jobs:
|
||||
source activate accelerate
|
||||
pip uninstall comet_ml -y
|
||||
make test_examples
|
||||
|
||||
|
||||
- name: Generate Report
|
||||
working-directory: accelerate
|
||||
if: always()
|
||||
@ -114,7 +116,8 @@ jobs:
|
||||
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_core_tests_multi_gpu:
|
||||
runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
|
||||
runs-on:
|
||||
group: aws-g6-12xlarge-plus
|
||||
env:
|
||||
CUDA_VISIBLE_DEVICES: "0,1"
|
||||
TEST_TYPE: "multi_gpu"
|
||||
@ -170,7 +173,8 @@ jobs:
|
||||
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_deepspeed_tests_multi_gpu:
|
||||
runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
|
||||
runs-on:
|
||||
group: aws-g6-12xlarge-plus
|
||||
env:
|
||||
CUDA_VISIBLE_DEVICES: "0,1"
|
||||
TEST_TYPE: "multi_gpu_deepspeed"
|
||||
@ -223,7 +227,7 @@ jobs:
|
||||
pip install slack_sdk tabulate
|
||||
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
|
||||
|
||||
run-integration-tests:
|
||||
if: always()
|
||||
uses: ./.github/workflows/self_hosted_integration_tests.yml
|
||||
uses: ./.github/workflows/self_hosted_integration_tests.yml
|
||||
|
||||
2
.github/workflows/quality.yml
vendored
2
.github/workflows/quality.yml
vendored
@ -11,6 +11,8 @@ jobs:
|
||||
uses: actions/setup-python@v3
|
||||
with:
|
||||
python-version: 3.8
|
||||
cache: 'pip'
|
||||
cache-dependency-path: 'setup.py'
|
||||
- name: Install Python dependencies
|
||||
run: pip install -e .[quality]
|
||||
- name: Run Quality check
|
||||
|
||||
18
.github/workflows/run_merge_tests.yml
vendored
18
.github/workflows/run_merge_tests.yml
vendored
@ -10,7 +10,8 @@ env:
|
||||
|
||||
jobs:
|
||||
run_core_tests_single_gpu:
|
||||
runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
|
||||
runs-on:
|
||||
group: aws-g6-4xlarge-plus
|
||||
env:
|
||||
CUDA_VISIBLE_DEVICES: "0"
|
||||
container:
|
||||
@ -39,7 +40,7 @@ jobs:
|
||||
run: |
|
||||
source activate accelerate;
|
||||
make test_cli
|
||||
|
||||
|
||||
- name: Run test on GPUs
|
||||
working-directory: accelerate
|
||||
if: always()
|
||||
@ -62,7 +63,8 @@ jobs:
|
||||
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_deepspeed_tests_single_gpu:
|
||||
runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
|
||||
runs-on:
|
||||
group: aws-g6-4xlarge-plus
|
||||
env:
|
||||
CUDA_VISIBLE_DEVICES: "0"
|
||||
container:
|
||||
@ -85,7 +87,7 @@ jobs:
|
||||
run: |
|
||||
source activate accelerate;
|
||||
pip freeze
|
||||
|
||||
|
||||
- name: Run test on GPUs
|
||||
working-directory: accelerate
|
||||
if: always()
|
||||
@ -101,7 +103,8 @@ jobs:
|
||||
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_core_tests_multi_gpu:
|
||||
runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
|
||||
runs-on:
|
||||
group: aws-g6-12xlarge-plus
|
||||
env:
|
||||
CUDA_VISIBLE_DEVICES: 0,1
|
||||
container:
|
||||
@ -147,7 +150,8 @@ jobs:
|
||||
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_deepspeed_tests_multi_gpu:
|
||||
runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
|
||||
runs-on:
|
||||
group: aws-g6-12xlarge-plus
|
||||
container:
|
||||
image: huggingface/accelerate:gpu-deepspeed-nightly
|
||||
options: --gpus all --shm-size "16gb"
|
||||
@ -181,4 +185,4 @@ jobs:
|
||||
if: always()
|
||||
run: |
|
||||
pip install tabulate;
|
||||
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# CI for specifically ensuring integrations work fine (`transformers` mainly) on GPUs
|
||||
# Useful tips:
|
||||
# - `working-directory` should be set to the root of the repo, which is cloned on the actual CI runner.
|
||||
# It follows the directory structure of `actions-runner/_work/{repo_name}/{repo_name}/{cloned_repo} on
|
||||
# It follows the directory structure of `actions-runner/_work/{repo_name}/{repo_name}/{cloned_repo} on
|
||||
# prem, but in Actions setting `working-directory` looks just in the `{repo_name}` level.
|
||||
# - New integrations to test should have its own job, and follow a strategy method where we check both
|
||||
# the pypi and github versions.
|
||||
@ -25,12 +25,13 @@ jobs:
|
||||
container:
|
||||
image: huggingface/accelerate:gpu-deepspeed-nightly
|
||||
options: --gpus all --shm-size "16gb"
|
||||
runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
|
||||
runs-on:
|
||||
group: aws-g6-12xlarge-plus
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
cuda_visible_devices: [
|
||||
"0",
|
||||
"0",
|
||||
"0,1"
|
||||
]
|
||||
steps:
|
||||
@ -51,7 +52,7 @@ jobs:
|
||||
pip install -e .[testing];
|
||||
pip uninstall comet_ml wandb dvclive -y
|
||||
cd ..;
|
||||
|
||||
|
||||
- name: Show installed libraries
|
||||
run: |
|
||||
source activate accelerate;
|
||||
@ -90,12 +91,13 @@ jobs:
|
||||
container:
|
||||
image: huggingface/accelerate:gpu-nightly
|
||||
options: --gpus all --shm-size "16gb"
|
||||
runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
|
||||
runs-on:
|
||||
group: aws-g6-12xlarge-plus
|
||||
strategy:
|
||||
fail-fast: false
|
||||
steps:
|
||||
- name: Install accelerate
|
||||
run:
|
||||
run:
|
||||
source activate accelerate;
|
||||
git clone https://github.com/huggingface/accelerate;
|
||||
cd accelerate;
|
||||
@ -122,4 +124,4 @@ jobs:
|
||||
working-directory: skorch/
|
||||
run: |
|
||||
source activate accelerate;
|
||||
pytest -sv -k TestAccelerate
|
||||
pytest -sv -k TestAccelerate
|
||||
|
||||
4
.github/workflows/stale.yml
vendored
4
.github/workflows/stale.yml
vendored
@ -19,10 +19,12 @@ jobs:
|
||||
uses: actions/setup-python@v3
|
||||
with:
|
||||
python-version: 3.8
|
||||
cache: 'pip'
|
||||
cache-dependency-path: 'setup.py'
|
||||
|
||||
- name: Install requirements
|
||||
run: |
|
||||
pip install PyGithub
|
||||
- name: Close stale issues
|
||||
run: |
|
||||
python utils/stale.py
|
||||
python utils/stale.py
|
||||
|
||||
6
.github/workflows/test.yml
vendored
6
.github/workflows/test.yml
vendored
@ -43,13 +43,15 @@ jobs:
|
||||
uses: actions/setup-python@v3
|
||||
with:
|
||||
python-version: 3.8
|
||||
cache: 'pip'
|
||||
cache-dependency-path: 'setup.py'
|
||||
|
||||
- name: Install the library
|
||||
run: |
|
||||
if [[ ${{ matrix.test-kind }} = test_prod ]]; then pip install -e .[test_prod]; fi
|
||||
if [[ ${{ matrix.test-kind }} != test_prod ]]; then pip install -e .[testing,test_trackers]; fi
|
||||
if [[ ${{ matrix.test-kind }} = test_rest ]]; then pip uninstall comet_ml -y; fi
|
||||
if [[ ${{ matrix.test-kind }} = minimum ]]; then pip install torch==1.10.0; fi
|
||||
if [[ ${{ matrix.pytorch-version }} = minimum ]]; then pip install torchvision==0.18.1 torch==2.3.1; fi
|
||||
pip install pytest-reportlog tabulate setuptools
|
||||
|
||||
- name: Show installed libraries
|
||||
@ -65,4 +67,4 @@ jobs:
|
||||
- name: Generate Report
|
||||
if: always()
|
||||
run: |
|
||||
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
55
.github/workflows/test_imports.yml
vendored
Normal file
55
.github/workflows/test_imports.yml
vendored
Normal file
@ -0,0 +1,55 @@
|
||||
name: Run Import Tests
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- "src/**"
|
||||
- "tests/**"
|
||||
- ".github/**"
|
||||
- "examples/**"
|
||||
- "setup.py"
|
||||
types: [opened, synchronize, reopened]
|
||||
|
||||
env:
|
||||
HF_HOME: ~/hf_cache
|
||||
TESTING_MOCKED_DATALOADERS: "1"
|
||||
IS_GITHUB_CI: "1"
|
||||
|
||||
jobs:
|
||||
run-tests:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
pytorch-version: [
|
||||
latest,
|
||||
minimum,
|
||||
]
|
||||
steps:
|
||||
- uses: actions/checkout@v3.1.0
|
||||
- name: Set up python 3.8
|
||||
uses: actions/setup-python@v3
|
||||
with:
|
||||
python-version: 3.8
|
||||
cache: 'pip'
|
||||
cache-dependency-path: 'setup.py'
|
||||
|
||||
- name: Install the library
|
||||
run: |
|
||||
pip install -e .
|
||||
pip install pytest-reportlog tabulate setuptools git+https://github.com/muellerzr/import-timer
|
||||
|
||||
- name: Show installed libraries
|
||||
run: |
|
||||
pip freeze
|
||||
|
||||
- name: Run Import Tests
|
||||
env:
|
||||
PYTORCH_VERSION: ${{ matrix.pytorch-version }}
|
||||
run: |
|
||||
pytest -sv tests/test_imports.py
|
||||
|
||||
- name: Generate Report
|
||||
if: always()
|
||||
run: |
|
||||
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
15
.github/workflows/trufflehog.yml
vendored
Normal file
15
.github/workflows/trufflehog.yml
vendored
Normal file
@ -0,0 +1,15 @@
|
||||
on:
|
||||
push:
|
||||
|
||||
name: Secret Leaks
|
||||
|
||||
jobs:
|
||||
trufflehog:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Secret Scanning
|
||||
uses: trufflesecurity/trufflehog@main
|
||||
20
README.md
20
README.md
@ -22,22 +22,12 @@ limitations under the License.
|
||||
|
||||
<p align="center">
|
||||
<!-- Uncomment when CircleCI is set up
|
||||
<a href="https://circleci.com/gh/huggingface/accelerate">
|
||||
<img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/master">
|
||||
</a>
|
||||
<a href="https://circleci.com/gh/huggingface/accelerate"><img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/master"></a>
|
||||
-->
|
||||
<a href="https://github.com/huggingface/accelerate/blob/main/LICENSE">
|
||||
<img alt="License" src="https://img.shields.io/github/license/huggingface/accelerate.svg?color=blue">
|
||||
</a>
|
||||
<a href="https://huggingface.co/docs/accelerate/index.html">
|
||||
<img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/accelerate/index.html.svg?down_color=red&down_message=offline&up_message=online">
|
||||
</a>
|
||||
<a href="https://github.com/huggingface/accelerate/releases">
|
||||
<img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/accelerate.svg">
|
||||
</a>
|
||||
<a href="https://github.com/huggingface/accelerate/blob/main/CODE_OF_CONDUCT.md">
|
||||
<img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-v2.0%20adopted-ff69b4.svg">
|
||||
</a>
|
||||
<a href="https://github.com/huggingface/accelerate/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/github/license/huggingface/accelerate.svg?color=blue"></a>
|
||||
<a href="https://huggingface.co/docs/accelerate/index.html"><img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/accelerate/index.html.svg?down_color=red&down_message=offline&up_message=online"></a>
|
||||
<a href="https://github.com/huggingface/accelerate/releases"><img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/accelerate.svg"></a>
|
||||
<a href="https://github.com/huggingface/accelerate/blob/main/CODE_OF_CONDUCT.md"><img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-v2.0%20adopted-ff69b4.svg"></a>
|
||||
</p>
|
||||
|
||||
<h3 align="center">
|
||||
|
||||
@ -1,46 +1,5 @@
|
||||
# Big model inference benchmarks
|
||||
# Benchmarks
|
||||
|
||||
Running inference with Accelerate on big models.
|
||||
The folders below contain suites to test various functionalities in Accelerate.
|
||||
|
||||
## Setup
|
||||
|
||||
These benchmarks use the `transformers` library:
|
||||
|
||||
```bash
|
||||
pip install transformers
|
||||
```
|
||||
|
||||
To reproduce or test a new setup, run
|
||||
|
||||
```py
|
||||
python inference_acc.py model_name
|
||||
```
|
||||
|
||||
This script supports `gpt-j-6b`, `gpt-neox`, `opt` (30B version) and `T0pp` out of the box, but you can specify any valid checkpoint for `model_name`.
|
||||
|
||||
To force a different `torch_dtype` than the one in the config: `--torch_dtype xxx`.
|
||||
|
||||
If you get an error linked to disk offload, you need to add the option `--disk-offload`
|
||||
|
||||
## Results
|
||||
|
||||
On a setup with two Titan RTXs (24GB of RAM) and 32GB of RAM, we get the following benchmarks (T0pp does not run in float16, which is why it's not included).
|
||||
|
||||
| Model | Model load time | Generation time | dtype | GPU 0 use | GPU 1 use | CPU use | Disk offload |
|
||||
|:-----:|:---------------:|:---------------:|:-----:|:---------:|:---------:|:-------:|:------------:|
|
||||
| GPT-J-6B | 8.7s | 0.05s per token | float16 | 11.7GB | 0GB | 0GB | no |
|
||||
| GPT-J-6B | 12.4s | 0.06s per token | float32 | 21.9GB | 1.5GB | 0GB | no |
|
||||
| GPT-Neo-X-20B | 30.9s | 0.08s per token | float16 | 21.5GB | 18GB | 0GB | no |
|
||||
| GPT-Neo-X-20B | 78.2s | 10.72s per token | float32 | 20.3GB | 22.7 GB | 24.4GB | yes |
|
||||
| T0pp (11B) | 29.4s | 0.05s per token | float32 | 21.1GB | 21.3GB | 0GB | no |
|
||||
| OPT-30B | 34.5s | 2.37s per token | float16 | 20.7GB | 22.3GB | 14.1GB | no |
|
||||
| OPT-30B | 112.3s | 33.9s per token | float32 | 20.2GB | 21.2GB | 23.5GB | yes |
|
||||
|
||||
Note on the results:
|
||||
- using two GPUs instead of one does not slow down generation
|
||||
- using CPU offload slows down a bit (see OPT-30b)
|
||||
- using disk offload slows down a lot (need to implement prefetching)
|
||||
|
||||
You will also note that Accelerate does not use anymore GPU and CPU RAM than necessary:
|
||||
- peak GPU memory is exactly the size of the model put on a given GPU
|
||||
- peak CPU memory is either the size of the biggest checkpoint shard or the part of the model offloaded on CPU, whichever is bigger.
|
||||
See their relevant README.md's for more information.
|
||||
|
||||
46
benchmarks/big_model_inference/README.md
Normal file
46
benchmarks/big_model_inference/README.md
Normal file
@ -0,0 +1,46 @@
|
||||
# Big model inference benchmarks
|
||||
|
||||
Running inference with Accelerate on big models.
|
||||
|
||||
## Setup
|
||||
|
||||
These benchmarks use the `transformers` library:
|
||||
|
||||
```bash
|
||||
pip install transformers
|
||||
```
|
||||
|
||||
To reproduce or test a new setup, run
|
||||
|
||||
```py
|
||||
python inference_acc.py model_name
|
||||
```
|
||||
|
||||
This script supports `gpt-j-6b`, `gpt-neox`, `opt` (30B version) and `T0pp` out of the box, but you can specify any valid checkpoint for `model_name`.
|
||||
|
||||
To force a different `torch_dtype` than the one in the config: `--torch_dtype xxx`.
|
||||
|
||||
If you get an error linked to disk offload, you need to add the option `--disk-offload`
|
||||
|
||||
## Results
|
||||
|
||||
On a setup with two Titan RTXs (24GB of RAM) and 32GB of RAM, we get the following benchmarks (T0pp does not run in float16, which is why it's not included).
|
||||
|
||||
| Model | Model load time | Generation time | dtype | GPU 0 use | GPU 1 use | CPU use | Disk offload |
|
||||
|:-----:|:---------------:|:---------------:|:-----:|:---------:|:---------:|:-------:|:------------:|
|
||||
| GPT-J-6B | 8.7s | 0.05s per token | float16 | 11.7GB | 0GB | 0GB | no |
|
||||
| GPT-J-6B | 12.4s | 0.06s per token | float32 | 21.9GB | 1.5GB | 0GB | no |
|
||||
| GPT-Neo-X-20B | 30.9s | 0.08s per token | float16 | 21.5GB | 18GB | 0GB | no |
|
||||
| GPT-Neo-X-20B | 78.2s | 10.72s per token | float32 | 20.3GB | 22.7 GB | 24.4GB | yes |
|
||||
| T0pp (11B) | 29.4s | 0.05s per token | float32 | 21.1GB | 21.3GB | 0GB | no |
|
||||
| OPT-30B | 34.5s | 2.37s per token | float16 | 20.7GB | 22.3GB | 14.1GB | no |
|
||||
| OPT-30B | 112.3s | 33.9s per token | float32 | 20.2GB | 21.2GB | 23.5GB | yes |
|
||||
|
||||
Note on the results:
|
||||
- using two GPUs instead of one does not slow down generation
|
||||
- using CPU offload slows down a bit (see OPT-30b)
|
||||
- using disk offload slows down a lot (need to implement prefetching)
|
||||
|
||||
You will also note that Accelerate does not use anymore GPU and CPU RAM than necessary:
|
||||
- peak GPU memory is exactly the size of the model put on a given GPU
|
||||
- peak CPU memory is either the size of the biggest checkpoint shard or the part of the model offloaded on CPU, whichever is bigger.
|
||||
12
benchmarks/fp8/Dockerfile
Normal file
12
benchmarks/fp8/Dockerfile
Normal file
@ -0,0 +1,12 @@
|
||||
FROM nvcr.io/nvidia/pytorch:24.07-py3
|
||||
|
||||
RUN pip install transformers evaluate datasets
|
||||
RUN git clone https://github.com/huggingface/accelerate.git
|
||||
|
||||
RUN cd accelerate && \
|
||||
pip install -e . && \
|
||||
cd benchmarks/fp8
|
||||
|
||||
RUN /bin/bash
|
||||
|
||||
|
||||
30
benchmarks/fp8/README.md
Normal file
30
benchmarks/fp8/README.md
Normal file
@ -0,0 +1,30 @@
|
||||
# FP8 Benchmarks
|
||||
|
||||
Comparing and running [TransformerEngine](https://github.com/NVIDIA/TransformerEngine) FP8 with accelerate
|
||||
|
||||
## Overview
|
||||
|
||||
This repo provides scripts which compare native TransformerEngine model training against `accelerate`'s own integration. Each modeling type is segmented out via a script, supporting the following:
|
||||
|
||||
* Single GPU training (`non_distributed.py`)
|
||||
* Multi-GPU training via DistributedDataParallelism (`ddp.py`)
|
||||
* Fully Sharded Data Parallelism (`fsdp.py`)
|
||||
* DeepSpeed ZeRO 1-3 (`deepspeed.py`)
|
||||
|
||||
To run them, it's recommended to use a docker image (see the attached `Dockerfile`) and not install `TransformerEngine` manually.
|
||||
|
||||
## Running:
|
||||
|
||||
You can run all scripts using the core `accelerate launch` command without any `accelerate config` being needed.
|
||||
|
||||
For single GPU, run it via `python`:
|
||||
|
||||
```bash
|
||||
python non_distributed.py
|
||||
```
|
||||
|
||||
For the rest, run it via `accelerate launch`:
|
||||
|
||||
```bash
|
||||
accelerate launch ddp.py # or distrib_deepspeed.py, ddp.py
|
||||
```
|
||||
143
benchmarks/fp8/ddp.py
Normal file
143
benchmarks/fp8/ddp.py
Normal file
@ -0,0 +1,143 @@
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
This script tests to ensure that `accelerate` performs at the same level as raw `TransformersEngine`.
|
||||
|
||||
This particular script verifies this for DDP training.
|
||||
"""
|
||||
import evaluate
|
||||
import torch
|
||||
import transformer_engine.common.recipe as te_recipe
|
||||
import transformer_engine.pytorch as te
|
||||
from fp8_utils import evaluate_model, get_named_parameters, get_training_utilities
|
||||
from torch.nn.parallel import DistributedDataParallel as DDP
|
||||
from transformer_engine.common.recipe import DelayedScaling
|
||||
|
||||
from accelerate import Accelerator
|
||||
from accelerate.state import AcceleratorState
|
||||
from accelerate.utils import FP8RecipeKwargs, set_seed
|
||||
from accelerate.utils.transformer_engine import convert_model
|
||||
|
||||
|
||||
MODEL_NAME = "bert-base-cased"
|
||||
METRIC = evaluate.load("glue", "mrpc")
|
||||
|
||||
|
||||
def train_baseline():
|
||||
set_seed(42)
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(MODEL_NAME)
|
||||
accelerator = Accelerator()
|
||||
device = accelerator.device
|
||||
model.to(device)
|
||||
|
||||
# Convert the model to TE
|
||||
old_named_params = get_named_parameters(model)
|
||||
|
||||
with torch.no_grad():
|
||||
convert_model(model)
|
||||
|
||||
FP8_RECIPE_KWARGS = {"fp8_format": te_recipe.Format.HYBRID, "amax_history_len": 32, "amax_compute_algo": "max"}
|
||||
fp8_recipe = DelayedScaling(**FP8_RECIPE_KWARGS)
|
||||
|
||||
new_named_params = get_named_parameters(model)
|
||||
|
||||
# Convert the model to DDP
|
||||
device_ids, output_device = [accelerator.local_process_index], accelerator.local_process_index
|
||||
model = DDP(model, device_ids=device_ids, output_device=output_device)
|
||||
|
||||
mapping = {p: new_named_params[n] for n, p in old_named_params.items()}
|
||||
for param_group in optimizer.param_groups:
|
||||
param_group["params"] = [mapping[p] for p in param_group["params"]]
|
||||
|
||||
base_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
||||
model.train()
|
||||
|
||||
for _ in range(2):
|
||||
for batch in train_dataloader:
|
||||
with te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe):
|
||||
with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
|
||||
batch = batch.to(device)
|
||||
outputs = model(**batch)
|
||||
loss = outputs.loss
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
lr_scheduler.step()
|
||||
|
||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
||||
|
||||
assert (
|
||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
||||
assert (
|
||||
trained_model_results["f1"] > base_model_results["f1"]
|
||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
||||
|
||||
return base_model_results, trained_model_results
|
||||
|
||||
|
||||
def train_integration():
|
||||
FP8_RECIPE_KWARGS = {"fp8_format": "HYBRID", "amax_history_len": 32, "amax_compute_algo": "max"}
|
||||
kwargs_handlers = [FP8RecipeKwargs(backend="TE", **FP8_RECIPE_KWARGS)]
|
||||
AcceleratorState()._reset_state(True)
|
||||
accelerator = Accelerator(mixed_precision="fp8", kwargs_handlers=kwargs_handlers)
|
||||
set_seed(42)
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(
|
||||
MODEL_NAME, accelerator=accelerator
|
||||
)
|
||||
|
||||
model, optimizer = accelerator.prepare(model, optimizer)
|
||||
base_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
||||
model.train()
|
||||
|
||||
for _ in range(2):
|
||||
for batch in train_dataloader:
|
||||
outputs = model(**batch)
|
||||
loss = outputs.loss
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
lr_scheduler.step()
|
||||
|
||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
||||
|
||||
assert (
|
||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
||||
assert (
|
||||
trained_model_results["f1"] > base_model_results["f1"]
|
||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
||||
|
||||
return base_model_results, trained_model_results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
baseline_not_trained, baseline_trained = train_baseline()
|
||||
accelerator_not_trained, accelerator_trained = train_integration()
|
||||
|
||||
assert (
|
||||
baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"]
|
||||
), f'Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}'
|
||||
assert (
|
||||
baseline_not_trained["f1"] == accelerator_not_trained["f1"]
|
||||
), f'F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}'
|
||||
assert (
|
||||
baseline_trained["accuracy"] == accelerator_trained["accuracy"]
|
||||
), f'Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}'
|
||||
assert (
|
||||
baseline_trained["f1"] == accelerator_trained["f1"]
|
||||
), f'F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}'
|
||||
|
||||
torch.distributed.destroy_process_group()
|
||||
189
benchmarks/fp8/distrib_deepspeed.py
Normal file
189
benchmarks/fp8/distrib_deepspeed.py
Normal file
@ -0,0 +1,189 @@
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
This script tests to ensure that `accelerate` performs at the same level as raw `TransformersEngine`.
|
||||
|
||||
This particular script verifies this for DDP training.
|
||||
"""
|
||||
from unittest.mock import patch
|
||||
|
||||
import deepspeed
|
||||
import evaluate
|
||||
import torch
|
||||
import transformer_engine.common.recipe as te_recipe
|
||||
import transformer_engine.pytorch as te
|
||||
from fp8_utils import evaluate_model, get_named_parameters, get_training_utilities
|
||||
from transformer_engine.common.recipe import DelayedScaling
|
||||
|
||||
from accelerate import Accelerator, DeepSpeedPlugin
|
||||
from accelerate.state import AcceleratorState
|
||||
from accelerate.utils import FP8RecipeKwargs, set_seed
|
||||
from accelerate.utils.transformer_engine import convert_model
|
||||
|
||||
|
||||
MODEL_NAME = "bert-base-cased"
|
||||
METRIC = evaluate.load("glue", "mrpc")
|
||||
|
||||
|
||||
def train_baseline(zero_stage: int = 1):
|
||||
# This forces transformers to think Zero-3 Init should be used
|
||||
with patch("transformers.integrations.deepspeed.is_deepspeed_zero3_enabled") as mock:
|
||||
mock.return_value = zero_stage == 3
|
||||
set_seed(42)
|
||||
|
||||
accelerator = Accelerator()
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(
|
||||
MODEL_NAME, accelerator=accelerator
|
||||
)
|
||||
|
||||
# Convert the model to TE
|
||||
old_named_params = get_named_parameters(model)
|
||||
|
||||
with torch.no_grad():
|
||||
convert_model(model)
|
||||
new_named_params = get_named_parameters(model)
|
||||
|
||||
mapping = {p: new_named_params[n] for n, p in old_named_params.items()}
|
||||
for param_group in optimizer.param_groups:
|
||||
param_group["params"] = [mapping[p] for p in param_group["params"]]
|
||||
|
||||
FP8_RECIPE_KWARGS = {"fp8_format": te_recipe.Format.HYBRID, "amax_history_len": 32, "amax_compute_algo": "max"}
|
||||
fp8_recipe = DelayedScaling(**FP8_RECIPE_KWARGS)
|
||||
|
||||
import numpy as np
|
||||
|
||||
config = {
|
||||
"train_batch_size": 32,
|
||||
"train_micro_batch_size_per_gpu": 16,
|
||||
"gradient_accumulation_steps": 1,
|
||||
"zero_optimization": {
|
||||
"stage": zero_stage,
|
||||
"offload_optimizer": {"device": "none", "nvme_path": None},
|
||||
"offload_param": {"device": "none", "nvme_path": None},
|
||||
"stage3_gather_16bit_weights_on_model_save": False,
|
||||
},
|
||||
"gradient_clipping": 1.0,
|
||||
"steps_per_print": np.inf,
|
||||
"bf16": {"enabled": True},
|
||||
"fp16": {"enabled": False},
|
||||
"zero_allow_untested_optimizer": True,
|
||||
}
|
||||
|
||||
(
|
||||
model,
|
||||
optimizer,
|
||||
_,
|
||||
_,
|
||||
) = deepspeed.initialize(
|
||||
model=model,
|
||||
optimizer=optimizer,
|
||||
config_params=config,
|
||||
)
|
||||
|
||||
base_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
||||
model.train()
|
||||
|
||||
model_outputs = []
|
||||
data = []
|
||||
|
||||
for _ in range(2):
|
||||
for batch in train_dataloader:
|
||||
with te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe):
|
||||
outputs = model(**batch)
|
||||
data.append(batch.to("cpu"))
|
||||
model_outputs.append(outputs.logits.to("cpu"))
|
||||
loss = outputs.loss
|
||||
model.backward(loss)
|
||||
model.step()
|
||||
for _ in range(accelerator.num_processes):
|
||||
lr_scheduler.step()
|
||||
|
||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
||||
model.destroy()
|
||||
assert (
|
||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
||||
assert (
|
||||
trained_model_results["f1"] > base_model_results["f1"]
|
||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
||||
|
||||
return base_model_results, trained_model_results, model_outputs, data
|
||||
|
||||
|
||||
def train_integration(zero_stage: int = 1):
|
||||
set_seed(42)
|
||||
FP8_RECIPE_KWARGS = {"fp8_format": "HYBRID", "amax_history_len": 32, "amax_compute_algo": "max"}
|
||||
kwargs_handlers = [FP8RecipeKwargs(backend="TE", **FP8_RECIPE_KWARGS)]
|
||||
AcceleratorState()._reset_state(True)
|
||||
deepspeed_plugin = DeepSpeedPlugin(
|
||||
zero_stage=zero_stage,
|
||||
zero3_init_flag=zero_stage == 3,
|
||||
)
|
||||
accelerator = Accelerator(
|
||||
mixed_precision="fp8", kwargs_handlers=kwargs_handlers, deepspeed_plugin=deepspeed_plugin
|
||||
)
|
||||
accelerator.state.deepspeed_plugin.deepspeed_config["train_micro_batch_size_per_gpu"] = 16
|
||||
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(
|
||||
MODEL_NAME, accelerator=accelerator
|
||||
)
|
||||
|
||||
model, optimizer, lr_scheduler = accelerator.prepare(model, optimizer, lr_scheduler)
|
||||
base_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
||||
model.train()
|
||||
model_outputs = []
|
||||
data = []
|
||||
for _ in range(2):
|
||||
for batch in train_dataloader:
|
||||
outputs = model(**batch)
|
||||
data.append(batch.to("cpu"))
|
||||
model_outputs.append(outputs.logits.to("cpu"))
|
||||
loss = outputs.loss
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
lr_scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
||||
model.destroy()
|
||||
assert (
|
||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
||||
assert (
|
||||
trained_model_results["f1"] > base_model_results["f1"]
|
||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
||||
|
||||
return base_model_results, trained_model_results, model_outputs, data
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# for zero_stage in [1, 2, 3]:
|
||||
zero_stage = 1
|
||||
baseline_not_trained, baseline_trained, baseline_outputs, baseline_data = train_baseline(zero_stage)
|
||||
accelerator_not_trained, accelerator_trained, accelerator_outputs, accelerator_data = train_integration(zero_stage)
|
||||
assert (
|
||||
baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"]
|
||||
), f'ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}'
|
||||
assert (
|
||||
baseline_not_trained["f1"] == accelerator_not_trained["f1"]
|
||||
), f'ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}'
|
||||
assert (
|
||||
baseline_trained["accuracy"] == accelerator_trained["accuracy"]
|
||||
), f'ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}'
|
||||
assert (
|
||||
baseline_trained["f1"] == accelerator_trained["f1"]
|
||||
), f'ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}'
|
||||
|
||||
torch.distributed.destroy_process_group()
|
||||
115
benchmarks/fp8/fp8_utils.py
Normal file
115
benchmarks/fp8/fp8_utils.py
Normal file
@ -0,0 +1,115 @@
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import torch
|
||||
|
||||
|
||||
def get_dataloaders(model_name: str, batch_size: int = 16):
|
||||
from datasets import load_dataset
|
||||
from torch.utils.data import DataLoader
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
datasets = load_dataset("glue", "mrpc")
|
||||
|
||||
def tokenize_function(examples):
|
||||
# max_length=None => use the model max length (it's actually the default)
|
||||
outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
|
||||
return outputs
|
||||
|
||||
# Apply the method we just defined to all the examples in all the splits of the dataset
|
||||
# starting with the main process first:
|
||||
tokenized_datasets = datasets.map(
|
||||
tokenize_function,
|
||||
batched=True,
|
||||
remove_columns=["idx", "sentence1", "sentence2"],
|
||||
)
|
||||
|
||||
# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
|
||||
# transformers library
|
||||
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
|
||||
|
||||
def collate_fn(examples):
|
||||
return tokenizer.pad(
|
||||
examples,
|
||||
padding="longest",
|
||||
pad_to_multiple_of=16, # Specific for FP8
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
# Instantiate dataloaders.
|
||||
train_dataloader = DataLoader(
|
||||
tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size, drop_last=True
|
||||
)
|
||||
eval_dataloader = DataLoader(
|
||||
tokenized_datasets["validation"],
|
||||
shuffle=False,
|
||||
collate_fn=collate_fn,
|
||||
batch_size=16,
|
||||
drop_last=True,
|
||||
)
|
||||
|
||||
return train_dataloader, eval_dataloader
|
||||
|
||||
|
||||
def get_training_utilities(model_name: str, batch_size: int = 16, accelerator=None):
|
||||
"""
|
||||
Returns a tuple of:
|
||||
- Model
|
||||
- Optimizer
|
||||
- Train dataloader (prepared)
|
||||
- Eval dataloader (prepared)
|
||||
- LR Scheduler
|
||||
Suitable for training on the MRPC dataset
|
||||
"""
|
||||
from torch.optim import AdamW
|
||||
from transformers import AutoModelForSequenceClassification, get_linear_schedule_with_warmup
|
||||
|
||||
from accelerate import Accelerator
|
||||
|
||||
if accelerator is None:
|
||||
accelerator = Accelerator()
|
||||
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
||||
train_dataloader, eval_dataloader = get_dataloaders(model_name, batch_size)
|
||||
optimizer = AdamW(model.parameters(), lr=0.0001)
|
||||
lr_scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=100,
|
||||
num_training_steps=len(train_dataloader) * 2,
|
||||
)
|
||||
train_dataloader, eval_dataloader = accelerator.prepare(train_dataloader, eval_dataloader)
|
||||
return model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||
|
||||
|
||||
def get_named_parameters(model):
|
||||
"""
|
||||
Same thing as `Accelerator.get_named_parameters` Returns a list of the named parameters of the model (extracted
|
||||
from parallel)
|
||||
"""
|
||||
from accelerate.utils import extract_model_from_parallel
|
||||
|
||||
model = extract_model_from_parallel(model)
|
||||
return {n: p for n, p in model.named_parameters()}
|
||||
|
||||
|
||||
def evaluate_model(model, dataloader, metric, accelerator=None):
|
||||
"Turns model to .eval(), runs dataloader, calculates metric, then turns eval back on"
|
||||
model.eval()
|
||||
for step, batch in enumerate(dataloader):
|
||||
with torch.no_grad():
|
||||
outputs = model(**batch)
|
||||
predictions = outputs.logits.argmax(dim=-1)
|
||||
if accelerator is not None and accelerator.num_processes > 1:
|
||||
predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))
|
||||
metric.add_batch(predictions=predictions, references=references)
|
||||
return metric.compute()
|
||||
160
benchmarks/fp8/fsdp.py
Normal file
160
benchmarks/fp8/fsdp.py
Normal file
@ -0,0 +1,160 @@
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
This script tests to ensure that `accelerate` performs at the same level as raw `TransformersEngine`.
|
||||
|
||||
This particular script verifies this for FSDP training.
|
||||
"""
|
||||
from functools import partial
|
||||
|
||||
import evaluate
|
||||
import torch
|
||||
import transformer_engine.common.recipe as te_recipe
|
||||
import transformer_engine.pytorch as te
|
||||
from fp8_utils import evaluate_model, get_named_parameters, get_training_utilities
|
||||
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
|
||||
from torch.distributed.fsdp import MixedPrecision
|
||||
from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
|
||||
from transformer_engine.common.recipe import DelayedScaling
|
||||
from transformers.models.bert import BertLayer
|
||||
|
||||
from accelerate import Accelerator
|
||||
from accelerate import FullyShardedDataParallelPlugin as FSDPPlugin
|
||||
from accelerate.state import AcceleratorState
|
||||
from accelerate.utils import FP8RecipeKwargs, set_seed
|
||||
from accelerate.utils.transformer_engine import convert_model
|
||||
|
||||
|
||||
MODEL_NAME = "bert-base-cased"
|
||||
METRIC = evaluate.load("glue", "mrpc")
|
||||
|
||||
FSDP_WRAP_POLICY = partial(transformer_auto_wrap_policy, transformer_layer_cls={BertLayer})
|
||||
|
||||
|
||||
def train_baseline():
|
||||
set_seed(42)
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(MODEL_NAME)
|
||||
accelerator = Accelerator()
|
||||
device = accelerator.device
|
||||
model.to(device)
|
||||
|
||||
# Convert the model to TE
|
||||
old_named_params = get_named_parameters(model)
|
||||
|
||||
with torch.no_grad():
|
||||
convert_model(model)
|
||||
|
||||
FP8_RECIPE_KWARGS = {"fp8_format": te_recipe.Format.HYBRID, "amax_history_len": 32, "amax_compute_algo": "max"}
|
||||
fp8_recipe = DelayedScaling(**FP8_RECIPE_KWARGS)
|
||||
|
||||
new_named_params = get_named_parameters(model)
|
||||
|
||||
# Convert the model to FSDP
|
||||
model = FSDP(
|
||||
model,
|
||||
use_orig_params=True,
|
||||
mixed_precision=MixedPrecision(param_dtype=torch.bfloat16, reduce_dtype=torch.float32),
|
||||
auto_wrap_policy=FSDP_WRAP_POLICY,
|
||||
)
|
||||
|
||||
mapping = {p: new_named_params[n] for n, p in old_named_params.items()}
|
||||
for param_group in optimizer.param_groups:
|
||||
param_group["params"] = [mapping[p] for p in param_group["params"]]
|
||||
|
||||
base_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
||||
model.train()
|
||||
|
||||
for _ in range(2):
|
||||
for batch in train_dataloader:
|
||||
with te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe):
|
||||
with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
|
||||
batch = batch.to(device)
|
||||
outputs = model(**batch)
|
||||
loss = outputs.loss
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
lr_scheduler.step()
|
||||
|
||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
||||
|
||||
assert (
|
||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
||||
assert (
|
||||
trained_model_results["f1"] > base_model_results["f1"]
|
||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
||||
|
||||
return base_model_results, trained_model_results
|
||||
|
||||
|
||||
def train_integration():
|
||||
FP8_RECIPE_KWARGS = {"fp8_format": "HYBRID", "amax_history_len": 32, "amax_compute_algo": "max"}
|
||||
kwargs_handlers = [FP8RecipeKwargs(backend="TE", **FP8_RECIPE_KWARGS)]
|
||||
AcceleratorState()._reset_state(True)
|
||||
fsdp_plugin = FSDPPlugin(
|
||||
auto_wrap_policy=FSDP_WRAP_POLICY,
|
||||
use_orig_params=True,
|
||||
mixed_precision_policy=MixedPrecision(param_dtype=torch.bfloat16, reduce_dtype=torch.float32),
|
||||
)
|
||||
accelerator = Accelerator(mixed_precision="fp8", fsdp_plugin=fsdp_plugin, kwargs_handlers=kwargs_handlers)
|
||||
set_seed(42)
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(
|
||||
MODEL_NAME, accelerator=accelerator
|
||||
)
|
||||
|
||||
model, optimizer = accelerator.prepare(model, optimizer)
|
||||
base_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
||||
model.train()
|
||||
|
||||
for _ in range(2):
|
||||
for batch in train_dataloader:
|
||||
outputs = model(**batch)
|
||||
loss = outputs.loss
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
lr_scheduler.step()
|
||||
|
||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
||||
|
||||
assert (
|
||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
||||
assert (
|
||||
trained_model_results["f1"] > base_model_results["f1"]
|
||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
||||
|
||||
return base_model_results, trained_model_results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
baseline_not_trained, baseline_trained = train_baseline()
|
||||
accelerator_not_trained, accelerator_trained = train_integration()
|
||||
|
||||
assert (
|
||||
baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"]
|
||||
), f'Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}'
|
||||
assert (
|
||||
baseline_not_trained["f1"] == accelerator_not_trained["f1"]
|
||||
), f'F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}'
|
||||
assert (
|
||||
baseline_trained["accuracy"] == accelerator_trained["accuracy"]
|
||||
), f'Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}'
|
||||
assert (
|
||||
baseline_trained["f1"] == accelerator_trained["f1"]
|
||||
), f'F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}'
|
||||
|
||||
torch.distributed.destroy_process_group()
|
||||
131
benchmarks/fp8/non_distributed.py
Normal file
131
benchmarks/fp8/non_distributed.py
Normal file
@ -0,0 +1,131 @@
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
This script tests to ensure that `accelerate` performs at the same level as raw `TransformersEngine`.
|
||||
|
||||
This particular script verifies this for single GPU training.
|
||||
"""
|
||||
import evaluate
|
||||
import torch
|
||||
import transformer_engine.common.recipe as te_recipe
|
||||
import transformer_engine.pytorch as te
|
||||
from fp8_utils import evaluate_model, get_named_parameters, get_training_utilities
|
||||
from transformer_engine.common.recipe import DelayedScaling
|
||||
|
||||
from accelerate import Accelerator
|
||||
from accelerate.state import AcceleratorState
|
||||
from accelerate.utils import FP8RecipeKwargs, set_seed
|
||||
from accelerate.utils.transformer_engine import convert_model
|
||||
|
||||
|
||||
MODEL_NAME = "bert-base-cased"
|
||||
METRIC = evaluate.load("glue", "mrpc")
|
||||
|
||||
|
||||
def train_baseline():
|
||||
set_seed(42)
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(MODEL_NAME)
|
||||
|
||||
# Convert the model to TE
|
||||
old_named_params = get_named_parameters(model)
|
||||
|
||||
with torch.no_grad():
|
||||
convert_model(model)
|
||||
|
||||
new_named_params = get_named_parameters(model)
|
||||
mapping = {p: new_named_params[n] for n, p in old_named_params.items()}
|
||||
for param_group in optimizer.param_groups:
|
||||
param_group["params"] = [mapping[p] for p in param_group["params"]]
|
||||
|
||||
FP8_RECIPE_KWARGS = {"fp8_format": te_recipe.Format.HYBRID, "amax_history_len": 32, "amax_compute_algo": "max"}
|
||||
fp8_recipe = DelayedScaling(**FP8_RECIPE_KWARGS)
|
||||
|
||||
model.to("cuda")
|
||||
base_model_results = evaluate_model(model, eval_dataloader, METRIC)
|
||||
model.train()
|
||||
|
||||
for batch in train_dataloader:
|
||||
with te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe):
|
||||
with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
|
||||
batch = batch.to("cuda")
|
||||
outputs = model(**batch)
|
||||
loss = outputs.loss
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
lr_scheduler.step()
|
||||
|
||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC)
|
||||
|
||||
assert (
|
||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
||||
assert (
|
||||
trained_model_results["f1"] > base_model_results["f1"]
|
||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
||||
|
||||
return base_model_results, trained_model_results
|
||||
|
||||
|
||||
def train_integration():
|
||||
FP8_RECIPE_KWARGS = {"fp8_format": "HYBRID", "amax_history_len": 32, "amax_compute_algo": "max"}
|
||||
kwargs_handlers = [FP8RecipeKwargs(backend="TE", **FP8_RECIPE_KWARGS)]
|
||||
AcceleratorState()._reset_state(True)
|
||||
accelerator = Accelerator(mixed_precision="fp8", kwargs_handlers=kwargs_handlers)
|
||||
set_seed(42)
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(
|
||||
MODEL_NAME, accelerator=accelerator
|
||||
)
|
||||
|
||||
model, optimizer, lr_scheduler = accelerator.prepare(model, optimizer, lr_scheduler)
|
||||
base_model_results = evaluate_model(model, eval_dataloader, METRIC)
|
||||
model.train()
|
||||
|
||||
for batch in train_dataloader:
|
||||
outputs = model(**batch)
|
||||
loss = outputs.loss
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
lr_scheduler.step()
|
||||
|
||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC)
|
||||
|
||||
assert (
|
||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
||||
assert (
|
||||
trained_model_results["f1"] > base_model_results["f1"]
|
||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
||||
|
||||
return base_model_results, trained_model_results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
baseline_not_trained, baseline_trained = train_baseline()
|
||||
accelerator_not_trained, accelerator_trained = train_integration()
|
||||
|
||||
assert (
|
||||
baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"]
|
||||
), f'Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}'
|
||||
assert (
|
||||
baseline_not_trained["f1"] == accelerator_not_trained["f1"]
|
||||
), f'F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}'
|
||||
assert (
|
||||
baseline_trained["accuracy"] == accelerator_trained["accuracy"]
|
||||
), f'Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}'
|
||||
assert (
|
||||
baseline_trained["f1"] == accelerator_trained["f1"]
|
||||
), f'F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}'
|
||||
@ -31,6 +31,8 @@
|
||||
title: Model quantization
|
||||
- local: usage_guides/tracking
|
||||
title: Experiment trackers
|
||||
- local: usage_guides/profiler
|
||||
title: Profiler
|
||||
- local: usage_guides/checkpoint
|
||||
title: Save and load training states
|
||||
- local: basic_tutorials/troubleshooting
|
||||
@ -48,6 +50,8 @@
|
||||
title: Low precision (FP8) training
|
||||
- local: usage_guides/deepspeed
|
||||
title: DeepSpeed
|
||||
- local: usage_guides/ddp_comm_hook
|
||||
title: DDP Communication Hooks
|
||||
- local: usage_guides/fsdp
|
||||
title: Fully Sharded Data Parallelism
|
||||
- local: usage_guides/megatron_lm
|
||||
@ -108,6 +112,8 @@
|
||||
title: Distributed inference with big models
|
||||
- local: package_reference/kwargs
|
||||
title: Kwargs handlers
|
||||
- local: package_reference/fp8
|
||||
title: FP8 Functionality
|
||||
- local: package_reference/utilities
|
||||
title: Utility functions and classes
|
||||
- local: package_reference/megatron_lm
|
||||
|
||||
@ -430,6 +430,17 @@ args = (model, "fp16", 42, 64)
|
||||
notebook_launcher(training_loop, args, num_processes=8)
|
||||
```
|
||||
|
||||
To launch the training process with elasticity, enabling fault tolerance, you can use the `elastic_launch` feature provided by PyTorch. This requires setting additional parameters such as `rdzv_backend` and `max_restarts`. Here is an example of how to use `notebook_launcher` with elastic capabilities:
|
||||
|
||||
```python
|
||||
notebook_launcher(
|
||||
training_loop,
|
||||
args,
|
||||
num_processes=2,
|
||||
max_restarts=3
|
||||
)
|
||||
```
|
||||
|
||||
As it's running it will print the progress as well as state how many devices you ran on. This tutorial was ran with two GPUs:
|
||||
|
||||
```python out
|
||||
|
||||
@ -15,7 +15,7 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
# Moving between FSDP And DeepSpeed
|
||||
|
||||
🤗 Accelerate offers flexibilty of training frameworks, by integrating two extremely powerful tools for distributed training, namely [Pytorch FSDP](../usage_guides/fsdp.md) and [Microsoft DeepSpeed](../usage_guides/deepspeed.md). The aim of this tutorial is to draw parallels, as well as to outline potential differences, to empower the user to switch seamlessly between these two frameworks.
|
||||
🤗 Accelerate offers flexibilty of training frameworks, by integrating two extremely powerful tools for distributed training, namely [Pytorch FSDP](../usage_guides/fsdp) and [Microsoft DeepSpeed](../usage_guides/deepspeed). The aim of this tutorial is to draw parallels, as well as to outline potential differences, to empower the user to switch seamlessly between these two frameworks.
|
||||
|
||||
<Tip>
|
||||
|
||||
@ -189,4 +189,4 @@ Framework | Model Loading (`torch_dtype`) | Mixed Precision | Preparation (Local
|
||||
--|--|--|--|--|--
|
||||
FSDP | bf16 | default (none) | bf16 | bf16 | bf16
|
||||
FSDP | bf16 | bf16 | fp32 | bf16 | fp32
|
||||
DeepSpeed | bf16 | bf16 | fp32 | bf16 | fp32
|
||||
DeepSpeed | bf16 | bf16 | fp32 | bf16 | fp32
|
||||
|
||||
@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.
|
||||
The release of new kinds of hardware led to the emergence of new training paradigms that better utilize them. Currently, this is in the form of training
|
||||
in 8-bit precision using packages such as [TransformersEngine](https://github.com/NVIDIA/TransformerEngine) (TE) or [MS-AMP](https://github.com/Azure/MS-AMP/tree/main).
|
||||
|
||||
For an introduction to the topics discussed today, we recommend reviewing the [low-precision usage guide](../usage_guides/low_precision_training.md) as this documentation will reference it regularly.
|
||||
For an introduction to the topics discussed today, we recommend reviewing the [low-precision usage guide](../usage_guides/low_precision_training) as this documentation will reference it regularly.
|
||||
|
||||
## A Quick Chart
|
||||
|
||||
|
||||
BIN
docs/source/imgs/profile_export.png
Normal file
BIN
docs/source/imgs/profile_export.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 105 KiB |
@ -145,10 +145,11 @@ values. They can also be passed in manually.
|
||||
|
||||
The following arguments are useful for fine-tuning how available hardware should be used
|
||||
|
||||
* `--mixed_precision {no,fp16,bf16}` (`str`) -- Whether or not to use mixed precision training. Choose between FP16 and BF16 (bfloat16) training. BF16 training is only supported on Nvidia Ampere GPUs and PyTorch 1.10 or later.
|
||||
* `--mixed_precision {no,fp16,bf16,fp8}` (`str`) -- Whether or not to use mixed precision training. Choose between FP16 and BF16 (bfloat16) training. BF16 training is only supported on Nvidia Ampere GPUs and PyTorch 1.10 or later.
|
||||
* `--num_processes NUM_PROCESSES` (`int`) -- The total number of processes to be launched in parallel.
|
||||
* `--num_machines NUM_MACHINES` (`int`) -- The total number of machines used in this training.
|
||||
* `--num_cpu_threads_per_process NUM_CPU_THREADS_PER_PROCESS` (`int`) -- The number of CPU threads per process. Can be tuned for optimal performance.
|
||||
* `--enable_cpu_affinity` (`bool`) -- Whether or not CPU affinity and balancing should be enabled. Currently only supported on NVIDIA hardware.
|
||||
|
||||
**Training Paradigm Arguments**:
|
||||
|
||||
@ -165,19 +166,26 @@ The following arguments are only useful when `multi_gpu` is passed or multi-gpu
|
||||
|
||||
* `--gpu_ids` (`str`) -- What GPUs (by id) should be used for training on this machine as a comma-seperated list
|
||||
* `--same_network` (`bool`) -- Whether all machines used for multinode training exist on the same local network.
|
||||
* `--machine_rank MACHINE_RANK` (`int`) -- The rank of the machine on which this script is launched.
|
||||
* `--main_process_ip MAIN_PROCESS_IP` (`str`) -- The IP address of the machine of rank 0.
|
||||
* `--main_process_port MAIN_PROCESS_PORT` (`int`) -- The port to use to communicate with the machine of rank 0.
|
||||
* `--rdzv_backend` (`str`) -- The rendezvous method to use, such as "static" or "c10d"
|
||||
* `--machine_rank` (`int`) -- The rank of the machine on which this script is launched.
|
||||
* `--main_process_ip` (`str`) -- The IP address of the machine of rank 0.
|
||||
* `--main_process_port` (`int`) -- The port to use to communicate with the machine of rank 0.
|
||||
* `-t`, `--tee` (`str`) -- Tee std streams into a log file and also to console.
|
||||
* `--log_dir` (`str`) -- Base directory to use for log files when using torchrun/torch.distributed.run as launcher. Use with --tee to redirect std streams info log files.
|
||||
* `--role` (`str`) -- User-defined role for the workers.
|
||||
* `--rdzv_backend` (`str`) -- The rendezvous method to use, such as 'static' (the default) or 'c10d'
|
||||
* `--rdzv_conf` (`str`) -- Additional rendezvous configuration (<key1>=<value1>,<key2>=<value2>,...).
|
||||
* `--max_restarts` (`int`) -- Maximum number of worker group restarts before failing.
|
||||
* `--monitor_interval` (`float`) -- Interval, in seconds, to monitor the state of workers.
|
||||
* `--monitor_interval` (`int`) -- Interval, in seconds, to monitor the state of workers.
|
||||
|
||||
**TPU Arguments**:
|
||||
|
||||
The following arguments are only useful when `tpu` is passed or TPU training is configured through `accelerate config`:
|
||||
|
||||
* `--main_training_function MAIN_TRAINING_FUNCTION` (`str`) -- The name of the main function to be executed in your script.
|
||||
* `--tpu_cluster` (`bool`) -- Whether to use a GCP TPU pod for training.
|
||||
* `--tpu_use_sudo` (`bool`) -- Whether to use `sudo` when running the TPU training script in each pod.
|
||||
* `--vm` (`str`) -- List of single Compute VM instance names. If not provided we assume usage of instance groups. For TPU pods.
|
||||
* `--env` (`str`) -- List of environment variables to set on the Compute VM instances. For TPU pods.
|
||||
* `--main_training_function` (`str`) -- The name of the main function to be executed in your script (only for TPU training).
|
||||
* `--downcast_bf16` (`bool`) -- Whether when using bf16 precision on TPUs if both float and double tensors are cast to bfloat16 or if double tensors remain as float32.
|
||||
|
||||
**DeepSpeed Arguments**:
|
||||
@ -188,6 +196,7 @@ The following arguments are only useful when `use_deepspeed` is passed or `deeps
|
||||
* `--zero_stage` (`int`) -- DeepSpeed's ZeRO optimization stage.
|
||||
* `--offload_optimizer_device` (`str`) -- Decides where (none|cpu|nvme) to offload optimizer states.
|
||||
* `--offload_param_device` (`str`) -- Decides where (none|cpu|nvme) to offload parameters.
|
||||
* `--offload_optimizer_nvme_path` (`str`) -- Decides Nvme Path to offload optimizer states.
|
||||
* `--gradient_accumulation_steps` (`int`) -- No of gradient_accumulation_steps used in your training script.
|
||||
* `--gradient_clipping` (`float`) -- Gradient clipping value used in your training script.
|
||||
* `--zero3_init_flag` (`str`) -- Decides Whether (true|false) to enable `deepspeed.zero.Init` for constructing massive models. Only applicable with DeepSpeed ZeRO Stage-3.
|
||||
@ -196,6 +205,7 @@ The following arguments are only useful when `use_deepspeed` is passed or `deeps
|
||||
* `--deepspeed_exclusion_filter` (`str`) -- DeepSpeed exclusion filter string when using mutli-node setup.
|
||||
* `--deepspeed_inclusion_filter` (`str`) -- DeepSpeed inclusion filter string when using mutli-node setup.
|
||||
* `--deepspeed_multinode_launcher` (`str`) -- DeepSpeed multi-node launcher to use.
|
||||
* `--deepspeed_moe_layer_cls_names` (`str`) -- comma-separated list of transformer MoE layer class names (case-sensitive) to wrap, e.g, `MixtralSparseMoeBlock` `Qwen2MoeSparseMoeBlock`, `JetMoEAttention,JetMoEBlock`
|
||||
|
||||
**Fully Sharded Data Parallelism Arguments**:
|
||||
|
||||
@ -210,8 +220,9 @@ The following arguments are only useful when `use_fsdp` is passed or Fully Shard
|
||||
* `--fsdp_state_dict_type` (`str`) -- FSDP's state dict type.
|
||||
* `--fsdp_forward_prefetch` (`str`) -- FSDP forward prefetch.
|
||||
* `--fsdp_use_orig_params` (`str`) -- If True, allows non-uniform `requires_grad` mixed in a FSDP unit.
|
||||
* `--fsdp_cpu_ram_efficient_loading` (`str`) - If true, only the first process loads the pretrained model checkoint while all other processes have empty weights. When using this, `--fsdp_sync_module_states` needs to True.
|
||||
* `--fsdp_sync_module_states` (`str`) - If true, each individually wrapped FSDP unit will broadcast module parameters from rank 0.
|
||||
* `--fsdp_cpu_ram_efficient_loading` (`str`) -- If true, only the first process loads the pretrained model checkoint while all other processes have empty weights. When using this, `--fsdp_sync_module_states` needs to True.
|
||||
* `--fsdp_sync_module_states` (`str`) -- If true, each individually wrapped FSDP unit will broadcast module parameters from rank 0.
|
||||
* `--fsdp_activation_checkpointing` (`bool`) -- Decides Whether intermediate activations are freed during the forward pass, and a checkpoint is left as a placeholder
|
||||
|
||||
**Megatron-LM Arguments**:
|
||||
|
||||
@ -225,6 +236,18 @@ The following arguments are only useful when `use_megatron_lm` is passed or Mega
|
||||
* `--megatron_lm_use_distributed_optimizer` (``) -- Decides Whether (true|false) to use distributed optimizer which shards optimizer state and gradients across Data Parallel (DP) ranks.
|
||||
* `--megatron_lm_gradient_clipping` (``) -- Megatron-LM's gradient clipping value based on global L2 Norm (0 to disable).
|
||||
|
||||
**FP8 Arguments**:
|
||||
|
||||
* `--fp8_backend` (`str`) -- Choose a backend to train with FP8 (`te` or `msamp`)
|
||||
* `--fp8_use_autocast_during_eval` (`bool`) -- Whether to use FP8 autocast during eval mode (useful only when `--fp8_backend=te` is passed). Generally better metrics are found when this is not passed.
|
||||
* `--fp8_margin` (`int`) -- The margin to use for the gradient scaling (useful only when `--fp8_backend=te` is passed).
|
||||
* `--fp8_interval` (`int`) -- The interval to use for how often the scaling factor is recomputed (useful only when `--fp8_backend=te` is passed).
|
||||
* `--fp8_format` (`str`) -- The format to use for the FP8 recipe (useful only when `--fp8_backend=te` is passed).
|
||||
* `--fp8_amax_history_len` (`int`) -- The length of the history to use for the scaling factor computation (useful only when `--fp8_backend=te` is passed).
|
||||
* `--fp8_amax_compute_algo` (`str`) -- The algorithm to use for the scaling factor computation. (useful only when `--fp8_backend=te` is passed).
|
||||
* `--fp8_override_linear_precision` (`Tuple[bool, bool, bool]`) -- Whether or not to execute `fprop`, `dgrad`, and `wgrad` GEMMS in higher precision.
|
||||
* `--fp8_opt_level` (`str`) -- What level of 8-bit collective communication should be used with MS-AMP (useful only when `--fp8_backend=msamp` is passed)
|
||||
|
||||
**AWS SageMaker Arguments**:
|
||||
|
||||
The following arguments are only useful when training in SageMaker
|
||||
|
||||
28
docs/source/package_reference/fp8.md
Normal file
28
docs/source/package_reference/fp8.md
Normal file
@ -0,0 +1,28 @@
|
||||
<!--Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
-->
|
||||
|
||||
# FP8 Functionality
|
||||
|
||||
Below are functions and classes relative to the underlying FP8 implementation
|
||||
|
||||
[[autodoc]] utils.FP8RecipeKwargs
|
||||
|
||||
[[autodoc]] utils.convert_model
|
||||
|
||||
[[autodoc]] utils.has_transformer_engine_layers
|
||||
|
||||
[[autodoc]] utils.contextual_fp8_autocast
|
||||
|
||||
[[autodoc]] utils.apply_fp8_autowrap
|
||||
@ -15,4 +15,10 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
# Utilities for Fully Sharded Data Parallelism
|
||||
|
||||
[[autodoc]] utils.FullyShardedDataParallelPlugin
|
||||
[[autodoc]] utils.enable_fsdp_ram_efficient_loading
|
||||
|
||||
[[autodoc]] utils.disable_fsdp_ram_efficient_loading
|
||||
|
||||
[[autodoc]] utils.merge_fsdp_weights
|
||||
|
||||
[[autodoc]] utils.FullyShardedDataParallelPlugin
|
||||
|
||||
@ -30,6 +30,10 @@ related to distributed training or mixed precision are created.
|
||||
|
||||
[[autodoc]] utils.FP8RecipeKwargs
|
||||
|
||||
## ProfileKwargs
|
||||
|
||||
[[autodoc]] utils.ProfileKwargs
|
||||
|
||||
## GradScalerKwargs
|
||||
|
||||
[[autodoc]] GradScalerKwargs
|
||||
|
||||
325
docs/source/usage_guides/ddp_comm_hook.md
Normal file
325
docs/source/usage_guides/ddp_comm_hook.md
Normal file
@ -0,0 +1,325 @@
|
||||
<!--
|
||||
Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
-->
|
||||
|
||||
# DDP Communication Hooks
|
||||
|
||||
Distributed Data Parallel (DDP) communication hooks provide a generic interface to control how gradients are communicated across workers by overriding the vanilla allreduce in `DistributedDataParallel`. A few built-in communication hooks are provided, and users can easily apply any of these hooks to optimize communication.
|
||||
|
||||
|
||||
- **FP16 Compression Hook**: Compresses gradients by casting them to half-precision floating-point format (`torch.float16`), reducing communication overhead.
|
||||
- **BF16 Compression Hook**: Similar to FP16, but uses the Brain Floating Point format (`torch.bfloat16`), which can be more efficient on certain hardware.
|
||||
- **PowerSGD Hook**: An advanced gradient compression algorithm that provides high compression rates and can accelerate bandwidth-bound distributed training.
|
||||
|
||||
In this tutorial, you will see how to quickly set up DDP communication hooks and perform training with the utilities provided in 🤗 Accelerate, which can be as simple as adding just one new line of code! This demonstrates how to use DDP communication hooks to optimize gradient communication in distributed training with the 🤗 Accelerate library.
|
||||
|
||||
## FP16 Compression Hook
|
||||
|
||||
<hfoptions id="fp16">
|
||||
<hfoption id="PyTorch">
|
||||
|
||||
```python
|
||||
import torch
|
||||
from torch.nn.parallel import DistributedDataParallel as DDP
|
||||
from torch.distributed.algorithms.ddp_comm_hooks import default_hooks
|
||||
|
||||
class MyModel(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.layer = torch.nn.Linear(10, 10)
|
||||
|
||||
def forward(self, x):
|
||||
return self.layer(x)
|
||||
|
||||
model = MyModel()
|
||||
model = DDP(model, device_ids=[torch.cuda.current_device()])
|
||||
model.register_comm_hook(state=None, hook=default_hooks.fp16_compress_hook)
|
||||
|
||||
# Training loop
|
||||
for data, targets in data_loader:
|
||||
outputs = model(data)
|
||||
loss = criterion(outputs, targets)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Accelerate">
|
||||
|
||||
```python
|
||||
from accelerate import Accelerator, DDPCommunicationHookType, DistributedDataParallelKwargs
|
||||
import torch
|
||||
|
||||
class MyModel(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.layer = torch.nn.Linear(10, 10)
|
||||
|
||||
def forward(self, x):
|
||||
return self.layer(x)
|
||||
|
||||
# DDP Communication Hook setup
|
||||
ddp_kwargs = DistributedDataParallelKwargs(comm_hook=DDPCommunicationHookType.FP16)
|
||||
accelerator = Accelerator(kwargs_handlers=[ddp_kwargs])
|
||||
|
||||
model = MyModel()
|
||||
optimizer = torch.optim.Adam(model.parameters())
|
||||
data_loader = DataLoader(dataset, batch_size=16)
|
||||
|
||||
model, optimizer, data_loader = accelerator.prepare(model, optimizer, data_loader)
|
||||
|
||||
# Training loop
|
||||
for data, targets in data_loader:
|
||||
outputs = model(data)
|
||||
loss = criterion(outputs, targets)
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
### BF16 Compression Hook
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
BF16 Compression Hook API is experimental, and it requires NCCL version later than 2.9.6.
|
||||
|
||||
</Tip>
|
||||
|
||||
<hfoptions id="bf16">
|
||||
<hfoption id="PyTorch">
|
||||
|
||||
```python
|
||||
import torch
|
||||
from torch.nn.parallel import DistributedDataParallel as DDP
|
||||
from torch.distributed.algorithms.ddp_comm_hooks import default_hooks
|
||||
|
||||
class MyModel(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.layer = torch.nn.Linear(10, 10)
|
||||
|
||||
def forward(self, x):
|
||||
return self.layer(x)
|
||||
|
||||
model = MyModel()
|
||||
model = DDP(model, device_ids=[torch.cuda.current_device()])
|
||||
model.register_comm_hook(state=None, hook=default_hooks.bf16_compress_hook)
|
||||
|
||||
# Training loop
|
||||
for data, targets in data_loader:
|
||||
outputs = model(data)
|
||||
loss = criterion(outputs, targets)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Accelerate">
|
||||
|
||||
```python
|
||||
from accelerate import Accelerator, DDPCommunicationHookType, DistributedDataParallelKwargs
|
||||
import torch
|
||||
|
||||
class MyModel(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.layer = torch.nn.Linear(10, 10)
|
||||
|
||||
def forward(self, x):
|
||||
return self.layer(x)
|
||||
|
||||
# DDP Communication Hook setup
|
||||
ddp_kwargs = DistributedDataParallelKwargs(comm_hook=DDPCommunicationHookType.BF16)
|
||||
accelerator = Accelerator(kwargs_handlers=[ddp_kwargs])
|
||||
|
||||
model = MyModel()
|
||||
optimizer = torch.optim.Adam(model.parameters())
|
||||
data_loader = DataLoader(dataset, batch_size=16)
|
||||
|
||||
model, optimizer, data_loader = accelerator.prepare(model, optimizer, data_loader)
|
||||
|
||||
# Training loop
|
||||
for data, targets in data_loader:
|
||||
outputs = model(data)
|
||||
loss = criterion(outputs, targets)
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
### PowerSGD Hook
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
PowerSGD typically requires extra memory of the same size as the model’s gradients to enable error feedback, which can compensate for biased compressed communication and improve accuracy.
|
||||
|
||||
</Tip>
|
||||
|
||||
<hfoptions id="powerSGD">
|
||||
<hfoption id="PyTorch">
|
||||
|
||||
```python
|
||||
import torch
|
||||
from torch.nn.parallel import DistributedDataParallel as DDP
|
||||
from torch.distributed.algorithms.ddp_comm_hooks import powerSGD_hook
|
||||
|
||||
class MyModel(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.layer = torch.nn.Linear(10, 10)
|
||||
|
||||
def forward(self, x):
|
||||
return self.layer(x)
|
||||
|
||||
model = MyModel()
|
||||
model = DDP(model, device_ids=[torch.cuda.current_device()])
|
||||
state = powerSGD_hook.PowerSGDState(process_group=None)
|
||||
model.register_comm_hook(state=state, hook=powerSGD_hook.powerSGD_hook)
|
||||
|
||||
# Training loop
|
||||
for data, targets in data_loader:
|
||||
outputs = model(data)
|
||||
loss = criterion(outputs, targets)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Accelerate">
|
||||
|
||||
```python
|
||||
from accelerate import Accelerator, DDPCommunicationHookType, DistributedDataParallelKwargs
|
||||
import torch
|
||||
|
||||
class MyModel(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.layer = torch.nn.Linear(10, 10)
|
||||
|
||||
def forward(self, x):
|
||||
return self.layer(x)
|
||||
|
||||
# DDP Communication Hook setup
|
||||
ddp_kwargs = DistributedDataParallelKwargs(comm_hook=DDPCommunicationHookType.POWER_SGD)
|
||||
accelerator = Accelerator(kwargs_handlers=[ddp_kwargs])
|
||||
|
||||
model = MyModel()
|
||||
optimizer = torch.optim.Adam(model.parameters())
|
||||
data_loader = DataLoader(dataset, batch_size=16)
|
||||
|
||||
model, optimizer, data_loader = accelerator.prepare(model, optimizer, data_loader)
|
||||
|
||||
# Training loop
|
||||
for data, targets in data_loader:
|
||||
outputs = model(data)
|
||||
loss = criterion(outputs, targets)
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## DDP Communication Hooks utilities
|
||||
|
||||
There are two additional utilities for supporting optional functionalities with the communication hooks.
|
||||
|
||||
### comm_wrapper
|
||||
|
||||
`comm_wrapper` is an option to wrap a communication hook with additional functionality. For example, it can be used to combine FP16 compression with other communication strategies. Currently supported wrappers are `no`, `fp16`, and `bf16`.
|
||||
|
||||
```python
|
||||
from accelerate import Accelerator, DDPCommunicationHookType, DistributedDataParallelKwargs
|
||||
import torch
|
||||
|
||||
class MyModel(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.layer = torch.nn.Linear(10, 10)
|
||||
|
||||
def forward(self, x):
|
||||
return self.layer(x)
|
||||
|
||||
# DDP Communication Hook setup
|
||||
ddp_kwargs = DistributedDataParallelKwargs(
|
||||
comm_hook=DDPCommunicationHookType.POWER_SGD,
|
||||
comm_wrapper=DDPCommunicationHookType.FP16
|
||||
)
|
||||
accelerator = Accelerator(kwargs_handlers=[ddp_kwargs])
|
||||
|
||||
model = MyModel()
|
||||
optimizer = torch.optim.Adam(model.parameters())
|
||||
data_loader = DataLoader(dataset, batch_size=16)
|
||||
|
||||
model, optimizer, data_loader = accelerator.prepare(model, optimizer, data_loader)
|
||||
|
||||
# Training loop
|
||||
for data, targets in data_loader:
|
||||
outputs = model(data)
|
||||
loss = criterion(outputs, targets)
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
```
|
||||
|
||||
### comm_state_option
|
||||
|
||||
`comm_state_option` allows you to pass additional state information required by certain communication hooks. This is particularly useful for stateful hooks like `PowerSGD`, which require maintaining hyperparameters and internal states across training steps. Below is an example showcasing the use of `comm_state_option` with the `PowerSGD` hook.
|
||||
|
||||
```python
|
||||
from accelerate import Accelerator, DDPCommunicationHookType, DistributedDataParallelKwargs
|
||||
import torch
|
||||
|
||||
class MyModel(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.layer = torch.nn.Linear(10, 10)
|
||||
|
||||
def forward(self, x):
|
||||
return self.layer(x)
|
||||
|
||||
# DDP Communication Hook setup
|
||||
ddp_kwargs = DistributedDataParallelKwargs(
|
||||
comm_hook=DDPCommunicationHookType.POWER_SGD,
|
||||
comm_state_option={"matrix_approximation_rank": 2}
|
||||
)
|
||||
accelerator = Accelerator(kwargs_handlers=[ddp_kwargs])
|
||||
|
||||
model = MyModel()
|
||||
optimizer = torch.optim.Adam(model.parameters())
|
||||
data_loader = DataLoader(dataset, batch_size=16)
|
||||
|
||||
model, optimizer, data_loader = accelerator.prepare(model, optimizer, data_loader)
|
||||
|
||||
# Training loop
|
||||
for data, targets in data_loader:
|
||||
outputs = model(data)
|
||||
loss = criterion(outputs, targets)
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
```
|
||||
|
||||
For more advanced usage and additional hooks, refer to the [PyTorch DDP Communication Hooks documentation](https://pytorch.org/docs/stable/ddp_comm_hooks.html).
|
||||
@ -433,7 +433,7 @@ Only the `auto` fields specified in above examples are handled by `prepare` meth
|
||||
The `auto` values are calculated as:
|
||||
|
||||
- `reduce_bucket_size`: `hidden_size * hidden_size`
|
||||
- `stage3_prefetch_bucket_size`: `0.9 * hidden_size * hidden_size`
|
||||
- `stage3_prefetch_bucket_size`: `int(0.9 * hidden_size * hidden_size)`
|
||||
- `stage3_param_persistence_threshold`: `10 * hidden_size`
|
||||
|
||||
For the `auto` feature to work for these 3 config entries - Accelerate will use `model.config.hidden_size` or `max(model.config.hidden_sizes)` as `hidden_size`. If neither of these is available, the launching will fail and you will have to set these 3 config entries manually. Remember the first 2 config entries are the communication buffers - the larger they are the more efficient the comms will be, and the larger they are the more GPU memory they will consume, so it's a tunable performance trade-off.
|
||||
@ -733,6 +733,6 @@ have any problems or questions with regards to DeepSpeed usage, please, file an
|
||||
|
||||
<Tip>
|
||||
|
||||
For those interested in the similarities and differences between FSDP and DeepSpeed, please check out the [concept guide here](../concept_guides/fsdp_and_deepspeed.md)!
|
||||
For those interested in the similarities and differences between FSDP and DeepSpeed, please check out the [concept guide here](../concept_guides/fsdp_and_deepspeed)!
|
||||
|
||||
</Tip>
|
||||
@ -161,6 +161,22 @@ When using transformers `save_pretrained`, pass `state_dict=accelerator.get_stat
|
||||
|
||||
You can then pass `state` into the `save_pretrained` method. There are several modes for `StateDictType` and `FullStateDictConfig` that you can use to control the behavior of `state_dict`. For more information, see the [PyTorch documentation](https://pytorch.org/docs/stable/fsdp.html).
|
||||
|
||||
If you choose to use `StateDictType.SHARDED_STATE_DICT`, the weights of the model during `Accelerator.save_state` will be split into `n` files for each sub-split on the model. To merge them back into
|
||||
a single dictionary to load back into the model later after training you can use the `merge_weights` utility:
|
||||
|
||||
```py
|
||||
from accelerate.utils import merge_fsdp_weights
|
||||
|
||||
# Our weights are saved usually in a `pytorch_model_fsdp_{model_number}` folder
|
||||
merge_fsdp_weights("pytorch_model_fsdp_0", "output_path", safe_serialization=True)
|
||||
```
|
||||
The final output will then either be saved to `model.safetensors` or `pytorch_model.bin` (if `safe_serialization=False` is passed).
|
||||
|
||||
This can also be called using the CLI:
|
||||
```bash
|
||||
accelerate merge-weights pytorch_model_fsdp_0/ output_path
|
||||
```
|
||||
|
||||
|
||||
## Mapping between FSDP sharding strategies and DeepSpeed ZeRO Stages
|
||||
* `FULL_SHARD` maps to the DeepSpeed `ZeRO Stage-3`. Shards optimizer states, gradients and parameters.
|
||||
@ -179,6 +195,6 @@ For more information on these options, please refer to the PyTorch [FullySharded
|
||||
|
||||
<Tip>
|
||||
|
||||
For those interested in the similarities and differences between FSDP and DeepSpeed, please check out the [concept guide here](../concept_guides/fsdp_and_deepspeed.md)!
|
||||
For those interested in the similarities and differences between FSDP and DeepSpeed, please check out the [concept guide here](../concept_guides/fsdp_and_deepspeed)!
|
||||
|
||||
</Tip>
|
||||
@ -19,7 +19,7 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
## What training on FP8 means
|
||||
|
||||
To explore more of the nitty-gritty in training in FP8 with PyTorch and 🤗 Accelerate, check out the [concept_guide](../concept_guides/low_precision_training.md) on why this can be difficult. But essentially rather than training in BF16, some (or all) aspects of training a model can be performed using 8 bits instead of 16. The challenge is doing so without degrading final performance.
|
||||
To explore more of the nitty-gritty in training in FP8 with PyTorch and 🤗 Accelerate, check out the [concept_guide](../concept_guides/low_precision_training) on why this can be difficult. But essentially rather than training in BF16, some (or all) aspects of training a model can be performed using 8 bits instead of 16. The challenge is doing so without degrading final performance.
|
||||
|
||||
This is only enabled on specific NVIDIA hardware, namely:
|
||||
|
||||
@ -39,7 +39,7 @@ from accelerate import Accelerator
|
||||
accelerator = Accelerator(mixed_precision="fp8")
|
||||
```
|
||||
|
||||
By default, if `MS-AMP` is available in your environment, 🤗 Accelerate will automatically utilize it as a backend. To specify it yourself (and customize other parts of the FP8 mixed precision setup), you can utilize the [`utils.FP8RecipeKwargs`]:
|
||||
By default, if `MS-AMP` is available in your environment, 🤗 Accelerate will automatically utilize it as a backend. To specify it yourself (and customize other parts of the FP8 mixed precision setup), you can utilize the [`utils.FP8RecipeKwargs`] or clarify it in your config `yaml`/during `accelerate launch`:
|
||||
|
||||
```{python}
|
||||
from accelerate import Accelerator
|
||||
@ -50,6 +50,19 @@ kwargs = [FP8RecipeKwargs(backend="msamp")]
|
||||
accelerator = Accelerator(mixed_precision="fp8", kwarg_handlers=kwargs)
|
||||
```
|
||||
|
||||
```{yaml}
|
||||
mixed_precision: fp8
|
||||
fp8_config:
|
||||
amax_compute_algorithm: max
|
||||
amax_history_length: 1024
|
||||
backend: TE
|
||||
fp8_format: E4M3
|
||||
interval: 1
|
||||
margin: 0
|
||||
override_linear_precision: false
|
||||
use_autocast_during_eval: false
|
||||
```
|
||||
|
||||
## Configuring MS-AMP
|
||||
|
||||
Of the two, `MS-AMP` is traditionally the easier one to configure as there is only a single argument: the optimization level.
|
||||
@ -68,6 +81,17 @@ kwargs = [FP8RecipeKwargs(backend="msamp", optimization_level="O2")]
|
||||
accelerator = Accelerator(mixed_precision="fp8", kwarg_handlers=kwargs)
|
||||
```
|
||||
|
||||
Or during `accelerate launch` via `--fp8_backend=msamp --fp8_opt_level=O2`
|
||||
|
||||
Similarly this can be set in your `config.yaml`:
|
||||
|
||||
```{yaml}
|
||||
mixed_precision: fp8
|
||||
fp8_config:
|
||||
backend: MSAMP
|
||||
opt_level: O2
|
||||
```
|
||||
|
||||
## Configuring TransformersEngine
|
||||
|
||||
TransformersEngine has much more available for customizing how and what FP8 calculations are performed. A full list of supported arguments and what they mean are available in [NVIDIA's documentation](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html), however they are restated as part of [`FP8KwargsHandler`]'s docstring for your convenience.
|
||||
@ -83,10 +107,39 @@ kwargs = [FP8RecipeKwargs(backend="te", ...)]
|
||||
accelerator = Accelerator(mixed_precision="fp8", kwarg_handlers=kwargs)
|
||||
```
|
||||
|
||||
Or during `accelerate launch` via `--fp8_backend=te ...`. Use `accelerate launch --fp8_backend=te -h` to see relevent arguments.
|
||||
|
||||
Similarly this can be set in your `config.yaml`:
|
||||
|
||||
```{yaml}
|
||||
mixed_precision: fp8
|
||||
fp8_config:
|
||||
amax_compute_algorithm: max
|
||||
amax_history_length: 1024
|
||||
backend: TE
|
||||
fp8_format: E4M3
|
||||
interval: 1
|
||||
margin: 0
|
||||
override_linear_precision: false
|
||||
use_autocast_during_eval: false
|
||||
```
|
||||
|
||||
## Example Zoo
|
||||
|
||||
We have examples showcasing training with FP8 both with accelerate and its underlying implementation available in the accelerate repo.
|
||||
Currently we support scripts showcasing:
|
||||
|
||||
* Single GPU
|
||||
* Distributed Data Parallelism (Multi-GPU)
|
||||
* Fully Sharded Data Parallelism
|
||||
* DeepSpeed ZeRO 1 through 3
|
||||
|
||||
Find out more [here](https://github.com/huggingface/accelerate/tree/main/benchmarks/fp8)
|
||||
|
||||
## Further Reading
|
||||
|
||||
To learn more about training in FP8 please check out the following resources:
|
||||
|
||||
* [Our concept guide](../concept_guides/low_precision_training.md) detailing into more about both TransformersEngine and MS-AMP
|
||||
* [Our concept guide](../concept_guides/low_precision_training) detailing into more about both TransformersEngine and MS-AMP
|
||||
* [The `transformers-engine` documentation](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html)
|
||||
* [The `MS-AMP` documentation](https://azure.github.io/MS-AMP/docs/)
|
||||
|
||||
@ -107,7 +107,10 @@ cd ..
|
||||
4. Installing Megatron-LM
|
||||
|
||||
```
|
||||
pip install git+https://github.com/huggingface/Megatron-LM.git
|
||||
git clone https://github.com/NVIDIA/Megatron-LM.git
|
||||
cd Megatron-LM
|
||||
git checkout core_r0.5.0
|
||||
pip install --no-use-pep517 -e .
|
||||
```
|
||||
|
||||
## Accelerate Megatron-LM Plugin
|
||||
|
||||
334
docs/source/usage_guides/profiler.md
Normal file
334
docs/source/usage_guides/profiler.md
Normal file
@ -0,0 +1,334 @@
|
||||
<!--
|
||||
Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
-->
|
||||
|
||||
# Profiler
|
||||
|
||||
Profiler is a tool that allows the collection of performance metrics during training and inference. Profiler’s context manager API can be used to better understand what model operators are the most expensive, examine their input shapes and stack traces, study device kernel activity, and visualize the execution trace. It provides insights into the performance of your model, allowing you to optimize and improve it.
|
||||
|
||||
This guide explains how to use PyTorch Profiler to measure the time and memory consumption of the model’s operators and how to integrate this with 🤗 Accelerate. We will cover various use cases and provide examples for each.
|
||||
|
||||
## Using profiler to analyze execution time
|
||||
|
||||
Profiler allows one to check which operators were called during the execution of a code range wrapped with a profiler context manager.
|
||||
|
||||
Let’s see how we can use profiler to analyze the execution time:
|
||||
|
||||
<hfoptions id="cpu execution time">
|
||||
<hfoption id="PyTorch">
|
||||
|
||||
```python
|
||||
import torch
|
||||
import torchvision.models as models
|
||||
from torch.profiler import profile, record_function, ProfilerActivity
|
||||
|
||||
model = models.resnet18()
|
||||
inputs = torch.randn(5, 3, 224, 224)
|
||||
|
||||
with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
|
||||
model(inputs)
|
||||
|
||||
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Accelerate">
|
||||
|
||||
```python
|
||||
from accelerate import Accelerator, ProfileKwargs
|
||||
import torch
|
||||
import torchvision.models as models
|
||||
|
||||
model = models.resnet18()
|
||||
inputs = torch.randn(5, 3, 224, 224)
|
||||
|
||||
profile_kwargs = ProfileKwargs(
|
||||
activities=["cpu"],
|
||||
record_shapes=True
|
||||
)
|
||||
|
||||
accelerator = Accelerator(cpu=True, kwargs_handlers=[profile_kwargs])
|
||||
model = accelerator.prepare(model)
|
||||
|
||||
with accelerator.profile() as prof:
|
||||
with torch.no_grad():
|
||||
model(inputs)
|
||||
|
||||
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
The resulting table output (omitting some columns):
|
||||
|
||||
```
|
||||
--------------------------------- ------------ ------------ ------------ ------------
|
||||
Name Self CPU CPU total CPU time avg # of Calls
|
||||
--------------------------------- ------------ ------------ ------------ ------------
|
||||
aten::conv2d 171.000us 52.260ms 2.613ms 20
|
||||
aten::convolution 227.000us 52.089ms 2.604ms 20
|
||||
aten::_convolution 270.000us 51.862ms 2.593ms 20
|
||||
aten::mkldnn_convolution 51.273ms 51.592ms 2.580ms 20
|
||||
aten::batch_norm 118.000us 7.059ms 352.950us 20
|
||||
aten::_batch_norm_impl_index 315.000us 6.941ms 347.050us 20
|
||||
aten::native_batch_norm 6.305ms 6.599ms 329.950us 20
|
||||
aten::max_pool2d 40.000us 4.008ms 4.008ms 1
|
||||
aten::max_pool2d_with_indices 3.968ms 3.968ms 3.968ms 1
|
||||
aten::add_ 780.000us 780.000us 27.857us 28
|
||||
--------------------------------- ------------ ------------ ------------ ------------
|
||||
Self CPU time total: 67.016ms
|
||||
```
|
||||
|
||||
To get a finer granularity of results and include operator input shapes, pass `group_by_input_shape=True` (note: this requires running the profiler with `record_shapes=True`):
|
||||
|
||||
```python
|
||||
print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10))
|
||||
```
|
||||
|
||||
## Using profiler to analyze memory consumption
|
||||
|
||||
Profiler can also show the amount of memory (used by the model’s tensors) that was allocated (or released) during the execution of the model’s operators. To enable memory profiling functionality pass `profile_memory=True`.
|
||||
|
||||
<hfoptions id="memory consumption">
|
||||
<hfoption id="PyTorch">
|
||||
|
||||
```python
|
||||
model = models.resnet18()
|
||||
inputs = torch.randn(5, 3, 224, 224)
|
||||
|
||||
with profile(activities=[ProfilerActivity.CPU],
|
||||
profile_memory=True, record_shapes=True) as prof:
|
||||
model(inputs)
|
||||
|
||||
print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Accelerate">
|
||||
|
||||
```python
|
||||
model = models.resnet18()
|
||||
inputs = torch.randn(5, 3, 224, 224)
|
||||
|
||||
profile_kwargs = ProfileKwargs(
|
||||
activities=["cpu"],
|
||||
profile_memory=True,
|
||||
record_shapes=True
|
||||
)
|
||||
|
||||
accelerator = Accelerator(cpu=True, kwargs_handlers=[profile_kwargs])
|
||||
model = accelerator.prepare(model)
|
||||
|
||||
with accelerator.profile() as prof:
|
||||
model(inputs)
|
||||
|
||||
print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
The resulting table output (omitting some columns):
|
||||
|
||||
```
|
||||
--------------------------------- ------------ ------------ ------------
|
||||
Name CPU Mem Self CPU Mem # of Calls
|
||||
--------------------------------- ------------ ------------ ------------
|
||||
aten::empty 94.85 Mb 94.85 Mb 205
|
||||
aten::max_pool2d_with_indices 11.48 Mb 11.48 Mb 1
|
||||
aten::addmm 19.53 Kb 19.53 Kb 1
|
||||
aten::mean 10.00 Kb 10.00 Kb 1
|
||||
aten::empty_strided 492 b 492 b 5
|
||||
aten::cat 240 b 240 b 6
|
||||
aten::abs 480 b 240 b 4
|
||||
aten::masked_select 120 b 112 b 1
|
||||
aten::ne 61 b 53 b 3
|
||||
aten::eq 30 b 30 b 1
|
||||
--------------------------------- ------------ ------------ ------------
|
||||
Self CPU time total: 69.332ms
|
||||
```
|
||||
|
||||
|
||||
## Exporting chrome trace
|
||||
|
||||
You can examine the sequence of profiled operators and CUDA kernels in Chrome trace viewer (`chrome://tracing`):
|
||||
|
||||

|
||||
|
||||
<hfoptions id="exporting chrome trace">
|
||||
<hfoption id="PyTorch">
|
||||
|
||||
```python
|
||||
model = models.resnet18().cuda()
|
||||
inputs = torch.randn(5, 3, 224, 224).cuda()
|
||||
|
||||
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
|
||||
model(inputs)
|
||||
|
||||
prof.export_chrome_trace("trace.json")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Accelerate">
|
||||
|
||||
```python
|
||||
profile_kwargs = ProfileKwargs(
|
||||
activities=["cpu", "cuda"],
|
||||
output_trace_dir="trace"
|
||||
)
|
||||
|
||||
accelerator = Accelerator(kwargs_handlers=[profile_kwargs])
|
||||
model = accelerator.prepare(model)
|
||||
|
||||
with accelerator.profile() as prof:
|
||||
model(inputs)
|
||||
|
||||
# The trace will be saved to the specified directory
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## Using Profiler to Analyze Long-Running Jobs
|
||||
|
||||
Profiler offers an additional API to handle long-running jobs (such as training loops). Tracing all of the execution can be slow and result in very large trace files. To avoid this, use optional arguments:
|
||||
|
||||
- `schedule_option`: Scheduling options allow you to control when profiling is active. This is useful for long-running jobs to avoid collecting too much data. Available keys are `wait`, `warmup`, `active`, `repeat` and `skip_first`. The profiler will skip the first `skip_first` steps, then wait for `wait` steps, then do the warmup for the next `warmup` steps, then do the active recording for the next `active` steps and then repeat the cycle starting with `wait` steps. The optional number of cycles is specified with the `repeat` parameter, the zero value means that the cycles will continue until the profiling is finished.
|
||||
- `on_trace_ready`: specifies a function that takes a reference to the profiler as an input and is called by the profiler each time the new trace is ready.
|
||||
|
||||
To illustrate how the API works, consider the following example:
|
||||
|
||||
<hfoptions id="custom handler">
|
||||
<hfoption id="PyTorch">
|
||||
|
||||
```python
|
||||
from torch.profiler import schedule
|
||||
|
||||
my_schedule = schedule(
|
||||
skip_first=10,
|
||||
wait=5,
|
||||
warmup=1,
|
||||
active=3,
|
||||
repeat=2
|
||||
)
|
||||
|
||||
def trace_handler(p):
|
||||
output = p.key_averages().table(sort_by="self_cuda_time_total", row_limit=10)
|
||||
print(output)
|
||||
p.export_chrome_trace("/tmp/trace_" + str(p.step_num) + ".json")
|
||||
|
||||
with profile(
|
||||
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
|
||||
schedule=my_schedule,
|
||||
on_trace_ready=trace_handler
|
||||
) as p:
|
||||
for idx in range(8):
|
||||
model(inputs)
|
||||
p.step()
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Accelerate">
|
||||
|
||||
```python
|
||||
def trace_handler(p):
|
||||
output = p.key_averages().table(sort_by="self_cuda_time_total", row_limit=10)
|
||||
print(output)
|
||||
p.export_chrome_trace("/tmp/trace_" + str(p.step_num) + ".json")
|
||||
|
||||
profile_kwargs = ProfileKwargs(
|
||||
activities=["cpu", "cuda"],
|
||||
schedule_option={"wait": 5, "warmup": 1, "active": 3, "repeat": 2, "skip_first": 10},
|
||||
on_trace_ready=trace_handler
|
||||
)
|
||||
|
||||
accelerator = Accelerator(kwargs_handlers=[profile_kwargs])
|
||||
model = accelerator.prepare(model)
|
||||
|
||||
with accelerator.profile() as prof:
|
||||
for idx in range(8):
|
||||
model(inputs)
|
||||
prof.step()
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## FLOPS
|
||||
|
||||
Use formula to estimate the FLOPs (floating point operations) of specific operators (matrix multiplication and 2D convolution).
|
||||
|
||||
To measure floating-point operations (FLOPS):
|
||||
|
||||
<hfoptions id="FLOPS">
|
||||
<hfoption id="PyTorch">
|
||||
|
||||
```python
|
||||
with profile(
|
||||
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
|
||||
with_flops=True
|
||||
) as prof:
|
||||
model(inputs)
|
||||
|
||||
print(prof.key_averages().table(sort_by="flops", row_limit=10))
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Accelerate">
|
||||
|
||||
```python
|
||||
profile_kwargs = ProfileKwargs(
|
||||
with_flops=True
|
||||
)
|
||||
accelerator = Accelerator(kwargs_handlers=[profile_kwargs])
|
||||
|
||||
with accelerator.profile() as prof:
|
||||
model(inputs)
|
||||
|
||||
print(prof.key_averages().table(sort_by="flops", row_limit=10))
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
The resulting table output (omitting some columns):
|
||||
|
||||
```
|
||||
------------------------------------------------------- ------------ ------------ ------------
|
||||
Name Self CPU Self CUDA Total FLOPs
|
||||
------------------------------------------------------- ------------ ------------ ------------
|
||||
aten::conv2d 197.000us 0.000us 18135613440.000
|
||||
aten::addmm 103.000us 17.000us 5120000.000
|
||||
aten::mul 29.000us 2.000us 30.000
|
||||
aten::convolution 409.000us 0.000us --
|
||||
aten::_convolution 253.000us 0.000us --
|
||||
aten::cudnn_convolution 5.465ms 2.970ms --
|
||||
cudaEventRecord 138.000us 0.000us --
|
||||
cudaStreamIsCapturing 43.000us 0.000us --
|
||||
cudaStreamGetPriority 40.000us 0.000us --
|
||||
cudaDeviceGetStreamPriorityRange 10.000us 0.000us --
|
||||
------------------------------------------------------- ------------ ------------ ------------
|
||||
Self CPU time total: 21.938ms
|
||||
Self CUDA time total: 4.165ms
|
||||
```
|
||||
|
||||
|
||||
|
||||
## Conclusion and Further Information
|
||||
|
||||
PyTorch Profiler is a powerful tool for analyzing the performance of your models. By integrating it with 🤗 Accelerate, you can easily profile your models and gain insights into their performance, helping you to optimize and improve them.
|
||||
|
||||
For more detailed information, refer to the [PyTorch Profiler documentation](https://pytorch.org/docs/stable/profiler.html).
|
||||
@ -233,6 +233,8 @@ In [/slurm/submit_multigpu.sh](./slurm/submit_multigpu.sh) the only parameter in
|
||||
|
||||
In [/slurm/submit_multinode.sh](./slurm/submit_multinode.sh) we must specify the number of nodes that will be part of the training (`--num_machines`), how many GPUs we will use in total (`--num_processes`), the [`backend`](https://pytorch.org/docs/stable/elastic/run.html#note-on-rendezvous-backend), `--main_process_ip` which will be the address the master node and the `--main_process_port`.
|
||||
|
||||
In [/slurm/submit_multicpu.sh](./slurm/submit_multicpu.sh) we must specify the number of nodes that will be part of the training (`--num_machines`), how many CPU processes we will use in total (`--num_processes`), the [`backend`](https://pytorch.org/docs/stable/elastic/run.html#note-on-rendezvous-backend), `--main_process_ip` which will be the address the master node and the `--main_process_port`. `mpirun_hostfile` specifies to run the job using MPIRun.
|
||||
|
||||
In both scripts, we run `activateEnviroment.sh` at the beginning. This script should contain the necessary instructions to initialize the environment for execution. Below, we show an example that loads the necessary libraries ([Environment modules](https://github.com/cea-hpc/modules)), activates the Python environment, and sets up various environment variables, most of them to run the scripts in offline mode in case we don't have internet connection from the cluster.
|
||||
|
||||
```bash
|
||||
|
||||
@ -88,4 +88,34 @@ These arguments should be added at the end of any method for starting the python
|
||||
accelerate launch ./local_sgd.py --local_sgd_steps 4
|
||||
```
|
||||
|
||||
### DDP Communication Hook (`ddp_comm_hook.py`)
|
||||
|
||||
- Shows how to use DDP Communication Hooks to control and optimize gradient communication across workers in a DistributedDataParallel setup.
|
||||
- Arguments available:
|
||||
- `ddp_comm_hook`, the type of DDP communication hook to use. Choose between `no`, `fp16`, `bf16`, `power_sgd`, and `batched_power_sgd`.
|
||||
|
||||
These arguments should be added at the end of any method for starting the python script (such as `accelerate launch`, `python -m torch.distributed.run`), such as:
|
||||
|
||||
```bash
|
||||
accelerate launch ./ddp_comm_hook.py --mixed_precision fp16 --ddp_comm_hook power_sgd
|
||||
```
|
||||
|
||||
### Profiler (`profiler.py`)
|
||||
|
||||
- Shows how to use the profiling capabilities of `Accelerate` to profile PyTorch models during training.
|
||||
- Uses the `ProfileKwargs` handler to customize profiling options, including activities, scheduling, and additional profiling options.
|
||||
- Can generate and save profiling traces in JSON format for visualization in Chrome's tracing tool.
|
||||
|
||||
Arguments available:
|
||||
- `--record_shapes`: If passed, records shapes for profiling.
|
||||
- `--profile_memory`: If passed, profiles memory usage.
|
||||
- `--with_stack`: If passed, profiles stack traces.
|
||||
- `--with_flops`: If passed, profiles floating point operations (FLOPS).
|
||||
- `--output_trace_dir`: If specified, saves the profiling trace to the given dir in JSON format.
|
||||
- `--cpu`: If passed, trains on the CPU instead of GPU.
|
||||
|
||||
These arguments should be added at the end of any method for starting the Python script (such as `python`, `accelerate launch`, `python -m torchrun`), such as:
|
||||
|
||||
```bash
|
||||
accelerate launch ./profiler.py --record_shapes --profile_memory --with_flops --output_trace_dir "profiler"
|
||||
```
|
||||
|
||||
231
examples/by_feature/ddp_comm_hook.py
Normal file
231
examples/by_feature/ddp_comm_hook.py
Normal file
@ -0,0 +1,231 @@
|
||||
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import evaluate
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
from torch.optim import AdamW
|
||||
from torch.utils.data import DataLoader
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
|
||||
|
||||
from accelerate import Accelerator, DistributedType
|
||||
from accelerate.utils import DDPCommunicationHookType, DistributedDataParallelKwargs
|
||||
|
||||
|
||||
########################################################################
|
||||
# This is a fully working simple example to use Accelerate
|
||||
# and perform ddp communication hook
|
||||
#
|
||||
# This example trains a Bert base model on GLUE MRPC
|
||||
# in any of the following settings (with the same script):
|
||||
# - single CPU or single GPU
|
||||
# - multi GPUS (using PyTorch distributed mode)
|
||||
# - (multi) TPUs
|
||||
# - fp16 (mixed-precision) or fp32 (normal precision)
|
||||
#
|
||||
# To run it in each of these various modes, follow the instructions
|
||||
# in the readme for examples:
|
||||
# https://github.com/huggingface/accelerate/tree/main/examples
|
||||
#
|
||||
########################################################################
|
||||
|
||||
|
||||
MAX_GPU_BATCH_SIZE = 16
|
||||
EVAL_BATCH_SIZE = 32
|
||||
|
||||
|
||||
def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
|
||||
"""
|
||||
Creates a set of `DataLoader`s for the `glue` dataset,
|
||||
using "bert-base-cased" as the tokenizer.
|
||||
|
||||
Args:
|
||||
accelerator (`Accelerator`):
|
||||
An `Accelerator` object
|
||||
batch_size (`int`, *optional*):
|
||||
The batch size for the train and validation DataLoaders.
|
||||
"""
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
||||
datasets = load_dataset("glue", "mrpc")
|
||||
|
||||
def tokenize_function(examples):
|
||||
# max_length=None => use the model max length (it's actually the default)
|
||||
outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
|
||||
return outputs
|
||||
|
||||
# Apply the method we just defined to all the examples in all the splits of the dataset
|
||||
# starting with the main process first:
|
||||
with accelerator.main_process_first():
|
||||
tokenized_datasets = datasets.map(
|
||||
tokenize_function,
|
||||
batched=True,
|
||||
remove_columns=["idx", "sentence1", "sentence2"],
|
||||
)
|
||||
|
||||
# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
|
||||
# transformers library
|
||||
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
|
||||
|
||||
def collate_fn(examples):
|
||||
# On TPU it's best to pad everything to the same length or training will be very slow.
|
||||
max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
|
||||
# When using mixed precision we want round multiples of 8/16
|
||||
if accelerator.mixed_precision == "fp8":
|
||||
pad_to_multiple_of = 16
|
||||
elif accelerator.mixed_precision != "no":
|
||||
pad_to_multiple_of = 8
|
||||
else:
|
||||
pad_to_multiple_of = None
|
||||
|
||||
return tokenizer.pad(
|
||||
examples,
|
||||
padding="longest",
|
||||
max_length=max_length,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
# Instantiate dataloaders.
|
||||
train_dataloader = DataLoader(
|
||||
tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
|
||||
)
|
||||
eval_dataloader = DataLoader(
|
||||
tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
|
||||
)
|
||||
|
||||
return train_dataloader, eval_dataloader
|
||||
|
||||
|
||||
# For testing only
|
||||
if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
|
||||
from accelerate.test_utils.training import mocked_dataloaders
|
||||
|
||||
get_dataloaders = mocked_dataloaders # noqa: F811
|
||||
|
||||
|
||||
def training_function(config, args):
|
||||
# For testing only
|
||||
if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
|
||||
config["num_epochs"] = 2
|
||||
# New Code #
|
||||
ddp_comm_hook_type = DDPCommunicationHookType(args.ddp_comm_hook)
|
||||
ddp_comm_wrapper = DDPCommunicationHookType(args.ddp_comm_wrapper)
|
||||
ddp_kwargs = DistributedDataParallelKwargs(comm_hook=ddp_comm_hook_type, comm_wrapper=ddp_comm_wrapper)
|
||||
# Initialize accelerator
|
||||
accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision, kwargs_handlers=[ddp_kwargs])
|
||||
# Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
|
||||
lr = config["lr"]
|
||||
num_epochs = int(config["num_epochs"])
|
||||
seed = int(config["seed"])
|
||||
batch_size = int(config["batch_size"])
|
||||
|
||||
metric = evaluate.load("glue", "mrpc")
|
||||
|
||||
set_seed(seed)
|
||||
train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
|
||||
# Instantiate the model (we build the model here so that the seed also control new weights initialization)
|
||||
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)
|
||||
|
||||
# We could avoid this line since the accelerator is set with `device_placement=True` (default value).
|
||||
# Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
|
||||
# creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
|
||||
model = model.to(accelerator.device)
|
||||
|
||||
# Instantiate optimizer
|
||||
optimizer = AdamW(params=model.parameters(), lr=lr)
|
||||
|
||||
# Instantiate scheduler
|
||||
lr_scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=100,
|
||||
num_training_steps=(len(train_dataloader) * num_epochs),
|
||||
)
|
||||
|
||||
# Prepare everything
|
||||
# There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
|
||||
# prepare method.
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||
)
|
||||
|
||||
# Now we train the model
|
||||
for epoch in range(num_epochs):
|
||||
model.train()
|
||||
for step, batch in enumerate(train_dataloader):
|
||||
# We could avoid this line since we set the accelerator with `device_placement=True`.
|
||||
batch.to(accelerator.device)
|
||||
# We use the new `accumulate` context manager to perform gradient accumulation
|
||||
with accelerator.accumulate(model):
|
||||
output = model(**batch)
|
||||
loss = output.loss
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
lr_scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
model.eval()
|
||||
for step, batch in enumerate(eval_dataloader):
|
||||
# We could avoid this line since we set the accelerator with `device_placement=True`.
|
||||
batch.to(accelerator.device)
|
||||
with torch.no_grad():
|
||||
outputs = model(**batch)
|
||||
predictions = outputs.logits.argmax(dim=-1)
|
||||
predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))
|
||||
metric.add_batch(
|
||||
predictions=predictions,
|
||||
references=references,
|
||||
)
|
||||
|
||||
eval_metric = metric.compute()
|
||||
# Use accelerator.print to print only on the main process.
|
||||
accelerator.print(f"epoch {epoch}:", eval_metric)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Simple example of training script.")
|
||||
parser.add_argument(
|
||||
"--mixed_precision",
|
||||
type=str,
|
||||
default=None,
|
||||
choices=["no", "fp16", "bf16", "fp8"],
|
||||
help="Whether to use mixed precision. Choose"
|
||||
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
|
||||
"and an Nvidia Ampere GPU.",
|
||||
)
|
||||
# New Code #
|
||||
parser.add_argument(
|
||||
"--ddp_comm_hook",
|
||||
type=str,
|
||||
default="no",
|
||||
choices=["no", "fp16", "bf16", "power_sgd", "batched_power_sgd"],
|
||||
help="DDP Communication hook to use. Choose between `no`, `fp16`, `bf16`, `power_sgd`, and `batched_power_sgd`.",
|
||||
)
|
||||
# New Code #
|
||||
parser.add_argument(
|
||||
"--ddp_comm_wrapper",
|
||||
type=str,
|
||||
default="no",
|
||||
choices=["no", "fp16", "bf16"],
|
||||
help="DDP Communication wrapper to use. Choose between `no`, `fp16`, and `bf16`.",
|
||||
)
|
||||
parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
|
||||
args = parser.parse_args()
|
||||
config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
|
||||
training_function(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
254
examples/by_feature/profiler.py
Normal file
254
examples/by_feature/profiler.py
Normal file
@ -0,0 +1,254 @@
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import evaluate
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
from torch.optim import AdamW
|
||||
from torch.utils.data import DataLoader
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
|
||||
|
||||
from accelerate import Accelerator, DistributedType
|
||||
from accelerate.utils import ProfileKwargs
|
||||
|
||||
|
||||
########################################################################
|
||||
# This is a fully working simple example to use Accelerate
|
||||
# and perform profiling
|
||||
#
|
||||
# This example trains a Bert base model on GLUE MRPC
|
||||
# in any of the following settings (with the same script):
|
||||
# - single CPU or single GPU
|
||||
# - multi GPUS (using PyTorch distributed mode)
|
||||
# - (multi) TPUs
|
||||
# - fp16 (mixed-precision) or fp32 (normal precision)
|
||||
#
|
||||
# To run it in each of these various modes, follow the instructions
|
||||
# in the readme for examples:
|
||||
# https://github.com/huggingface/accelerate/tree/main/examples
|
||||
#
|
||||
########################################################################
|
||||
|
||||
|
||||
MAX_GPU_BATCH_SIZE = 16
|
||||
EVAL_BATCH_SIZE = 32
|
||||
|
||||
|
||||
def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
|
||||
"""
|
||||
Creates a set of `DataLoader`s for the `glue` dataset,
|
||||
using "bert-base-cased" as the tokenizer.
|
||||
|
||||
Args:
|
||||
accelerator (`Accelerator`):
|
||||
An `Accelerator` object
|
||||
batch_size (`int`, *optional*):
|
||||
The batch size for the train and validation DataLoaders.
|
||||
"""
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
||||
datasets = load_dataset("glue", "mrpc")
|
||||
|
||||
def tokenize_function(examples):
|
||||
# max_length=None => use the model max length (it's actually the default)
|
||||
outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
|
||||
return outputs
|
||||
|
||||
# Apply the method we just defined to all the examples in all the splits of the dataset
|
||||
# starting with the main process first:
|
||||
with accelerator.main_process_first():
|
||||
tokenized_datasets = datasets.map(
|
||||
tokenize_function,
|
||||
batched=True,
|
||||
remove_columns=["idx", "sentence1", "sentence2"],
|
||||
)
|
||||
|
||||
# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
|
||||
# transformers library
|
||||
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
|
||||
|
||||
def collate_fn(examples):
|
||||
# On TPU it's best to pad everything to the same length or training will be very slow.
|
||||
max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
|
||||
# When using mixed precision we want round multiples of 8/16
|
||||
if accelerator.mixed_precision == "fp8":
|
||||
pad_to_multiple_of = 16
|
||||
elif accelerator.mixed_precision != "no":
|
||||
pad_to_multiple_of = 8
|
||||
else:
|
||||
pad_to_multiple_of = None
|
||||
|
||||
return tokenizer.pad(
|
||||
examples,
|
||||
padding="longest",
|
||||
max_length=max_length,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
# Instantiate dataloaders.
|
||||
train_dataloader = DataLoader(
|
||||
tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
|
||||
)
|
||||
eval_dataloader = DataLoader(
|
||||
tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
|
||||
)
|
||||
|
||||
return train_dataloader, eval_dataloader
|
||||
|
||||
|
||||
# For testing only
|
||||
if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
|
||||
from accelerate.test_utils.training import mocked_dataloaders
|
||||
|
||||
get_dataloaders = mocked_dataloaders # noqa: F811
|
||||
|
||||
|
||||
def training_function(config, args):
|
||||
# For testing only
|
||||
if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
|
||||
config["num_epochs"] = 2
|
||||
# New Code #
|
||||
profile_kwargs = ProfileKwargs(
|
||||
record_shapes=args.record_shapes,
|
||||
profile_memory=args.profile_memory,
|
||||
with_flops=args.with_flops,
|
||||
output_trace_dir=args.output_trace_dir,
|
||||
)
|
||||
# Initialize accelerator
|
||||
accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision, kwargs_handlers=[profile_kwargs])
|
||||
# Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
|
||||
lr = config["lr"]
|
||||
num_epochs = int(config["num_epochs"])
|
||||
seed = int(config["seed"])
|
||||
batch_size = int(config["batch_size"])
|
||||
|
||||
metric = evaluate.load("glue", "mrpc")
|
||||
|
||||
set_seed(seed)
|
||||
train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
|
||||
# Instantiate the model (we build the model here so that the seed also control new weights initialization)
|
||||
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)
|
||||
|
||||
# We could avoid this line since the accelerator is set with `device_placement=True` (default value).
|
||||
# Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
|
||||
# creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
|
||||
model = model.to(accelerator.device)
|
||||
|
||||
# Instantiate optimizer
|
||||
optimizer = AdamW(params=model.parameters(), lr=lr)
|
||||
|
||||
# Instantiate scheduler
|
||||
lr_scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=100,
|
||||
num_training_steps=(len(train_dataloader) * num_epochs),
|
||||
)
|
||||
|
||||
# Prepare everything
|
||||
# There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
|
||||
# prepare method.
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||
)
|
||||
|
||||
# Now we train the model
|
||||
for epoch in range(num_epochs):
|
||||
model.train()
|
||||
# New Code #
|
||||
with accelerator.profile() as prof:
|
||||
for step, batch in enumerate(train_dataloader):
|
||||
# We could avoid this line since we set the accelerator with `device_placement=True`.
|
||||
batch.to(accelerator.device)
|
||||
# We use the new `accumulate` context manager to perform gradient accumulation
|
||||
with accelerator.accumulate(model):
|
||||
output = model(**batch)
|
||||
loss = output.loss
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
lr_scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
# New Code #
|
||||
accelerator.print(
|
||||
prof.key_averages().table(
|
||||
sort_by="self_cpu_time_total" if args.cpu else "self_cuda_time_total", row_limit=-1
|
||||
)
|
||||
)
|
||||
|
||||
model.eval()
|
||||
for step, batch in enumerate(eval_dataloader):
|
||||
# We could avoid this line since we set the accelerator with `device_placement=True`.
|
||||
batch.to(accelerator.device)
|
||||
with torch.no_grad():
|
||||
outputs = model(**batch)
|
||||
predictions = outputs.logits.argmax(dim=-1)
|
||||
predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))
|
||||
metric.add_batch(
|
||||
predictions=predictions,
|
||||
references=references,
|
||||
)
|
||||
|
||||
eval_metric = metric.compute()
|
||||
# Use accelerator.print to print only on the main process.
|
||||
accelerator.print(f"epoch {epoch}:", eval_metric)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Simple example of training script.")
|
||||
parser.add_argument(
|
||||
"--mixed_precision",
|
||||
type=str,
|
||||
default=None,
|
||||
choices=["no", "fp16", "bf16", "fp8"],
|
||||
help="Whether to use mixed precision. Choose"
|
||||
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
|
||||
"and an Nvidia Ampere GPU.",
|
||||
)
|
||||
# New Code #
|
||||
parser.add_argument(
|
||||
"--record_shapes",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="If passed, will record shapes for profiling.",
|
||||
)
|
||||
# New Code #
|
||||
parser.add_argument(
|
||||
"--profile_memory",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="If passed, will profile memory.",
|
||||
)
|
||||
# New Code #
|
||||
parser.add_argument(
|
||||
"--with_flops",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="If passed, will profile flops.",
|
||||
)
|
||||
# New Code #
|
||||
parser.add_argument(
|
||||
"--output_trace_dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help="If passed, will save a json trace to the specified path.",
|
||||
)
|
||||
parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
|
||||
args = parser.parse_args()
|
||||
config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
|
||||
training_function(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
65
examples/slurm/submit_multicpu.sh
Normal file
65
examples/slurm/submit_multicpu.sh
Normal file
@ -0,0 +1,65 @@
|
||||
#!/bin/bash -l
|
||||
|
||||
#SBATCH --job-name=multicpu
|
||||
#SBATCH --nodes=2 # number of Nodes
|
||||
#SBATCH --ntasks-per-node=1 # number of MP tasks
|
||||
#SBATCH --exclusive
|
||||
#SBATCH --output=O-%x.%j
|
||||
#SBATCH --error=E-%x.%j
|
||||
|
||||
######################
|
||||
### Set enviroment ###
|
||||
######################
|
||||
source activateEnvironment.sh
|
||||
|
||||
######################
|
||||
#### Set network #####
|
||||
######################
|
||||
head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
|
||||
######################
|
||||
|
||||
# Setup env variables for distributed jobs
|
||||
export MASTER_PORT="${MASTER_PORT:-29555 }"
|
||||
echo "head_node_ip=${head_node_ip}"
|
||||
echo "MASTER_PORT=${MASTER_PORT}"
|
||||
|
||||
INSTANCES_PER_NODE="${INSTANCES_PER_NODE:-1}"
|
||||
|
||||
if [[ $SLURM_NNODES == 1 ]] && [[ $INSTANCES_PER_NODE == 1 ]]; then
|
||||
export CCL_WORKER_COUNT=0
|
||||
LAUNCHER=""
|
||||
else
|
||||
# Setup env variables for distributed jobs
|
||||
export CCL_WORKER_COUNT="${CCL_WORKER_COUNT:-2}"
|
||||
echo "CCL_WORKER_COUNT=${CCL_WORKER_COUNT}"
|
||||
|
||||
# Write hostfile
|
||||
HOSTFILE_PATH=hostfile
|
||||
scontrol show hostname $SLURM_JOB_NODELIST | perl -ne 'chomb; print "$_"x1'> ${HOSTFILE_PATH}
|
||||
|
||||
export LAUNCHER="accelerate launch \
|
||||
--num_processes $((SLURM_NNODES * ${INSTANCES_PER_NODE})) \
|
||||
--num_machines $SLURM_NNODES \
|
||||
--rdzv_backend c10d \
|
||||
--main_process_ip $head_node_ip \
|
||||
--main_process_port $MASTER_PORT \
|
||||
--mpirun_hostfile $HOSTFILE_PATH \
|
||||
--mpirun_ccl $CCL_WORKER_COUNT"
|
||||
fi
|
||||
|
||||
# This step is necessary because accelerate launch does not handle multiline arguments properly
|
||||
export ACCELERATE_DIR="${ACCELERATE_DIR:-/accelerate}"
|
||||
export SCRIPT="${ACCELERATE_DIR}/examples/complete_nlp_example.py"
|
||||
export SCRIPT_ARGS=" \
|
||||
--cpu \
|
||||
--output_dir ${ACCELERATE_DIR}/examples/output \
|
||||
"
|
||||
|
||||
# This step is necessary because accelerate launch does not handle multiline arguments properly
|
||||
export CMD="$LAUNCHER $SCRIPT $SCRIPT_ARGS"
|
||||
# Print the command
|
||||
echo $CMD
|
||||
echo ""
|
||||
|
||||
# Run the command
|
||||
eval $CMD
|
||||
@ -13,14 +13,15 @@
|
||||
######################
|
||||
### Set enviroment ###
|
||||
######################
|
||||
source activateEnviroment.sh
|
||||
source activateEnvironment.sh
|
||||
export GPUS_PER_NODE=4
|
||||
######################
|
||||
|
||||
export SCRIPT=/accelerate/examples/complete_nlp_example.py
|
||||
export ACCELERATE_DIR="${ACCELERATE_DIR:-/accelerate}"
|
||||
export SCRIPT="${ACCELERATE_DIR}/examples/complete_nlp_example.py"
|
||||
export SCRIPT_ARGS=" \
|
||||
--mixed_precision fp16 \
|
||||
--output_dir /accelerate/examples/output \
|
||||
--output_dir ${ACCELERATE_DIR}/examples/output \
|
||||
--with_tracking \
|
||||
"
|
||||
|
||||
|
||||
@ -13,7 +13,7 @@
|
||||
######################
|
||||
### Set enviroment ###
|
||||
######################
|
||||
source activateEnviroment.sh
|
||||
source activateEnvironment.sh
|
||||
export GPUS_PER_NODE=4
|
||||
######################
|
||||
|
||||
@ -30,10 +30,11 @@ export LAUNCHER="accelerate launch \
|
||||
--main_process_ip $head_node_ip \
|
||||
--main_process_port 29500 \
|
||||
"
|
||||
export SCRIPT="/accelerate/examples/complete_nlp_example.py"
|
||||
export ACCELERATE_DIR="${ACCELERATE_DIR:-/accelerate}"
|
||||
export SCRIPT="${ACCELERATE_DIR}/examples/complete_nlp_example.py"
|
||||
export SCRIPT_ARGS=" \
|
||||
--mixed_precision fp16 \
|
||||
--output_dir /accelerate/examples/output \
|
||||
--output_dir ${ACCELERATE_DIR}/examples/output \
|
||||
"
|
||||
|
||||
# This step is necessary because accelerate launch does not handle multiline arguments properly
|
||||
|
||||
11
setup.py
11
setup.py
@ -36,7 +36,7 @@ extras["test_dev"] = [
|
||||
"timm",
|
||||
]
|
||||
extras["testing"] = extras["test_prod"] + extras["test_dev"]
|
||||
extras["deepspeed"] = ["deepspeed<=0.14.0"]
|
||||
extras["deepspeed"] = ["deepspeed"]
|
||||
extras["rich"] = ["rich"]
|
||||
|
||||
extras["test_trackers"] = ["wandb", "comet-ml", "tensorboard", "dvclive"]
|
||||
@ -48,7 +48,7 @@ extras["sagemaker"] = [
|
||||
|
||||
setup(
|
||||
name="accelerate",
|
||||
version="0.30.0.dev",
|
||||
version="0.34.0.dev0",
|
||||
description="Accelerate",
|
||||
long_description=open("README.md", encoding="utf-8").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
@ -65,17 +65,18 @@ setup(
|
||||
"accelerate-config=accelerate.commands.config:main",
|
||||
"accelerate-estimate-memory=accelerate.commands.estimate:main",
|
||||
"accelerate-launch=accelerate.commands.launch:main",
|
||||
"accelerate-merge-weights=accelerate.commands.merge:main",
|
||||
]
|
||||
},
|
||||
python_requires=">=3.8.0",
|
||||
install_requires=[
|
||||
"numpy>=1.17",
|
||||
"numpy>=1.17,<2.0.0",
|
||||
"packaging>=20.0",
|
||||
"psutil",
|
||||
"pyyaml",
|
||||
"torch>=1.10.0",
|
||||
"huggingface_hub",
|
||||
"safetensors>=0.3.1",
|
||||
"huggingface_hub>=0.21.0",
|
||||
"safetensors>=0.4.3",
|
||||
],
|
||||
extras_require=extras,
|
||||
classifiers=[
|
||||
|
||||
@ -11,7 +11,7 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
__version__ = "0.30.0.dev0"
|
||||
__version__ = "0.34.0.dev0"
|
||||
|
||||
from .accelerator import Accelerator
|
||||
from .big_modeling import (
|
||||
@ -30,12 +30,14 @@ from .state import PartialState
|
||||
from .utils import (
|
||||
AutocastKwargs,
|
||||
DataLoaderConfiguration,
|
||||
DDPCommunicationHookType,
|
||||
DeepSpeedPlugin,
|
||||
DistributedDataParallelKwargs,
|
||||
DistributedType,
|
||||
FullyShardedDataParallelPlugin,
|
||||
GradScalerKwargs,
|
||||
InitProcessGroupKwargs,
|
||||
ProfileKwargs,
|
||||
find_executable_batch_size,
|
||||
infer_auto_device_map,
|
||||
is_rich_available,
|
||||
|
||||
@ -31,6 +31,7 @@ from typing import Any, Callable, Union
|
||||
|
||||
import torch
|
||||
import torch.utils.hooks as hooks
|
||||
from huggingface_hub import split_torch_state_dict_into_shards
|
||||
|
||||
from .checkpointing import load_accelerator_state, load_custom_state, save_accelerator_state, save_custom_state
|
||||
from .data_loader import DataLoaderDispatcher, prepare_data_loader, skip_first_batches
|
||||
@ -44,8 +45,10 @@ from .utils import (
|
||||
MODEL_NAME,
|
||||
SAFE_WEIGHTS_INDEX_NAME,
|
||||
SAFE_WEIGHTS_NAME,
|
||||
SAFE_WEIGHTS_PATTERN_NAME,
|
||||
WEIGHTS_INDEX_NAME,
|
||||
WEIGHTS_NAME,
|
||||
WEIGHTS_PATTERN_NAME,
|
||||
AutocastKwargs,
|
||||
DataLoaderConfiguration,
|
||||
DeepSpeedPlugin,
|
||||
@ -61,9 +64,11 @@ from .utils import (
|
||||
LoggerType,
|
||||
MegatronLMPlugin,
|
||||
PrecisionType,
|
||||
ProfileKwargs,
|
||||
ProjectConfiguration,
|
||||
RNGType,
|
||||
TorchDynamoPlugin,
|
||||
apply_fp8_autowrap,
|
||||
check_os_kernel,
|
||||
clean_state_dict_for_safetensors,
|
||||
compare_versions,
|
||||
@ -74,18 +79,18 @@ from .utils import (
|
||||
gather_object,
|
||||
get_mixed_precision_context_manager,
|
||||
get_pretty_name,
|
||||
has_transformer_engine_layers,
|
||||
is_bf16_available,
|
||||
is_deepspeed_available,
|
||||
is_fp8_available,
|
||||
is_ipex_available,
|
||||
is_lomo_available,
|
||||
is_megatron_lm_available,
|
||||
is_mlu_available,
|
||||
is_msamp_available,
|
||||
is_musa_available,
|
||||
is_npu_available,
|
||||
is_torch_version,
|
||||
is_torch_xla_available,
|
||||
is_transformer_engine_available,
|
||||
is_xpu_available,
|
||||
load_fsdp_model,
|
||||
load_fsdp_optimizer,
|
||||
@ -97,10 +102,9 @@ from .utils import (
|
||||
save,
|
||||
save_fsdp_model,
|
||||
save_fsdp_optimizer,
|
||||
shard_checkpoint,
|
||||
wait_for_everyone,
|
||||
)
|
||||
from .utils.constants import FSDP_PYTORCH_VERSION
|
||||
from .utils.constants import FSDP_PYTORCH_VERSION, PROFILE_PATTERN_NAME
|
||||
from .utils.modeling import get_state_dict_offloaded_model
|
||||
from .utils.other import is_compiled_module
|
||||
|
||||
@ -114,11 +118,6 @@ if is_deepspeed_available():
|
||||
DummyScheduler,
|
||||
)
|
||||
|
||||
if is_fp8_available():
|
||||
import transformer_engine.common.recipe as te_recipe
|
||||
from transformer_engine.pytorch import fp8_autocast
|
||||
|
||||
|
||||
if is_megatron_lm_available():
|
||||
from .utils import (
|
||||
MegatronEngine,
|
||||
@ -128,9 +127,7 @@ if is_megatron_lm_available():
|
||||
MegatronLMSchedulerWrapper,
|
||||
megatron_lm_initialize,
|
||||
megatron_lm_prepare_data_loader,
|
||||
megatron_lm_prepare_model,
|
||||
megatron_lm_prepare_optimizer,
|
||||
megatron_lm_prepare_scheduler,
|
||||
megatron_lm_prepare_model_optimizer_scheduler,
|
||||
)
|
||||
|
||||
from torch.distributed.algorithms.join import Join
|
||||
@ -216,12 +213,12 @@ class Accelerator:
|
||||
project_dir (`str`, `os.PathLike`, *optional*):
|
||||
A path to a directory for storing data such as logs of locally-compatible loggers and potentially saved
|
||||
checkpoints.
|
||||
step_scheduler_with_optimizer (`bool`, *optional`, defaults to `True`):
|
||||
step_scheduler_with_optimizer (`bool`, *optional*, defaults to `True`):
|
||||
Set `True` if the learning rate scheduler is stepped at the same time as the optimizer, `False` if only
|
||||
done under certain circumstances (at the end of each epoch, for instance).
|
||||
kwargs_handlers (list of [`~utils.KwargsHandler`], *optional*)
|
||||
A list of [`~utils.KwargsHandler`] to customize how the objects related to distributed training or mixed
|
||||
precision are created. See [kwargs](kwargs) for more information.
|
||||
A list of [`~utils.KwargsHandler`] to customize how the objects related to distributed training, profiling
|
||||
or mixed precision are created. See [kwargs](kwargs) for more information.
|
||||
dynamo_backend (`str` or [`~utils.DynamoBackend`], *optional*, defaults to `"no"`):
|
||||
Set to one of the possible dynamo backends to optimize your training with torch dynamo.
|
||||
gradient_accumulation_plugin ([`~utils.GradientAccumulationPlugin`], *optional*):
|
||||
@ -298,6 +295,9 @@ class Accelerator:
|
||||
if is_mlu_available():
|
||||
if compare_versions("deepspeed-mlu", "<", "0.10.1"):
|
||||
raise ImportError("DeepSpeed MLU version must be >= 0.10.1. Please update DeepSpeed MLU.")
|
||||
elif is_musa_available():
|
||||
if compare_versions("deepspeed", ">", "0.14.3"):
|
||||
raise ImportError("DeepSpeed MUSA version must be <= 0.14.3. Please downgrade DeepSpeed.")
|
||||
elif compare_versions("deepspeed", "<", "0.9.3"):
|
||||
raise ImportError("DeepSpeed version must be >= 0.9.3. Please update DeepSpeed.")
|
||||
|
||||
@ -341,6 +341,7 @@ class Accelerator:
|
||||
self.init_handler = None
|
||||
self.fp8_recipe_handler = None
|
||||
self.autocast_handler = None
|
||||
self.profile_handler = None
|
||||
self.has_lomo_optimizer = False
|
||||
|
||||
if kwargs_handlers is not None:
|
||||
@ -373,6 +374,11 @@ class Accelerator:
|
||||
raise ValueError("You can only pass one `AutocastKwargs` in `kwargs_handler`.")
|
||||
else:
|
||||
self.autocast_handler = handler
|
||||
elif isinstance(handler, ProfileKwargs):
|
||||
if self.profile_handler is not None:
|
||||
raise ValueError("You can only pass one `ProfileKwargs` in `kwargs_handler`.")
|
||||
else:
|
||||
self.profile_handler = handler
|
||||
|
||||
kwargs = self.init_handler.to_kwargs() if self.init_handler is not None else {}
|
||||
self.state = AcceleratorState(
|
||||
@ -386,10 +392,15 @@ class Accelerator:
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if self.state.mixed_precision == "fp8" and self.fp8_recipe_handler is None:
|
||||
self.fp8_recipe_handler = FP8RecipeKwargs()
|
||||
|
||||
self.delayed_fp8_autocast = False
|
||||
if self.fp8_recipe_handler is not None:
|
||||
# We already check if FP8 is available during `self.state`
|
||||
if self.state.mixed_precision != "fp8":
|
||||
if self.state.mixed_precision != "fp8" and (
|
||||
self.distributed_type not in (DistributedType.FSDP, DistributedType.DEEPSPEED)
|
||||
):
|
||||
raise ValueError("Passing in a `FP8RecipeKwargs` object requires setting `mixed_precision='fp8'`.")
|
||||
self.delayed_fp8_autocast = self.fp8_recipe_handler.backend == "TE" and self.distributed_type in (
|
||||
DistributedType.MULTI_GPU,
|
||||
@ -460,7 +471,7 @@ class Accelerator:
|
||||
and self.distributed_type not in (DistributedType.DEEPSPEED, DistributedType.MEGATRON_LM)
|
||||
):
|
||||
self.native_amp = True
|
||||
if self.device.type not in ("xpu", "cuda", "npu", "xla", "mlu") or is_torch_xla_available(
|
||||
if self.device.type not in ("xpu", "cuda", "npu", "xla", "mlu", "musa") or is_torch_xla_available(
|
||||
check_is_tpu=True
|
||||
):
|
||||
raise ValueError(f"fp16 mixed precision requires a GPU (not {self.device.type!r}).")
|
||||
@ -473,8 +484,12 @@ class Accelerator:
|
||||
self.scaler = xamp.GradScaler(**kwargs)
|
||||
elif is_mlu_available():
|
||||
self.scaler = torch.mlu.amp.GradScaler(**kwargs)
|
||||
elif is_musa_available():
|
||||
self.scalar = torch.musa.amp.GradScaler(**kwargs)
|
||||
elif is_npu_available():
|
||||
self.scaler = torch.npu.amp.GradScaler(**kwargs)
|
||||
elif is_xpu_available():
|
||||
self.scaler = torch.amp.GradScaler("xpu", **kwargs)
|
||||
else:
|
||||
self.scaler = torch.cuda.amp.GradScaler(**kwargs)
|
||||
|
||||
@ -996,14 +1011,14 @@ class Accelerator:
|
||||
model.require_backward_grad_sync = old_require_backward_grad_sync
|
||||
model.require_forward_param_sync = old_require_forward_param_sync
|
||||
|
||||
def _do_sync(self, force: bool = False):
|
||||
def _do_sync(self):
|
||||
"Sets the right `sync_gradients` context and either resets or increases `self.step`"
|
||||
if self.gradient_state.sync_with_dataloader and self.gradient_state.end_of_dataloader:
|
||||
self.step = 0
|
||||
self.gradient_state._set_sync_gradients(True)
|
||||
else:
|
||||
self.step += 1
|
||||
self.gradient_state._set_sync_gradients(force or ((self.step % self.gradient_state.num_steps) == 0))
|
||||
self.gradient_state._set_sync_gradients((self.step % self.gradient_state.num_steps) == 0)
|
||||
|
||||
@property
|
||||
def sync_gradients(self):
|
||||
@ -1049,12 +1064,21 @@ class Accelerator:
|
||||
... optimizer.zero_grad()
|
||||
```
|
||||
"""
|
||||
# sync_each_batch=True will guarantee below that self.sync_gradients=True, therefore
|
||||
# resulting in the nullcontext always being selected.
|
||||
self._do_sync(force=self.gradient_state.plugin_kwargs.get("sync_each_batch", False))
|
||||
self._do_sync()
|
||||
|
||||
allow_gradient_sync = (
|
||||
self.sync_gradients # must sync if sync gradients need to complete an optimizer step
|
||||
or (
|
||||
# the no_sync context stops the gradients from reducing during distributed training
|
||||
# bringing speedup (potentially at some costs). Here, no_sync can be prevented
|
||||
# by setting sync_each_batch = True.
|
||||
self.use_distributed # only relevant in distributed settings
|
||||
and self.gradient_state.plugin_kwargs.get("sync_each_batch", False)
|
||||
)
|
||||
)
|
||||
with contextlib.ExitStack() as cm_stack:
|
||||
for m in models:
|
||||
cm_stack.enter_context(contextlib.nullcontext() if self.sync_gradients else self.no_sync(m))
|
||||
cm_stack.enter_context(contextlib.nullcontext() if allow_gradient_sync else self.no_sync(m))
|
||||
yield
|
||||
|
||||
@contextmanager
|
||||
@ -1106,6 +1130,7 @@ class Accelerator:
|
||||
DistributedType.MULTI_GPU,
|
||||
DistributedType.MULTI_NPU,
|
||||
DistributedType.MULTI_MLU,
|
||||
DistributedType.MULTI_MUSA,
|
||||
DistributedType.MULTI_XPU,
|
||||
):
|
||||
dl_even_batches_values = []
|
||||
@ -1271,15 +1296,18 @@ class Accelerator:
|
||||
|
||||
# If we're dealing with device placement, this deals with that by...
|
||||
tpu_should_fix_optimizer = self.device_placement and self.distributed_type == DistributedType.XLA
|
||||
if tpu_should_fix_optimizer or (self.mixed_precision == "fp8" and self.fp8_recipe_handler.backend == "TE"):
|
||||
|
||||
if tpu_should_fix_optimizer:
|
||||
# 1. grabbing old model parameters
|
||||
old_named_params = self._get_named_parameters(*args)
|
||||
|
||||
if self.distributed_type in [DistributedType.MULTI_CPU, DistributedType.MULTI_XPU, DistributedType.NO]:
|
||||
if self.device.type == "cpu" and self.state.use_ipex:
|
||||
args = self._prepare_ipex(*args)
|
||||
args = self._prepare_ipex_or_xpu(*args)
|
||||
elif self.device.type == "xpu" and is_xpu_available():
|
||||
args = self._prepare_ipex(*args)
|
||||
args = self._prepare_ipex_or_xpu(*args)
|
||||
if self.fp8_recipe_handler is not None and self.fp8_recipe_handler.backend == "TE":
|
||||
args = self._prepare_te(*args)
|
||||
if self.distributed_type == DistributedType.DEEPSPEED:
|
||||
result = self._prepare_deepspeed(*args)
|
||||
elif self.distributed_type == DistributedType.MEGATRON_LM:
|
||||
@ -1293,8 +1321,7 @@ class Accelerator:
|
||||
self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
|
||||
)
|
||||
result = tuple(self._prepare_one(obj, device_placement=d) for obj, d in zip(result, device_placement))
|
||||
|
||||
if tpu_should_fix_optimizer or (self.mixed_precision == "fp8" and self.fp8_recipe_handler.backend == "TE"):
|
||||
if tpu_should_fix_optimizer:
|
||||
# 2. grabbing new model parameters
|
||||
new_named_params = self._get_named_parameters(*result)
|
||||
# 3. building a map from the first to the second
|
||||
@ -1365,20 +1392,8 @@ class Accelerator:
|
||||
model.forward = convert_outputs_to_fp32(new_forward)
|
||||
|
||||
# We prepare fp8 after, allowing for bf16 autocast to happen first
|
||||
if getattr(self.fp8_recipe_handler, "backend", None) == "TE":
|
||||
if not has_transformer_engine_layers(model):
|
||||
with torch.no_grad():
|
||||
convert_model(model)
|
||||
model._converted_to_transformer_engine = True
|
||||
|
||||
kwargs = self.fp8_recipe_handler.to_kwargs() if self.fp8_recipe_handler is not None else {}
|
||||
if "fp8_format" in kwargs:
|
||||
kwargs["fp8_format"] = getattr(te_recipe.Format, kwargs["fp8_format"])
|
||||
fp8_recipe = te_recipe.DelayedScaling(**kwargs)
|
||||
# If we are in DDP or FSDP, we delay `autocast` until after FSDP/DDP has been initialized
|
||||
# to make use of the process group
|
||||
if not self.delayed_fp8_autocast:
|
||||
model.forward = fp8_autocast(enabled=True, fp8_recipe=fp8_recipe)(model.forward)
|
||||
if getattr(self.fp8_recipe_handler, "backend", None) == "TE" and not self.delayed_fp8_autocast:
|
||||
model = apply_fp8_autowrap(model, self.fp8_recipe_handler)
|
||||
|
||||
if (getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False)) and getattr(
|
||||
model, "hf_device_map", False
|
||||
@ -1401,7 +1416,7 @@ class Accelerator:
|
||||
if (self.device.index is not None) or (current_device_index != 0):
|
||||
raise ValueError(
|
||||
"You can't train a model that has been loaded in 8-bit precision on a different device than the one "
|
||||
"you're training on. Make sure you loaded the model on the correct device using for example `device_map={'':torch.cuda.current_device() or device_map={'':torch.xpu.current_device()}"
|
||||
"you're training on. Make sure you loaded the model on the correct device using for example `device_map={'':torch.cuda.current_device()}` or `device_map={'':torch.xpu.current_device()}`"
|
||||
)
|
||||
|
||||
if "cpu" in model_devices or "disk" in model_devices:
|
||||
@ -1414,6 +1429,7 @@ class Accelerator:
|
||||
if self.distributed_type in (
|
||||
DistributedType.MULTI_GPU,
|
||||
DistributedType.MULTI_MLU,
|
||||
DistributedType.MULTI_MUSA,
|
||||
DistributedType.MULTI_NPU,
|
||||
DistributedType.MULTI_XPU,
|
||||
):
|
||||
@ -1428,7 +1444,10 @@ class Accelerator:
|
||||
model = torch.nn.parallel.DistributedDataParallel(
|
||||
model, device_ids=device_ids, output_device=output_device, **kwargs
|
||||
)
|
||||
if self.ddp_handler is not None:
|
||||
self.ddp_handler.register_comm_hook(model)
|
||||
elif self.distributed_type == DistributedType.FSDP:
|
||||
# We need to fix the optimizer *before* sharding the model
|
||||
from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
|
||||
|
||||
# Check if the model is already a FSDP model due to `Manual Wrapping` and if so,
|
||||
@ -1546,13 +1565,13 @@ class Accelerator:
|
||||
elif self.distributed_type == DistributedType.MULTI_CPU:
|
||||
kwargs = self.ddp_handler.to_kwargs() if self.ddp_handler is not None else {}
|
||||
model = torch.nn.parallel.DistributedDataParallel(model, **kwargs)
|
||||
if self.ddp_handler is not None:
|
||||
self.ddp_handler.register_comm_hook(model)
|
||||
elif self.distributed_type == DistributedType.XLA and self.state.fork_launched:
|
||||
model = xmp.MpModelWrapper(model).to(self.device)
|
||||
# Now we can apply the FP8 autocast
|
||||
if self.delayed_fp8_autocast:
|
||||
model.forward = fp8_autocast(enabled=True, fp8_recipe=fp8_recipe, fp8_group=model.process_group)(
|
||||
model.forward
|
||||
)
|
||||
model = apply_fp8_autowrap(model, self.fp8_recipe_handler)
|
||||
# torch.compile should be called last and only if the model isn't already compiled.
|
||||
if self.state.dynamo_plugin.backend != DynamoBackend.NO and not is_compiled_module(model):
|
||||
if not is_torch_version(">=", "2.0"):
|
||||
@ -1560,6 +1579,42 @@ class Accelerator:
|
||||
model = torch.compile(model, **self.state.dynamo_plugin.to_kwargs())
|
||||
return model
|
||||
|
||||
def _prepare_te(self, *args):
|
||||
if not is_transformer_engine_available():
|
||||
raise ImportError(
|
||||
"`transformer_engine` was not found on your system. Please ensure that `transformer_engine` is installed"
|
||||
)
|
||||
model, optimizer = None, None
|
||||
num_models, num_optimizers = 0, 0
|
||||
result = [obj for obj in args]
|
||||
for obj in result:
|
||||
if isinstance(obj, torch.nn.Module):
|
||||
model = obj
|
||||
num_models += 1
|
||||
elif isinstance(obj, (torch.optim.Optimizer)):
|
||||
optimizer = obj
|
||||
num_optimizers += 1
|
||||
if optimizer is None and model is None:
|
||||
return result
|
||||
elif optimizer is None or model is None:
|
||||
raise ValueError(
|
||||
"You must pass a model and an optimizer together to `accelerate.prepare()` when using TransformerEngine."
|
||||
)
|
||||
elif num_models > 1 or num_optimizers > 1:
|
||||
raise ValueError(
|
||||
f"You can't use multiple models ({num_models}) or optimizers {num_optimizers} with TransformerEngine."
|
||||
)
|
||||
old_named_params = self._get_named_parameters(model)
|
||||
with torch.no_grad():
|
||||
convert_model(model)
|
||||
new_named_params = self._get_named_parameters(model)
|
||||
mapping = {p: new_named_params[n] for n, p in old_named_params.items()}
|
||||
# We need to switch the optimizer params to the new params *after* the model is wrapped in FSDP
|
||||
for param_group in optimizer.param_groups:
|
||||
param_group["params"] = [mapping[p] for p in param_group["params"]]
|
||||
|
||||
return result
|
||||
|
||||
def _prepare_deepspeed(self, *args):
|
||||
import deepspeed
|
||||
|
||||
@ -1668,6 +1723,9 @@ class Accelerator:
|
||||
)
|
||||
|
||||
if model is not None:
|
||||
# If we are using FP8, we need to apply the autowrap now
|
||||
if getattr(self.fp8_recipe_handler, "backend", None) == "TE":
|
||||
model = apply_fp8_autowrap(model, self.fp8_recipe_handler)
|
||||
# if the model is an MOE, set the appropriate MOE layers as leaf Z3 modules
|
||||
deepspeed_plugin.set_moe_leaf_modules(model)
|
||||
# deal with config keys that use `auto` value and rely on model's hidden_size
|
||||
@ -1699,7 +1757,7 @@ class Accelerator:
|
||||
config_kwargs.update(
|
||||
{
|
||||
"zero_optimization.reduce_bucket_size": hidden_size * hidden_size,
|
||||
"zero_optimization.stage3_prefetch_bucket_size": 0.9 * hidden_size * hidden_size,
|
||||
"zero_optimization.stage3_prefetch_bucket_size": int(0.9 * hidden_size * hidden_size),
|
||||
"zero_optimization.stage3_param_persistence_threshold": 10 * hidden_size,
|
||||
}
|
||||
)
|
||||
@ -1786,6 +1844,7 @@ class Accelerator:
|
||||
|
||||
def _prepare_megatron_lm(self, *args):
|
||||
megatron_lm_plugin = self.state.megatron_lm_plugin
|
||||
micro_batch_size = None
|
||||
if not megatron_lm_plugin.megatron_dataset_flag:
|
||||
batch_sizes = [obj.batch_size for obj in args if hasattr(obj, "batch_size")]
|
||||
if len(batch_sizes) == 0:
|
||||
@ -1804,19 +1863,22 @@ class Accelerator:
|
||||
if isinstance(obj, MegatronLMDummyDataLoader):
|
||||
micro_batch_size = obj.dataset_args["micro_batch_size"]
|
||||
break
|
||||
|
||||
dp_degree = self.num_processes // (megatron_lm_plugin.tp_degree * megatron_lm_plugin.pp_degree)
|
||||
megatron_lm_plugin.set_training_args(micro_batch_size, dp_degree)
|
||||
|
||||
if micro_batch_size is not None:
|
||||
dp_degree = self.num_processes // (megatron_lm_plugin.tp_degree * megatron_lm_plugin.pp_degree)
|
||||
megatron_lm_plugin.set_training_args(micro_batch_size, dp_degree)
|
||||
else:
|
||||
raise ValueError(
|
||||
"When you do not pass the dataloader parameter, the `data_parallel_size`, "
|
||||
"`micro_batch_size`, and `global_batch_size` megatron parameters will not be updated."
|
||||
)
|
||||
model = None
|
||||
optimizer = None
|
||||
scheduler = None
|
||||
is_dummy_scheduler = False
|
||||
batch_data = None
|
||||
for obj in args:
|
||||
if isinstance(obj, torch.utils.data.DataLoader) and batch_data is None:
|
||||
batch_data = next(iter(obj))
|
||||
if isinstance(obj, torch.nn.Module):
|
||||
elif isinstance(obj, torch.nn.Module):
|
||||
model = obj
|
||||
elif isinstance(obj, (torch.optim.Optimizer)):
|
||||
optimizer = obj
|
||||
@ -1828,8 +1890,7 @@ class Accelerator:
|
||||
if optimizer is not None:
|
||||
megatron_lm_plugin.set_optimizer_type(optimizer)
|
||||
if scheduler is not None:
|
||||
is_dummy_scheduler = isinstance(scheduler, MegatronLMDummyScheduler)
|
||||
if not is_dummy_scheduler:
|
||||
if not isinstance(scheduler, MegatronLMDummyScheduler):
|
||||
raise ValueError(
|
||||
"You can't use a custom scheduler with Megatron-LM. Please use the `accelerate.utils.MegatronLMDummyScheduler` instead."
|
||||
)
|
||||
@ -1837,6 +1898,10 @@ class Accelerator:
|
||||
|
||||
# initialize megatron-lm
|
||||
megatron_lm_initialize(self, args_defaults=megatron_lm_plugin.megatron_lm_default_args)
|
||||
|
||||
(model, optimizer, scheduler) = megatron_lm_prepare_model_optimizer_scheduler(self)
|
||||
self.wait_for_everyone()
|
||||
|
||||
counter = 0
|
||||
result = []
|
||||
for obj in args:
|
||||
@ -1852,13 +1917,6 @@ class Accelerator:
|
||||
else:
|
||||
result.append(obj)
|
||||
|
||||
if model is not None:
|
||||
model = megatron_lm_prepare_model(self)
|
||||
if optimizer is not None:
|
||||
optimizer = megatron_lm_prepare_optimizer(self, model)
|
||||
if scheduler is not None:
|
||||
scheduler = megatron_lm_prepare_scheduler(self, optimizer, scheduler)
|
||||
|
||||
if model is not None:
|
||||
model = MegatronEngine(self, model, optimizer, scheduler)
|
||||
if optimizer is not None:
|
||||
@ -1873,26 +1931,32 @@ class Accelerator:
|
||||
result[i] = optimizer
|
||||
elif isinstance(result[i], MegatronLMDummyScheduler):
|
||||
result[i] = scheduler
|
||||
|
||||
if model is not None:
|
||||
self._models.append(model)
|
||||
if len(self._models) > 1:
|
||||
raise AssertionError(
|
||||
"You can't use same `Accelerator()` instance with multiple models when using Megatron-LM"
|
||||
)
|
||||
if optimizer is not None:
|
||||
self._optimizers.append(optimizer)
|
||||
if scheduler is not None:
|
||||
self._schedulers.append(scheduler)
|
||||
if len(self._models) > 1:
|
||||
raise AssertionError(
|
||||
"You can't use same `Accelerator()` instance with multiple models when using Megatron-LM"
|
||||
)
|
||||
|
||||
return tuple(result)
|
||||
|
||||
def _prepare_ipex(self, *args):
|
||||
if not is_ipex_available():
|
||||
raise ImportError(
|
||||
"IPEX is not installed or IPEX's version does not match current PyTorch version. Please refer"
|
||||
" to https://github.com/intel/intel-extension-for-pytorch."
|
||||
)
|
||||
else:
|
||||
import intel_extension_for_pytorch as ipex
|
||||
def _prepare_ipex_or_xpu(self, *args):
|
||||
"""
|
||||
Prepares model and optimizer for training with IPEX or XPU acceleration. This covers 3 cases, IPEX compiled
|
||||
with CPU only support, IPEX compiled with XPU support and training with XPU pytorch backend available in stock
|
||||
pytorch starting from version 2.4.
|
||||
"""
|
||||
if self.state.use_ipex:
|
||||
if not is_ipex_available():
|
||||
raise ImportError(
|
||||
"IPEX is not installed or IPEX's version does not match current PyTorch version. Please refer"
|
||||
" to https://github.com/intel/intel-extension-for-pytorch."
|
||||
)
|
||||
|
||||
model = None
|
||||
optimizer = None
|
||||
@ -1905,12 +1969,12 @@ class Accelerator:
|
||||
optimizer = obj
|
||||
if optimizer is not None and model is not None:
|
||||
dtype = torch.bfloat16 if self.state.mixed_precision == "bf16" else None
|
||||
if self.device.type == "xpu" and is_xpu_available():
|
||||
if self.device.type == "xpu":
|
||||
model = model.to(self.device)
|
||||
model, optimizer = torch.xpu.optimize(
|
||||
model, optimizer=optimizer, dtype=dtype, inplace=True, level="O1"
|
||||
)
|
||||
else:
|
||||
# ipex.optimize() is available only for IPEX, both IPEX-CPU and IPEX-XPU
|
||||
if is_ipex_available():
|
||||
import intel_extension_for_pytorch as ipex
|
||||
|
||||
model, optimizer = ipex.optimize(model, optimizer=optimizer, dtype=dtype, inplace=True, level="O1")
|
||||
for i in range(len(result)):
|
||||
if isinstance(result[i], torch.nn.Module):
|
||||
@ -2035,7 +2099,8 @@ class Accelerator:
|
||||
# transformers & accelerate
|
||||
from lomo_optim import AdaLomo, Lomo
|
||||
|
||||
self.has_lomo_optimizer = isinstance(optimizer, (Lomo, AdaLomo))
|
||||
# Support multiple optimizers: https://github.com/huggingface/accelerate/pull/2695#discussion_r1589164607
|
||||
self.has_lomo_optimizer |= isinstance(optimizer, (Lomo, AdaLomo))
|
||||
|
||||
# Ensure we can't double wrap an optimizer due to `find_batch_size`
|
||||
if getattr(optimizer, "_is_accelerate_prepared", False):
|
||||
@ -2265,6 +2330,12 @@ class Accelerator:
|
||||
xm.all_reduce("sum", gradients, scale=1.0 / self.num_processes)
|
||||
# Set is_xla_gradients_synced to True to avoid all-reduce twice in the AcceleratedOptimizer step.
|
||||
acc_opt.gradient_state.is_xla_gradients_synced = True
|
||||
if os.environ.get("ACCELERATE_USE_FSDP", "false") == "true":
|
||||
self.unscale_gradients()
|
||||
parameters = [p for p in parameters]
|
||||
for model in self._models:
|
||||
if parameters == [p for p in model.parameters()]:
|
||||
return model.clip_grad_norm_(max_norm, norm_type)
|
||||
self.unscale_gradients()
|
||||
return torch.nn.utils.clip_grad_norm_(parameters, max_norm, norm_type=norm_type)
|
||||
|
||||
@ -2637,11 +2708,10 @@ class Accelerator:
|
||||
for tracker in self.trackers:
|
||||
tracker.log(values, step=step, **log_kwargs.get(tracker.name, {}))
|
||||
|
||||
@on_main_process
|
||||
def end_training(self):
|
||||
"""
|
||||
Runs any special end training behaviors, such as stopping trackers on the main process only. Should always be
|
||||
called at the end of your script if using experiment tracking.
|
||||
Runs any special end training behaviors, such as stopping trackers on the main process only or destoying
|
||||
process group. Should always be called at the end of your script if using experiment tracking.
|
||||
|
||||
Example:
|
||||
|
||||
@ -2657,6 +2727,10 @@ class Accelerator:
|
||||
for tracker in self.trackers:
|
||||
tracker.finish()
|
||||
|
||||
if torch.distributed.is_initialized():
|
||||
# needed when using torch.distributed.init_process_group
|
||||
torch.distributed.destroy_process_group()
|
||||
|
||||
def save(self, obj, f, safe_serialization=False):
|
||||
"""
|
||||
Save the object passed to disk once per machine. Use in place of `torch.save`.
|
||||
@ -2750,9 +2824,11 @@ class Accelerator:
|
||||
if safe_serialization:
|
||||
state_dict = clean_state_dict_for_safetensors(state_dict)
|
||||
weights_name = SAFE_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME
|
||||
filename_pattern = SAFE_WEIGHTS_PATTERN_NAME if safe_serialization else WEIGHTS_PATTERN_NAME
|
||||
|
||||
# Shard the model if it is too big.
|
||||
shards, index = shard_checkpoint(state_dict, max_shard_size=max_shard_size, weights_name=weights_name)
|
||||
state_dict_split = split_torch_state_dict_into_shards(
|
||||
state_dict, filename_pattern=filename_pattern, max_shard_size=max_shard_size
|
||||
)
|
||||
|
||||
# Clean the folder from a previous save
|
||||
for filename in os.listdir(save_directory):
|
||||
@ -2768,31 +2844,36 @@ class Accelerator:
|
||||
if (
|
||||
filename.startswith(weights_no_suffix)
|
||||
and os.path.isfile(full_filename)
|
||||
and filename not in shards.keys()
|
||||
and filename not in state_dict_split.filename_to_tensors.keys()
|
||||
and reg.fullmatch(filename_no_suffix) is not None
|
||||
and PartialState().is_main_process
|
||||
):
|
||||
os.remove(full_filename)
|
||||
|
||||
# Save the model
|
||||
for shard_file, shard in shards.items():
|
||||
self.save(shard, os.path.join(save_directory, shard_file), safe_serialization=safe_serialization)
|
||||
for filename, tensors in state_dict_split.filename_to_tensors.items():
|
||||
shard = {tensor: state_dict[tensor] for tensor in tensors}
|
||||
self.save(shard, os.path.join(save_directory, filename), safe_serialization=safe_serialization)
|
||||
|
||||
if index is None:
|
||||
path_to_weights = os.path.join(save_directory, WEIGHTS_NAME)
|
||||
logger.info(f"Model weights saved in {path_to_weights}")
|
||||
else:
|
||||
# Save index if sharded
|
||||
if state_dict_split.is_sharded:
|
||||
index = {
|
||||
"metadata": state_dict_split.metadata,
|
||||
"weight_map": state_dict_split.tensor_to_filename,
|
||||
}
|
||||
save_index_file = SAFE_WEIGHTS_INDEX_NAME if safe_serialization else WEIGHTS_INDEX_NAME
|
||||
save_index_file = os.path.join(save_directory, save_index_file)
|
||||
# Save the index as well
|
||||
with open(save_index_file, "w", encoding="utf-8") as f:
|
||||
content = json.dumps(index, indent=2, sort_keys=True) + "\n"
|
||||
f.write(content)
|
||||
logger.info(
|
||||
f"The model is bigger than the maximum size per checkpoint ({max_shard_size}) and is going to be "
|
||||
f"split in {len(shards)} checkpoint shards. You can find where each parameters has been saved in the "
|
||||
f"split in {len(state_dict_split.filename_to_tensors)} checkpoint shards. You can find where each parameters has been saved in the "
|
||||
f"index located at {save_index_file}."
|
||||
)
|
||||
else:
|
||||
path_to_weights = os.path.join(save_directory, WEIGHTS_NAME)
|
||||
logger.info(f"Model weights saved in {path_to_weights}")
|
||||
|
||||
def register_save_state_pre_hook(self, hook: Callable[..., None]) -> hooks.RemovableHandle:
|
||||
"""
|
||||
@ -2951,6 +3032,7 @@ class Accelerator:
|
||||
schedulers,
|
||||
dataloaders,
|
||||
self.state.process_index,
|
||||
self.step,
|
||||
self.scaler,
|
||||
save_on_each_node=self.project_configuration.save_on_each_node,
|
||||
safe_serialization=safe_serialization,
|
||||
@ -3092,13 +3174,14 @@ class Accelerator:
|
||||
if self.num_processes > 1 and self.distributed_type in (
|
||||
DistributedType.MULTI_GPU,
|
||||
DistributedType.MULTI_MLU,
|
||||
DistributedType.MULTI_MUSA,
|
||||
DistributedType.MULTI_NPU,
|
||||
):
|
||||
map_location = "on_device"
|
||||
else:
|
||||
map_location = "cpu"
|
||||
|
||||
load_accelerator_state(
|
||||
override_attributes = load_accelerator_state(
|
||||
input_dir,
|
||||
models,
|
||||
optimizers,
|
||||
@ -3109,11 +3192,15 @@ class Accelerator:
|
||||
map_location,
|
||||
**load_model_func_kwargs,
|
||||
)
|
||||
if "step" in override_attributes:
|
||||
self.step = override_attributes["step"]
|
||||
custom_checkpoints = [
|
||||
f for f in os.listdir(input_dir) if re.search(r"^custom_checkpoint_\d+\.pkl$", f) is not None
|
||||
]
|
||||
if len(custom_checkpoints) != len(self._custom_objects):
|
||||
err = "Number of custom checkpoints in folder {input_dir} does not match the number of registered objects:"
|
||||
err = (
|
||||
f"Number of custom checkpoints in folder {input_dir} does not match the number of registered objects:"
|
||||
)
|
||||
err += f"\n\tFound checkpoints: {len(custom_checkpoints)}"
|
||||
err += f"\n\tRegistered objects: {len(self._custom_objects)}\n"
|
||||
err += "Please make sure to only load checkpoints from folders that were created with the same set of registered objects,"
|
||||
@ -3242,6 +3329,8 @@ class Accelerator:
|
||||
from torch.distributed.fsdp import FullStateDictConfig, StateDictType
|
||||
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
|
||||
|
||||
if unwrap:
|
||||
model = self.unwrap_model(model)
|
||||
full_state_dict_config = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
|
||||
with FSDP.state_dict_type(model, StateDictType.FULL_STATE_DICT, full_state_dict_config):
|
||||
state_dict = model.state_dict()
|
||||
@ -3325,6 +3414,66 @@ class Accelerator:
|
||||
yield
|
||||
autocast_context.__exit__(*sys.exc_info())
|
||||
|
||||
@contextmanager
|
||||
def profile(self, profile_handler: ProfileKwargs | None = None):
|
||||
"""
|
||||
Will profile the code inside the context manager. The profile will be saved to a Chrome Trace file if
|
||||
`profile_handler.output_trace_dir` is set.
|
||||
|
||||
A different `profile_handler` can be passed in to override the one set in the `Accelerator` object.
|
||||
|
||||
Args:
|
||||
profile_handler (`ProfileKwargs`, *optional*):
|
||||
The profile handler to use for this context manager. If not passed, will use the one set in the
|
||||
`Accelerator` object.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
# Profile with default settings
|
||||
from accelerate import Accelerator
|
||||
from accelerate.utils import ProfileKwargs
|
||||
|
||||
accelerator = Accelerator()
|
||||
with accelerator.profile() as prof:
|
||||
train()
|
||||
accelerator.print(prof.key_averages().table())
|
||||
|
||||
|
||||
# Profile with the custom handler
|
||||
def custom_handler(prof):
|
||||
print(prof.key_averages().table(sort_by="self_cpu_time_total", row_limit=10))
|
||||
|
||||
|
||||
kwargs = ProfileKwargs(schedule_option=dict(wait=1, warmup=1, active=1), on_trace_ready=custom_handler)
|
||||
accelerator = Accelerator(kwarg_handler=[kwargs])
|
||||
with accelerator.profile() as prof:
|
||||
for _ in range(10):
|
||||
train_iteration()
|
||||
prof.step()
|
||||
|
||||
|
||||
# Profile and export to Chrome Trace
|
||||
kwargs = ProfileKwargs(output_trace_dir="output_trace")
|
||||
accelerator = Accelerator(kwarg_handler=[kwargs])
|
||||
with accelerator.profile():
|
||||
train()
|
||||
```
|
||||
"""
|
||||
profile_handler = profile_handler or self.profile_handler or ProfileKwargs()
|
||||
|
||||
with profile_handler.build() as profiler:
|
||||
yield profiler
|
||||
|
||||
if profile_handler.output_trace_dir is None:
|
||||
return
|
||||
|
||||
os.makedirs(profile_handler.output_trace_dir, exist_ok=True)
|
||||
profiler.export_chrome_trace(
|
||||
os.path.join(profile_handler.output_trace_dir, PROFILE_PATTERN_NAME.format(suffix=self.process_index))
|
||||
)
|
||||
self.wait_for_everyone()
|
||||
|
||||
@property
|
||||
def optimizer_step_was_skipped(self):
|
||||
"""
|
||||
|
||||
@ -38,6 +38,7 @@ from .utils import (
|
||||
get_balanced_memory,
|
||||
infer_auto_device_map,
|
||||
is_mlu_available,
|
||||
is_musa_available,
|
||||
is_npu_available,
|
||||
is_torch_version,
|
||||
is_xpu_available,
|
||||
@ -432,8 +433,8 @@ def dispatch_model(
|
||||
[device for device in set(device_map.values()) if device in ("cpu", "disk")]
|
||||
)
|
||||
if len(offloaded_devices_str) > 0:
|
||||
logging.warning(
|
||||
f"Some parameters are on the meta device device because they were offloaded to the {offloaded_devices_str}."
|
||||
logger.warning(
|
||||
f"Some parameters are on the meta device because they were offloaded to the {offloaded_devices_str}."
|
||||
)
|
||||
|
||||
# Attaching the hook may break tied weights, so we retie them
|
||||
@ -457,11 +458,14 @@ def dispatch_model(
|
||||
|
||||
return wrapper
|
||||
|
||||
# Make sure to update _accelerate_added_attributes in hooks.py if you add any hook
|
||||
model.to = add_warning(model.to, model)
|
||||
if is_npu_available():
|
||||
model.npu = add_warning(model.npu, model)
|
||||
elif is_mlu_available():
|
||||
model.mlu = add_warning(model.mlu, model)
|
||||
elif is_musa_available():
|
||||
model.musa = add_warning(model.musa, model)
|
||||
elif is_xpu_available():
|
||||
model.xpu = add_warning(model.xpu, model)
|
||||
else:
|
||||
@ -482,6 +486,8 @@ def dispatch_model(
|
||||
device = f"npu:{device}"
|
||||
elif is_mlu_available() and isinstance(device, int):
|
||||
device = f"mlu:{device}"
|
||||
elif is_musa_available() and isinstance(device, int):
|
||||
device = f"musa:{device}"
|
||||
elif is_xpu_available() and isinstance(device, int):
|
||||
device = f"xpu:{device}"
|
||||
if device != "disk":
|
||||
|
||||
@ -18,7 +18,7 @@ from typing import List
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from safetensors.torch import load_file
|
||||
from safetensors.torch import load_model
|
||||
from torch.cuda.amp import GradScaler
|
||||
|
||||
from .utils import (
|
||||
@ -32,6 +32,7 @@ from .utils import (
|
||||
SCHEDULER_NAME,
|
||||
WEIGHTS_NAME,
|
||||
get_pretty_name,
|
||||
is_mlu_available,
|
||||
is_torch_xla_available,
|
||||
is_xpu_available,
|
||||
save,
|
||||
@ -55,6 +56,7 @@ def save_accelerator_state(
|
||||
schedulers: list,
|
||||
dataloaders: list,
|
||||
process_index: int,
|
||||
step: int,
|
||||
scaler: GradScaler = None,
|
||||
save_on_each_node: bool = False,
|
||||
safe_serialization: bool = True,
|
||||
@ -82,6 +84,8 @@ def save_accelerator_state(
|
||||
A list of dataloader instances to save their sampler states
|
||||
process_index (`int`):
|
||||
The current process index in the Accelerator state
|
||||
step (`int`):
|
||||
The current step in the internal step tracker
|
||||
scaler (`torch.cuda.amp.GradScaler`, *optional*):
|
||||
An optional gradient scaler instance to save
|
||||
save_on_each_node (`bool`, *optional*):
|
||||
@ -134,11 +138,14 @@ def save_accelerator_state(
|
||||
# Random number generator states
|
||||
states = {}
|
||||
states_name = f"{RNG_STATE_NAME}_{process_index}.pkl"
|
||||
states["step"] = step
|
||||
states["random_state"] = random.getstate()
|
||||
states["numpy_random_seed"] = np.random.get_state()
|
||||
states["torch_manual_seed"] = torch.get_rng_state()
|
||||
if is_xpu_available():
|
||||
states["torch_xpu_manual_seed"] = torch.xpu.get_rng_state_all()
|
||||
if is_mlu_available():
|
||||
states["torch_mlu_manual_seed"] = torch.mlu.get_rng_state_all()
|
||||
else:
|
||||
states["torch_cuda_manual_seed"] = torch.cuda.get_rng_state_all()
|
||||
if is_torch_xla_available():
|
||||
@ -180,7 +187,12 @@ def load_accelerator_state(
|
||||
What device to load the optimizer state onto. Should be one of either "cpu" or "on_device".
|
||||
load_model_func_kwargs (`dict`, *optional*):
|
||||
Additional arguments that can be passed to the model's `load_state_dict` method.
|
||||
|
||||
Returns:
|
||||
`dict`: Contains the `Accelerator` attributes to override while loading the state.
|
||||
"""
|
||||
# stores the `Accelerator` attributes to override
|
||||
override_attributes = dict()
|
||||
if map_location not in [None, "cpu", "on_device"]:
|
||||
raise TypeError(
|
||||
"Unsupported optimizer map location passed, please choose one of `None`, `'cpu'`, or `'on_device'`"
|
||||
@ -196,12 +208,12 @@ def load_accelerator_state(
|
||||
ending = f"_{i}" if i > 0 else ""
|
||||
input_model_file = input_dir.joinpath(f"{SAFE_MODEL_NAME}{ending}.safetensors")
|
||||
if input_model_file.exists():
|
||||
state_dict = load_file(input_model_file, device=str(map_location))
|
||||
load_model(model, input_model_file, device=str(map_location), **load_model_func_kwargs)
|
||||
else:
|
||||
# Load with torch
|
||||
input_model_file = input_dir.joinpath(f"{MODEL_NAME}{ending}.bin")
|
||||
state_dict = torch.load(input_model_file, map_location=map_location)
|
||||
models[i].load_state_dict(state_dict, **load_model_func_kwargs)
|
||||
model.load_state_dict(state_dict, **load_model_func_kwargs)
|
||||
logger.info("All model weights loaded successfully")
|
||||
|
||||
# Optimizer states
|
||||
@ -240,11 +252,15 @@ def load_accelerator_state(
|
||||
# Random states
|
||||
try:
|
||||
states = torch.load(input_dir.joinpath(f"{RNG_STATE_NAME}_{process_index}.pkl"))
|
||||
if "step" in states:
|
||||
override_attributes["step"] = states["step"]
|
||||
random.setstate(states["random_state"])
|
||||
np.random.set_state(states["numpy_random_seed"])
|
||||
torch.set_rng_state(states["torch_manual_seed"])
|
||||
if is_xpu_available():
|
||||
torch.xpu.set_rng_state_all(states["torch_xpu_manual_seed"])
|
||||
if is_mlu_available():
|
||||
torch.mlu.set_rng_state_all(states["torch_mlu_manual_seed"])
|
||||
else:
|
||||
torch.cuda.set_rng_state_all(states["torch_cuda_manual_seed"])
|
||||
if is_torch_xla_available():
|
||||
@ -253,6 +269,8 @@ def load_accelerator_state(
|
||||
except Exception:
|
||||
logger.info("Could not load random states")
|
||||
|
||||
return override_attributes
|
||||
|
||||
|
||||
def save_custom_state(obj, path, index: int = 0, save_on_each_node: bool = False):
|
||||
"""
|
||||
|
||||
@ -18,6 +18,7 @@ from accelerate.commands.config import get_config_parser
|
||||
from accelerate.commands.env import env_command_parser
|
||||
from accelerate.commands.estimate import estimate_command_parser
|
||||
from accelerate.commands.launch import launch_command_parser
|
||||
from accelerate.commands.merge import merge_command_parser
|
||||
from accelerate.commands.test import test_command_parser
|
||||
from accelerate.commands.tpu import tpu_command_parser
|
||||
from accelerate.commands.utils import CustomArgumentParser
|
||||
@ -32,6 +33,7 @@ def main():
|
||||
estimate_command_parser(subparsers=subparsers)
|
||||
env_command_parser(subparsers=subparsers)
|
||||
launch_command_parser(subparsers=subparsers)
|
||||
merge_command_parser(subparsers=subparsers)
|
||||
tpu_command_parser(subparsers=subparsers)
|
||||
test_command_parser(subparsers=subparsers)
|
||||
|
||||
|
||||
@ -20,9 +20,13 @@ from ...utils import (
|
||||
ComputeEnvironment,
|
||||
DistributedType,
|
||||
is_deepspeed_available,
|
||||
is_fp8_available,
|
||||
is_mlu_available,
|
||||
is_mps_available,
|
||||
is_msamp_available,
|
||||
is_musa_available,
|
||||
is_npu_available,
|
||||
is_transformer_engine_available,
|
||||
is_transformers_available,
|
||||
is_xpu_available,
|
||||
)
|
||||
@ -41,6 +45,7 @@ from .config_utils import (
|
||||
_ask_options,
|
||||
_convert_distributed_mode,
|
||||
_convert_dynamo_backend,
|
||||
_convert_fp8_backend,
|
||||
_convert_mixed_precision,
|
||||
_convert_yes_no_to_bool,
|
||||
)
|
||||
@ -49,7 +54,16 @@ from .config_utils import (
|
||||
def get_cluster_input():
|
||||
distributed_type = _ask_options(
|
||||
"Which type of machine are you using?",
|
||||
["No distributed training", "multi-CPU", "multi-XPU", "multi-GPU", "multi-NPU", "multi-MLU", "TPU"],
|
||||
[
|
||||
"No distributed training",
|
||||
"multi-CPU",
|
||||
"multi-XPU",
|
||||
"multi-GPU",
|
||||
"multi-NPU",
|
||||
"multi-MLU",
|
||||
"multi-MUSA",
|
||||
"TPU",
|
||||
],
|
||||
_convert_distributed_mode,
|
||||
)
|
||||
|
||||
@ -66,6 +80,7 @@ def get_cluster_input():
|
||||
if distributed_type in [
|
||||
DistributedType.MULTI_GPU,
|
||||
DistributedType.MULTI_MLU,
|
||||
DistributedType.MULTI_MUSA,
|
||||
DistributedType.MULTI_NPU,
|
||||
DistributedType.MULTI_XPU,
|
||||
DistributedType.MULTI_CPU,
|
||||
@ -145,7 +160,13 @@ def get_cluster_input():
|
||||
not use_cpu
|
||||
and is_xpu_available()
|
||||
and distributed_type
|
||||
not in [DistributedType.MULTI_GPU, DistributedType.MULTI_NPU, DistributedType.MULTI_MLU, DistributedType.XLA]
|
||||
not in [
|
||||
DistributedType.MULTI_GPU,
|
||||
DistributedType.MULTI_NPU,
|
||||
DistributedType.MULTI_MLU,
|
||||
DistributedType.XLA,
|
||||
DistributedType.MULTI_MUSA,
|
||||
]
|
||||
):
|
||||
ipex_config["use_xpu"] = _ask_field(
|
||||
"Do you want to use XPU plugin to speed up training on XPU? [yes/NO]:",
|
||||
@ -205,6 +226,7 @@ def get_cluster_input():
|
||||
DistributedType.MULTI_XPU,
|
||||
DistributedType.MULTI_NPU,
|
||||
DistributedType.MULTI_MLU,
|
||||
DistributedType.MULTI_MUSA,
|
||||
DistributedType.NO,
|
||||
]
|
||||
and not use_mps
|
||||
@ -358,6 +380,7 @@ def get_cluster_input():
|
||||
DistributedType.MULTI_GPU,
|
||||
DistributedType.MULTI_NPU,
|
||||
DistributedType.MULTI_MLU,
|
||||
DistributedType.MULTI_MUSA,
|
||||
DistributedType.MULTI_XPU,
|
||||
]:
|
||||
use_fsdp = _ask_field(
|
||||
@ -446,6 +469,12 @@ def get_cluster_input():
|
||||
default=True,
|
||||
error_message="Please enter yes or no.",
|
||||
)
|
||||
fsdp_config["fsdp_activation_checkpointing"] = _ask_field(
|
||||
"Do you want to enable FSDP activation checkpointing? [yes/NO]: ",
|
||||
_convert_yes_no_to_bool,
|
||||
default=False,
|
||||
error_message="Please enter yes or no.",
|
||||
)
|
||||
|
||||
megatron_lm_config = {}
|
||||
if distributed_type in [DistributedType.MULTI_GPU]:
|
||||
@ -523,6 +552,7 @@ def get_cluster_input():
|
||||
DistributedType.MULTI_XPU,
|
||||
DistributedType.MULTI_GPU,
|
||||
DistributedType.MULTI_MLU,
|
||||
DistributedType.MULTI_MUSA,
|
||||
DistributedType.MULTI_NPU,
|
||||
DistributedType.XLA,
|
||||
]:
|
||||
@ -559,6 +589,7 @@ def get_cluster_input():
|
||||
in [
|
||||
DistributedType.MULTI_GPU,
|
||||
DistributedType.MULTI_MLU,
|
||||
DistributedType.MULTI_MUSA,
|
||||
DistributedType.MULTI_NPU,
|
||||
DistributedType.MULTI_XPU,
|
||||
DistributedType.NO,
|
||||
@ -570,6 +601,8 @@ def get_cluster_input():
|
||||
machine_type = "NPU(s)"
|
||||
elif is_mlu_available():
|
||||
machine_type = "MLU(s)"
|
||||
elif is_musa_available():
|
||||
machine_type = "MUSA(s)"
|
||||
else:
|
||||
machine_type = "GPU(s)"
|
||||
gpu_ids = _ask_field(
|
||||
@ -579,7 +612,7 @@ def get_cluster_input():
|
||||
|
||||
# CPU affinity is only supported on NVIDIA hardware for now
|
||||
enable_cpu_affinity = False
|
||||
if distributed_type == (DistributedType.NO, DistributedType.MULTI_GPU) and not use_cpu and not use_mps:
|
||||
if distributed_type in (DistributedType.NO, DistributedType.MULTI_GPU) and not use_cpu and not use_mps:
|
||||
enable_cpu_affinity = _ask_field(
|
||||
"Would you like to enable numa efficiency? (Currently only supported on NVIDIA hardware). [yes/NO]: ",
|
||||
_convert_yes_no_to_bool,
|
||||
@ -587,6 +620,7 @@ def get_cluster_input():
|
||||
error_message="Please enter yes or no.",
|
||||
)
|
||||
|
||||
fp8_config = None
|
||||
if distributed_type == DistributedType.XLA:
|
||||
mixed_precision = "no"
|
||||
main_training_function = _ask_field(
|
||||
@ -668,10 +702,86 @@ def get_cluster_input():
|
||||
mixed_precision = None
|
||||
else:
|
||||
mixed_precision = _ask_options(
|
||||
"Do you wish to use FP16 or BF16 (mixed precision)?",
|
||||
"Do you wish to use mixed precision?",
|
||||
["no", "fp16", "bf16", "fp8"],
|
||||
_convert_mixed_precision,
|
||||
)
|
||||
if mixed_precision == "fp8":
|
||||
if not is_fp8_available():
|
||||
raise ValueError("FP8 (either Transformer Engine or MSAMP) is not installed on this machine.")
|
||||
fp8_config = {}
|
||||
fp8_config["backend"] = _ask_options(
|
||||
"Which FP8 backend do you want to use?",
|
||||
["te", "msamp"],
|
||||
_convert_fp8_backend,
|
||||
)
|
||||
if fp8_config["backend"] == "TE":
|
||||
if not is_transformer_engine_available():
|
||||
raise ValueError("TransformersEngine was selected, but it is not installed on this machine.")
|
||||
fp8_config["use_autocast_during_eval"] = _ask_field(
|
||||
"Do you want to use FP8 autocast during eval mode? Generally better metrics are found when this is disabled [yes/NO]: ",
|
||||
_convert_yes_no_to_bool,
|
||||
default=False,
|
||||
)
|
||||
fp8_config["margin"] = _ask_field(
|
||||
"What margin should be used for gradient scaling? [0]: ",
|
||||
int,
|
||||
default=0,
|
||||
)
|
||||
fp8_config["interval"] = _ask_field(
|
||||
"What interval should be used for for how often the scaling factor is recomputed? [1]: ",
|
||||
int,
|
||||
default=1,
|
||||
)
|
||||
fp8_config["fp8_format"] = _ask_options(
|
||||
"Which weight format should be used?",
|
||||
["E4M3", "HYBRID"],
|
||||
lambda x: "E4M3" if x == 0 else "HYBRID",
|
||||
default=0,
|
||||
)
|
||||
fp8_config["amax_history_length"] = _ask_field(
|
||||
"What length of history should be used for the amax scaling factor computation? [1024]: ",
|
||||
int,
|
||||
default=1024,
|
||||
)
|
||||
fp8_config["amax_compute_algorithm"] = _ask_options(
|
||||
"Which algorithm should be used for the amax scaling factor computation?",
|
||||
["max", "most_recent"],
|
||||
lambda x: "max" if x == 0 else "most_recent",
|
||||
default=0,
|
||||
)
|
||||
fp8_config["override_linear_precision"] = _ask_field(
|
||||
"Do you want to to execute `fprop`, `dgrad`, and `wgrad` GEMMS in higher precision? [yes/NO]: ",
|
||||
_convert_yes_no_to_bool,
|
||||
default=False,
|
||||
)
|
||||
if fp8_config["override_linear_precision"]:
|
||||
fprop = _ask_field(
|
||||
"Should `fprop` be executed in higher precision? [yes/NO]: ",
|
||||
_convert_yes_no_to_bool,
|
||||
default=False,
|
||||
)
|
||||
dgrad = _ask_field(
|
||||
"Should `dgrad` be executed in higher precision? [yes/NO]: ",
|
||||
_convert_yes_no_to_bool,
|
||||
default=False,
|
||||
)
|
||||
wgrad = _ask_field(
|
||||
"Should `wgrad` be executed in higher precision? [yes/NO]: ",
|
||||
_convert_yes_no_to_bool,
|
||||
default=False,
|
||||
)
|
||||
fp8_config["override_linear_precision"] = (fprop, dgrad, wgrad)
|
||||
|
||||
elif fp8_config["backend"] == "MSAMP":
|
||||
if not is_msamp_available():
|
||||
raise ValueError("MSAMP was selected, but it is not installed on this machine.")
|
||||
fp8_config["optimization_level"] = _ask_options(
|
||||
"Which optimization level should be used?",
|
||||
["O1", "O2"],
|
||||
lambda x: "O1" if x == 0 else "O2",
|
||||
default=1,
|
||||
)
|
||||
|
||||
if use_dynamo and mixed_precision == "no" and not use_cpu:
|
||||
print(
|
||||
@ -695,6 +805,7 @@ def get_cluster_input():
|
||||
main_process_ip=main_process_ip,
|
||||
main_process_port=main_process_port,
|
||||
main_training_function=main_training_function,
|
||||
fp8_config=fp8_config,
|
||||
deepspeed_config=deepspeed_config,
|
||||
fsdp_config=fsdp_config,
|
||||
megatron_lm_config=megatron_lm_config,
|
||||
|
||||
@ -83,11 +83,19 @@ class BaseConfig:
|
||||
def to_dict(self):
|
||||
result = self.__dict__
|
||||
# For serialization, it's best to convert Enums to strings (or their underlying value type).
|
||||
for key, value in result.items():
|
||||
|
||||
def _convert_enums(value):
|
||||
if isinstance(value, Enum):
|
||||
result[key] = value.value
|
||||
if isinstance(value, dict) and not bool(value):
|
||||
result[key] = None
|
||||
return value.value
|
||||
if isinstance(value, dict):
|
||||
if not bool(value):
|
||||
return None
|
||||
for key1, value1 in value.items():
|
||||
value[key1] = _convert_enums(value1)
|
||||
return value
|
||||
|
||||
for key, value in result.items():
|
||||
result[key] = _convert_enums(value)
|
||||
result = {k: v for k, v in result.items() if v is not None}
|
||||
return result
|
||||
|
||||
@ -184,6 +192,8 @@ class ClusterConfig(BaseConfig):
|
||||
main_training_function: str = "main"
|
||||
enable_cpu_affinity: bool = False
|
||||
|
||||
# args for FP8 training
|
||||
fp8_config: dict = None
|
||||
# args for deepspeed_plugin
|
||||
deepspeed_config: dict = None
|
||||
# args for fsdp
|
||||
@ -221,6 +231,8 @@ class ClusterConfig(BaseConfig):
|
||||
self.ipex_config = {}
|
||||
if self.mpirun_config is None:
|
||||
self.mpirun_config = {}
|
||||
if self.fp8_config is None:
|
||||
self.fp8_config = {}
|
||||
return super().__post_init__()
|
||||
|
||||
|
||||
@ -241,3 +253,4 @@ class SageMakerConfig(BaseConfig):
|
||||
sagemaker_metrics_file: str = None
|
||||
additional_args: dict = None
|
||||
dynamo_config: dict = None
|
||||
enable_cpu_affinity: bool = False
|
||||
|
||||
@ -20,6 +20,7 @@ from ...utils.dataclasses import (
|
||||
ComputeEnvironment,
|
||||
DistributedType,
|
||||
DynamoBackend,
|
||||
FP8BackendType,
|
||||
PrecisionType,
|
||||
SageMakerDistributedType,
|
||||
)
|
||||
@ -37,6 +38,8 @@ DYNAMO_BACKENDS = [
|
||||
"FX2TRT",
|
||||
"ONNXRT",
|
||||
"TENSORRT",
|
||||
"AOT_TORCHXLA_TRACE_ONCE",
|
||||
"TORHCHXLA_TRACE_ONCE",
|
||||
"IPEX",
|
||||
"TVM",
|
||||
]
|
||||
@ -68,7 +71,9 @@ def _convert_compute_environment(value):
|
||||
|
||||
def _convert_distributed_mode(value):
|
||||
value = int(value)
|
||||
return DistributedType(["NO", "MULTI_CPU", "MULTI_XPU", "MULTI_GPU", "MULTI_NPU", "MULTI_MLU", "XLA"][value])
|
||||
return DistributedType(
|
||||
["NO", "MULTI_CPU", "MULTI_XPU", "MULTI_GPU", "MULTI_NPU", "MULTI_MLU", "MULTI_MUSA", "XLA"][value]
|
||||
)
|
||||
|
||||
|
||||
def _convert_dynamo_backend(value):
|
||||
@ -86,6 +91,11 @@ def _convert_sagemaker_distributed_mode(value):
|
||||
return SageMakerDistributedType(["NO", "DATA_PARALLEL", "MODEL_PARALLEL"][value])
|
||||
|
||||
|
||||
def _convert_fp8_backend(value):
|
||||
value = int(value)
|
||||
return FP8BackendType(["TE", "MSAMP"][value])
|
||||
|
||||
|
||||
def _convert_yes_no_to_bool(value):
|
||||
return {"yes": True, "no": False}[value.lower()]
|
||||
|
||||
|
||||
@ -18,7 +18,7 @@ from pathlib import Path
|
||||
|
||||
import torch
|
||||
|
||||
from ...utils import is_mlu_available, is_npu_available, is_xpu_available
|
||||
from ...utils import is_mlu_available, is_musa_available, is_npu_available, is_xpu_available
|
||||
from .config_args import ClusterConfig, default_json_config_file
|
||||
from .config_utils import SubcommandHelpFormatter
|
||||
|
||||
@ -65,6 +65,14 @@ def write_basic_config(mixed_precision="no", save_location: str = default_json_c
|
||||
config["distributed_type"] = "MULTI_MLU"
|
||||
else:
|
||||
config["distributed_type"] = "NO"
|
||||
elif is_musa_available():
|
||||
num_musas = torch.musa.device_count()
|
||||
config["num_processes"] = num_musas
|
||||
config["use_cpu"] = False
|
||||
if num_musas > 1:
|
||||
config["distributed_type"] = "MULTI_MUSA"
|
||||
else:
|
||||
config["distributed_type"] = "NO"
|
||||
elif torch.cuda.is_available():
|
||||
num_gpus = torch.cuda.device_count()
|
||||
config["num_processes"] = num_gpus
|
||||
@ -95,6 +103,7 @@ def write_basic_config(mixed_precision="no", save_location: str = default_json_c
|
||||
config["num_processes"] = 1
|
||||
config["distributed_type"] = "NO"
|
||||
config["debug"] = False
|
||||
config["enable_cpu_affinity"] = False
|
||||
config = ClusterConfig(**config)
|
||||
config.to_json_file(path)
|
||||
return path
|
||||
|
||||
@ -26,7 +26,7 @@ import torch
|
||||
from accelerate import __version__ as version
|
||||
from accelerate.commands.config import default_config_file, load_config_from_file
|
||||
|
||||
from ..utils import is_mlu_available, is_npu_available, is_xpu_available
|
||||
from ..utils import is_mlu_available, is_musa_available, is_npu_available, is_xpu_available
|
||||
|
||||
|
||||
def env_command_parser(subparsers=None):
|
||||
@ -49,6 +49,7 @@ def env_command(args):
|
||||
pt_cuda_available = torch.cuda.is_available()
|
||||
pt_xpu_available = is_xpu_available()
|
||||
pt_mlu_available = is_mlu_available()
|
||||
pt_musa_available = is_musa_available()
|
||||
pt_npu_available = is_npu_available()
|
||||
|
||||
accelerate_config = "Not found"
|
||||
@ -75,10 +76,13 @@ def env_command(args):
|
||||
"PyTorch XPU available": str(pt_xpu_available),
|
||||
"PyTorch NPU available": str(pt_npu_available),
|
||||
"PyTorch MLU available": str(pt_mlu_available),
|
||||
"PyTorch MUSA available": str(pt_musa_available),
|
||||
"System RAM": f"{psutil.virtual_memory().total / 1024 ** 3:.2f} GB",
|
||||
}
|
||||
if pt_cuda_available:
|
||||
info["GPU type"] = torch.cuda.get_device_name()
|
||||
if pt_mlu_available:
|
||||
info["MLU type"] = torch.mlu.get_device_name()
|
||||
if pt_npu_available:
|
||||
info["CANN version"] = torch.version.cann
|
||||
|
||||
|
||||
@ -38,7 +38,7 @@ def verify_on_hub(repo: str, token: str = None):
|
||||
"Verifies that the model is on the hub and returns the model info."
|
||||
try:
|
||||
return model_info(repo, token=token)
|
||||
except GatedRepoError:
|
||||
except (OSError, GatedRepoError):
|
||||
return "gated"
|
||||
except RepositoryNotFoundError:
|
||||
return "repo"
|
||||
|
||||
@ -40,6 +40,7 @@ from accelerate.utils import (
|
||||
is_bf16_available,
|
||||
is_deepspeed_available,
|
||||
is_mlu_available,
|
||||
is_musa_available,
|
||||
is_npu_available,
|
||||
is_rich_available,
|
||||
is_sagemaker_available,
|
||||
@ -52,6 +53,7 @@ from accelerate.utils import (
|
||||
prepare_sagemager_args_inputs,
|
||||
prepare_simple_launcher_cmd_env,
|
||||
prepare_tpu,
|
||||
str_to_bool,
|
||||
)
|
||||
from accelerate.utils.constants import DEEPSPEED_MULTINODE_LAUNCHERS, TORCH_DYNAMO_MODES
|
||||
|
||||
@ -73,11 +75,14 @@ options_to_group = {
|
||||
"use_deepspeed": "DeepSpeed Arguments",
|
||||
"use_fsdp": "FSDP Arguments",
|
||||
"use_megatron_lm": "Megatron-LM Arguments",
|
||||
"fp8_backend": "FP8 Arguments",
|
||||
}
|
||||
|
||||
|
||||
def clean_option(option):
|
||||
"Finds all cases of - after the first two characters and changes them to _"
|
||||
if "fp8_backend" in option:
|
||||
option = "--fp8_backend"
|
||||
if option.startswith("--"):
|
||||
return option[2:].replace("-", "_")
|
||||
|
||||
@ -213,7 +218,6 @@ def launch_command_parser(subparsers=None):
|
||||
action="store_true",
|
||||
help="Whether or not CPU affinity and balancing should be enabled. Currently only supported on NVIDIA hardware.",
|
||||
)
|
||||
|
||||
# Dynamo arguments
|
||||
resource_args.add_argument(
|
||||
"--dynamo_backend",
|
||||
@ -303,6 +307,15 @@ def launch_command_parser(subparsers=None):
|
||||
type=str,
|
||||
help="Tee std streams into a log file and also to console.",
|
||||
)
|
||||
distributed_args.add_argument(
|
||||
"--log_dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"Base directory to use for log files when using torchrun/torch.distributed.run as launcher. "
|
||||
"Use with --tee to redirect std streams info log files."
|
||||
),
|
||||
)
|
||||
distributed_args.add_argument(
|
||||
"--role",
|
||||
type=str,
|
||||
@ -331,7 +344,7 @@ def launch_command_parser(subparsers=None):
|
||||
distributed_args.add_argument(
|
||||
"--monitor_interval",
|
||||
type=float,
|
||||
default=5,
|
||||
default=0.1,
|
||||
help="Interval, in seconds, to monitor the state of workers.",
|
||||
)
|
||||
parser.add_argument(
|
||||
@ -575,6 +588,12 @@ def launch_command_parser(subparsers=None):
|
||||
help="If True, each individually wrapped FSDP unit will broadcast module parameters from rank 0."
|
||||
" (useful only when `use_fsdp` flag is passed).",
|
||||
)
|
||||
fsdp_args.add_argument(
|
||||
"--fsdp_activation_checkpointing",
|
||||
default="false",
|
||||
type=str,
|
||||
help="Decides Whether (true|false) intermediate activations are freed during the forward pass, and a checkpoint is left as a placeholder. (useful only when `use_fsdp` flag is passed).",
|
||||
)
|
||||
|
||||
# megatron_lm args
|
||||
megatron_lm_args = parser.add_argument_group("Megatron-LM Arguments", "Arguments related to Megatron-LM.")
|
||||
@ -626,6 +645,68 @@ def launch_command_parser(subparsers=None):
|
||||
"(useful only when `use_megatron_lm` flag is passed).",
|
||||
)
|
||||
|
||||
# FP8 arguments
|
||||
fp8_args = parser.add_argument_group(
|
||||
"FP8 Arguments", "Arguments related to FP8 training (requires `--mixed_precision=fp8`)"
|
||||
)
|
||||
fp8_args.add_argument(
|
||||
"--fp8_backend",
|
||||
type=str,
|
||||
choices=["te", "msamp"],
|
||||
help="Choose a backend to train with FP8 (te: TransformerEngine, msamp: MS-AMP)",
|
||||
)
|
||||
fp8_args.add_argument(
|
||||
"--fp8_use_autocast_during_eval",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Whether to use FP8 autocast during eval mode (useful only when `--fp8_backend=te` is passed). Generally better metrics are found when this is not passed.",
|
||||
)
|
||||
fp8_args.add_argument(
|
||||
"--fp8_margin",
|
||||
type=int,
|
||||
default=0,
|
||||
help="The margin to use for the gradient scaling (useful only when `--fp8_backend=te` is passed).",
|
||||
)
|
||||
fp8_args.add_argument(
|
||||
"--fp8_interval",
|
||||
type=int,
|
||||
default=1,
|
||||
help="The interval to use for how often the scaling factor is recomputed (useful only when `--fp8_backend=te` is passed).",
|
||||
)
|
||||
fp8_args.add_argument(
|
||||
"--fp8_format",
|
||||
type=str,
|
||||
default="E4M3",
|
||||
choices=["E4M3", "HYBRID"],
|
||||
help="The format to use for the FP8 recipe (useful only when `--fp8_backend=te` is passed).",
|
||||
)
|
||||
fp8_args.add_argument(
|
||||
"--fp8_amax_history_len",
|
||||
type=int,
|
||||
default=1024,
|
||||
help="The length of the history to use for the scaling factor computation (useful only when `--fp8_backend=te` is passed).",
|
||||
)
|
||||
fp8_args.add_argument(
|
||||
"--fp8_amax_compute_algo",
|
||||
type=str,
|
||||
default="most_recent",
|
||||
choices=["max", "most_recent"],
|
||||
help="The algorithm to use for the scaling factor computation. (useful only when `--fp8_backend=te` is passed).",
|
||||
)
|
||||
fp8_args.add_argument(
|
||||
"--fp8_override_linear_precision",
|
||||
type=lambda x: tuple(map(str_to_bool, x.split(","))),
|
||||
default=(False, False, False),
|
||||
help="Whether or not to execute `fprop`, `dgrad`, and `wgrad` GEMMS in higher precision. Should be passed in a comma-seperated string of booleans (useful only when `--fp8_backend=te` is passed).",
|
||||
)
|
||||
fp8_args.add_argument(
|
||||
"--fp8_opt_level",
|
||||
type=str,
|
||||
default="O2",
|
||||
choices=["O1", "O2"],
|
||||
help="What level of 8-bit collective communication should be used with MS-AMP (useful only when `--fp8_backend=msamp` is passed).",
|
||||
)
|
||||
|
||||
# AWS arguments
|
||||
aws_args = parser.add_argument_group("AWS Arguments", "Arguments related to AWS.")
|
||||
aws_args.add_argument(
|
||||
@ -919,6 +1000,7 @@ def _validate_launch_command(args):
|
||||
DistributedType.MULTI_GPU,
|
||||
DistributedType.MULTI_NPU,
|
||||
DistributedType.MULTI_MLU,
|
||||
DistributedType.MULTI_MUSA,
|
||||
DistributedType.MULTI_XPU,
|
||||
)
|
||||
else False
|
||||
@ -998,6 +1080,8 @@ def _validate_launch_command(args):
|
||||
args.num_processes = torch.xpu.device_count()
|
||||
elif is_mlu_available():
|
||||
args.num_processes = torch.mlu.device_count()
|
||||
elif is_musa_available():
|
||||
args.num_processes = torch.musa.device_count()
|
||||
elif is_npu_available():
|
||||
args.num_processes = torch.npu.device_count()
|
||||
else:
|
||||
@ -1005,11 +1089,16 @@ def _validate_launch_command(args):
|
||||
warned.append(f"\t`--num_processes` was set to a value of `{args.num_processes}`")
|
||||
if args.debug is None:
|
||||
args.debug = False
|
||||
if not args.multi_gpu and (
|
||||
(args.use_xpu and is_xpu_available() and torch.xpu.device_count() > 1)
|
||||
or (is_mlu_available() and torch.mlu.device_count() > 1)
|
||||
or (is_npu_available() and torch.npu.device_count() > 1)
|
||||
or (torch.cuda.device_count() > 1)
|
||||
if (
|
||||
not args.multi_gpu
|
||||
and args.num_processes > 1
|
||||
and (
|
||||
(args.use_xpu and is_xpu_available() and torch.xpu.device_count() > 1)
|
||||
or (is_mlu_available() and torch.mlu.device_count() > 1)
|
||||
or (is_musa_available() and torch.musa.device_count() > 1)
|
||||
or (is_npu_available() and torch.npu.device_count() > 1)
|
||||
or (torch.cuda.device_count() > 1)
|
||||
)
|
||||
):
|
||||
warned.append(
|
||||
"\t\tMore than one GPU was found, enabling multi-GPU training.\n"
|
||||
@ -1034,10 +1123,11 @@ def _validate_launch_command(args):
|
||||
defaults is not None and defaults.compute_environment != ComputeEnvironment.AMAZON_SAGEMAKER
|
||||
)
|
||||
if is_aws_env_disabled and args.num_cpu_threads_per_process is None:
|
||||
args.num_cpu_threads_per_process = 1
|
||||
if args.use_cpu and args.num_processes >= 1:
|
||||
args.num_cpu_threads_per_process = get_int_from_env(["OMP_NUM_THREADS"], 1)
|
||||
if args.use_cpu and args.num_processes >= 1 and get_int_from_env(["OMP_NUM_THREADS"], 0) == 0:
|
||||
local_size = get_int_from_env(
|
||||
["MPI_LOCALNRANKS", "OMPI_COMM_WORLD_LOCAL_SIZE", "MV2_COMM_WORLD_LOCAL_SIZE"], 1
|
||||
["MPI_LOCALNRANKS", "OMPI_COMM_WORLD_LOCAL_SIZE", "MV2_COMM_WORLD_LOCAL_SIZE"],
|
||||
max(int(args.num_processes / args.num_machines), 1),
|
||||
)
|
||||
threads_per_process = int(psutil.cpu_count(logical=False) / local_size)
|
||||
if threads_per_process > 1:
|
||||
|
||||
69
src/accelerate/commands/merge.py
Normal file
69
src/accelerate/commands/merge.py
Normal file
@ -0,0 +1,69 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from accelerate.commands.utils import CustomArgumentParser
|
||||
from accelerate.utils import merge_fsdp_weights
|
||||
|
||||
|
||||
description = """Utility to merge the weights from multiple FSDP checkpoints into a single combined checkpoint. Should be used if
|
||||
`SHARDED_STATE_DICT` was used for the model. Weights will be saved to `{output_path}`.
|
||||
|
||||
This is a CPU-bound process and requires enough RAM to load the entire model state dict."""
|
||||
|
||||
|
||||
def merge_command(args):
|
||||
merge_fsdp_weights(
|
||||
args.checkpoint_directory, args.output_path, not args.unsafe_serialization, args.remove_checkpoint_dir
|
||||
)
|
||||
|
||||
|
||||
def merge_command_parser(subparsers=None):
|
||||
if subparsers is not None:
|
||||
parser = subparsers.add_parser("merge-weights", description=description)
|
||||
else:
|
||||
parser = CustomArgumentParser(description=description)
|
||||
|
||||
parser.add_argument("checkpoint_directory", type=str, help="A directory containing sharded weights saved by FSDP.")
|
||||
parser.add_argument(
|
||||
"output_path",
|
||||
type=str,
|
||||
help="The path to save the merged weights. Defaults to the current directory. ",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--unsafe_serialization",
|
||||
action="store_false",
|
||||
default=False,
|
||||
help="Whether to save the merged weights as `.bin` rather than `.safetensors` (not recommended).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--remove_checkpoint_dir",
|
||||
action="store_true",
|
||||
help="Whether to remove the checkpoint directory after merging.",
|
||||
default=False,
|
||||
)
|
||||
|
||||
if subparsers is not None:
|
||||
parser.set_defaults(func=merge_command)
|
||||
return parser
|
||||
|
||||
|
||||
def main():
|
||||
parser = merge_command_parser()
|
||||
args = parser.parse_args()
|
||||
merge_command(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -20,7 +20,7 @@ import torch
|
||||
from torch.utils.data import BatchSampler, DataLoader, IterableDataset, RandomSampler
|
||||
|
||||
from .logging import get_logger
|
||||
from .state import AcceleratorState, DistributedType, GradientState, is_torch_xla_available
|
||||
from .state import AcceleratorState, DistributedType, GradientState, PartialState, is_torch_xla_available
|
||||
from .utils import (
|
||||
RNGType,
|
||||
broadcast,
|
||||
@ -539,6 +539,7 @@ if is_torch_xla_available():
|
||||
super().__init__(dataloader, device)
|
||||
self._rng_types = self._loader.rng_types
|
||||
self._loader.rng_types = None
|
||||
self.device = device
|
||||
|
||||
def __iter__(self):
|
||||
if self._rng_types is not None:
|
||||
@ -558,6 +559,10 @@ if is_torch_xla_available():
|
||||
def batch_sampler(self):
|
||||
return self._loader.batch_sampler
|
||||
|
||||
@property
|
||||
def dataloader(self):
|
||||
return self._loader
|
||||
|
||||
|
||||
class DataLoaderDispatcher(DataLoader, DataLoaderStateMixin):
|
||||
"""
|
||||
@ -1083,6 +1088,11 @@ def skip_first_batches(dataloader, num_batches=0):
|
||||
"""
|
||||
Creates a `torch.utils.data.DataLoader` that will efficiently skip the first `num_batches`.
|
||||
"""
|
||||
state = PartialState()
|
||||
if state.distributed_type == DistributedType.XLA:
|
||||
device = dataloader.device
|
||||
dataloader = dataloader.dataloader
|
||||
|
||||
dataset = dataloader.dataset
|
||||
sampler_is_batch_sampler = False
|
||||
if isinstance(dataset, IterableDataset):
|
||||
@ -1146,4 +1156,7 @@ def skip_first_batches(dataloader, num_batches=0):
|
||||
else:
|
||||
dataloader = DataLoader(dataset, batch_sampler=new_batch_sampler, **kwargs)
|
||||
|
||||
if state.distributed_type == DistributedType.XLA:
|
||||
dataloader = MpDeviceLoaderWrapper(dataloader, device)
|
||||
|
||||
return dataloader
|
||||
|
||||
@ -26,10 +26,14 @@ from .utils import (
|
||||
send_to_device,
|
||||
set_module_tensor_to_device,
|
||||
)
|
||||
from .utils.memory import clear_device_cache
|
||||
from .utils.modeling import get_non_persistent_buffers
|
||||
from .utils.other import recursive_getattr
|
||||
|
||||
|
||||
_accelerate_added_attributes = ["to", "cuda", "npu", "xpu", "mlu", "musa"]
|
||||
|
||||
|
||||
class ModelHook:
|
||||
"""
|
||||
A hook that contains callbacks to be executed just before and after the forward method of a model. The difference
|
||||
@ -202,6 +206,10 @@ def remove_hook_from_module(module: nn.Module, recurse=False):
|
||||
module.forward = module._old_forward
|
||||
delattr(module, "_old_forward")
|
||||
|
||||
# Remove accelerate added warning hooks from dispatch_model
|
||||
for attr in _accelerate_added_attributes:
|
||||
module.__dict__.pop(attr, None)
|
||||
|
||||
if recurse:
|
||||
for child in module.children():
|
||||
remove_hook_from_module(child, recurse)
|
||||
@ -688,6 +696,7 @@ class CpuOffload(ModelHook):
|
||||
def pre_forward(self, module, *args, **kwargs):
|
||||
if self.prev_module_hook is not None:
|
||||
self.prev_module_hook.offload()
|
||||
clear_device_cache()
|
||||
module.to(self.execution_device)
|
||||
return send_to_device(args, self.execution_device), send_to_device(kwargs, self.execution_device)
|
||||
|
||||
|
||||
@ -28,11 +28,6 @@ from .utils import (
|
||||
)
|
||||
|
||||
|
||||
if is_pippy_available():
|
||||
from pippy.IR import Pipe, PipeSplitWrapper, annotate_split_points
|
||||
from pippy.PipelineStage import PipelineStage
|
||||
|
||||
|
||||
def generate_device_map(model, num_processes: int = 1, no_split_module_classes=None, max_memory: dict = None):
|
||||
"""
|
||||
Calculates the device map for `model` with an offset for PiPPy
|
||||
@ -83,6 +78,10 @@ def build_pipeline(model, split_points, args, kwargs, num_chunks):
|
||||
Users can pass in custom `num_chunks` as an optional hyper-parameter. By default will use
|
||||
`AcceleratorState.num_processes`
|
||||
"""
|
||||
# Note: We import here to reduce import time from general modules, and isolate outside dependencies
|
||||
from pippy.IR import Pipe, PipeSplitWrapper, annotate_split_points
|
||||
from pippy.PipelineStage import PipelineStage
|
||||
|
||||
# We need to annotate the split points in the model for PiPPy
|
||||
state = PartialState()
|
||||
annotate_split_points(model, {split_point: PipeSplitWrapper.SplitPoint.BEGINNING for split_point in split_points})
|
||||
|
||||
@ -26,8 +26,10 @@ from .utils import (
|
||||
check_cuda_p2p_ib_support,
|
||||
get_gpu_info,
|
||||
is_mps_available,
|
||||
is_torch_version,
|
||||
patch_environment,
|
||||
)
|
||||
from .utils.constants import ELASTIC_LOG_LINE_PREFIX_TEMPLATE_PYTORCH_VERSION
|
||||
|
||||
|
||||
def test_launch():
|
||||
@ -44,6 +46,13 @@ def notebook_launcher(
|
||||
master_addr="127.0.0.1",
|
||||
node_rank=0,
|
||||
num_nodes=1,
|
||||
rdzv_backend="static",
|
||||
rdzv_endpoint="",
|
||||
rdzv_conf=None,
|
||||
rdzv_id="none",
|
||||
max_restarts=0,
|
||||
monitor_interval=0.1,
|
||||
log_line_prefix_template=None,
|
||||
):
|
||||
"""
|
||||
Launches a training function, using several processes or multiple nodes if it's possible in the current environment
|
||||
@ -78,6 +87,20 @@ def notebook_launcher(
|
||||
The rank of the current node.
|
||||
num_nodes (`int`, *optional*, defaults to 1):
|
||||
The number of nodes to use for training.
|
||||
rdzv_backend (`str`, *optional*, defaults to `"static"`):
|
||||
The rendezvous method to use, such as 'static' (the default) or 'c10d'
|
||||
rdzv_endpoint (`str`, *optional*, defaults to `""`):
|
||||
The endpoint of the rdzv sync. storage.
|
||||
rdzv_conf (`Dict`, *optional*, defaults to `None`):
|
||||
Additional rendezvous configuration.
|
||||
rdzv_id (`str`, *optional*, defaults to `"none"`):
|
||||
The unique run id of the job.
|
||||
max_restarts (`int`, *optional*, defaults to 0):
|
||||
The maximum amount of restarts that elastic agent will conduct on workers before failure.
|
||||
monitor_interval (`float`, *optional*, defaults to 0.1):
|
||||
The interval in seconds that is used by the elastic_agent as a period of monitoring workers.
|
||||
log_line_prefix_template (`str`, *optional*, defaults to `None`):
|
||||
The prefix template for elastic launch logging. Available from PyTorch 2.2.0.
|
||||
|
||||
Example:
|
||||
|
||||
@ -141,6 +164,7 @@ def notebook_launcher(
|
||||
raise ValueError("The node_rank must be less than the number of nodes.")
|
||||
if num_processes > 1:
|
||||
# Multi-GPU launch
|
||||
from torch.distributed.launcher.api import LaunchConfig, elastic_launch
|
||||
from torch.multiprocessing import start_processes
|
||||
from torch.multiprocessing.spawn import ProcessRaisedException
|
||||
|
||||
@ -198,7 +222,27 @@ def notebook_launcher(
|
||||
launcher = PrepareForLaunch(function, distributed_type="MULTI_GPU")
|
||||
print(f"Launching training on {num_processes} GPUs.")
|
||||
try:
|
||||
start_processes(launcher, args=args, nprocs=num_processes, start_method="fork")
|
||||
if rdzv_conf is None:
|
||||
rdzv_conf = {}
|
||||
if rdzv_backend == "static":
|
||||
rdzv_conf["rank"] = node_rank
|
||||
if not rdzv_endpoint:
|
||||
rdzv_endpoint = f"{master_addr}:{use_port}"
|
||||
launch_config_kwargs = dict(
|
||||
min_nodes=num_nodes,
|
||||
max_nodes=num_nodes,
|
||||
nproc_per_node=num_processes,
|
||||
run_id=rdzv_id,
|
||||
rdzv_endpoint=rdzv_endpoint,
|
||||
rdzv_backend=rdzv_backend,
|
||||
rdzv_configs=rdzv_conf,
|
||||
max_restarts=max_restarts,
|
||||
monitor_interval=monitor_interval,
|
||||
start_method="fork",
|
||||
)
|
||||
if is_torch_version(">=", ELASTIC_LOG_LINE_PREFIX_TEMPLATE_PYTORCH_VERSION):
|
||||
launch_config_kwargs["log_line_prefix_template"] = log_line_prefix_template
|
||||
elastic_launch(config=LaunchConfig(**launch_config_kwargs), entrypoint=function)(*args)
|
||||
except ProcessRaisedException as e:
|
||||
if "Cannot re-initialize CUDA in forked subprocess" in e.args[0]:
|
||||
raise RuntimeError(
|
||||
|
||||
@ -70,6 +70,7 @@ class LocalSGD:
|
||||
DistributedType.MULTI_CPU,
|
||||
DistributedType.MULTI_GPU,
|
||||
DistributedType.MULTI_MLU,
|
||||
DistributedType.MULTI_MUSA,
|
||||
DistributedType.MULTI_NPU,
|
||||
]:
|
||||
raise NotImplementedError("LocalSGD is supported only for CPUs and GPUs (no DeepSpeed or MegatronLM)")
|
||||
|
||||
@ -54,6 +54,8 @@ class MultiProcessAdapter(logging.LoggerAdapter):
|
||||
)
|
||||
main_process_only = kwargs.pop("main_process_only", True)
|
||||
in_order = kwargs.pop("in_order", False)
|
||||
# set `stacklevel` to exclude ourself in `Logger.findCaller()` while respecting user's choice
|
||||
kwargs.setdefault("stacklevel", 2)
|
||||
|
||||
if self.isEnabledFor(level):
|
||||
if self._should_log(main_process_only):
|
||||
|
||||
@ -15,7 +15,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import threading
|
||||
import warnings
|
||||
@ -41,6 +40,7 @@ from .utils import (
|
||||
is_ipex_available,
|
||||
is_mlu_available,
|
||||
is_mps_available,
|
||||
is_musa_available,
|
||||
is_npu_available,
|
||||
is_torch_xla_available,
|
||||
is_xpu_available,
|
||||
@ -57,6 +57,9 @@ if is_torch_xla_available():
|
||||
if is_mlu_available(check_device=False):
|
||||
import torch_mlu # noqa: F401
|
||||
|
||||
if is_musa_available(check_device=False):
|
||||
import torch_musa # noqa: F401
|
||||
|
||||
if is_npu_available(check_device=False):
|
||||
import torch_npu # noqa: F401
|
||||
|
||||
@ -182,7 +185,7 @@ class PartialState:
|
||||
original_backend = kwargs.pop("backend", None)
|
||||
backend, distributed_type = self._prepare_backend(cpu, use_sagemaker_dp, original_backend)
|
||||
if original_backend is not None and backend != original_backend:
|
||||
raise ValueError("Your assigned backend {original_backend} is not avaliable, please use {backend}")
|
||||
raise ValueError(f"Your assigned backend {original_backend} is not avaliable, please use {backend}")
|
||||
self.backend = backend
|
||||
self.distributed_type = distributed_type
|
||||
use_deepspeed = False
|
||||
@ -196,11 +199,6 @@ class PartialState:
|
||||
)
|
||||
from deepspeed import comm as dist
|
||||
|
||||
if is_xpu_available() and is_ccl_available():
|
||||
os.environ["CCL_PROCESS_LAUNCHER"] = "none"
|
||||
os.environ["CCL_LOCAL_SIZE"] = os.environ.get("LOCAL_WORLD_SIZE", "1")
|
||||
os.environ["CCL_LOCAL_RANK"] = os.environ.get("LOCAL_RANK", "0")
|
||||
|
||||
if not dist.is_initialized():
|
||||
dist.init_distributed(dist_backend=self.backend, auto_mpi_discovery=False, **kwargs)
|
||||
# We need to flag to `use_deepspeed` to be True to override `distributed_type` later
|
||||
@ -218,10 +216,6 @@ class PartialState:
|
||||
os.environ["WORLD_SIZE"] = str(dist_information.world_size)
|
||||
os.environ["LOCAL_RANK"] = str(dist_information.local_rank)
|
||||
os.environ["LOCAL_WORLD_SIZE"] = str(dist_information.local_world_size)
|
||||
if self.backend == "ccl" and self.distributed_type == DistributedType.MULTI_XPU:
|
||||
os.environ["CCL_PROCESS_LAUNCHER"] = "none"
|
||||
os.environ["CCL_LOCAL_SIZE"] = os.environ["LOCAL_WORLD_SIZE"]
|
||||
os.environ["CCL_LOCAL_RANK"] = os.environ["LOCAL_RANK"]
|
||||
if not os.environ.get("MASTER_PORT", None):
|
||||
os.environ["MASTER_PORT"] = "29500"
|
||||
if (
|
||||
@ -238,7 +232,7 @@ class PartialState:
|
||||
|
||||
if (
|
||||
self.distributed_type == DistributedType.MULTI_CPU
|
||||
and get_int_from_env(["OMP_NUM_THREADS", "OMP_NUM_THREADS"], 0) > 0
|
||||
and get_int_from_env(["OMP_NUM_THREADS"], 0) == 0
|
||||
):
|
||||
import psutil
|
||||
|
||||
@ -370,6 +364,7 @@ class PartialState:
|
||||
if self.distributed_type in (
|
||||
DistributedType.MULTI_GPU,
|
||||
DistributedType.MULTI_MLU,
|
||||
DistributedType.MULTI_MUSA,
|
||||
DistributedType.MULTI_NPU,
|
||||
DistributedType.MULTI_XPU,
|
||||
DistributedType.MULTI_CPU,
|
||||
@ -437,11 +432,9 @@ class PartialState:
|
||||
length = len(inputs[list(inputs.keys())[0]])
|
||||
if not all(len(v) == length for v in inputs.values()):
|
||||
raise ValueError("All values in the dictionary must have the same length")
|
||||
num_samples_per_process = math.ceil(length / self.num_processes)
|
||||
start_index = self.process_index * num_samples_per_process
|
||||
end_index = start_index + num_samples_per_process
|
||||
if (len(inputs) % self.num_processes != 0) and (self.process_index == self.num_processes - 1):
|
||||
end_index = length
|
||||
num_samples_per_process, num_extras = divmod(length, self.num_processes)
|
||||
start_index = self.process_index * num_samples_per_process + min(self.process_index, num_extras)
|
||||
end_index = start_index + num_samples_per_process + (1 if self.process_index < num_extras else 0)
|
||||
|
||||
def _split_values(inputs, start_index, end_index):
|
||||
if isinstance(inputs, (list, tuple, torch.Tensor)):
|
||||
@ -457,7 +450,7 @@ class PartialState:
|
||||
tensorized_result = send_to_device(result, self.device)
|
||||
result = pad_across_processes(tensorized_result, pad_index=inputs[-1])
|
||||
else:
|
||||
result += [result[-1]] * (num_samples_per_process - len(result))
|
||||
result += [result[-1]] * (num_samples_per_process + 1 - len(result))
|
||||
return result
|
||||
elif isinstance(inputs, dict):
|
||||
for key in inputs.keys():
|
||||
@ -474,7 +467,7 @@ class PartialState:
|
||||
end_index = len(inputs)
|
||||
result_idcs = list(range(start_index, end_index))
|
||||
if apply_padding:
|
||||
result_idcs += [end_index - 1] * (num_samples_per_process - len(result_idcs))
|
||||
result_idcs += [end_index - 1] * (num_samples_per_process + 1 - len(result_idcs))
|
||||
return inputs.select(result_idcs)
|
||||
return inputs
|
||||
|
||||
@ -691,6 +684,7 @@ class PartialState:
|
||||
- MPS if `torch.backends.mps.is_available()` and `torch.backends.mps.is_built()` both return True.
|
||||
- CUDA if `torch.cuda.is_available()`
|
||||
- MLU if `is_mlu_available()`
|
||||
- MUSA if `is_musa_available()`
|
||||
- NPU if `is_npu_available()`
|
||||
- CPU otherwise
|
||||
"""
|
||||
@ -699,6 +693,8 @@ class PartialState:
|
||||
return torch.device("mps")
|
||||
elif is_mlu_available():
|
||||
return torch.device("mlu")
|
||||
elif is_musa_available():
|
||||
return torch.device("musa")
|
||||
elif torch.cuda.is_available():
|
||||
return torch.device("cuda")
|
||||
elif is_xpu_available():
|
||||
@ -725,6 +721,9 @@ class PartialState:
|
||||
if is_mlu_available():
|
||||
backend = "cncl"
|
||||
distributed_type = DistributedType.MULTI_MLU
|
||||
elif is_musa_available():
|
||||
backend = "mccl"
|
||||
distributed_type = DistributedType.MULTI_MUSA
|
||||
elif torch.cuda.is_available():
|
||||
if backend is None:
|
||||
backend = "nccl"
|
||||
@ -772,7 +771,7 @@ class PartialState:
|
||||
self.device = torch.device("cpu") if self._cpu else self.default_device
|
||||
return
|
||||
device = str(self.distributed_type).split(".")[-1].replace("MULTI_", "").lower()
|
||||
if device not in ("cpu", "gpu", "mlu", "npu", "xpu", "xla"):
|
||||
if device not in ("cpu", "gpu", "mlu", "musa", "npu", "xpu", "xla"):
|
||||
raise ValueError(
|
||||
f"Can't set device for {self.distributed_type} ({device}), verify we should be calling `_set_device()` for it!"
|
||||
)
|
||||
@ -781,16 +780,10 @@ class PartialState:
|
||||
else:
|
||||
if device == "gpu":
|
||||
device = "cuda"
|
||||
self.device = torch.device(device, self.local_process_index)
|
||||
if self.device is not None:
|
||||
if device == "xpu":
|
||||
torch.xpu.set_device(self.device)
|
||||
elif device == "mlu":
|
||||
torch.mlu.set_device(self.device)
|
||||
elif device == "npu":
|
||||
torch.npu.set_device(self.device)
|
||||
elif device == "cuda":
|
||||
torch.cuda.set_device(self.device)
|
||||
device_module = getattr(torch, device)
|
||||
device_index = self.local_process_index % device_module.device_count()
|
||||
self.device = torch.device(device, device_index)
|
||||
device_module.set_device(self.device)
|
||||
|
||||
def __getattr__(self, name: str):
|
||||
# By this point we know that no attributes of `self` contain `name`,
|
||||
@ -897,10 +890,11 @@ class AcceleratorState:
|
||||
elif self.distributed_type in [
|
||||
DistributedType.MULTI_GPU,
|
||||
DistributedType.MULTI_MLU,
|
||||
DistributedType.MULTI_MUSA,
|
||||
DistributedType.MULTI_NPU,
|
||||
DistributedType.MULTI_XPU,
|
||||
]:
|
||||
if os.environ.get("ACCELERATE_USE_FSDP", "false") == "true":
|
||||
if os.environ.get("ACCELERATE_USE_FSDP", "false") == "true" or fsdp_plugin is not None:
|
||||
self.distributed_type = DistributedType.FSDP
|
||||
if self._mixed_precision != "no":
|
||||
fsdp_plugin.set_mixed_precision(self._mixed_precision)
|
||||
@ -923,6 +917,12 @@ class AcceleratorState:
|
||||
and self.device.type == "cuda"
|
||||
):
|
||||
torch.backends.cuda.matmul.allow_tf32 = True
|
||||
if (
|
||||
self.dynamo_plugin.backend != DynamoBackend.NO
|
||||
and self._mixed_precision == "no"
|
||||
and self.device.type == "musa"
|
||||
):
|
||||
torch.backends.musa.matmul.allow_tf32 = True
|
||||
PartialState._shared_state["distributed_type"] = self.distributed_type
|
||||
|
||||
@property
|
||||
|
||||
@ -29,6 +29,7 @@ from .testing import (
|
||||
require_multi_device,
|
||||
require_multi_gpu,
|
||||
require_multi_xpu,
|
||||
require_musa,
|
||||
require_non_cpu,
|
||||
require_non_torch_xla,
|
||||
require_non_xpu,
|
||||
@ -40,6 +41,7 @@ from .testing import (
|
||||
require_torch_min_version,
|
||||
require_torchvision,
|
||||
require_tpu,
|
||||
require_transformer_engine,
|
||||
require_xpu,
|
||||
skip,
|
||||
slow,
|
||||
|
||||
@ -23,7 +23,7 @@ from torch.utils.data import DataLoader
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
|
||||
|
||||
from accelerate import Accelerator, DistributedType
|
||||
from accelerate.utils import is_mlu_available, is_npu_available, is_xpu_available
|
||||
from accelerate.utils import is_mlu_available, is_musa_available, is_npu_available, is_xpu_available
|
||||
from accelerate.utils.deepspeed import DummyOptim, DummyScheduler
|
||||
|
||||
|
||||
@ -48,6 +48,10 @@ class TorchTracemalloc:
|
||||
torch.mlu.empty_cache()
|
||||
torch.mlu.reset_max_memory_allocated() # reset the peak gauge to zero
|
||||
self.begin = torch.mlu.memory_allocated()
|
||||
elif is_musa_available():
|
||||
torch.musa.empty_cache()
|
||||
torch.musa.reset_max_memory_allocated() # reset the peak gauge to zero
|
||||
self.begin = torch.musa.memory_allocated()
|
||||
elif is_npu_available():
|
||||
torch.npu.empty_cache()
|
||||
torch.npu.reset_max_memory_allocated() # reset the peak gauge to zero
|
||||
@ -68,6 +72,10 @@ class TorchTracemalloc:
|
||||
torch.mlu.empty_cache()
|
||||
torch.mlu.memory_allocated() # reset the peak gauge to zero
|
||||
self.begin = torch.mlu.max_memory_allocated()
|
||||
elif is_musa_available():
|
||||
torch.musa.empty_cache()
|
||||
torch.musa.memory_allocated() # reset the peak gauge to zero
|
||||
self.begin = torch.musa.max_memory_allocated()
|
||||
elif is_npu_available():
|
||||
torch.npu.empty_cache()
|
||||
self.end = torch.npu.memory_allocated()
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
|
||||
import torch.distributed
|
||||
|
||||
from accelerate.test_utils import require_huggingface_suite
|
||||
from accelerate.test_utils import require_huggingface_suite, torch_device
|
||||
from accelerate.utils import is_transformers_available
|
||||
|
||||
|
||||
@ -27,7 +27,8 @@ GPT2_TINY = "sshleifer/tiny-gpt2"
|
||||
|
||||
@require_huggingface_suite
|
||||
def init_torch_dist_then_launch_deepspeed():
|
||||
torch.distributed.init_process_group(backend="nccl")
|
||||
backend = "ccl" if torch_device == "xpu" else "nccl"
|
||||
torch.distributed.init_process_group(backend=backend)
|
||||
deepspeed_config = {
|
||||
"zero_optimization": {
|
||||
"stage": 3,
|
||||
|
||||
77
src/accelerate/test_utils/scripts/test_ddp_comm_hook.py
Normal file
77
src/accelerate/test_utils/scripts/test_ddp_comm_hook.py
Normal file
@ -0,0 +1,77 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import torch
|
||||
|
||||
from accelerate import Accelerator, DDPCommunicationHookType, DistributedDataParallelKwargs
|
||||
|
||||
|
||||
class MockModel(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
torch.manual_seed(0)
|
||||
self.p = torch.nn.Parameter(torch.randn(40, 20))
|
||||
|
||||
def forward(self, x, rank):
|
||||
return self.p * (x ** (1 + rank))
|
||||
|
||||
|
||||
def _run_and_get_grads(model, rank):
|
||||
torch.manual_seed(2024)
|
||||
input = torch.randn(40, 20)
|
||||
output = model(input, rank)
|
||||
output.mean().backward()
|
||||
param = next(model.parameters())
|
||||
return param.grad
|
||||
|
||||
|
||||
def test_ddp_comm_hook(comm_hook, comm_wrapper, comm_state_option):
|
||||
ddp_kwargs = DistributedDataParallelKwargs(
|
||||
comm_hook=comm_hook,
|
||||
comm_wrapper=comm_wrapper,
|
||||
comm_state_option=comm_state_option,
|
||||
)
|
||||
accelerator = Accelerator(kwargs_handlers=[ddp_kwargs])
|
||||
|
||||
model = accelerator.prepare(MockModel())
|
||||
hook_grads = _run_and_get_grads(model, accelerator.local_process_index)
|
||||
|
||||
reference_model = torch.nn.parallel.DistributedDataParallel(
|
||||
MockModel().to(accelerator.device),
|
||||
device_ids=[accelerator.local_process_index],
|
||||
output_device=accelerator.local_process_index,
|
||||
)
|
||||
reference_grads = _run_and_get_grads(reference_model, accelerator.local_process_index)
|
||||
|
||||
torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-2, atol=1e-2)
|
||||
|
||||
|
||||
def main():
|
||||
for comm_hook, comm_wrapper, comm_state_option in [
|
||||
(DDPCommunicationHookType.NO, DDPCommunicationHookType.NO, {}),
|
||||
(DDPCommunicationHookType.FP16, DDPCommunicationHookType.NO, {}),
|
||||
(DDPCommunicationHookType.BF16, DDPCommunicationHookType.NO, {}),
|
||||
(DDPCommunicationHookType.POWER_SGD, DDPCommunicationHookType.NO, {}),
|
||||
(DDPCommunicationHookType.POWER_SGD, DDPCommunicationHookType.FP16, {}),
|
||||
(DDPCommunicationHookType.POWER_SGD, DDPCommunicationHookType.BF16, {}),
|
||||
(DDPCommunicationHookType.POWER_SGD, DDPCommunicationHookType.NO, {"matrix_approximation_rank": 2}),
|
||||
(DDPCommunicationHookType.BATCHED_POWER_SGD, DDPCommunicationHookType.NO, {}),
|
||||
(DDPCommunicationHookType.BATCHED_POWER_SGD, DDPCommunicationHookType.FP16, {}),
|
||||
(DDPCommunicationHookType.BATCHED_POWER_SGD, DDPCommunicationHookType.BF16, {}),
|
||||
]:
|
||||
print(f"Test DDP comm hook: {comm_hook}, comm wrapper: {comm_wrapper}")
|
||||
test_ddp_comm_hook(comm_hook, comm_wrapper, comm_state_option)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
160
src/accelerate/test_utils/scripts/test_merge_weights.py
Normal file
160
src/accelerate/test_utils/scripts/test_merge_weights.py
Normal file
@ -0,0 +1,160 @@
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import logging
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
from safetensors.torch import load_file
|
||||
from torch.distributed.fsdp.fully_sharded_data_parallel import ShardingStrategy, StateDictType
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from accelerate import Accelerator, FullyShardedDataParallelPlugin
|
||||
from accelerate.commands.merge import merge_command, merge_command_parser
|
||||
from accelerate.state import AcceleratorState
|
||||
from accelerate.test_utils.training import RegressionDataset
|
||||
from accelerate.utils import merge_fsdp_weights, patch_environment, save_fsdp_model
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
parser = merge_command_parser()
|
||||
|
||||
|
||||
class TinyModel(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.linear1 = torch.nn.Linear(16, 16)
|
||||
self.activation = torch.nn.ReLU()
|
||||
self.linear2 = torch.nn.Linear(16, 16)
|
||||
self.softmax = torch.nn.Softmax()
|
||||
|
||||
def forward(self, x):
|
||||
return self.linear2(self.activation(self.linear1(x)))
|
||||
|
||||
|
||||
def setup():
|
||||
if AcceleratorState._shared_state != {}:
|
||||
AcceleratorState()._reset_state()
|
||||
plugin = FullyShardedDataParallelPlugin(
|
||||
sharding_strategy=ShardingStrategy.FULL_SHARD, state_dict_type=StateDictType.SHARDED_STATE_DICT
|
||||
)
|
||||
model = TinyModel()
|
||||
with patch_environment(fsdp_auto_wrap_policy="SIZE_BASED_WRAP"):
|
||||
plugin.set_auto_wrap_policy(model)
|
||||
accelerator = Accelerator(fsdp_plugin=plugin)
|
||||
model = accelerator.prepare(model)
|
||||
return model, plugin, accelerator
|
||||
|
||||
|
||||
def mock_training(accelerator, model):
|
||||
train_set = RegressionDataset(length=128, seed=42)
|
||||
train_dl = DataLoader(train_set, batch_size=16, shuffle=False)
|
||||
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
|
||||
|
||||
train_dl, model, optimizer = accelerator.prepare(train_dl, model, optimizer)
|
||||
for _ in range(3):
|
||||
for batch in train_dl:
|
||||
model.zero_grad()
|
||||
output = model(batch["x"])
|
||||
loss = torch.nn.functional.mse_loss(output, batch["y"])
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
return model
|
||||
|
||||
|
||||
def check_weights(operation, state_1, state_2):
|
||||
for weight_1, weight_2 in zip(state_1.values(), state_2.values()):
|
||||
if str(weight_1.device) != "cuda":
|
||||
weight_1 = weight_1.to("cuda")
|
||||
if str(weight_2.device) != "cuda":
|
||||
weight_2 = weight_2.to("cuda")
|
||||
if operation == "same":
|
||||
assert torch.allclose(weight_1, weight_2)
|
||||
else:
|
||||
assert not torch.allclose(weight_1, weight_2)
|
||||
|
||||
|
||||
def check_safetensors_weights(path, model):
|
||||
safe_state_dict = load_file(path / "model.safetensors")
|
||||
safe_loaded_model = TinyModel()
|
||||
check_weights("diff", model.state_dict(), safe_loaded_model.state_dict())
|
||||
safe_loaded_model.load_state_dict(safe_state_dict)
|
||||
check_weights("same", model.state_dict(), safe_loaded_model.state_dict())
|
||||
|
||||
|
||||
def check_pytorch_weights(path, model):
|
||||
nonsafe_state_dict = torch.load(path / "pytorch_model.bin")
|
||||
nonsafe_loaded_model = TinyModel()
|
||||
check_weights("diff", model.state_dict(), nonsafe_loaded_model.state_dict())
|
||||
nonsafe_loaded_model.load_state_dict(nonsafe_state_dict)
|
||||
check_weights("same", model.state_dict(), nonsafe_loaded_model.state_dict())
|
||||
|
||||
|
||||
def test_merge_weights_safetensors(model, path):
|
||||
# Should now be saved at `path/merged.safetensors`
|
||||
merge_fsdp_weights(path / "pytorch_model_fsdp_0", path, safe_serialization=True)
|
||||
check_safetensors_weights(path, model)
|
||||
|
||||
|
||||
def test_merge_weights_command_safetensors(model, path):
|
||||
args = parser.parse_args([str(path / "pytorch_model_fsdp_0"), str(path)])
|
||||
merge_command(args)
|
||||
check_safetensors_weights(path, model)
|
||||
|
||||
|
||||
def test_merge_weights_pytorch(model, path):
|
||||
# Should now be saved at `path/merged.bin`
|
||||
merge_fsdp_weights(path / "pytorch_model_fsdp_0", path, safe_serialization=False)
|
||||
check_pytorch_weights(path, model)
|
||||
|
||||
|
||||
def test_merge_weights_command_pytorch(model, path):
|
||||
args = parser.parse_args([str(path / "pytorch_model_fsdp_0"), str(path), "--unsafe_serialization"])
|
||||
merge_command(args)
|
||||
check_pytorch_weights(path, model)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Note this test requires at least two accelerators!
|
||||
model, plugin, accelerator = setup()
|
||||
if accelerator.num_processes > 1:
|
||||
try:
|
||||
# Initial setup for things
|
||||
out_path = Path("test_merge_weights_fsdp_weights")
|
||||
if not out_path.exists():
|
||||
out_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Train briefly once weights aren't the baseline
|
||||
model = mock_training(accelerator, model)
|
||||
accelerator.wait_for_everyone()
|
||||
|
||||
gc.collect() # Needed for some lingering refs after training
|
||||
save_fsdp_model(plugin, accelerator, model, out_path)
|
||||
accelerator.wait_for_everyone()
|
||||
|
||||
# Finally we can test
|
||||
test_merge_weights_safetensors(model, out_path)
|
||||
test_merge_weights_command_safetensors(model, out_path)
|
||||
test_merge_weights_pytorch(model, out_path)
|
||||
test_merge_weights_command_pytorch(model, out_path)
|
||||
except Exception:
|
||||
raise
|
||||
finally:
|
||||
# Cleanup in case of any failures
|
||||
if accelerator.is_main_process:
|
||||
shutil.rmtree(out_path)
|
||||
accelerator.wait_for_everyone()
|
||||
@ -16,8 +16,11 @@ Test file to ensure that in general certain situational setups for notebooks wor
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
from multiprocessing import Queue
|
||||
|
||||
from pytest import raises
|
||||
from pytest import mark, raises
|
||||
from torch.distributed.elastic.multiprocessing.errors import ChildFailedError
|
||||
|
||||
from accelerate import PartialState, notebook_launcher
|
||||
from accelerate.test_utils import require_bnb
|
||||
@ -29,6 +32,25 @@ def basic_function():
|
||||
print(f"PartialState:\n{PartialState()}")
|
||||
|
||||
|
||||
def tough_nut_function(queue: Queue):
|
||||
if queue.empty():
|
||||
return
|
||||
trial = queue.get()
|
||||
if trial > 0:
|
||||
queue.put(trial - 1)
|
||||
raise RuntimeError("The nut hasn't cracked yet! Try again.")
|
||||
|
||||
print(f"PartialState:\n{PartialState()}")
|
||||
|
||||
|
||||
def bipolar_sleep_function(sleep_sec: int):
|
||||
state = PartialState()
|
||||
if state.process_index % 2 == 0:
|
||||
raise RuntimeError("I'm an even process. I don't like to sleep.")
|
||||
else:
|
||||
time.sleep(sleep_sec)
|
||||
|
||||
|
||||
NUM_PROCESSES = int(os.environ.get("ACCELERATE_NUM_PROCESSES", 1))
|
||||
|
||||
|
||||
@ -36,6 +58,36 @@ def test_can_initialize():
|
||||
notebook_launcher(basic_function, (), num_processes=NUM_PROCESSES)
|
||||
|
||||
|
||||
@mark.skipif(NUM_PROCESSES < 2, reason="Need at least 2 processes to test static rendezvous backends")
|
||||
def test_static_rdzv_backend():
|
||||
notebook_launcher(basic_function, (), num_processes=NUM_PROCESSES, rdzv_backend="static")
|
||||
|
||||
|
||||
@mark.skipif(NUM_PROCESSES < 2, reason="Need at least 2 processes to test c10d rendezvous backends")
|
||||
def test_c10d_rdzv_backend():
|
||||
notebook_launcher(basic_function, (), num_processes=NUM_PROCESSES, rdzv_backend="c10d")
|
||||
|
||||
|
||||
@mark.skipif(NUM_PROCESSES < 2, reason="Need at least 2 processes to test fault tolerance")
|
||||
def test_fault_tolerant(max_restarts: int = 3):
|
||||
queue = Queue()
|
||||
queue.put(max_restarts)
|
||||
notebook_launcher(tough_nut_function, (queue,), num_processes=NUM_PROCESSES, max_restarts=max_restarts)
|
||||
|
||||
|
||||
@mark.skipif(NUM_PROCESSES < 2, reason="Need at least 2 processes to test monitoring")
|
||||
def test_monitoring(monitor_interval: float = 0.01, sleep_sec: int = 100):
|
||||
start_time = time.time()
|
||||
with raises(ChildFailedError, match="I'm an even process. I don't like to sleep."):
|
||||
notebook_launcher(
|
||||
bipolar_sleep_function,
|
||||
(sleep_sec,),
|
||||
num_processes=NUM_PROCESSES,
|
||||
monitor_interval=monitor_interval,
|
||||
)
|
||||
assert time.time() - start_time < sleep_sec, "Monitoring did not stop the process in time."
|
||||
|
||||
|
||||
@require_bnb
|
||||
def test_problematic_imports():
|
||||
with raises(RuntimeError, match="Please keep these imports"):
|
||||
@ -47,6 +99,14 @@ def test_problematic_imports():
|
||||
def main():
|
||||
print("Test basic notebook can be ran")
|
||||
test_can_initialize()
|
||||
print("Test static rendezvous backend")
|
||||
test_static_rdzv_backend()
|
||||
print("Test c10d rendezvous backend")
|
||||
test_c10d_rdzv_backend()
|
||||
print("Test fault tolerant")
|
||||
test_fault_tolerant()
|
||||
print("Test monitoring")
|
||||
test_monitoring()
|
||||
if is_bnb_available():
|
||||
print("Test problematic imports (bnb)")
|
||||
test_problematic_imports()
|
||||
|
||||
@ -37,7 +37,9 @@ from accelerate.utils import (
|
||||
is_datasets_available,
|
||||
is_ipex_available,
|
||||
is_mlu_available,
|
||||
is_musa_available,
|
||||
is_npu_available,
|
||||
is_pytest_available,
|
||||
is_xpu_available,
|
||||
set_seed,
|
||||
synchronize_rng_states,
|
||||
@ -473,7 +475,7 @@ def training_check(use_seedable_sampler=False):
|
||||
|
||||
accelerator.print("Training yielded the same results on one CPU or distributes setup with batch split.")
|
||||
|
||||
if torch.cuda.is_available() or is_npu_available() or is_mlu_available():
|
||||
if torch.cuda.is_available() or is_npu_available() or is_mlu_available() or is_musa_available():
|
||||
# Mostly a test that FP16 doesn't crash as the operation inside the model is not converted to FP16
|
||||
print("FP16 training check.")
|
||||
AcceleratorState._reset_state()
|
||||
@ -692,6 +694,24 @@ def test_split_between_processes_tensor():
|
||||
state.wait_for_everyone()
|
||||
|
||||
|
||||
def test_split_between_processes_evenly():
|
||||
state = AcceleratorState()
|
||||
if state.num_processes in (1, 2, 4, 8):
|
||||
data = list(range(17))
|
||||
num_samples_per_process = len(data) // state.num_processes
|
||||
num_extras = len(data) % state.num_processes
|
||||
with state.split_between_processes(data) as results:
|
||||
if state.process_index < num_extras:
|
||||
assert (
|
||||
len(results) == num_samples_per_process + 1
|
||||
), f"Each Process should have even elements. Expected: {num_samples_per_process + 1}, Actual: {len(results)}"
|
||||
else:
|
||||
assert (
|
||||
len(results) == num_samples_per_process
|
||||
), f"Each Process should have even elements. Expected: {num_samples_per_process}, Actual: {len(results)}"
|
||||
state.wait_for_everyone()
|
||||
|
||||
|
||||
def test_trigger():
|
||||
accelerator = Accelerator()
|
||||
# should start with being false
|
||||
@ -756,6 +776,10 @@ def main():
|
||||
print("\n**Test split between processes as a tensor**")
|
||||
test_split_between_processes_tensor()
|
||||
|
||||
if state.process_index == 0:
|
||||
print("\n**Test split between processes evenly**")
|
||||
test_split_between_processes_evenly()
|
||||
|
||||
if state.process_index == 0:
|
||||
print("\n**Test split between processes as a datasets.Dataset**")
|
||||
if is_datasets_available():
|
||||
@ -793,9 +817,10 @@ def main():
|
||||
print("\n**Breakpoint trigger test**")
|
||||
test_trigger()
|
||||
|
||||
if state.local_process_index == 0:
|
||||
print("\n**Test reinstantiated state**")
|
||||
test_reinstantiated_state()
|
||||
if is_pytest_available():
|
||||
if state.local_process_index == 0:
|
||||
print("\n**Test reinstantiated state**")
|
||||
test_reinstantiated_state()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -267,7 +267,7 @@ def test_gradient_accumulation_with_opt_and_scheduler(
|
||||
step_model(model, input, target, accelerator, False)
|
||||
opt.step()
|
||||
|
||||
if ((iteration + 1) % 2 == 0) or ((iteration + 1) == len(dataloader)) or sync_each_batch:
|
||||
if ((iteration + 1) % 2 == 0) or ((iteration + 1) == len(dataloader)):
|
||||
if split_batches:
|
||||
sched.step()
|
||||
else:
|
||||
@ -284,18 +284,18 @@ def test_gradient_accumulation_with_opt_and_scheduler(
|
||||
assert (
|
||||
opt.param_groups[0]["lr"] == ddp_opt.param_groups[0]["lr"]
|
||||
), f'Learning rates found in each optimizer did not align\nopt: {opt.param_groups[0]["lr"]}\nDDP opt: {ddp_opt.param_groups[0]["lr"]}\n'
|
||||
did_step = (((iteration + 1) % 2) == 0) or ((iteration + 1) == len(dataloader)) or sync_each_batch
|
||||
did_step = (((iteration + 1) % 2) == 0) or ((iteration + 1) == len(dataloader))
|
||||
if accelerator.num_processes > 1:
|
||||
check_model_parameters(
|
||||
model,
|
||||
ddp_model,
|
||||
did_step,
|
||||
did_step or sync_each_batch, # syncs at each grad_accum interval of if sync_each_batch==True
|
||||
iteration,
|
||||
rtol=1e-3, # somehow needs a relative tolerance
|
||||
rtol=1e-3, # needs a relative tolerance due to roundoff errors
|
||||
)
|
||||
|
||||
if ((iteration + 1) % 2 == 0) or ((iteration + 1) == len(dataloader)) or sync_each_batch:
|
||||
opt.zero_grad() # needs to be guarded by logic as to when we should zero grads
|
||||
if did_step:
|
||||
opt.zero_grad() # flush gradients every accum step
|
||||
ddp_opt.zero_grad()
|
||||
|
||||
# Shuffle ddp_input on each iteration
|
||||
@ -343,6 +343,7 @@ def main():
|
||||
DistributedType.MULTI_GPU,
|
||||
DistributedType.MULTI_NPU,
|
||||
DistributedType.MULTI_MLU,
|
||||
DistributedType.MULTI_MUSA,
|
||||
DistributedType.MULTI_CPU,
|
||||
):
|
||||
if state.local_process_index == 0:
|
||||
@ -351,7 +352,12 @@ def main():
|
||||
if state.local_process_index == 0:
|
||||
print("**Test Distributed `no_sync` context manager with multiple forwards**")
|
||||
test_distributed_sync_multiple_fwd(accelerator)
|
||||
if state.distributed_type in (DistributedType.MULTI_GPU, DistributedType.MULTI_NPU, DistributedType.MULTI_MLU):
|
||||
if state.distributed_type in (
|
||||
DistributedType.MULTI_GPU,
|
||||
DistributedType.MULTI_NPU,
|
||||
DistributedType.MULTI_MLU,
|
||||
DistributedType.MULTI_MUSA,
|
||||
):
|
||||
for split_batch in [True, False]:
|
||||
for dispatch_batches in [True, False]:
|
||||
for sync_each_batch in [True, False]:
|
||||
@ -369,7 +375,12 @@ def main():
|
||||
"`split_batches=False`, `dispatch_batches=False`, `sync_each_batch=False`**",
|
||||
)
|
||||
test_gradient_accumulation_with_opt_and_scheduler()
|
||||
if state.distributed_type in (DistributedType.MULTI_GPU, DistributedType.MULTI_NPU, DistributedType.MULTI_MLU):
|
||||
if state.distributed_type in (
|
||||
DistributedType.MULTI_GPU,
|
||||
DistributedType.MULTI_NPU,
|
||||
DistributedType.MULTI_MLU,
|
||||
DistributedType.MULTI_MUSA,
|
||||
):
|
||||
for split_batch in [True, False]:
|
||||
for dispatch_batches in [True, False]:
|
||||
for sync_each_batch in [True, False]:
|
||||
|
||||
@ -40,8 +40,10 @@ from ..utils import (
|
||||
is_datasets_available,
|
||||
is_deepspeed_available,
|
||||
is_dvclive_available,
|
||||
is_import_timer_available,
|
||||
is_mlu_available,
|
||||
is_mps_available,
|
||||
is_musa_available,
|
||||
is_npu_available,
|
||||
is_pandas_available,
|
||||
is_pippy_available,
|
||||
@ -51,7 +53,9 @@ from ..utils import (
|
||||
is_torch_version,
|
||||
is_torch_xla_available,
|
||||
is_torchvision_available,
|
||||
is_transformer_engine_available,
|
||||
is_transformers_available,
|
||||
is_triton_available,
|
||||
is_wandb_available,
|
||||
is_xpu_available,
|
||||
str_to_bool,
|
||||
@ -63,16 +67,20 @@ def get_backend():
|
||||
return "xla", torch.cuda.device_count(), torch.cuda.memory_allocated
|
||||
elif is_cuda_available():
|
||||
return "cuda", torch.cuda.device_count(), torch.cuda.memory_allocated
|
||||
elif is_mps_available(min_version="2.0"):
|
||||
return "mps", 1, torch.mps.current_allocated_memory
|
||||
elif is_mps_available():
|
||||
return "mps", 1, torch.mps.current_allocated_memory()
|
||||
return "mps", 1, lambda: 0
|
||||
elif is_mlu_available():
|
||||
return "mlu", torch.mlu.device_count(), torch.mlu.memory_allocated
|
||||
elif is_musa_available():
|
||||
return "musa", torch.musa.device_count(), torch.musa.memory_allocated
|
||||
elif is_npu_available():
|
||||
return "npu", torch.npu.device_count(), torch.npu.memory_allocated
|
||||
elif is_xpu_available():
|
||||
return "xpu", torch.xpu.device_count(), torch.xpu.memory_allocated
|
||||
else:
|
||||
return "cpu", 1, 0
|
||||
return "cpu", 1, lambda: 0
|
||||
|
||||
|
||||
torch_device, device_count, memory_allocated_func = get_backend()
|
||||
@ -97,7 +105,7 @@ def get_launch_command(**kwargs) -> list:
|
||||
return command
|
||||
|
||||
|
||||
DEFAULT_LAUNCH_COMMAND = get_launch_command(num_processes=device_count)
|
||||
DEFAULT_LAUNCH_COMMAND = get_launch_command(num_processes=device_count, monitor_interval=0.1)
|
||||
|
||||
|
||||
def parse_flag_from_env(key, default=False):
|
||||
@ -176,6 +184,13 @@ def require_mlu(test_case):
|
||||
return unittest.skipUnless(is_mlu_available(), "test require a MLU")(test_case)
|
||||
|
||||
|
||||
def require_musa(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires MUSA. These tests are skipped when there are no MUSA available.
|
||||
"""
|
||||
return unittest.skipUnless(is_musa_available(), "test require a MUSA")(test_case)
|
||||
|
||||
|
||||
def require_npu(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires NPU. These tests are skipped when there are no NPU available.
|
||||
@ -210,7 +225,7 @@ def require_transformers(test_case):
|
||||
|
||||
def require_timm(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires transformers. These tests are skipped when they are not.
|
||||
Decorator marking a test that requires timm. These tests are skipped when they are not.
|
||||
"""
|
||||
return unittest.skipUnless(is_timm_available(), "test requires the timm library")(test_case)
|
||||
|
||||
@ -222,6 +237,13 @@ def require_torchvision(test_case):
|
||||
return unittest.skipUnless(is_torchvision_available(), "test requires the torchvision library")(test_case)
|
||||
|
||||
|
||||
def require_triton(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires triton. These tests are skipped when they are not.
|
||||
"""
|
||||
return unittest.skipUnless(is_triton_available(), "test requires the triton library")(test_case)
|
||||
|
||||
|
||||
def require_schedulefree(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires schedulefree. These tests are skipped when they are not.
|
||||
@ -375,6 +397,22 @@ def require_pippy(test_case):
|
||||
return unittest.skipUnless(is_pippy_available(), "test requires pippy")(test_case)
|
||||
|
||||
|
||||
def require_import_timer(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires tuna interpreter installed. These tests are skipped when tuna isn't
|
||||
installed
|
||||
"""
|
||||
return unittest.skipUnless(is_import_timer_available(), "test requires tuna interpreter")(test_case)
|
||||
|
||||
|
||||
def require_transformer_engine(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires transformers engine installed. These tests are skipped when transformers
|
||||
engine isn't installed
|
||||
"""
|
||||
return unittest.skipUnless(is_transformer_engine_available(), "test requires transformers engine")(test_case)
|
||||
|
||||
|
||||
_atleast_one_tracker_available = (
|
||||
any([is_wandb_available(), is_tensorboard_available()]) and not is_comet_ml_available()
|
||||
)
|
||||
|
||||
@ -14,10 +14,12 @@
|
||||
from .constants import (
|
||||
MODEL_NAME,
|
||||
OPTIMIZER_NAME,
|
||||
PROFILE_PATTERN_NAME,
|
||||
RNG_STATE_NAME,
|
||||
SAFE_MODEL_NAME,
|
||||
SAFE_WEIGHTS_INDEX_NAME,
|
||||
SAFE_WEIGHTS_NAME,
|
||||
SAFE_WEIGHTS_PATTERN_NAME,
|
||||
SAMPLER_NAME,
|
||||
SCALER_NAME,
|
||||
SCHEDULER_NAME,
|
||||
@ -25,6 +27,7 @@ from .constants import (
|
||||
TORCH_LAUNCH_PARAMS,
|
||||
WEIGHTS_INDEX_NAME,
|
||||
WEIGHTS_NAME,
|
||||
WEIGHTS_PATTERN_NAME,
|
||||
)
|
||||
from .dataclasses import (
|
||||
AutocastKwargs,
|
||||
@ -32,6 +35,7 @@ from .dataclasses import (
|
||||
ComputeEnvironment,
|
||||
CustomDtype,
|
||||
DataLoaderConfiguration,
|
||||
DDPCommunicationHookType,
|
||||
DeepSpeedPlugin,
|
||||
DistributedDataParallelKwargs,
|
||||
DistributedType,
|
||||
@ -45,11 +49,13 @@ from .dataclasses import (
|
||||
LoggerType,
|
||||
MegatronLMPlugin,
|
||||
PrecisionType,
|
||||
ProfileKwargs,
|
||||
ProjectConfiguration,
|
||||
RNGType,
|
||||
SageMakerDistributedType,
|
||||
TensorInformation,
|
||||
TorchDynamoPlugin,
|
||||
add_model_config_to_megatron_parser,
|
||||
)
|
||||
from .environment import (
|
||||
are_libraries_initialized,
|
||||
@ -80,6 +86,7 @@ from .imports import (
|
||||
is_deepspeed_available,
|
||||
is_dvclive_available,
|
||||
is_fp8_available,
|
||||
is_import_timer_available,
|
||||
is_ipex_available,
|
||||
is_lomo_available,
|
||||
is_megatron_lm_available,
|
||||
@ -87,11 +94,13 @@ from .imports import (
|
||||
is_mlu_available,
|
||||
is_mps_available,
|
||||
is_msamp_available,
|
||||
is_musa_available,
|
||||
is_npu_available,
|
||||
is_pandas_available,
|
||||
is_peft_available,
|
||||
is_pippy_available,
|
||||
is_pynvml_available,
|
||||
is_pytest_available,
|
||||
is_rich_available,
|
||||
is_sagemaker_available,
|
||||
is_schedulefree_available,
|
||||
@ -101,6 +110,7 @@ from .imports import (
|
||||
is_torchvision_available,
|
||||
is_transformer_engine_available,
|
||||
is_transformers_available,
|
||||
is_triton_available,
|
||||
is_wandb_available,
|
||||
is_xpu_available,
|
||||
)
|
||||
@ -139,6 +149,7 @@ from .offload import (
|
||||
)
|
||||
from .operations import (
|
||||
CannotPadNestedTensorWarning,
|
||||
GatheredParameters,
|
||||
broadcast,
|
||||
broadcast_object_list,
|
||||
concatenate,
|
||||
@ -164,7 +175,7 @@ from .operations import (
|
||||
send_to_device,
|
||||
slice_tensors,
|
||||
)
|
||||
from .versions import compare_versions, is_torch_version
|
||||
from .versions import compare_versions, is_torch_version, parse
|
||||
|
||||
|
||||
if is_deepspeed_available():
|
||||
@ -178,7 +189,15 @@ if is_deepspeed_available():
|
||||
)
|
||||
|
||||
from .bnb import has_4bit_bnb_layers, load_and_quantize_model
|
||||
from .fsdp_utils import load_fsdp_model, load_fsdp_optimizer, save_fsdp_model, save_fsdp_optimizer
|
||||
from .fsdp_utils import (
|
||||
disable_fsdp_ram_efficient_loading,
|
||||
enable_fsdp_ram_efficient_loading,
|
||||
load_fsdp_model,
|
||||
load_fsdp_optimizer,
|
||||
merge_fsdp_weights,
|
||||
save_fsdp_model,
|
||||
save_fsdp_optimizer,
|
||||
)
|
||||
from .launch import (
|
||||
PrepareForLaunch,
|
||||
_filter_args,
|
||||
@ -188,24 +207,31 @@ from .launch import (
|
||||
prepare_simple_launcher_cmd_env,
|
||||
prepare_tpu,
|
||||
)
|
||||
|
||||
# For docs
|
||||
from .megatron_lm import (
|
||||
AbstractTrainStep,
|
||||
BertTrainStep,
|
||||
GPTTrainStep,
|
||||
MegatronEngine,
|
||||
MegatronLMDummyDataLoader,
|
||||
MegatronLMDummyScheduler,
|
||||
MegatronLMOptimizerWrapper,
|
||||
MegatronLMSchedulerWrapper,
|
||||
T5TrainStep,
|
||||
avg_losses_across_data_parallel_group,
|
||||
gather_across_data_parallel_groups,
|
||||
)
|
||||
from .megatron_lm import initialize as megatron_lm_initialize
|
||||
from .megatron_lm import prepare_data_loader as megatron_lm_prepare_data_loader
|
||||
from .megatron_lm import prepare_model as megatron_lm_prepare_model
|
||||
from .megatron_lm import prepare_optimizer as megatron_lm_prepare_optimizer
|
||||
from .megatron_lm import prepare_scheduler as megatron_lm_prepare_scheduler
|
||||
|
||||
|
||||
if is_megatron_lm_available():
|
||||
from .megatron_lm import (
|
||||
MegatronEngine,
|
||||
MegatronLMOptimizerWrapper,
|
||||
MegatronLMSchedulerWrapper,
|
||||
gather_across_data_parallel_groups,
|
||||
)
|
||||
from .megatron_lm import initialize as megatron_lm_initialize
|
||||
from .megatron_lm import prepare_data_loader as megatron_lm_prepare_data_loader
|
||||
from .megatron_lm import prepare_model_optimizer_scheduler as megatron_lm_prepare_model_optimizer_scheduler
|
||||
from .megatron_lm import prepare_optimizer as megatron_lm_prepare_optimizer
|
||||
from .megatron_lm import prepare_scheduler as megatron_lm_prepare_scheduler
|
||||
from .memory import find_executable_batch_size, release_memory
|
||||
from .other import (
|
||||
check_os_kernel,
|
||||
@ -225,4 +251,9 @@ from .other import (
|
||||
from .random import set_seed, synchronize_rng_state, synchronize_rng_states
|
||||
from .torch_xla import install_xla
|
||||
from .tqdm import tqdm
|
||||
from .transformer_engine import convert_model, has_transformer_engine_layers
|
||||
from .transformer_engine import (
|
||||
apply_fp8_autowrap,
|
||||
contextual_fp8_autocast,
|
||||
convert_model,
|
||||
has_transformer_engine_layers,
|
||||
)
|
||||
|
||||
@ -22,9 +22,12 @@ RNG_STATE_NAME = "random_states"
|
||||
OPTIMIZER_NAME = "optimizer"
|
||||
SCHEDULER_NAME = "scheduler"
|
||||
SAMPLER_NAME = "sampler"
|
||||
PROFILE_PATTERN_NAME = "profile_{suffix}.json"
|
||||
WEIGHTS_NAME = f"{MODEL_NAME}.bin"
|
||||
WEIGHTS_PATTERN_NAME = "pytorch_model{suffix}.bin"
|
||||
WEIGHTS_INDEX_NAME = f"{WEIGHTS_NAME}.index.json"
|
||||
SAFE_WEIGHTS_NAME = f"{SAFE_MODEL_NAME}.safetensors"
|
||||
SAFE_WEIGHTS_PATTERN_NAME = "model{suffix}.safetensors"
|
||||
SAFE_WEIGHTS_INDEX_NAME = f"{SAFE_WEIGHTS_NAME}.index.json"
|
||||
SAGEMAKER_PYTORCH_VERSION = "1.10.2"
|
||||
SAGEMAKER_PYTHON_VERSION = "py38"
|
||||
@ -38,6 +41,7 @@ FSDP_PYTORCH_VERSION = "2.1.0"
|
||||
FSDP_MODEL_NAME = "pytorch_model_fsdp"
|
||||
DEEPSPEED_MULTINODE_LAUNCHERS = ["pdsh", "standard", "openmpi", "mvapich", "mpich"]
|
||||
TORCH_DYNAMO_MODES = ["default", "reduce-overhead", "max-autotune"]
|
||||
ELASTIC_LOG_LINE_PREFIX_TEMPLATE_PYTORCH_VERSION = "2.2.0"
|
||||
|
||||
STR_OPERATION_TO_FUNC = {">": op.gt, ">=": op.ge, "==": op.eq, "!=": op.ne, "<=": op.le, "<": op.lt}
|
||||
|
||||
@ -69,4 +73,10 @@ TORCH_LAUNCH_PARAMS = [
|
||||
]
|
||||
|
||||
CUDA_DISTRIBUTED_TYPES = ["DEEPSPEED", "MULTI_GPU", "FSDP", "MEGATRON_LM"]
|
||||
TORCH_DISTRIBUTED_OPERATION_TYPES = CUDA_DISTRIBUTED_TYPES + ["MULTI_NPU", "MULTI_MLU", "MULTI_XPU", "MULTI_CPU"]
|
||||
TORCH_DISTRIBUTED_OPERATION_TYPES = CUDA_DISTRIBUTED_TYPES + [
|
||||
"MULTI_NPU",
|
||||
"MULTI_MLU",
|
||||
"MULTI_MUSA",
|
||||
"MULTI_XPU",
|
||||
"MULTI_CPU",
|
||||
]
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -12,27 +12,38 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
|
||||
from ..logging import get_logger
|
||||
from .constants import FSDP_MODEL_NAME, FSDP_PYTORCH_VERSION, OPTIMIZER_NAME
|
||||
from .imports import is_torch_distributed_available
|
||||
from .constants import FSDP_MODEL_NAME, OPTIMIZER_NAME, SAFE_WEIGHTS_NAME, WEIGHTS_NAME
|
||||
from .modeling import is_peft_model
|
||||
from .other import save
|
||||
from .versions import is_torch_version
|
||||
|
||||
|
||||
if is_torch_version(">=", FSDP_PYTORCH_VERSION) and is_torch_distributed_available():
|
||||
import torch.distributed.checkpoint as dist_cp
|
||||
from torch.distributed.checkpoint.default_planner import DefaultLoadPlanner, DefaultSavePlanner
|
||||
from torch.distributed.checkpoint.optimizer import load_sharded_optimizer_state_dict
|
||||
from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
|
||||
from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
|
||||
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def enable_fsdp_ram_efficient_loading():
|
||||
"""
|
||||
Enables RAM efficient loading of Hugging Face models for FSDP in the environment.
|
||||
"""
|
||||
# Sets values for `transformers.modeling_utils.is_fsdp_enabled`
|
||||
if "ACCELERATE_USE_FSDP" not in os.environ:
|
||||
os.environ["ACCELERATE_USE_FSDP"] = "True"
|
||||
os.environ["FSDP_CPU_RAM_EFFICIENT_LOADING"] = "True"
|
||||
|
||||
|
||||
def disable_fsdp_ram_efficient_loading():
|
||||
"""
|
||||
Disables RAM efficient loading of Hugging Face models for FSDP in the environment.
|
||||
"""
|
||||
os.environ["FSDP_CPU_RAM_EFFICIENT_LOADING"] = "False"
|
||||
|
||||
|
||||
def _get_model_state_dict(model, adapter_only=False):
|
||||
if adapter_only and is_peft_model(model):
|
||||
from peft import get_peft_model_state_dict
|
||||
@ -52,8 +63,13 @@ def _set_model_state_dict(model, state_dict, adapter_only=False):
|
||||
|
||||
|
||||
def save_fsdp_model(fsdp_plugin, accelerator, model, output_dir, model_index=0, adapter_only=False):
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
# Note: We import here to reduce import time from general modules, and isolate outside dependencies
|
||||
import torch.distributed.checkpoint as dist_cp
|
||||
from torch.distributed.checkpoint.default_planner import DefaultSavePlanner
|
||||
from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
|
||||
from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
|
||||
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
|
||||
# FSDP raises error when single GPU is used with `offload_to_cpu=True` for FULL_STATE_DICT
|
||||
# so, only enable it when num_processes>1
|
||||
@ -97,6 +113,12 @@ def save_fsdp_model(fsdp_plugin, accelerator, model, output_dir, model_index=0,
|
||||
|
||||
|
||||
def load_fsdp_model(fsdp_plugin, accelerator, model, input_dir, model_index=0, adapter_only=False):
|
||||
# Note: We import here to reduce import time from general modules, and isolate outside dependencies
|
||||
import torch.distributed.checkpoint as dist_cp
|
||||
from torch.distributed.checkpoint.default_planner import DefaultLoadPlanner
|
||||
from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
|
||||
from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
|
||||
|
||||
accelerator.wait_for_everyone()
|
||||
if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
|
||||
# FSDP raises error when single GPU is used with `offload_to_cpu=True` for FULL_STATE_DICT
|
||||
@ -150,6 +172,12 @@ def load_fsdp_model(fsdp_plugin, accelerator, model, input_dir, model_index=0, a
|
||||
|
||||
|
||||
def save_fsdp_optimizer(fsdp_plugin, accelerator, optimizer, model, output_dir, optimizer_index=0):
|
||||
# Note: We import here to reduce import time from general modules, and isolate outside dependencies
|
||||
import torch.distributed.checkpoint as dist_cp
|
||||
from torch.distributed.checkpoint.default_planner import DefaultSavePlanner
|
||||
from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
|
||||
from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
|
||||
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
with FSDP.state_dict_type(
|
||||
model, fsdp_plugin.state_dict_type, fsdp_plugin.state_dict_config, fsdp_plugin.optim_state_dict_config
|
||||
@ -177,6 +205,12 @@ def save_fsdp_optimizer(fsdp_plugin, accelerator, optimizer, model, output_dir,
|
||||
|
||||
|
||||
def load_fsdp_optimizer(fsdp_plugin, accelerator, optimizer, model, input_dir, optimizer_index=0, adapter_only=False):
|
||||
# Note: We import here to reduce import time from general modules, and isolate outside dependencies
|
||||
import torch.distributed.checkpoint as dist_cp
|
||||
from torch.distributed.checkpoint.optimizer import load_sharded_optimizer_state_dict
|
||||
from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
|
||||
from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
|
||||
|
||||
accelerator.wait_for_everyone()
|
||||
with FSDP.state_dict_type(
|
||||
model, fsdp_plugin.state_dict_type, fsdp_plugin.state_dict_config, fsdp_plugin.optim_state_dict_config
|
||||
@ -207,3 +241,86 @@ def load_fsdp_optimizer(fsdp_plugin, accelerator, optimizer, model, input_dir, o
|
||||
logger.info(f"Optimizer loaded from {ckpt_dir}")
|
||||
flattened_osd = FSDP.optim_state_dict_to_load(model=model, optim=optimizer, optim_state_dict=optim_state)
|
||||
optimizer.load_state_dict(flattened_osd)
|
||||
|
||||
|
||||
def _distributed_checkpoint_to_merged_weights(checkpoint_dir: str, save_path: str, safe_serialization: bool = True):
|
||||
"""
|
||||
Passthrough to `torch.distributed.checkpoint.format_utils.dcp_to_torch_save`
|
||||
|
||||
Will save under `save_path` as either `model.safetensors` or `pytorch_model.bin`.
|
||||
"""
|
||||
# Note: We import here to reduce import time from general modules, and isolate outside dependencies
|
||||
import torch.distributed.checkpoint as dist_cp
|
||||
import torch.distributed.checkpoint.format_utils as dist_cp_format_utils
|
||||
|
||||
state_dict = {}
|
||||
save_path = Path(save_path)
|
||||
save_path.mkdir(exist_ok=True)
|
||||
dist_cp_format_utils._load_state_dict(
|
||||
state_dict,
|
||||
storage_reader=dist_cp.FileSystemReader(checkpoint_dir),
|
||||
planner=dist_cp_format_utils._EmptyStateDictLoadPlanner(),
|
||||
no_dist=True,
|
||||
)
|
||||
save_path = save_path / SAFE_WEIGHTS_NAME if safe_serialization else save_path / WEIGHTS_NAME
|
||||
|
||||
# To handle if state is a dict like {model: {...}}
|
||||
if len(state_dict.keys()) == 1:
|
||||
state_dict = state_dict[list(state_dict)[0]]
|
||||
save(state_dict, save_path, safe_serialization=safe_serialization)
|
||||
return save_path
|
||||
|
||||
|
||||
def merge_fsdp_weights(
|
||||
checkpoint_dir: str, output_path: str, safe_serialization: bool = True, remove_checkpoint_dir: bool = False
|
||||
):
|
||||
"""
|
||||
Merge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if
|
||||
`SHARDED_STATE_DICT` was used for the model. Weights will be saved to `{output_path}/model.safetensors` if
|
||||
`safe_serialization` else `pytorch_model.bin`.
|
||||
|
||||
Note: this is a CPU-bound process.
|
||||
|
||||
Args:
|
||||
checkpoint_dir (`str`):
|
||||
The directory containing the FSDP checkpoints (can be either the model or optimizer).
|
||||
output_path (`str`):
|
||||
The path to save the merged checkpoint.
|
||||
safe_serialization (`bool`, *optional*, defaults to `True`):
|
||||
Whether to save the merged weights with safetensors (recommended).
|
||||
remove_checkpoint_dir (`bool`, *optional*, defaults to `False`):
|
||||
Whether to remove the checkpoint directory after merging.
|
||||
"""
|
||||
checkpoint_dir = Path(checkpoint_dir)
|
||||
from accelerate.state import PartialState
|
||||
|
||||
if not is_torch_version(">=", "2.3.0"):
|
||||
raise ValueError("`merge_fsdp_weights` requires PyTorch >= 2.3.0`")
|
||||
|
||||
# Verify that the checkpoint directory exists
|
||||
if not checkpoint_dir.exists():
|
||||
model_path_exists = (checkpoint_dir / "pytorch_model_fsdp_0").exists()
|
||||
optimizer_path_exists = (checkpoint_dir / "optimizer_0").exists()
|
||||
err = f"Tried to load from {checkpoint_dir} but couldn't find a valid metadata file."
|
||||
if model_path_exists and optimizer_path_exists:
|
||||
err += " However, potential model and optimizer checkpoint directories exist."
|
||||
err += f"Please pass in either {checkpoint_dir}/pytorch_model_fsdp_0 or {checkpoint_dir}/optimizer_0"
|
||||
err += "instead."
|
||||
elif model_path_exists:
|
||||
err += " However, a potential model checkpoint directory exists."
|
||||
err += f"Please try passing in {checkpoint_dir}/pytorch_model_fsdp_0 instead."
|
||||
elif optimizer_path_exists:
|
||||
err += " However, a potential optimizer checkpoint directory exists."
|
||||
err += f"Please try passing in {checkpoint_dir}/optimizer_0 instead."
|
||||
raise ValueError(err)
|
||||
|
||||
# To setup `save` to work
|
||||
state = PartialState()
|
||||
if state.is_main_process:
|
||||
logger.info(f"Merging FSDP weights from {checkpoint_dir}")
|
||||
save_path = _distributed_checkpoint_to_merged_weights(checkpoint_dir, output_path, safe_serialization)
|
||||
logger.info(f"Successfully merged FSDP weights and saved to {save_path}")
|
||||
if remove_checkpoint_dir:
|
||||
logger.info(f"Removing old checkpoint directory {checkpoint_dir}")
|
||||
shutil.rmtree(checkpoint_dir)
|
||||
state.wait_for_everyone()
|
||||
|
||||
@ -19,11 +19,9 @@ import warnings
|
||||
from functools import lru_cache
|
||||
|
||||
import torch
|
||||
from packaging import version
|
||||
from packaging.version import parse
|
||||
|
||||
from .environment import parse_flag_from_env, str_to_bool
|
||||
from .versions import compare_versions, is_torch_version
|
||||
from .versions import compare_versions, is_torch_version, parse
|
||||
|
||||
|
||||
# Try to run Torch native job in an environment with TorchXLA installed by setting this value to 0.
|
||||
@ -81,8 +79,16 @@ def get_ccl_version():
|
||||
return importlib.metadata.version("oneccl_bind_pt")
|
||||
|
||||
|
||||
def is_import_timer_available():
|
||||
return _is_package_available("import_timer")
|
||||
|
||||
|
||||
def is_pynvml_available():
|
||||
return _is_package_available("pynvml")
|
||||
return _is_package_available("pynvml") or _is_package_available("pynvml", "nvidia-ml-py")
|
||||
|
||||
|
||||
def is_pytest_available():
|
||||
return _is_package_available("pytest")
|
||||
|
||||
|
||||
def is_msamp_available():
|
||||
@ -94,7 +100,7 @@ def is_schedulefree_available():
|
||||
|
||||
|
||||
def is_transformer_engine_available():
|
||||
return _is_package_available("transformer_engine")
|
||||
return _is_package_available("transformer_engine", "transformer-engine")
|
||||
|
||||
|
||||
def is_lomo_available():
|
||||
@ -172,7 +178,7 @@ def is_deepspeed_available():
|
||||
def is_pippy_available():
|
||||
package_exists = _is_package_available("pippy", "torchpippy")
|
||||
if package_exists:
|
||||
pippy_version = version.parse(importlib.metadata.version("torchpippy"))
|
||||
pippy_version = parse(importlib.metadata.version("torchpippy"))
|
||||
return compare_versions(pippy_version, ">", "0.1.1")
|
||||
return False
|
||||
|
||||
@ -191,7 +197,7 @@ def is_bf16_available(ignore_tpu=False):
|
||||
def is_4bit_bnb_available():
|
||||
package_exists = _is_package_available("bitsandbytes")
|
||||
if package_exists:
|
||||
bnb_version = version.parse(importlib.metadata.version("bitsandbytes"))
|
||||
bnb_version = parse(importlib.metadata.version("bitsandbytes"))
|
||||
return compare_versions(bnb_version, ">=", "0.39.0")
|
||||
return False
|
||||
|
||||
@ -199,7 +205,7 @@ def is_4bit_bnb_available():
|
||||
def is_8bit_bnb_available():
|
||||
package_exists = _is_package_available("bitsandbytes")
|
||||
if package_exists:
|
||||
bnb_version = version.parse(importlib.metadata.version("bitsandbytes"))
|
||||
bnb_version = parse(importlib.metadata.version("bitsandbytes"))
|
||||
return compare_versions(bnb_version, ">=", "0.37.2")
|
||||
return False
|
||||
|
||||
@ -214,11 +220,11 @@ def is_torchvision_available():
|
||||
|
||||
def is_megatron_lm_available():
|
||||
if str_to_bool(os.environ.get("ACCELERATE_USE_MEGATRON_LM", "False")) == 1:
|
||||
package_exists = importlib.util.find_spec("megatron") is not None
|
||||
if package_exists:
|
||||
if importlib.util.find_spec("megatron") is not None:
|
||||
try:
|
||||
megatron_version = parse(importlib.metadata.version("megatron-lm"))
|
||||
return compare_versions(megatron_version, ">=", "2.2.0")
|
||||
megatron_version = parse(importlib.metadata.version("megatron-core"))
|
||||
if compare_versions(megatron_version, "==", "0.5.0"):
|
||||
return importlib.util.find_spec(".data", "megatron")
|
||||
except Exception as e:
|
||||
warnings.warn(f"Parse Megatron version failed. Exception:{e}")
|
||||
return False
|
||||
@ -240,10 +246,14 @@ def is_timm_available():
|
||||
return _is_package_available("timm")
|
||||
|
||||
|
||||
def is_triton_available():
|
||||
return _is_package_available("triton")
|
||||
|
||||
|
||||
def is_aim_available():
|
||||
package_exists = _is_package_available("aim")
|
||||
if package_exists:
|
||||
aim_version = version.parse(importlib.metadata.version("aim"))
|
||||
aim_version = parse(importlib.metadata.version("aim"))
|
||||
return compare_versions(aim_version, "<", "4.0.0")
|
||||
return False
|
||||
|
||||
@ -304,13 +314,15 @@ def is_mlflow_available():
|
||||
return False
|
||||
|
||||
|
||||
def is_mps_available():
|
||||
return is_torch_version(">=", "1.12") and torch.backends.mps.is_available() and torch.backends.mps.is_built()
|
||||
def is_mps_available(min_version="1.12"):
|
||||
# With torch 1.12, you can use torch.backends.mps
|
||||
# With torch 2.0.0, you can use torch.mps
|
||||
return is_torch_version(">=", min_version) and torch.backends.mps.is_available() and torch.backends.mps.is_built()
|
||||
|
||||
|
||||
def is_ipex_available():
|
||||
def get_major_and_minor_from_version(full_version):
|
||||
return str(version.parse(full_version).major) + "." + str(version.parse(full_version).minor)
|
||||
return str(parse(full_version).major) + "." + str(parse(full_version).minor)
|
||||
|
||||
_torch_version = importlib.metadata.version("torch")
|
||||
if importlib.util.find_spec("intel_extension_for_pytorch") is None:
|
||||
@ -337,7 +349,6 @@ def is_mlu_available(check_device=False):
|
||||
if importlib.util.find_spec("torch_mlu") is None:
|
||||
return False
|
||||
|
||||
import torch
|
||||
import torch_mlu # noqa: F401
|
||||
|
||||
if check_device:
|
||||
@ -351,12 +362,29 @@ def is_mlu_available(check_device=False):
|
||||
|
||||
|
||||
@lru_cache
|
||||
def is_npu_available(check_device=False):
|
||||
"Checks if `torch_npu` is installed and potentially if a NPU is in the environment"
|
||||
if importlib.util.find_spec("torch") is None or importlib.util.find_spec("torch_npu") is None:
|
||||
def is_musa_available(check_device=False):
|
||||
"Checks if `torch_musa` is installed and potentially if a MUSA is in the environment"
|
||||
if importlib.util.find_spec("torch_musa") is None:
|
||||
return False
|
||||
|
||||
import torch_musa # noqa: F401
|
||||
|
||||
if check_device:
|
||||
try:
|
||||
# Will raise a RuntimeError if no MUSA is found
|
||||
_ = torch.musa.device_count()
|
||||
return torch.musa.is_available()
|
||||
except RuntimeError:
|
||||
return False
|
||||
return hasattr(torch, "musa") and torch.musa.is_available()
|
||||
|
||||
|
||||
@lru_cache
|
||||
def is_npu_available(check_device=False):
|
||||
"Checks if `torch_npu` is installed and potentially if a NPU is in the environment"
|
||||
if importlib.util.find_spec("torch_npu") is None:
|
||||
return False
|
||||
|
||||
import torch
|
||||
import torch_npu # noqa: F401
|
||||
|
||||
if check_device:
|
||||
@ -371,19 +399,23 @@ def is_npu_available(check_device=False):
|
||||
|
||||
@lru_cache
|
||||
def is_xpu_available(check_device=False):
|
||||
"""
|
||||
Checks if XPU acceleration is available either via `intel_extension_for_pytorch` or via stock PyTorch (>=2.4) and
|
||||
potentially if a XPU is in the environment
|
||||
"""
|
||||
|
||||
"check if user disables it explicitly"
|
||||
if not parse_flag_from_env("ACCELERATE_USE_XPU", default=True):
|
||||
return False
|
||||
"Checks if `intel_extension_for_pytorch` is installed and potentially if a XPU is in the environment"
|
||||
if is_ipex_available():
|
||||
import torch
|
||||
|
||||
if is_ipex_available():
|
||||
if is_torch_version("<=", "1.12"):
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
|
||||
import intel_extension_for_pytorch # noqa: F401
|
||||
import intel_extension_for_pytorch # noqa: F401
|
||||
else:
|
||||
if is_torch_version("<=", "2.3"):
|
||||
return False
|
||||
|
||||
if check_device:
|
||||
try:
|
||||
|
||||
@ -27,8 +27,10 @@ from ..commands.config.config_args import SageMakerConfig
|
||||
from ..utils import (
|
||||
DynamoBackend,
|
||||
PrecisionType,
|
||||
is_fp8_available,
|
||||
is_ipex_available,
|
||||
is_mlu_available,
|
||||
is_musa_available,
|
||||
is_npu_available,
|
||||
is_torch_xla_available,
|
||||
is_xpu_available,
|
||||
@ -67,10 +69,23 @@ def _get_mpirun_args():
|
||||
mpirun_version = subprocess.check_output([mpi_app, "--version"])
|
||||
|
||||
if b"Open MPI" in mpirun_version:
|
||||
return mpi_app, "--hostfile", "-n", "--npernode"
|
||||
return mpi_app, "--hostfile", "-n", "--npernode", "--bind-to"
|
||||
else:
|
||||
# Intel MPI and MVAPICH both use the same arg names
|
||||
return mpi_app, "-f", "-n", "-ppn"
|
||||
return mpi_app, "-f", "-n", "-ppn", ""
|
||||
|
||||
|
||||
def setup_fp8_env(args: argparse.Namespace, current_env: Dict[str, str]):
|
||||
"""
|
||||
Setup the FP8 environment variables.
|
||||
"""
|
||||
prefix = "ACCELERATE_"
|
||||
for arg in vars(args):
|
||||
if arg.startswith("fp8_"):
|
||||
value = getattr(args, arg)
|
||||
if value is not None:
|
||||
current_env[f"{prefix}{arg.upper()}"] = str(getattr(args, arg))
|
||||
return current_env
|
||||
|
||||
|
||||
def prepare_simple_launcher_cmd_env(args: argparse.Namespace) -> Tuple[List[str], Dict[str, str]]:
|
||||
@ -82,14 +97,23 @@ def prepare_simple_launcher_cmd_env(args: argparse.Namespace) -> Tuple[List[str]
|
||||
raise ValueError("--module and --no_python cannot be used together")
|
||||
|
||||
if args.mpirun_hostfile is not None:
|
||||
mpi_app_name, hostfile_arg, num_proc_arg, proc_per_node_arg = _get_mpirun_args()
|
||||
mpi_app_name, hostfile_arg, num_proc_arg, proc_per_node_arg, bind_to_arg = _get_mpirun_args()
|
||||
mpirun_ccl = getattr(args, "mpirun_ccl", None)
|
||||
bind_to = getattr(args, "bind-to", "socket")
|
||||
num_machines = args.num_machines
|
||||
num_processes = getattr(args, "num_processes", None)
|
||||
nproc_per_node = str(num_processes // num_machines) if num_processes and num_machines else "1"
|
||||
cmd += [mpi_app_name, hostfile_arg, args.mpirun_hostfile, proc_per_node_arg, nproc_per_node]
|
||||
cmd += [
|
||||
mpi_app_name,
|
||||
hostfile_arg,
|
||||
args.mpirun_hostfile,
|
||||
proc_per_node_arg,
|
||||
nproc_per_node,
|
||||
]
|
||||
if num_processes:
|
||||
cmd += [num_proc_arg, str(num_processes)]
|
||||
if bind_to_arg:
|
||||
cmd += [bind_to_arg, bind_to]
|
||||
if not args.no_python:
|
||||
cmd.append(sys.executable)
|
||||
if args.module:
|
||||
@ -106,6 +130,8 @@ def prepare_simple_launcher_cmd_env(args: argparse.Namespace) -> Tuple[List[str]
|
||||
current_env["ZE_AFFINITY_MASK"] = args.gpu_ids
|
||||
elif is_mlu_available():
|
||||
current_env["MLU_VISIBLE_DEVICES"] = args.gpu_ids
|
||||
elif is_musa_available():
|
||||
current_env["MUSA_VISIBLE_DEVICES"] = args.gpu_ids
|
||||
elif is_npu_available():
|
||||
current_env["ASCEND_RT_VISIBLE_DEVICES"] = args.gpu_ids
|
||||
else:
|
||||
@ -115,7 +141,7 @@ def prepare_simple_launcher_cmd_env(args: argparse.Namespace) -> Tuple[List[str]
|
||||
current_env["MASTER_PORT"] = str(args.main_process_port)
|
||||
|
||||
if args.mpirun_hostfile is not None:
|
||||
current_env["CCL_WORKER_COUNT"] = mpirun_ccl
|
||||
current_env["CCL_WORKER_COUNT"] = str(mpirun_ccl)
|
||||
elif args.num_processes > 1:
|
||||
current_env["MASTER_ADDR"] = args.main_process_ip if args.main_process_ip is not None else "127.0.0.1"
|
||||
current_env["MASTER_PORT"] = str(args.main_process_port) if args.main_process_port is not None else "29500"
|
||||
@ -128,6 +154,12 @@ def prepare_simple_launcher_cmd_env(args: argparse.Namespace) -> Tuple[List[str]
|
||||
)
|
||||
|
||||
current_env["ACCELERATE_MIXED_PRECISION"] = str(mixed_precision)
|
||||
if args.mixed_precision.lower() == "fp8":
|
||||
if not is_fp8_available():
|
||||
raise RuntimeError(
|
||||
"FP8 is not available on this machine. Please ensure that either Transformer Engine or MSAMP is installed."
|
||||
)
|
||||
current_env = setup_fp8_env(args, current_env)
|
||||
|
||||
try:
|
||||
dynamo_backend = DynamoBackend(args.dynamo_backend.upper())
|
||||
@ -200,6 +232,8 @@ def prepare_multi_gpu_env(args: argparse.Namespace) -> Dict[str, str]:
|
||||
current_env["ZE_AFFINITY_MASK"] = gpu_ids
|
||||
elif is_mlu_available():
|
||||
current_env["MLU_VISIBLE_DEVICES"] = gpu_ids
|
||||
elif is_musa_available():
|
||||
current_env["MUSA_VISIBLE_DEVICES"] = gpu_ids
|
||||
elif is_npu_available():
|
||||
current_env["ASCEND_RT_VISIBLE_DEVICES"] = gpu_ids
|
||||
else:
|
||||
@ -211,6 +245,12 @@ def prepare_multi_gpu_env(args: argparse.Namespace) -> Dict[str, str]:
|
||||
raise ValueError(f"Unknown mixed_precision mode: {mixed_precision}. Choose between {PrecisionType.list()}.")
|
||||
|
||||
current_env["ACCELERATE_MIXED_PRECISION"] = str(mixed_precision)
|
||||
if args.mixed_precision.lower() == "fp8":
|
||||
if not is_fp8_available():
|
||||
raise RuntimeError(
|
||||
"FP8 is not available on this machine. Please ensure that either Transformer Engine or MSAMP is installed."
|
||||
)
|
||||
current_env = setup_fp8_env(args, current_env)
|
||||
|
||||
try:
|
||||
dynamo_backend = DynamoBackend(args.dynamo_backend.upper())
|
||||
@ -250,6 +290,7 @@ def prepare_multi_gpu_env(args: argparse.Namespace) -> Dict[str, str]:
|
||||
current_env["FSDP_USE_ORIG_PARAMS"] = str(args.fsdp_use_orig_params).lower()
|
||||
current_env["FSDP_CPU_RAM_EFFICIENT_LOADING"] = str(args.fsdp_cpu_ram_efficient_loading).lower()
|
||||
current_env["FSDP_SYNC_MODULE_STATES"] = str(args.fsdp_sync_module_states).lower()
|
||||
current_env["FSDP_ACTIVATION_CHECKPOINTING"] = str(args.fsdp_activation_checkpointing).lower()
|
||||
|
||||
if args.use_megatron_lm:
|
||||
prefix = "MEGATRON_LM_"
|
||||
@ -360,6 +401,8 @@ def prepare_deepspeed_cmd_env(args: argparse.Namespace) -> Tuple[List[str], Dict
|
||||
current_env["ZE_AFFINITY_MASK"] = gpu_ids
|
||||
elif is_mlu_available():
|
||||
current_env["MLU_VISIBLE_DEVICES"] = gpu_ids
|
||||
elif is_musa_available():
|
||||
current_env["MUSA_VISIBLE_DEVICES"] = gpu_ids
|
||||
elif is_npu_available():
|
||||
current_env["ASCEND_RT_VISIBLE_DEVICES"] = gpu_ids
|
||||
else:
|
||||
@ -373,6 +416,12 @@ def prepare_deepspeed_cmd_env(args: argparse.Namespace) -> Tuple[List[str], Dict
|
||||
|
||||
current_env["PYTHONPATH"] = env_var_path_add("PYTHONPATH", os.path.abspath("."))
|
||||
current_env["ACCELERATE_MIXED_PRECISION"] = str(mixed_precision)
|
||||
if args.mixed_precision.lower() == "fp8":
|
||||
if not is_fp8_available():
|
||||
raise RuntimeError(
|
||||
"FP8 is not available on this machine. Please ensure that either Transformer Engine or MSAMP is installed."
|
||||
)
|
||||
current_env = setup_fp8_env(args, current_env)
|
||||
current_env["ACCELERATE_CONFIG_DS_FIELDS"] = str(args.deepspeed_fields_from_accelerate_config).lower()
|
||||
current_env["ACCELERATE_USE_DEEPSPEED"] = "true"
|
||||
if args.zero_stage is not None:
|
||||
@ -511,6 +560,12 @@ def prepare_sagemager_args_inputs(
|
||||
"ACCELERATE_DYNAMO_USE_DYNAMIC": str(args.dynamo_use_dynamic),
|
||||
"ACCELERATE_SAGEMAKER_DISTRIBUTED_TYPE": sagemaker_config.distributed_type.value,
|
||||
}
|
||||
if args.mixed_precision.lower() == "fp8":
|
||||
if not is_fp8_available():
|
||||
raise RuntimeError(
|
||||
"FP8 is not available on this machine. Please ensure that either Transformer Engine or MSAMP is installed."
|
||||
)
|
||||
environment = setup_fp8_env(args, environment)
|
||||
# configure distribution set up
|
||||
distribution = None
|
||||
if sagemaker_config.distributed_type == SageMakerDistributedType.DATA_PARALLEL:
|
||||
@ -612,6 +667,7 @@ class PrepareForLaunch:
|
||||
elif self.distributed_type in (
|
||||
DistributedType.MULTI_GPU,
|
||||
DistributedType.MULTI_MLU,
|
||||
DistributedType.MULTI_MUSA,
|
||||
DistributedType.MULTI_NPU,
|
||||
DistributedType.MULTI_XPU,
|
||||
DistributedType.MULTI_CPU,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -23,7 +23,36 @@ import inspect
|
||||
|
||||
import torch
|
||||
|
||||
from .imports import is_mlu_available, is_mps_available, is_npu_available, is_xpu_available
|
||||
from .imports import (
|
||||
is_cuda_available,
|
||||
is_mlu_available,
|
||||
is_mps_available,
|
||||
is_musa_available,
|
||||
is_npu_available,
|
||||
is_xpu_available,
|
||||
)
|
||||
|
||||
|
||||
def clear_device_cache(garbage_collection=False):
|
||||
"""
|
||||
Clears the device cache by calling `torch.{backend}.empty_cache`. Can also run `gc.collect()`, but do note that
|
||||
this is a *considerable* slowdown and should be used sparingly.
|
||||
"""
|
||||
if garbage_collection:
|
||||
gc.collect()
|
||||
|
||||
if is_xpu_available():
|
||||
torch.xpu.empty_cache()
|
||||
elif is_mlu_available():
|
||||
torch.mlu.empty_cache()
|
||||
elif is_musa_available():
|
||||
torch.musa.empty_cache()
|
||||
elif is_npu_available():
|
||||
torch.npu.empty_cache()
|
||||
elif is_mps_available(min_version="2.0"):
|
||||
torch.mps.empty_cache()
|
||||
elif is_cuda_available():
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
def release_memory(*objects):
|
||||
@ -52,17 +81,7 @@ def release_memory(*objects):
|
||||
objects = list(objects)
|
||||
for i in range(len(objects)):
|
||||
objects[i] = None
|
||||
gc.collect()
|
||||
if is_xpu_available():
|
||||
torch.xpu.empty_cache()
|
||||
elif is_mlu_available():
|
||||
torch.mlu.empty_cache()
|
||||
elif is_npu_available():
|
||||
torch.npu.empty_cache()
|
||||
elif is_mps_available():
|
||||
torch.mps.empty_cache()
|
||||
else:
|
||||
torch.cuda.empty_cache()
|
||||
clear_device_cache(garbage_collection=True)
|
||||
return objects
|
||||
|
||||
|
||||
@ -118,15 +137,7 @@ def find_executable_batch_size(function: callable = None, starting_batch_size: i
|
||||
|
||||
def decorator(*args, **kwargs):
|
||||
nonlocal batch_size
|
||||
gc.collect()
|
||||
if is_xpu_available():
|
||||
torch.xpu.empty_cache()
|
||||
elif is_mlu_available():
|
||||
torch.mlu.empty_cache()
|
||||
elif is_npu_available():
|
||||
torch.npu.empty_cache()
|
||||
else:
|
||||
torch.cuda.empty_cache()
|
||||
clear_device_cache(garbage_collection=True)
|
||||
params = list(inspect.signature(function).parameters.keys())
|
||||
# Guard against user error
|
||||
if len(params) < (len(args) + 1):
|
||||
@ -142,15 +153,7 @@ def find_executable_batch_size(function: callable = None, starting_batch_size: i
|
||||
return function(batch_size, *args, **kwargs)
|
||||
except Exception as e:
|
||||
if should_reduce_batch_size(e):
|
||||
gc.collect()
|
||||
if is_xpu_available():
|
||||
torch.xpu.empty_cache()
|
||||
elif is_mlu_available():
|
||||
torch.mlu.empty_cache()
|
||||
elif is_npu_available():
|
||||
torch.npu.empty_cache()
|
||||
else:
|
||||
torch.cuda.empty_cache()
|
||||
clear_device_cache(garbage_collection=True)
|
||||
batch_size //= 2
|
||||
else:
|
||||
raise
|
||||
|
||||
@ -14,7 +14,6 @@
|
||||
|
||||
import contextlib
|
||||
import gc
|
||||
import importlib
|
||||
import inspect
|
||||
import json
|
||||
import logging
|
||||
@ -24,9 +23,8 @@ import shutil
|
||||
import tempfile
|
||||
import warnings
|
||||
from collections import OrderedDict, defaultdict
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
from typing import Dict, List, Optional, Set, Tuple, Union
|
||||
|
||||
import packaging
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
@ -36,14 +34,16 @@ from .dataclasses import AutocastKwargs, CustomDtype, DistributedType
|
||||
from .imports import (
|
||||
is_mlu_available,
|
||||
is_mps_available,
|
||||
is_musa_available,
|
||||
is_npu_available,
|
||||
is_peft_available,
|
||||
is_torch_xla_available,
|
||||
is_xpu_available,
|
||||
)
|
||||
from .memory import clear_device_cache
|
||||
from .offload import load_offloaded_weight, offload_weight, save_offload_index
|
||||
from .tqdm import is_tqdm_available, tqdm
|
||||
from .versions import compare_versions
|
||||
from .versions import compare_versions, is_torch_version
|
||||
|
||||
|
||||
if is_npu_available(check_device=False):
|
||||
@ -52,6 +52,9 @@ if is_npu_available(check_device=False):
|
||||
if is_mlu_available(check_device=False):
|
||||
import torch_mlu # noqa: F401
|
||||
|
||||
if is_musa_available(check_device=False):
|
||||
import torch_musa # noqa: F401
|
||||
|
||||
from safetensors import safe_open
|
||||
from safetensors.torch import load_file as safe_load_file
|
||||
|
||||
@ -160,6 +163,8 @@ def dtype_byte_size(dtype: torch.dtype):
|
||||
return 1 / 2
|
||||
elif dtype == CustomDtype.FP8:
|
||||
return 1
|
||||
elif is_torch_version(">=", "2.1.0") and dtype == torch.float8_e4m3fn:
|
||||
return 1
|
||||
bit_search = re.search(r"[^\d](\d+)$", str(dtype))
|
||||
if bit_search is None:
|
||||
raise ValueError(f"`dtype` is not a valid dtype: {dtype}.")
|
||||
@ -230,6 +235,11 @@ def shard_checkpoint(
|
||||
weights_name (`str`, *optional*, defaults to `"pytorch_model.bin"`):
|
||||
The name of the model save file.
|
||||
"""
|
||||
logger.warning(
|
||||
"Note that `shard_checkpoint` is deprecated and will be removed in 0.33.0. We recommend you using "
|
||||
"split_torch_state_dict_into_shards from huggingface_hub library"
|
||||
)
|
||||
|
||||
max_shard_size = convert_file_size_to_int(max_shard_size)
|
||||
|
||||
sharded_state_dicts = [{}]
|
||||
@ -353,10 +363,15 @@ def set_module_tensor_to_device(
|
||||
if old_value.device == torch.device("meta") and device not in ["meta", torch.device("meta")] and value is None:
|
||||
raise ValueError(f"{tensor_name} is on the meta device, we need a `value` to put in on {device}.")
|
||||
|
||||
param = module._parameters[tensor_name] if tensor_name in module._parameters else None
|
||||
param_cls = type(param)
|
||||
|
||||
if value is not None:
|
||||
if old_value.shape != value.shape:
|
||||
# We can expect mismatches when using bnb 4bit since Params4bit will reshape and pack the weights.
|
||||
# In other cases, we want to make sure we're not loading checkpoints that do not match the config.
|
||||
if old_value.shape != value.shape and param_cls.__name__ != "Params4bit":
|
||||
raise ValueError(
|
||||
f'Trying to set a tensor of shape {value.shape} in "{tensor_name}" (which has shape {old_value.shape}), this look incorrect.'
|
||||
f'Trying to set a tensor of shape {value.shape} in "{tensor_name}" (which has shape {old_value.shape}), this looks incorrect.'
|
||||
)
|
||||
|
||||
if dtype is None:
|
||||
@ -365,9 +380,6 @@ def set_module_tensor_to_device(
|
||||
elif not str(value.dtype).startswith(("torch.uint", "torch.int", "torch.bool")):
|
||||
value = value.to(dtype)
|
||||
|
||||
param = module._parameters[tensor_name] if tensor_name in module._parameters else None
|
||||
param_cls = type(param)
|
||||
|
||||
device_quantization = None
|
||||
with torch.no_grad():
|
||||
# leave it on cpu first before moving them to cuda
|
||||
@ -386,8 +398,12 @@ def set_module_tensor_to_device(
|
||||
device = f"npu:{device}"
|
||||
elif is_mlu_available():
|
||||
device = f"mlu:{device}"
|
||||
elif is_musa_available():
|
||||
device = f"musa:{device}"
|
||||
elif is_xpu_available():
|
||||
device = f"xpu:{device}"
|
||||
if "xpu" in str(device) and not is_xpu_available():
|
||||
raise ValueError(f'{device} is not available, you should use device="cpu" instead')
|
||||
if value is None:
|
||||
new_value = old_value.to(device)
|
||||
if dtype is not None and device in ["meta", torch.device("meta")]:
|
||||
@ -407,7 +423,7 @@ def set_module_tensor_to_device(
|
||||
elif value is not None or not check_device_same(torch.device(device), module._parameters[tensor_name].device):
|
||||
param_cls = type(module._parameters[tensor_name])
|
||||
kwargs = module._parameters[tensor_name].__dict__
|
||||
if param_cls.__name__ in ["Int8Params", "FP4Params"]:
|
||||
if param_cls.__name__ in ["Int8Params", "FP4Params", "Params4bit"]:
|
||||
if param_cls.__name__ == "Int8Params" and new_value.dtype == torch.float32:
|
||||
# downcast to fp16 if any - needed for 8bit serialization
|
||||
new_value = new_value.to(torch.float16)
|
||||
@ -442,21 +458,18 @@ def set_module_tensor_to_device(
|
||||
elif module.bias is None:
|
||||
# if no bias exists, we can quantize right away
|
||||
module = module.cuda(device_index)
|
||||
elif module.__class__.__name__ == "Linear4bit" and getattr(module.weight, "quant_state", None) is None:
|
||||
elif (
|
||||
module.__class__.__name__ == "Linear4bit"
|
||||
and getattr(module.weight, "quant_state", None) is None
|
||||
and str(module.weight.device) != "meta"
|
||||
):
|
||||
# quantize only if necessary
|
||||
device_index = torch.device(device).index if torch.device(device).type == "cuda" else None
|
||||
if not getattr(module.weight, "quant_state", None) and device_index is not None:
|
||||
module.weight = module.weight.cuda(device_index)
|
||||
# clean pre and post foward hook
|
||||
if device != "cpu":
|
||||
if is_npu_available():
|
||||
torch.npu.empty_cache()
|
||||
elif is_mlu_available():
|
||||
torch.mlu.empty_cache()
|
||||
elif is_xpu_available():
|
||||
torch.xpu.empty_cache()
|
||||
else:
|
||||
torch.cuda.empty_cache()
|
||||
clear_device_cache()
|
||||
|
||||
# When handling tied weights, we update tied_params_map to keep track of the tied weights that have already been allocated on the device in
|
||||
# order to avoid duplicating memory, see above.
|
||||
@ -603,7 +616,65 @@ def check_tied_parameters_on_same_device(tied_params, device_map):
|
||||
)
|
||||
|
||||
|
||||
def find_tied_parameters(model: nn.Module, **kwargs):
|
||||
def _get_named_modules(
|
||||
module: torch.nn.Module,
|
||||
memo: Optional[Set[torch.nn.Module]] = None,
|
||||
prefix: str = "",
|
||||
remove_duplicate: bool = True,
|
||||
):
|
||||
"""
|
||||
Return an iterator over all modules in the network, yielding both the name of the module as well as the module
|
||||
itself. Copied from PyTorch `torch.nn.Module.named_modules` for compatability with torch < 2.0 versions with
|
||||
`remove_duplicate` option added.
|
||||
|
||||
Args:
|
||||
memo (set of `torch.nn.Module`, *optional*):
|
||||
A memo to store the set of modules already added to the result
|
||||
prefix (`str`, *optional*):
|
||||
A prefix that will be added to the name of the module
|
||||
remove_duplicate (`bool`, *optional*):
|
||||
Whether to remove the duplicated module instances in the result or not
|
||||
|
||||
Yields:
|
||||
(str, Module): Tuple of name and module
|
||||
|
||||
Note:
|
||||
Duplicate modules are returned only once. In the following example, ``l`` will be returned only once.
|
||||
"""
|
||||
if memo is None:
|
||||
memo = set()
|
||||
if module not in memo:
|
||||
if remove_duplicate:
|
||||
memo.add(module)
|
||||
yield prefix, module
|
||||
for name, sub_module in module._modules.items():
|
||||
if module is None:
|
||||
continue
|
||||
submodule_prefix = prefix + ("." if prefix else "") + name
|
||||
yield from _get_named_modules(sub_module, memo, submodule_prefix, remove_duplicate)
|
||||
|
||||
|
||||
def _get_named_parameters(module: torch.nn.Module, prefix="", recurse=True, remove_duplicate: bool = True):
|
||||
"""
|
||||
Help yield various names + members of modules. Copied from PyTorch `torch.nn.Module.named_modules` for
|
||||
compatability with torch < 2.0 versions with `remove_duplicate` option added.
|
||||
"""
|
||||
memo = set()
|
||||
modules = (
|
||||
_get_named_modules(module, prefix=prefix, remove_duplicate=remove_duplicate) if recurse else [(prefix, module)]
|
||||
)
|
||||
for module_prefix, module in modules:
|
||||
members = module._parameters.items()
|
||||
for k, v in members:
|
||||
if v is None or v in memo:
|
||||
continue
|
||||
if remove_duplicate:
|
||||
memo.add(v)
|
||||
name = module_prefix + ("." if module_prefix else "") + k
|
||||
yield name, v
|
||||
|
||||
|
||||
def find_tied_parameters(model: torch.nn.Module, **kwargs):
|
||||
"""
|
||||
Find the tied parameters in a given model.
|
||||
|
||||
@ -632,33 +703,32 @@ def find_tied_parameters(model: nn.Module, **kwargs):
|
||||
[['linear1.weight', 'linear2.weight']]
|
||||
```
|
||||
"""
|
||||
# Initialize result and named_parameters before recursing.
|
||||
named_parameters = kwargs.get("named_parameters", None)
|
||||
prefix = kwargs.get("prefix", "")
|
||||
result = kwargs.get("result", {})
|
||||
|
||||
if named_parameters is None:
|
||||
named_parameters = {n: p for n, p in model.named_parameters()}
|
||||
else:
|
||||
# A tied parameter will not be in the full `named_parameters` seen above but will be in the `named_parameters`
|
||||
# of the submodule it belongs to. So while recursing we track the names that are not in the initial
|
||||
# `named_parameters`.
|
||||
for name, parameter in model.named_parameters():
|
||||
full_name = name if prefix == "" else f"{prefix}.{name}"
|
||||
if full_name not in named_parameters:
|
||||
# When we find one, it has to be one of the existing parameters.
|
||||
for new_name, new_param in named_parameters.items():
|
||||
if new_param is parameter:
|
||||
if new_name not in result:
|
||||
result[new_name] = []
|
||||
result[new_name].append(full_name)
|
||||
# get ALL model parameters and thier names
|
||||
all_named_parameters = {name: param for name, param in _get_named_parameters(model, remove_duplicate=False)}
|
||||
|
||||
# Once we have treated direct parameters, we move to the child modules.
|
||||
for name, child in model.named_children():
|
||||
child_name = name if prefix == "" else f"{prefix}.{name}"
|
||||
find_tied_parameters(child, named_parameters=named_parameters, prefix=child_name, result=result)
|
||||
# get ONLY unique named parameters,
|
||||
# if parameter is tied and have multiple names, it will be included only once
|
||||
no_duplicate_named_parameters = {
|
||||
name: param for name, param in _get_named_parameters(model, remove_duplicate=True)
|
||||
}
|
||||
|
||||
return FindTiedParametersResult([sorted([weight] + list(set(tied))) for weight, tied in result.items()])
|
||||
# the difference of the two sets will give us the tied parameters
|
||||
tied_param_names = set(all_named_parameters.keys()) - set(no_duplicate_named_parameters.keys())
|
||||
|
||||
# 'tied_param_names' contains the names of parameters that are tied in the model, but we do not know
|
||||
# which names refer to the same parameter. To identify this, we need to group them together.
|
||||
tied_param_groups = {}
|
||||
for tied_param_name in tied_param_names:
|
||||
tied_param = all_named_parameters[tied_param_name]
|
||||
for param_name, param in no_duplicate_named_parameters.items():
|
||||
# compare if parameters are the same, if so, group thier names together
|
||||
if param is tied_param:
|
||||
if param_name not in tied_param_groups:
|
||||
tied_param_groups[param_name] = []
|
||||
tied_param_groups[param_name].append(tied_param_name)
|
||||
|
||||
return FindTiedParametersResult([sorted([weight] + list(set(tied))) for weight, tied in tied_param_groups.items()])
|
||||
|
||||
|
||||
def retie_parameters(model, tied_params):
|
||||
@ -803,27 +873,48 @@ def get_max_memory(max_memory: Optional[Dict[Union[int, str], Union[int, str]]]
|
||||
import psutil
|
||||
|
||||
if max_memory is None:
|
||||
if not (torch.cuda.is_available() or is_npu_available() or is_mlu_available() or is_xpu_available()):
|
||||
max_memory = {}
|
||||
|
||||
else:
|
||||
# Make sure CUDA is initialized on each GPU to have the right memory info.
|
||||
if is_npu_available():
|
||||
for i in range(torch.npu.device_count()):
|
||||
max_memory = {}
|
||||
# Make sure CUDA is initialized on each GPU to have the right memory info.
|
||||
if is_npu_available():
|
||||
for i in range(torch.npu.device_count()):
|
||||
try:
|
||||
_ = torch.tensor(0, device=torch.device("npu", i))
|
||||
max_memory = {i: torch.npu.mem_get_info(i)[0] for i in range(torch.npu.device_count())}
|
||||
elif is_mlu_available():
|
||||
for i in range(torch.mlu.device_count()):
|
||||
max_memory[i] = torch.npu.mem_get_info(i)[0]
|
||||
except Exception:
|
||||
logger.info(f"Device {i} seems unavailable, Proceeding to check subsequent devices.")
|
||||
continue
|
||||
elif is_mlu_available():
|
||||
for i in range(torch.mlu.device_count()):
|
||||
try:
|
||||
_ = torch.tensor(0, device=torch.device("mlu", i))
|
||||
max_memory = {i: torch.mlu.mem_get_info(i)[0] for i in range(torch.mlu.device_count())}
|
||||
elif is_xpu_available():
|
||||
for i in range(torch.xpu.device_count()):
|
||||
max_memory[i] = torch.mlu.mem_get_info(i)[0]
|
||||
except Exception:
|
||||
logger.info(f"Device {i} seems unavailable, Proceeding to check subsequent devices.")
|
||||
continue
|
||||
elif is_musa_available():
|
||||
for i in range(torch.musa.device_count()):
|
||||
try:
|
||||
_ = torch.tensor(0, device=torch.device("musa", i))
|
||||
max_memory[i] = torch.musa.mem_get_info(i)[0]
|
||||
except Exception:
|
||||
logger.info(f"Device {i} seems unavailable, Proceeding to check subsequent devices.")
|
||||
continue
|
||||
elif is_xpu_available():
|
||||
for i in range(torch.xpu.device_count()):
|
||||
try:
|
||||
_ = torch.tensor(0, device=torch.device("xpu", i))
|
||||
max_memory = {i: torch.xpu.max_memory_allocated(i) for i in range(torch.xpu.device_count())}
|
||||
else:
|
||||
for i in range(torch.cuda.device_count()):
|
||||
max_memory[i] = torch.xpu.max_memory_allocated(i)
|
||||
except Exception:
|
||||
logger.info(f"Device {i} seems unavailable, Proceeding to check subsequent devices.")
|
||||
continue
|
||||
else:
|
||||
for i in range(torch.cuda.device_count()):
|
||||
try:
|
||||
_ = torch.tensor([0], device=i)
|
||||
max_memory = {i: torch.cuda.mem_get_info(i)[0] for i in range(torch.cuda.device_count())}
|
||||
max_memory[i] = torch.cuda.mem_get_info(i)[0]
|
||||
except Exception:
|
||||
logger.info(f"Device {i} seems unavailable, Proceeding to check subsequent devices.")
|
||||
continue
|
||||
# allocate everything in the mps device as the RAM is shared
|
||||
if is_mps_available():
|
||||
max_memory["mps"] = psutil.virtual_memory().available
|
||||
@ -844,6 +935,8 @@ def get_max_memory(max_memory: Optional[Dict[Union[int, str], Union[int, str]]]
|
||||
num_devices = torch.npu.device_count()
|
||||
elif is_mlu_available():
|
||||
num_devices = torch.mlu.device_count()
|
||||
elif is_musa_available():
|
||||
num_devices = torch.musa.device_count()
|
||||
elif is_xpu_available():
|
||||
num_devices = torch.xpu.device_count()
|
||||
else:
|
||||
@ -916,6 +1009,17 @@ def load_offloaded_weights(model, index, offload_folder):
|
||||
set_module_tensor_to_device(model, param_name, "cpu", value=weight, fp16_statistics=fp16_statistics)
|
||||
|
||||
|
||||
def get_module_leaves(module_sizes):
|
||||
module_children = {}
|
||||
for module in module_sizes:
|
||||
if module == "" or "." not in module:
|
||||
continue
|
||||
parent = module.rsplit(".", 1)[0]
|
||||
module_children[parent] = module_children.get(parent, 0) + 1
|
||||
leaves = [module for module in module_sizes if module_children.get(module, 0) == 0 and module != ""]
|
||||
return leaves
|
||||
|
||||
|
||||
def get_balanced_memory(
|
||||
model: nn.Module,
|
||||
max_memory: Optional[Dict[Union[int, str], Union[int, str]]] = None,
|
||||
@ -957,23 +1061,16 @@ def get_balanced_memory(
|
||||
max_memory = get_max_memory(max_memory)
|
||||
|
||||
if is_npu_available():
|
||||
num_devices = len([d for d in max_memory if torch.device(d).type == "npu" and max_memory[d] > 0])
|
||||
expected_device_type = "npu"
|
||||
elif is_mlu_available():
|
||||
num_devices = len([d for d in max_memory if torch.device(d).type == "mlu" and max_memory[d] > 0])
|
||||
expected_device_type = "mlu"
|
||||
elif is_musa_available():
|
||||
expected_device_type = "musa"
|
||||
elif is_xpu_available():
|
||||
num_devices = len(
|
||||
[
|
||||
d
|
||||
for d in max_memory
|
||||
if (
|
||||
d != "cpu"
|
||||
and (torch.device(d).type == "xpu" or torch.xpu.get_device_properties(d).dev_type == "gpu")
|
||||
)
|
||||
and max_memory[d] > 0
|
||||
]
|
||||
)
|
||||
expected_device_type = "xpu"
|
||||
else:
|
||||
num_devices = len([d for d in max_memory if torch.device(d).type == "cuda" and max_memory[d] > 0])
|
||||
expected_device_type = "cuda"
|
||||
num_devices = len([d for d in max_memory if torch.device(d).type == expected_device_type and max_memory[d] > 0])
|
||||
|
||||
if num_devices == 0:
|
||||
return max_memory
|
||||
@ -1025,10 +1122,10 @@ def get_balanced_memory(
|
||||
buffer = 0
|
||||
|
||||
# Compute mean of final modules. In the first dict of module sizes, leaves are the parameters
|
||||
leaves = [n for n in module_sizes if len([p for p in module_sizes if n == "" or p.startswith(n + ".")]) == 0]
|
||||
leaves = get_module_leaves(module_sizes)
|
||||
module_sizes = {n: v for n, v in module_sizes.items() if n not in leaves}
|
||||
# Once removed, leaves are the final modules.
|
||||
leaves = [n for n in module_sizes if len([p for p in module_sizes if n == "" or p.startswith(n + ".")]) == 0]
|
||||
leaves = get_module_leaves(module_sizes)
|
||||
mean_leaves = int(sum([module_sizes[n] for n in leaves]) / max(len(leaves), 1))
|
||||
buffer = int(1.25 * max(buffer, mean_leaves))
|
||||
per_gpu += buffer
|
||||
@ -1432,7 +1529,15 @@ def load_state_dict(checkpoint_file, device_map=None):
|
||||
else:
|
||||
# if we only have one device we can load everything directly
|
||||
if len(set(device_map.values())) == 1:
|
||||
return safe_load_file(checkpoint_file, device=list(device_map.values())[0])
|
||||
device = list(device_map.values())[0]
|
||||
target_device = device
|
||||
if is_xpu_available():
|
||||
if compare_versions("safetensors", "<", "0.4.2"):
|
||||
raise ImportError("Safetensors version must be >= 0.4.2 for XPU. Please upgrade safetensors.")
|
||||
if isinstance(device, int):
|
||||
target_device = f"xpu:{device}"
|
||||
|
||||
return safe_load_file(checkpoint_file, device=target_device)
|
||||
|
||||
devices = list(set(device_map.values()) - {"disk"})
|
||||
# cpu device should always exist as fallback option
|
||||
@ -1462,15 +1567,9 @@ def load_state_dict(checkpoint_file, device_map=None):
|
||||
progress_bar = None
|
||||
for device in devices:
|
||||
target_device = device
|
||||
|
||||
if is_xpu_available():
|
||||
current_safetensors_version = packaging.version.parse(importlib.metadata.version("safetensors"))
|
||||
|
||||
if compare_versions(current_safetensors_version, "<", "0.4.2"):
|
||||
raise ModuleNotFoundError(
|
||||
f"You need at least safetensors 0.4.2 for Intel GPU, while you have {current_safetensors_version}"
|
||||
)
|
||||
|
||||
if compare_versions("safetensors", "<", "0.4.2"):
|
||||
raise ImportError("Safetensors version must be >= 0.4.2 for XPU. Please upgrade safetensors.")
|
||||
if isinstance(device, int):
|
||||
target_device = f"xpu:{device}"
|
||||
|
||||
@ -1538,6 +1637,49 @@ def get_state_dict_offloaded_model(model: nn.Module):
|
||||
return state_dict
|
||||
|
||||
|
||||
def get_state_dict_from_offload(
|
||||
module: nn.Module,
|
||||
module_name: str,
|
||||
state_dict: Dict[str, Union[str, torch.tensor]],
|
||||
device_to_put_offload: Union[int, str, torch.device] = "cpu",
|
||||
):
|
||||
"""
|
||||
Retrieve the state dictionary (with parameters) from an offloaded module and load into a specified device (defualts
|
||||
to cpu).
|
||||
|
||||
Args:
|
||||
module: (`torch.nn.Module`):
|
||||
The module we want to retrieve a state dictionary from
|
||||
module_name: (`str`):
|
||||
The name of the module of interest
|
||||
state_dict (`Dict[str, Union[int, str, torch.device]]`):
|
||||
Dictionary of {module names: parameters}
|
||||
device_to_put_offload (`Union[int, str, torch.device]`):
|
||||
Device to load offloaded parameters into, defaults to the cpu.
|
||||
"""
|
||||
from ..hooks import AlignDevicesHook
|
||||
|
||||
root = module_name[: module_name.rfind(".")] # module name without .weight or .bias
|
||||
preforward = False
|
||||
if hasattr(module, "_hf_hook") and isinstance(module._hf_hook, AlignDevicesHook) and module._hf_hook.offload:
|
||||
# assign the device to which the offloaded parameters will be sent
|
||||
original_device = module._hf_hook.execution_device
|
||||
module._hf_hook.execution_device = device_to_put_offload
|
||||
module._hf_hook.pre_forward(module)
|
||||
preforward = True
|
||||
|
||||
for m_key in module.state_dict():
|
||||
params = module.state_dict()[m_key]
|
||||
if (root + f".{m_key}") in state_dict:
|
||||
state_dict[root + f".{m_key}"] = params
|
||||
|
||||
if preforward:
|
||||
module._hf_hook.post_forward(module, torch.tensor([]))
|
||||
module._hf_hook.execution_device = original_device
|
||||
|
||||
return state_dict
|
||||
|
||||
|
||||
def load_checkpoint_in_model(
|
||||
model: nn.Module,
|
||||
checkpoint: Union[str, os.PathLike],
|
||||
@ -1790,6 +1932,7 @@ def get_mixed_precision_context_manager(native_amp: bool = False, autocast_kwarg
|
||||
DistributedType.MULTI_CPU,
|
||||
DistributedType.MULTI_GPU,
|
||||
DistributedType.MULTI_MLU,
|
||||
DistributedType.MULTI_MUSA,
|
||||
DistributedType.MULTI_NPU,
|
||||
DistributedType.MULTI_XPU,
|
||||
DistributedType.FSDP,
|
||||
|
||||
@ -17,12 +17,13 @@ A set of basic tensor ops compatible with tpu, gpu, and multigpu
|
||||
|
||||
import pickle
|
||||
import warnings
|
||||
from contextlib import contextmanager, nullcontext
|
||||
from functools import update_wrapper, wraps
|
||||
from typing import Any, Mapping
|
||||
|
||||
import torch
|
||||
|
||||
from ..state import PartialState
|
||||
from ..state import AcceleratorState, PartialState
|
||||
from .constants import TORCH_DISTRIBUTED_OPERATION_TYPES
|
||||
from .dataclasses import DistributedType, TensorInformation
|
||||
from .imports import (
|
||||
@ -151,9 +152,6 @@ def send_to_device(tensor, device, non_blocking=False, skip_keys=None):
|
||||
device = "npu:0"
|
||||
if device == "xpu":
|
||||
device = "xpu:0"
|
||||
# TODO: torch_mlu LongTensor.to(<int num>) has bugs, we will fix this later.
|
||||
if is_torch_tensor(tensor) and tensor.device.type in ["mlu"] and tensor.dtype in [torch.int64]:
|
||||
tensor = tensor.cpu()
|
||||
try:
|
||||
return tensor.to(device, non_blocking=non_blocking)
|
||||
except TypeError: # .to() doesn't accept non_blocking as kwarg
|
||||
@ -846,3 +844,25 @@ def find_device(data):
|
||||
return device
|
||||
elif isinstance(data, torch.Tensor):
|
||||
return data.device
|
||||
|
||||
|
||||
@contextmanager
|
||||
def GatheredParameters(params, modifier_rank=None, fwd_module=None, enabled=True):
|
||||
"""
|
||||
Wrapper around `deepspeed.runtime.zero.GatheredParameters`, but if Zero-3 is not enabled, will be a no-op context
|
||||
manager.
|
||||
"""
|
||||
# We need to use the `AcceleratorState` here since it has access to the deepspeed plugin
|
||||
if AcceleratorState().distributed_type != DistributedType.DEEPSPEED or (
|
||||
AcceleratorState().deepspeed_plugin is not None
|
||||
and not AcceleratorState().deepspeed_plugin.is_zero3_init_enabled()
|
||||
):
|
||||
gather_param_context = nullcontext()
|
||||
else:
|
||||
import deepspeed
|
||||
|
||||
gather_param_context = deepspeed.zero.GatheredParameters(
|
||||
params, modifier_rank=modifier_rank, fwd_module=fwd_module, enabled=enabled
|
||||
)
|
||||
with gather_param_context:
|
||||
yield
|
||||
|
||||
@ -21,7 +21,7 @@ import torch
|
||||
from ..state import AcceleratorState
|
||||
from .constants import CUDA_DISTRIBUTED_TYPES
|
||||
from .dataclasses import DistributedType, RNGType
|
||||
from .imports import is_mlu_available, is_npu_available, is_torch_xla_available, is_xpu_available
|
||||
from .imports import is_mlu_available, is_musa_available, is_npu_available, is_torch_xla_available, is_xpu_available
|
||||
|
||||
|
||||
if is_torch_xla_available():
|
||||
@ -51,6 +51,8 @@ def set_seed(seed: int, device_specific: bool = False, deterministic: bool = Fal
|
||||
torch.npu.manual_seed_all(seed)
|
||||
elif is_mlu_available():
|
||||
torch.mlu.manual_seed_all(seed)
|
||||
elif is_musa_available():
|
||||
torch.musa.manual_seed_all(seed)
|
||||
else:
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
# ^^ safe to call this function even if cuda is not available
|
||||
@ -76,6 +78,9 @@ def synchronize_rng_state(rng_type: Optional[RNGType] = None, generator: Optiona
|
||||
elif rng_type == RNGType.MLU:
|
||||
assert is_mlu_available(), "Can't synchronize MLU seeds on an environment without MLUs."
|
||||
rng_state = torch.mlu.get_rng_state()
|
||||
elif rng_type == RNGType.MUSA:
|
||||
assert is_musa_available(), "Can't synchronize MUSA seeds on an environment without MUSAs."
|
||||
rng_state = torch.musa.get_rng_state()
|
||||
elif rng_type == RNGType.XPU:
|
||||
assert is_xpu_available(), "Can't synchronize XPU seeds on an environment without XPUs."
|
||||
rng_state = torch.xpu.get_rng_state()
|
||||
@ -93,6 +98,7 @@ def synchronize_rng_state(rng_type: Optional[RNGType] = None, generator: Optiona
|
||||
elif (
|
||||
state.distributed_type in CUDA_DISTRIBUTED_TYPES
|
||||
or state.distributed_type == DistributedType.MULTI_MLU
|
||||
or state.distributed_type == DistributedType.MULTI_MUSA
|
||||
or state.distributed_type == DistributedType.MULTI_NPU
|
||||
or state.distributed_type == DistributedType.MULTI_XPU
|
||||
):
|
||||
@ -111,6 +117,8 @@ def synchronize_rng_state(rng_type: Optional[RNGType] = None, generator: Optiona
|
||||
torch.npu.set_rng_state(rng_state)
|
||||
elif rng_type == RNGType.MLU:
|
||||
torch.mlu.set_rng_state(rng_state)
|
||||
elif rng_type == RNGType.MUSA:
|
||||
torch.musa.set_rng_state(rng_state)
|
||||
elif rng_type == RNGType.XPU:
|
||||
torch.xpu.set_rng_state(rng_state)
|
||||
elif rng_type == RNGType.XLA:
|
||||
|
||||
@ -12,9 +12,12 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from types import MethodType
|
||||
|
||||
import torch.nn as nn
|
||||
|
||||
from .imports import is_fp8_available
|
||||
from .operations import GatheredParameters
|
||||
|
||||
|
||||
if is_fp8_available():
|
||||
@ -29,22 +32,28 @@ def convert_model(model, to_transformer_engine=True, _convert_linear=True, _conv
|
||||
raise ImportError("Using `convert_model` requires transformer_engine to be installed.")
|
||||
for name, module in model.named_children():
|
||||
if isinstance(module, nn.Linear) and to_transformer_engine and _convert_linear:
|
||||
# Return early if the linear layer weights are not multiples of 16
|
||||
if any(p % 16 != 0 for p in module.weight.shape):
|
||||
return
|
||||
has_bias = module.bias is not None
|
||||
te_module = te.Linear(
|
||||
module.in_features, module.out_features, bias=has_bias, params_dtype=module.weight.dtype
|
||||
)
|
||||
te_module.weight.copy_(module.weight)
|
||||
params_to_gather = [module.weight]
|
||||
if has_bias:
|
||||
te_module.bias.copy_(module.bias)
|
||||
params_to_gather.append(module.bias)
|
||||
|
||||
setattr(model, name, te_module)
|
||||
with GatheredParameters(params_to_gather, modifier_rank=0):
|
||||
if any(p % 16 != 0 for p in module.weight.shape):
|
||||
return
|
||||
te_module = te.Linear(
|
||||
module.in_features, module.out_features, bias=has_bias, params_dtype=module.weight.dtype
|
||||
)
|
||||
te_module.weight.copy_(module.weight)
|
||||
if has_bias:
|
||||
te_module.bias.copy_(module.bias)
|
||||
|
||||
setattr(model, name, te_module)
|
||||
# Note: @xrsrke (Phuc) found that te.LayerNorm doesn't have any real memory savings or speedups over nn.LayerNorm
|
||||
elif isinstance(module, nn.LayerNorm) and to_transformer_engine and _convert_ln:
|
||||
te_module = te.LayerNorm(module.normalized_shape[0], eps=module.eps, params_dtype=module.weight.dtype)
|
||||
te_module.weight.copy_(module.weight)
|
||||
te_module.bias.copy_(module.bias)
|
||||
with GatheredParameters([module.weight, module.bias], modifier_rank=0):
|
||||
te_module = te.LayerNorm(module.normalized_shape[0], eps=module.eps, params_dtype=module.weight.dtype)
|
||||
te_module.weight.copy_(module.weight)
|
||||
te_module.bias.copy_(module.bias)
|
||||
|
||||
setattr(model, name, te_module)
|
||||
elif isinstance(module, te.Linear) and not to_transformer_engine and _convert_linear:
|
||||
@ -82,3 +91,43 @@ def has_transformer_engine_layers(model):
|
||||
if isinstance(m, (te.LayerNorm, te.Linear, te.TransformerLayer)):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def contextual_fp8_autocast(model_forward, fp8_recipe, use_during_eval=False):
|
||||
"""
|
||||
Wrapper for a model's forward method to apply FP8 autocast. Is context aware, meaning that by default it will
|
||||
disable FP8 autocast during eval mode, which is generally better for more accurate metrics.
|
||||
"""
|
||||
from transformer_engine.pytorch import fp8_autocast
|
||||
|
||||
def forward(self, *args, **kwargs):
|
||||
enabled = use_during_eval or self.training
|
||||
with fp8_autocast(enabled=enabled, fp8_recipe=fp8_recipe):
|
||||
return model_forward(*args, **kwargs)
|
||||
|
||||
# To act like a decorator so that it can be popped when doing `extract_model_from_parallel`
|
||||
forward.__wrapped__ = model_forward
|
||||
|
||||
return forward
|
||||
|
||||
|
||||
def apply_fp8_autowrap(model, fp8_recipe_handler):
|
||||
"""
|
||||
Applies FP8 context manager to the model's forward method
|
||||
"""
|
||||
# Import here to keep base imports fast
|
||||
import transformer_engine.common.recipe as te_recipe
|
||||
|
||||
kwargs = fp8_recipe_handler.to_kwargs() if fp8_recipe_handler is not None else {}
|
||||
if "fp8_format" in kwargs:
|
||||
kwargs["fp8_format"] = getattr(te_recipe.Format, kwargs["fp8_format"])
|
||||
use_during_eval = kwargs.pop("use_autocast_during_eval", False)
|
||||
fp8_recipe = te_recipe.DelayedScaling(**kwargs)
|
||||
new_forward = contextual_fp8_autocast(model.forward, fp8_recipe, use_during_eval)
|
||||
|
||||
if hasattr(model.forward, "__func__"):
|
||||
model.forward = MethodType(new_forward, model)
|
||||
else:
|
||||
model.forward = new_forward
|
||||
|
||||
return model
|
||||
|
||||
@ -15,11 +15,20 @@
|
||||
import importlib.metadata
|
||||
from typing import Union
|
||||
|
||||
from packaging.version import Version, parse
|
||||
from packaging.version import Version
|
||||
from packaging.version import parse as _parse
|
||||
|
||||
from .constants import STR_OPERATION_TO_FUNC
|
||||
|
||||
|
||||
def parse(version: str):
|
||||
"""
|
||||
Same as `packaging.version.parse`, but grabs strictly the base version.
|
||||
"""
|
||||
version = _parse(version)
|
||||
return _parse(version.base_version)
|
||||
|
||||
|
||||
torch_version = parse(importlib.metadata.version("torch"))
|
||||
|
||||
|
||||
|
||||
@ -638,7 +638,7 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
|
||||
|
||||
assert config["gradient_clipping"] == 1.0
|
||||
assert config["zero_optimization"]["reduce_bucket_size"] == (hidden_size * hidden_size)
|
||||
assert config["zero_optimization"]["stage3_prefetch_bucket_size"] == ((0.9 * hidden_size) * hidden_size)
|
||||
assert config["zero_optimization"]["stage3_prefetch_bucket_size"] == int((0.9 * hidden_size) * hidden_size)
|
||||
assert config["zero_optimization"]["stage3_param_persistence_threshold"] == (10 * hidden_size)
|
||||
assert not config["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"]
|
||||
|
||||
@ -695,7 +695,7 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
|
||||
)
|
||||
zero_opt = accelerator.deepspeed_config["zero_optimization"]
|
||||
assert zero_opt["reduce_bucket_size"] == (hidden_size * hidden_size)
|
||||
assert zero_opt["stage3_prefetch_bucket_size"] == (0.9 * hidden_size) * hidden_size
|
||||
assert zero_opt["stage3_prefetch_bucket_size"] == int((0.9 * hidden_size) * hidden_size)
|
||||
assert zero_opt["stage3_param_persistence_threshold"] == (10 * hidden_size)
|
||||
|
||||
@parameterized.expand([FP16, BF16], name_func=parameterized_custom_name_func)
|
||||
@ -992,6 +992,11 @@ class DeepSpeedIntegrationTest(TempDirTestCase):
|
||||
execute_subprocess_async(cmd_stage)
|
||||
|
||||
def test_peak_memory_usage(self):
|
||||
if compare_versions("deepspeed", ">", "0.12.6"):
|
||||
self.skipTest(
|
||||
"The test fails when deepspeed>0.12.6. This is something that needs to be fixed on deepspeed library"
|
||||
)
|
||||
|
||||
self.test_file_path = self.test_scripts_folder / "test_peak_memory_usage.py"
|
||||
cmd = [
|
||||
"accelerate",
|
||||
|
||||
@ -13,6 +13,7 @@
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import functools
|
||||
import os
|
||||
|
||||
import torch
|
||||
@ -41,6 +42,7 @@ from accelerate.utils.constants import (
|
||||
FSDP_STATE_DICT_TYPE,
|
||||
)
|
||||
from accelerate.utils.dataclasses import FullyShardedDataParallelPlugin
|
||||
from accelerate.utils.fsdp_utils import disable_fsdp_ram_efficient_loading, enable_fsdp_ram_efficient_loading
|
||||
from accelerate.utils.other import patch_environment
|
||||
|
||||
|
||||
@ -60,7 +62,6 @@ class FSDPPluginIntegration(AccelerateTestCase):
|
||||
super().setUp()
|
||||
|
||||
self.dist_env = dict(
|
||||
ACCELERATE_USE_FSDP="true",
|
||||
MASTER_ADDR="localhost",
|
||||
MASTER_PORT="10999",
|
||||
RANK="0",
|
||||
@ -68,43 +69,58 @@ class FSDPPluginIntegration(AccelerateTestCase):
|
||||
WORLD_SIZE="1",
|
||||
)
|
||||
|
||||
self.fsdp_env = dict(ACCELERATE_USE_FSDP="true", **self.dist_env)
|
||||
|
||||
def test_sharding_strategy(self):
|
||||
from torch.distributed.fsdp.fully_sharded_data_parallel import ShardingStrategy
|
||||
|
||||
# check that giving enums works fine
|
||||
for i, strategy in enumerate(FSDP_SHARDING_STRATEGY):
|
||||
env = self.dist_env.copy()
|
||||
env = self.fsdp_env.copy()
|
||||
env["FSDP_SHARDING_STRATEGY"] = f"{i + 1}"
|
||||
with mockenv_context(**env):
|
||||
fsdp_plugin = FullyShardedDataParallelPlugin()
|
||||
assert fsdp_plugin.sharding_strategy == ShardingStrategy(i + 1)
|
||||
fsdp_plugin = FullyShardedDataParallelPlugin(sharding_strategy=ShardingStrategy(i + 1))
|
||||
assert fsdp_plugin.sharding_strategy == ShardingStrategy(i + 1)
|
||||
|
||||
# check that giving names works fine
|
||||
for i, strategy in enumerate(FSDP_SHARDING_STRATEGY):
|
||||
env = self.dist_env.copy()
|
||||
env = self.fsdp_env.copy()
|
||||
env["FSDP_SHARDING_STRATEGY"] = strategy
|
||||
with mockenv_context(**env):
|
||||
fsdp_plugin = FullyShardedDataParallelPlugin()
|
||||
assert fsdp_plugin.sharding_strategy == ShardingStrategy(i + 1)
|
||||
fsdp_plugin = FullyShardedDataParallelPlugin(sharding_strategy=strategy)
|
||||
assert fsdp_plugin.sharding_strategy == ShardingStrategy(i + 1)
|
||||
|
||||
def test_backward_prefetch(self):
|
||||
from torch.distributed.fsdp.fully_sharded_data_parallel import BackwardPrefetch
|
||||
|
||||
for i, prefetch_policy in enumerate(FSDP_BACKWARD_PREFETCH):
|
||||
env = self.dist_env.copy()
|
||||
expected_value = None if prefetch_policy == "NO_PREFETCH" else BackwardPrefetch(i + 1)
|
||||
env = self.fsdp_env.copy()
|
||||
env["FSDP_BACKWARD_PREFETCH"] = prefetch_policy
|
||||
with mockenv_context(**env):
|
||||
fsdp_plugin = FullyShardedDataParallelPlugin()
|
||||
if prefetch_policy == "NO_PREFETCH":
|
||||
assert fsdp_plugin.backward_prefetch is None
|
||||
else:
|
||||
assert fsdp_plugin.backward_prefetch == BackwardPrefetch(i + 1)
|
||||
assert (
|
||||
fsdp_plugin.backward_prefetch == expected_value
|
||||
), f"Actual: {fsdp_plugin.backward_prefetch} != Expected: {expected_value}"
|
||||
|
||||
# Check if torch enum works
|
||||
if prefetch_policy != "NO_PREFETCH":
|
||||
fsdp_plugin = FullyShardedDataParallelPlugin(backward_prefetch=BackwardPrefetch(i + 1))
|
||||
assert fsdp_plugin.backward_prefetch == expected_value
|
||||
|
||||
# Check if name works
|
||||
fsdp_plugin = FullyShardedDataParallelPlugin(backward_prefetch=prefetch_policy)
|
||||
assert fsdp_plugin.backward_prefetch == expected_value
|
||||
|
||||
def test_state_dict_type(self):
|
||||
from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
|
||||
|
||||
for i, state_dict_type in enumerate(FSDP_STATE_DICT_TYPE):
|
||||
env = self.dist_env.copy()
|
||||
env = self.fsdp_env.copy()
|
||||
env["FSDP_STATE_DICT_TYPE"] = state_dict_type
|
||||
with mockenv_context(**env):
|
||||
fsdp_plugin = FullyShardedDataParallelPlugin()
|
||||
@ -113,33 +129,64 @@ class FSDPPluginIntegration(AccelerateTestCase):
|
||||
assert fsdp_plugin.state_dict_config.offload_to_cpu
|
||||
assert fsdp_plugin.state_dict_config.rank0_only
|
||||
|
||||
fsdp_plugin = FullyShardedDataParallelPlugin(state_dict_type=StateDictType(i + 1))
|
||||
assert fsdp_plugin.state_dict_type == StateDictType(i + 1)
|
||||
if state_dict_type == "FULL_STATE_DICT":
|
||||
assert fsdp_plugin.state_dict_config.offload_to_cpu
|
||||
assert fsdp_plugin.state_dict_config.rank0_only
|
||||
|
||||
def test_auto_wrap_policy(self):
|
||||
model = AutoModel.from_pretrained(BERT_BASE_CASED)
|
||||
for policy in FSDP_AUTO_WRAP_POLICY:
|
||||
env = self.dist_env.copy()
|
||||
env = self.fsdp_env.copy()
|
||||
env["FSDP_AUTO_WRAP_POLICY"] = policy
|
||||
transformer_cls_to_wrap = None
|
||||
min_num_params = None
|
||||
if policy == "TRANSFORMER_BASED_WRAP":
|
||||
env["FSDP_TRANSFORMER_CLS_TO_WRAP"] = "BertLayer"
|
||||
transformer_cls_to_wrap = "BertLayer"
|
||||
elif policy == "SIZE_BASED_WRAP":
|
||||
env["FSDP_MIN_NUM_PARAMS"] = "2000"
|
||||
min_num_params = 2000
|
||||
# First test via env
|
||||
with mockenv_context(**env):
|
||||
fsdp_plugin = FullyShardedDataParallelPlugin()
|
||||
fsdp_plugin.set_auto_wrap_policy(model)
|
||||
if policy == "NO_WRAP":
|
||||
assert fsdp_plugin.auto_wrap_policy is None
|
||||
else:
|
||||
assert fsdp_plugin.auto_wrap_policy is not None
|
||||
if policy == "NO_WRAP":
|
||||
assert fsdp_plugin.auto_wrap_policy is None
|
||||
else:
|
||||
assert isinstance(fsdp_plugin.auto_wrap_policy, functools.partial)
|
||||
|
||||
env = self.dist_env.copy()
|
||||
# Then manually set the policy
|
||||
fsdp_plugin = FullyShardedDataParallelPlugin(
|
||||
auto_wrap_policy=policy,
|
||||
transformer_cls_names_to_wrap=transformer_cls_to_wrap,
|
||||
min_num_params=min_num_params,
|
||||
)
|
||||
fsdp_plugin.set_auto_wrap_policy(model)
|
||||
if policy == "NO_WRAP":
|
||||
assert fsdp_plugin.auto_wrap_policy is None
|
||||
else:
|
||||
assert isinstance(fsdp_plugin.auto_wrap_policy, functools.partial)
|
||||
|
||||
env = self.fsdp_env.copy()
|
||||
env["FSDP_AUTO_WRAP_POLICY"] = "TRANSFORMER_BASED_WRAP"
|
||||
env["FSDP_TRANSFORMER_CLS_TO_WRAP"] = "T5Layer"
|
||||
with mockenv_context(**env):
|
||||
fsdp_plugin = FullyShardedDataParallelPlugin()
|
||||
with self.assertRaises(Exception) as cm:
|
||||
fsdp_plugin.set_auto_wrap_policy(model)
|
||||
assert "Could not find the transformer layer class to wrap in the model." in str(cm.exception)
|
||||
assert "Could not find the transformer layer class T5Layer in the model." in str(cm.exception)
|
||||
|
||||
env = self.dist_env.copy()
|
||||
fsdp_plugin = FullyShardedDataParallelPlugin(
|
||||
auto_wrap_policy="TRANSFORMER_BASED_WRAP",
|
||||
transformer_cls_names_to_wrap="T5Layer",
|
||||
)
|
||||
with self.assertRaises(Exception) as cm:
|
||||
fsdp_plugin.set_auto_wrap_policy(model)
|
||||
assert "Could not find the transformer layer class T5Layer in the model." in str(cm.exception)
|
||||
|
||||
env = self.fsdp_env.copy()
|
||||
env["FSDP_AUTO_WRAP_POLICY"] = "SIZE_BASED_WRAP"
|
||||
env["FSDP_MIN_NUM_PARAMS"] = "0"
|
||||
with mockenv_context(**env):
|
||||
@ -147,12 +194,19 @@ class FSDPPluginIntegration(AccelerateTestCase):
|
||||
fsdp_plugin.set_auto_wrap_policy(model)
|
||||
assert fsdp_plugin.auto_wrap_policy is None
|
||||
|
||||
fsdp_plugin = FullyShardedDataParallelPlugin(
|
||||
auto_wrap_policy="SIZE_BASED_WRAP",
|
||||
min_num_params=0,
|
||||
)
|
||||
fsdp_plugin.set_auto_wrap_policy(model)
|
||||
assert fsdp_plugin.auto_wrap_policy is None
|
||||
|
||||
def test_mixed_precision(self):
|
||||
from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision
|
||||
from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
|
||||
|
||||
for mp_dtype in dtypes:
|
||||
env = self.dist_env.copy()
|
||||
env = self.fsdp_env.copy()
|
||||
env["ACCELERATE_MIXED_PRECISION"] = mp_dtype
|
||||
with mockenv_context(**env):
|
||||
accelerator = Accelerator()
|
||||
@ -167,21 +221,30 @@ class FSDPPluginIntegration(AccelerateTestCase):
|
||||
elif mp_dtype == BF16:
|
||||
assert accelerator.scaler is None
|
||||
AcceleratorState._reset_state(True)
|
||||
plugin = FullyShardedDataParallelPlugin(
|
||||
mixed_precision_policy={"param_dtype": dtype, "reduce_dtype": dtype, "buffer_dtype": dtype}
|
||||
)
|
||||
assert plugin.mixed_precision_policy == mp_policy
|
||||
with mockenv_context(**self.dist_env):
|
||||
accelerator = Accelerator(fsdp_plugin=plugin)
|
||||
assert accelerator.state.fsdp_plugin.mixed_precision_policy == mp_policy
|
||||
AcceleratorState._reset_state(True)
|
||||
|
||||
def test_mixed_precision_buffer_autocast_override(self):
|
||||
from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision
|
||||
from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
|
||||
|
||||
for mp_dtype in dtypes:
|
||||
env = self.dist_env.copy()
|
||||
if mp_dtype == "fp16":
|
||||
dtype = torch.float16
|
||||
elif mp_dtype == "bf16":
|
||||
dtype = torch.bfloat16
|
||||
mp_policy = MixedPrecision(param_dtype=dtype, reduce_dtype=dtype, buffer_dtype=torch.float32)
|
||||
|
||||
env = self.fsdp_env.copy()
|
||||
env["ACCELERATE_MIXED_PRECISION"] = mp_dtype
|
||||
with mockenv_context(**env):
|
||||
accelerator = Accelerator()
|
||||
if mp_dtype == "fp16":
|
||||
dtype = torch.float16
|
||||
elif mp_dtype == "bf16":
|
||||
dtype = torch.bfloat16
|
||||
mp_policy = MixedPrecision(param_dtype=dtype, reduce_dtype=dtype, buffer_dtype=torch.float32)
|
||||
accelerator.state.fsdp_plugin.set_mixed_precision(dtype, buffer_autocast=True, override=True)
|
||||
assert accelerator.state.fsdp_plugin.mixed_precision_policy == mp_policy
|
||||
if mp_dtype == FP16:
|
||||
@ -194,12 +257,25 @@ class FSDPPluginIntegration(AccelerateTestCase):
|
||||
from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload
|
||||
|
||||
for flag in [True, False]:
|
||||
env = self.dist_env.copy()
|
||||
env = self.fsdp_env.copy()
|
||||
env["FSDP_OFFLOAD_PARAMS"] = str(flag).lower()
|
||||
with mockenv_context(**env):
|
||||
fsdp_plugin = FullyShardedDataParallelPlugin()
|
||||
assert fsdp_plugin.cpu_offload == CPUOffload(offload_params=flag)
|
||||
|
||||
fsdp_plugin = FullyShardedDataParallelPlugin(cpu_offload=flag)
|
||||
assert fsdp_plugin.cpu_offload == CPUOffload(offload_params=flag)
|
||||
|
||||
def test_cpu_ram_efficient_loading(self):
|
||||
enable_fsdp_ram_efficient_loading()
|
||||
fsdp_plugin = FullyShardedDataParallelPlugin()
|
||||
assert fsdp_plugin.cpu_ram_efficient_loading is True
|
||||
assert os.environ.get("FSDP_CPU_RAM_EFFICIENT_LOADING") == "True"
|
||||
disable_fsdp_ram_efficient_loading()
|
||||
fsdp_plugin = FullyShardedDataParallelPlugin()
|
||||
assert fsdp_plugin.cpu_ram_efficient_loading is False
|
||||
assert os.environ.get("FSDP_CPU_RAM_EFFICIENT_LOADING") == "False"
|
||||
|
||||
|
||||
# Skip this test when TorchXLA is available because accelerate.launch does not support TorchXLA FSDP.
|
||||
@require_non_torch_xla
|
||||
|
||||
@ -15,6 +15,7 @@ import json
|
||||
import os
|
||||
import pickle
|
||||
import tempfile
|
||||
import time
|
||||
from unittest.mock import patch
|
||||
|
||||
import psutil
|
||||
@ -26,14 +27,33 @@ from torch.utils.data import DataLoader, TensorDataset
|
||||
from accelerate import DistributedType, infer_auto_device_map, init_empty_weights, load_checkpoint_and_dispatch
|
||||
from accelerate.accelerator import Accelerator
|
||||
from accelerate.state import GradientState, PartialState
|
||||
from accelerate.test_utils import require_bnb, require_multi_device, require_non_cpu, slow, torch_device
|
||||
from accelerate.test_utils import (
|
||||
require_bnb,
|
||||
require_multi_gpu,
|
||||
require_non_cpu,
|
||||
require_transformer_engine,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
from accelerate.test_utils.testing import AccelerateTestCase, require_cuda, require_non_torch_xla
|
||||
from accelerate.utils import patch_environment
|
||||
from accelerate.utils.modeling import load_checkpoint_in_model
|
||||
from accelerate.utils import FP8RecipeKwargs, patch_environment
|
||||
from accelerate.utils.modeling import get_state_dict_from_offload, load_checkpoint_in_model
|
||||
|
||||
|
||||
def create_components():
|
||||
model = torch.nn.Linear(2, 4)
|
||||
class ModelWithTiedWeights(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.linear1 = torch.nn.Linear(2, 4)
|
||||
self.linear2 = torch.nn.Linear(4, 2)
|
||||
self.linear2.weight = self.linear1.weight
|
||||
self.linear2.bias = self.linear1.bias
|
||||
|
||||
def forward(self, x):
|
||||
return self.linear2(self.linear1(x))
|
||||
|
||||
|
||||
def create_components(tied_weights=False):
|
||||
model = ModelWithTiedWeights() if tied_weights else torch.nn.Linear(2, 4)
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=1.0)
|
||||
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.01, steps_per_epoch=2, epochs=1)
|
||||
train_dl = DataLoader(TensorDataset(torch.tensor([1, 2, 3])))
|
||||
@ -54,11 +74,14 @@ class ModelForTest(torch.nn.Module):
|
||||
|
||||
|
||||
def get_signature(model):
|
||||
return (model.weight.abs().sum() + model.bias.abs().sum()).item()
|
||||
return sum(param.abs().sum().item() for param in model.parameters())
|
||||
|
||||
|
||||
def load_random_weights(model):
|
||||
state = torch.nn.Linear(*tuple(model.weight.T.shape)).state_dict()
|
||||
if isinstance(model, torch.nn.Linear):
|
||||
state = torch.nn.Linear(*tuple(model.weight.T.shape)).state_dict()
|
||||
elif isinstance(model, ModelWithTiedWeights):
|
||||
state = ModelWithTiedWeights().state_dict()
|
||||
model.load_state_dict(state)
|
||||
|
||||
|
||||
@ -66,6 +89,7 @@ def parameterized_custom_name_func(func, param_num, param):
|
||||
# customize the test name generator function as we want both params to appear in the sub-test
|
||||
# name, as by default it shows only the first param
|
||||
param_based_name = "use_safetensors" if param.args[0] is True else "use_pytorch"
|
||||
param_based_name += "_tied_weights" if (len(param.args) == 2 and param.args[1] is True) else ""
|
||||
return f"{func.__name__}_{param_based_name}"
|
||||
|
||||
|
||||
@ -204,6 +228,10 @@ class AcceleratorTester(AccelerateTestCase):
|
||||
model, optimizer, scheduler, train_dl, valid_dl = accelerator.prepare(
|
||||
model, optimizer, scheduler, train_dl, valid_dl
|
||||
)
|
||||
|
||||
# Short sleep here makes this test more reliable
|
||||
time.sleep(1e-3)
|
||||
|
||||
model, optimizer, scheduler, train_dl, valid_dl = accelerator.free_memory(
|
||||
model, optimizer, scheduler, train_dl, valid_dl
|
||||
)
|
||||
@ -230,10 +258,10 @@ class AcceleratorTester(AccelerateTestCase):
|
||||
accelerator = Accelerator()
|
||||
assert str(accelerator.state.device) == "cuda:64"
|
||||
|
||||
@parameterized.expand((True, False), name_func=parameterized_custom_name_func)
|
||||
def test_save_load_model(self, use_safetensors):
|
||||
@parameterized.expand([(True, True), (True, False), (False, False)], name_func=parameterized_custom_name_func)
|
||||
def test_save_load_model(self, use_safetensors, tied_weights):
|
||||
accelerator = Accelerator()
|
||||
model, optimizer, scheduler, train_dl, valid_dl = create_components()
|
||||
model, optimizer, scheduler, train_dl, valid_dl = create_components(tied_weights)
|
||||
accelerator.prepare(model, optimizer, scheduler, train_dl, valid_dl)
|
||||
|
||||
model_signature = get_signature(model)
|
||||
@ -261,6 +289,22 @@ class AcceleratorTester(AccelerateTestCase):
|
||||
load_checkpoint_in_model(model, tmpdirname)
|
||||
assert abs(model_signature - get_signature(model)) < 1e-3
|
||||
|
||||
@parameterized.expand([True, False], name_func=parameterized_custom_name_func)
|
||||
def test_save_sharded_model(self, use_safetensors):
|
||||
accelerator = Accelerator()
|
||||
inputs = torch.randn(3, 3)
|
||||
model = ModelForTest()
|
||||
expected = model(inputs)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
# By setting it to 100, we will split the model int 3 shards
|
||||
accelerator.save_model(model, tmpdirname, safe_serialization=use_safetensors, max_shard_size=100)
|
||||
# make sure loaded weights match
|
||||
load_checkpoint_in_model(model, tmpdirname)
|
||||
output = model(inputs)
|
||||
|
||||
assert torch.allclose(expected, output, atol=1e-5)
|
||||
|
||||
@parameterized.expand([True, False], name_func=parameterized_custom_name_func)
|
||||
def test_save_model_offload(self, use_safetensors):
|
||||
accelerator = Accelerator()
|
||||
@ -281,6 +325,34 @@ class AcceleratorTester(AccelerateTestCase):
|
||||
output = model(inputs)
|
||||
assert torch.allclose(expected, output, atol=1e-5)
|
||||
|
||||
@parameterized.expand([True, False], name_func=parameterized_custom_name_func)
|
||||
@require_non_cpu
|
||||
def test_get_state_dict_from_offload(self, use_safetensors):
|
||||
accelerator = Accelerator()
|
||||
|
||||
device_map = {"linear1": "cpu", "batchnorm": "disk", "linear2": "disk"}
|
||||
model = ModelForTest()
|
||||
offloaded_layer_weight = model.linear2.weight
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
accelerator.save_model(model, tmp_dir, safe_serialization=use_safetensors)
|
||||
# load model with offloaded layers
|
||||
load_checkpoint_and_dispatch(model, tmp_dir, device_map=device_map, offload_folder=tmp_dir)
|
||||
cpu_onloaded_layer = get_state_dict_from_offload(
|
||||
model.linear2, "linear2.weight", {"linear2.weight": ""}, device_to_put_offload="cpu"
|
||||
)
|
||||
device_onloaded_layer = get_state_dict_from_offload(
|
||||
model.linear2, "linear2.weight", {"linear2.weight": ""}, device_to_put_offload=0
|
||||
)
|
||||
cpu_onloaded_layer_weight = cpu_onloaded_layer["linear2.weight"]
|
||||
device_onloaded_layer_weight = device_onloaded_layer["linear2.weight"]
|
||||
|
||||
assert torch.allclose(offloaded_layer_weight, cpu_onloaded_layer_weight)
|
||||
assert torch.allclose(
|
||||
offloaded_layer_weight, device_onloaded_layer_weight.to("cpu")
|
||||
) # must be on the same device for torch.allclose()
|
||||
assert cpu_onloaded_layer_weight.device.type == "cpu"
|
||||
assert device_onloaded_layer_weight.device.type == torch_device
|
||||
|
||||
@parameterized.expand([True, False], name_func=parameterized_custom_name_func)
|
||||
def test_save_load_model_with_hooks(self, use_safetensors):
|
||||
accelerator = Accelerator()
|
||||
@ -385,7 +457,7 @@ class AcceleratorTester(AccelerateTestCase):
|
||||
getattr(valid_dl, "_is_accelerate_prepared", False) is True
|
||||
), "Valid Dataloader is missing `_is_accelerator_prepared` or is set to `False`"
|
||||
|
||||
@require_non_torch_xla
|
||||
@require_cuda
|
||||
@slow
|
||||
@require_bnb
|
||||
def test_accelerator_bnb(self):
|
||||
@ -402,7 +474,7 @@ class AcceleratorTester(AccelerateTestCase):
|
||||
# This should work
|
||||
model = accelerator.prepare(model)
|
||||
|
||||
@require_non_torch_xla
|
||||
@require_cuda
|
||||
@slow
|
||||
@require_bnb
|
||||
def test_accelerator_bnb_cpu_error(self):
|
||||
@ -431,7 +503,7 @@ class AcceleratorTester(AccelerateTestCase):
|
||||
@require_non_torch_xla
|
||||
@slow
|
||||
@require_bnb
|
||||
@require_multi_device
|
||||
@require_multi_gpu
|
||||
def test_accelerator_bnb_multi_device(self):
|
||||
"""Tests that the accelerator can be used with the BNB library."""
|
||||
from transformers import AutoModelForCausalLM
|
||||
@ -467,7 +539,7 @@ class AcceleratorTester(AccelerateTestCase):
|
||||
@require_non_torch_xla
|
||||
@slow
|
||||
@require_bnb
|
||||
@require_multi_device
|
||||
@require_multi_gpu
|
||||
def test_accelerator_bnb_multi_device_no_distributed(self):
|
||||
"""Tests that the accelerator can be used with the BNB library."""
|
||||
from transformers import AutoModelForCausalLM
|
||||
@ -496,6 +568,22 @@ class AcceleratorTester(AccelerateTestCase):
|
||||
accelerator = Accelerator(cpu=True)
|
||||
_ = accelerator.prepare(sgd)
|
||||
|
||||
@require_transformer_engine
|
||||
def test_can_unwrap_model_te(self):
|
||||
model, optimizer, *_ = create_components()
|
||||
fp8_recipe = FP8RecipeKwargs(backend="TE")
|
||||
accelerator = Accelerator(mixed_precision="fp8", kwargs_handlers=[fp8_recipe])
|
||||
inputs = torch.randn(10, 2).to(torch_device)
|
||||
model, optimizer = accelerator.prepare(model, optimizer)
|
||||
model(inputs) # sanity check that this works
|
||||
|
||||
model = accelerator.unwrap_model(model, keep_fp32_wrapper=False)
|
||||
model(inputs) # check that this still works
|
||||
|
||||
# check that pickle roundtrip works
|
||||
model_loaded = pickle.loads(pickle.dumps(model))
|
||||
model_loaded(inputs)
|
||||
|
||||
@require_non_cpu
|
||||
def test_can_unwrap_model_fp16(self):
|
||||
# test for a regression introduced in #872
|
||||
|
||||
@ -13,6 +13,7 @@
|
||||
# limitations under the License.
|
||||
import copy
|
||||
import gc
|
||||
import logging
|
||||
import os
|
||||
import unittest
|
||||
from collections import OrderedDict
|
||||
@ -45,6 +46,7 @@ from accelerate.test_utils import (
|
||||
from accelerate.utils import is_torch_version, offload_state_dict
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
torch_device = f"{torch_device}:0" if torch_device != "cpu" else "cpu"
|
||||
|
||||
|
||||
@ -306,6 +308,32 @@ class BigModelingTester(unittest.TestCase):
|
||||
== "Hello world! My name is Kiyoshi, and I'm a student at the University of Tokyo"
|
||||
)
|
||||
|
||||
@require_non_cpu
|
||||
def test_dispatch_model_and_remove_hook(self):
|
||||
model = ModelForTest()
|
||||
device_map = {"linear1": "cpu", "batchnorm": "cpu", "linear2": 0}
|
||||
x = torch.randn(2, 3)
|
||||
expected = model(x)
|
||||
|
||||
with TemporaryDirectory() as tmp_dir:
|
||||
dispatch_model(model, device_map, offload_dir=tmp_dir)
|
||||
output = model(x)
|
||||
remove_hook_from_submodules(model)
|
||||
# need to check if we get any warning
|
||||
with self.assertLogs(level="WARNING") as cm:
|
||||
# We want to assert there are no warnings, but the 'assertLogs' method does not support that.
|
||||
# Therefore, we are adding a dummy warning, and then we will assert it is the only warning.
|
||||
model.to(torch_device)
|
||||
logger.warning("Dummy warning")
|
||||
self.assertEqual(len(cm.records), 1)
|
||||
self.assertIn(
|
||||
"Dummy warning",
|
||||
cm.records[0].message,
|
||||
)
|
||||
output_bis = model(x.to(torch_device))
|
||||
assert torch.allclose(expected, output.cpu(), atol=1e-5)
|
||||
assert torch.allclose(expected, output_bis.cpu(), atol=1e-5)
|
||||
|
||||
@require_non_cpu
|
||||
def test_dispatch_model(self):
|
||||
model = ModelForTest()
|
||||
@ -484,7 +512,12 @@ class BigModelingTester(unittest.TestCase):
|
||||
del model
|
||||
gc.collect()
|
||||
|
||||
# This test fails because sometimes data_ptr() of compute2.weight is the same as compute1.weight.
|
||||
# I checked that the values are not the same but it gives the same address. This does not happen on my local machine.
|
||||
@require_cuda
|
||||
@unittest.skip(
|
||||
"Flaky test, we should have enough coverage with test_dispatch_model_tied_weights_memory_with_nested_offload_cpu test"
|
||||
)
|
||||
def test_dispatch_model_tied_weights_memory_with_nested_offload_disk(self):
|
||||
# Test that we do not duplicate tied weights at any point during dispatch_model call.
|
||||
|
||||
@ -622,7 +655,7 @@ class BigModelingTester(unittest.TestCase):
|
||||
with self.assertRaises(RuntimeError):
|
||||
model.to(0)
|
||||
|
||||
@require_multi_gpu
|
||||
@require_multi_device
|
||||
def test_dispatch_model_move_model_warning(self):
|
||||
model = ModelForTest()
|
||||
device_map = {"linear1": 0, "batchnorm": 0, "linear2": 1}
|
||||
@ -631,7 +664,7 @@ class BigModelingTester(unittest.TestCase):
|
||||
with self.assertLogs("accelerate.big_modeling", level="WARNING"):
|
||||
model.to("cpu")
|
||||
with self.assertLogs("accelerate.big_modeling", level="WARNING"):
|
||||
model.cuda(0)
|
||||
model.to(torch_device)
|
||||
with self.assertRaises(RuntimeError):
|
||||
x = torch.randn(2, 3)
|
||||
model(x)
|
||||
@ -746,7 +779,7 @@ class BigModelingTester(unittest.TestCase):
|
||||
|
||||
# CPU-offloaded weights are on the meta device while waiting for the forward pass.
|
||||
assert new_model.linear1.weight.device == torch.device("meta")
|
||||
assert new_model.linear2.weight.device == torch.device(0)
|
||||
assert new_model.linear2.weight.device == torch.device(torch_device)
|
||||
|
||||
output = new_model(x)
|
||||
assert torch.allclose(expected, output.cpu(), atol=1e-5)
|
||||
@ -769,8 +802,8 @@ class BigModelingTester(unittest.TestCase):
|
||||
# CPU-offloaded weights are on the meta device while waiting for the forward pass.
|
||||
assert new_model.linear1.weight.device == torch.device("meta")
|
||||
assert new_model.linear2.weight.device == torch.device("meta")
|
||||
assert new_model.linear3.weight.device == torch.device(0)
|
||||
assert new_model.linear4.weight.device == torch.device(1)
|
||||
assert new_model.linear3.weight.device == torch.device(torch_device)
|
||||
assert new_model.linear4.weight.device == torch.device(torch_device.replace(":0", ":1"))
|
||||
|
||||
output = new_model(x)
|
||||
assert torch.allclose(expected, output.cpu(), atol=1e-5)
|
||||
@ -795,8 +828,8 @@ class BigModelingTester(unittest.TestCase):
|
||||
# CPU-offloaded weights are on the meta device while waiting for the forward pass.
|
||||
assert new_model.linear1.linear.weight.device == torch.device("meta")
|
||||
assert new_model.linear2.linear.weight.device == torch.device("meta")
|
||||
assert new_model.linear3.linear.weight.device == torch.device(0)
|
||||
assert new_model.linear4.linear.weight.device == torch.device(0)
|
||||
assert new_model.linear3.linear.weight.device == torch.device(torch_device)
|
||||
assert new_model.linear4.linear.weight.device == torch.device(torch_device)
|
||||
|
||||
output = new_model(x)
|
||||
assert torch.allclose(expected, output.cpu(), atol=1e-5)
|
||||
@ -821,8 +854,8 @@ class BigModelingTester(unittest.TestCase):
|
||||
# CPU-offloaded weights are on the meta device while waiting for the forward pass.
|
||||
assert new_model.linear1.linear.weight.device == torch.device("meta")
|
||||
assert new_model.linear2.linear.weight.device == torch.device("meta")
|
||||
assert new_model.linear3.linear.weight.device == torch.device(0)
|
||||
assert new_model.linear4.linear.weight.device == torch.device(1)
|
||||
assert new_model.linear3.linear.weight.device == torch.device(torch_device)
|
||||
assert new_model.linear4.linear.weight.device == torch.device(torch_device.replace(":0", ":1"))
|
||||
|
||||
output = new_model(x)
|
||||
assert torch.allclose(expected, output.cpu(), atol=1e-5)
|
||||
@ -835,8 +868,8 @@ class BigModelingTester(unittest.TestCase):
|
||||
|
||||
inputs = torch.randn(3, 4)
|
||||
outputs = model1(inputs)
|
||||
assert outputs.device == torch.device(0)
|
||||
assert model1.weight.device == torch.device(0)
|
||||
assert outputs.device == torch.device(torch_device)
|
||||
assert model1.weight.device == torch.device(torch_device)
|
||||
|
||||
hook1.offload()
|
||||
assert model1.weight.device == torch.device("cpu")
|
||||
@ -846,13 +879,13 @@ class BigModelingTester(unittest.TestCase):
|
||||
assert model2.weight.device == torch.device("cpu")
|
||||
|
||||
outputs = model1(inputs)
|
||||
assert outputs.device == torch.device(0)
|
||||
assert model1.weight.device == torch.device(0)
|
||||
assert outputs.device == torch.device(torch_device)
|
||||
assert model1.weight.device == torch.device(torch_device)
|
||||
|
||||
outputs = model2(outputs)
|
||||
assert outputs.device == torch.device(0)
|
||||
assert outputs.device == torch.device(torch_device)
|
||||
assert model1.weight.device == torch.device("cpu")
|
||||
assert model2.weight.device == torch.device(0)
|
||||
assert model2.weight.device == torch.device(torch_device)
|
||||
|
||||
hook2.offload()
|
||||
assert model2.weight.device == torch.device("cpu")
|
||||
@ -889,7 +922,7 @@ class BigModelingTester(unittest.TestCase):
|
||||
assert model.h[(-1)].self_attention.query_key_value.weight.dtype == torch.int8
|
||||
assert model.h[(-1)].self_attention.query_key_value.weight.device.index == 1
|
||||
|
||||
@require_non_torch_xla
|
||||
@require_cuda
|
||||
@slow
|
||||
@require_bnb
|
||||
def test_dispatch_model_int8_simple(self):
|
||||
@ -952,7 +985,7 @@ class BigModelingTester(unittest.TestCase):
|
||||
assert model.h[0].self_attention.query_key_value.weight.dtype == torch.int8
|
||||
assert model.h[0].self_attention.query_key_value.weight.device.index == 0
|
||||
|
||||
@require_non_torch_xla
|
||||
@require_cuda
|
||||
@slow
|
||||
@require_bnb
|
||||
def test_dipatch_model_fp4_simple(self):
|
||||
|
||||
@ -20,7 +20,7 @@ from unittest.mock import patch
|
||||
import torch
|
||||
from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
|
||||
|
||||
from accelerate.commands.config.config_args import BaseConfig, ClusterConfig, SageMakerConfig
|
||||
from accelerate.commands.config.config_args import BaseConfig, ClusterConfig, SageMakerConfig, load_config_from_file
|
||||
from accelerate.commands.estimate import estimate_command, estimate_command_parser, gather_data
|
||||
from accelerate.commands.launch import _validate_launch_command, launch_command_parser
|
||||
from accelerate.test_utils import execute_subprocess_async
|
||||
@ -73,8 +73,9 @@ class AccelerateLauncherTester(unittest.TestCase):
|
||||
execute_subprocess_async(cmd, env=os.environ.copy())
|
||||
|
||||
def test_config_compatibility(self):
|
||||
invalid_configs = ["fp8", "invalid", "mpi", "sagemaker"]
|
||||
for config in sorted(self.test_config_path.glob("**/*.yaml")):
|
||||
if "invalid" in str(config) or "mpi" in str(config):
|
||||
if any(invalid_config in str(config) for invalid_config in invalid_configs):
|
||||
continue
|
||||
with self.subTest(config_file=config):
|
||||
cmd = get_launch_command(config_file=config) + [self.test_file_path]
|
||||
@ -196,6 +197,8 @@ class ClusterConfigTester(unittest.TestCase):
|
||||
Test case for verifying the config dataclasses work
|
||||
"""
|
||||
|
||||
test_config_path = Path("tests/test_configs")
|
||||
|
||||
def test_base_config(self):
|
||||
# Tests that all the dataclasses can be initialized
|
||||
config = BaseConfig(
|
||||
@ -257,6 +260,8 @@ class ClusterConfigTester(unittest.TestCase):
|
||||
assert config.ec2_instance_type == "MY_TYPE"
|
||||
assert config.iam_role_name == "MY_ROLE"
|
||||
|
||||
config = load_config_from_file(str(self.test_config_path / "0_30_0_sagemaker.yaml"))
|
||||
|
||||
|
||||
class TpuConfigTester(unittest.TestCase):
|
||||
"""
|
||||
@ -430,7 +435,10 @@ class ModelEstimatorTester(unittest.TestCase):
|
||||
estimate_command(args)
|
||||
|
||||
def test_gated(self):
|
||||
with self.assertRaises(GatedRepoError, msg="Repo for model `meta-llama/Llama-2-7b-hf` is gated"):
|
||||
with self.assertRaises(
|
||||
(GatedRepoError, EnvironmentError),
|
||||
msg="Repo for model `meta-llama/Llama-2-7b-hf` is gated or environment error occurred",
|
||||
):
|
||||
args = self.parser.parse_args(["meta-llama/Llama-2-7b-hf"])
|
||||
with patch_environment(hf_hub_disable_implicit_token="1"):
|
||||
estimate_command(args)
|
||||
@ -451,7 +459,7 @@ class ModelEstimatorTester(unittest.TestCase):
|
||||
args = self.parser.parse_args(["bert-base-cased", "--dtypes", "float32", "float16"])
|
||||
output = gather_data(args)
|
||||
# The largest layer and total size of the model in bytes
|
||||
largest_layer, total_size = 89075712, 433249280
|
||||
largest_layer, total_size = 90669056, 433249280
|
||||
# Check that full precision -> int4 is calculating correctly
|
||||
assert len(output) == 2, f"Output was missing a precision, expected 2 but received {len(output)}"
|
||||
|
||||
@ -479,7 +487,7 @@ class ModelEstimatorTester(unittest.TestCase):
|
||||
args = self.parser.parse_args(["bert-base-cased", "--dtypes", "float32"])
|
||||
output = gather_data(args)
|
||||
# The largest layer and total size of the model in bytes
|
||||
largest_layer, total_size = 89075712, 433249280
|
||||
largest_layer, total_size = 90669056, 433249280
|
||||
assert (
|
||||
largest_layer == output[0][1]
|
||||
), f"Calculation for largest layer size in `fp32` is incorrect, expected {largest_layer} but received {output[0][1]}"
|
||||
|
||||
8
tests/test_configs/0_30_0_sagemaker.yaml
Normal file
8
tests/test_configs/0_30_0_sagemaker.yaml
Normal file
@ -0,0 +1,8 @@
|
||||
compute_environment: AMAZON_SAGEMAKER
|
||||
debug: false
|
||||
distributed_type: NO
|
||||
mixed_precision: fp16
|
||||
debug: false
|
||||
use_cpu: false
|
||||
ec2_instance_type: MY_TYPE
|
||||
iam_role_name: MY_ROLE
|
||||
26
tests/test_configs/0_34_0_fp8.yaml
Normal file
26
tests/test_configs/0_34_0_fp8.yaml
Normal file
@ -0,0 +1,26 @@
|
||||
compute_environment: LOCAL_MACHINE
|
||||
debug: false
|
||||
distributed_type: MULTI_GPU
|
||||
downcast_bf16: 'no'
|
||||
enable_cpu_affinity: false
|
||||
fp8_config:
|
||||
amax_compute_algorithm: max
|
||||
amax_history_length: 1024
|
||||
backend: TE
|
||||
fp8_format: E4M3
|
||||
interval: 1
|
||||
margin: 0
|
||||
override_linear_precision: false
|
||||
use_autocast_during_eval: false
|
||||
gpu_ids: all
|
||||
machine_rank: 0
|
||||
main_training_function: main
|
||||
mixed_precision: fp8
|
||||
num_machines: 1
|
||||
num_processes: 2
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
@ -28,6 +28,7 @@ from accelerate.test_utils.testing import (
|
||||
TempDirTestCase,
|
||||
get_launch_command,
|
||||
require_huggingface_suite,
|
||||
require_multi_device,
|
||||
require_multi_gpu,
|
||||
require_pippy,
|
||||
require_schedulefree,
|
||||
@ -54,6 +55,8 @@ EXCLUDE_EXAMPLES = [
|
||||
"deepspeed_with_config_support.py",
|
||||
"megatron_lm_gpt_pretraining.py",
|
||||
"early_stopping.py",
|
||||
"ddp_comm_hook.py",
|
||||
"profiler.py",
|
||||
]
|
||||
|
||||
|
||||
@ -247,12 +250,21 @@ class FeatureExamplesTests(TempDirTestCase):
|
||||
testargs = ["examples/by_feature/early_stopping.py"]
|
||||
run_command(self.launch_args + testargs)
|
||||
|
||||
@require_multi_gpu
|
||||
def test_profiler(self):
|
||||
testargs = ["examples/by_feature/profiler.py"]
|
||||
run_command(self.launch_args + testargs)
|
||||
|
||||
@require_multi_device
|
||||
def test_ddp_comm_hook(self):
|
||||
testargs = ["examples/by_feature/ddp_comm_hook.py", "--ddp_comm_hook", "fp16"]
|
||||
run_command(self.launch_args + testargs)
|
||||
|
||||
@require_multi_device
|
||||
def test_distributed_inference_examples_stable_diffusion(self):
|
||||
testargs = ["examples/inference/distributed/stable_diffusion.py"]
|
||||
run_command(self.launch_args + testargs)
|
||||
|
||||
@require_multi_gpu
|
||||
@require_multi_device
|
||||
def test_distributed_inference_examples_phi2(self):
|
||||
testargs = ["examples/inference/distributed/phi2.py"]
|
||||
run_command(self.launch_args + testargs)
|
||||
|
||||
@ -166,21 +166,21 @@ class HooksModelTester(unittest.TestCase):
|
||||
add_hook_to_module(model.batchnorm, AlignDevicesHook(execution_device=0))
|
||||
add_hook_to_module(model.linear2, AlignDevicesHook(execution_device=1))
|
||||
|
||||
assert model.linear1.weight.device == torch.device(0)
|
||||
assert model.batchnorm.weight.device == torch.device(0)
|
||||
assert model.batchnorm.running_mean.device == torch.device(0)
|
||||
assert model.linear2.weight.device == torch.device(1)
|
||||
assert model.linear1.weight.device == torch.device(torch_device)
|
||||
assert model.batchnorm.weight.device == torch.device(torch_device)
|
||||
assert model.batchnorm.running_mean.device == torch.device(torch_device)
|
||||
assert model.linear2.weight.device == torch.device(torch_device.replace(":0", ":1"))
|
||||
|
||||
# We can still make a forward pass. The input does not need to be on any particular device
|
||||
x = torch.randn(2, 3)
|
||||
output = model(x)
|
||||
assert output.device == torch.device(1)
|
||||
assert output.device == torch.device(torch_device.replace(":0", ":1"))
|
||||
|
||||
# We can add a general hook to put back output on same device as input.
|
||||
add_hook_to_module(model, AlignDevicesHook(io_same_device=True))
|
||||
x = torch.randn(2, 3).to(torch_device)
|
||||
output = model(x)
|
||||
assert output.device == torch.device(0)
|
||||
assert output.device == torch.device(torch_device)
|
||||
|
||||
def test_align_devices_as_cpu_offload(self):
|
||||
model = ModelForTest()
|
||||
|
||||
83
tests/test_imports.py
Normal file
83
tests/test_imports.py
Normal file
@ -0,0 +1,83 @@
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import subprocess
|
||||
|
||||
from accelerate.test_utils.testing import TempDirTestCase, require_import_timer
|
||||
from accelerate.utils import is_import_timer_available
|
||||
|
||||
|
||||
if is_import_timer_available():
|
||||
from import_timer import calculate_total_time, read_import_profile
|
||||
from import_timer.core import get_paths_above_threshold, sort_nodes_by_total_time
|
||||
|
||||
|
||||
def convert_list_to_string(data):
|
||||
end_result = ""
|
||||
arrow_right = "->"
|
||||
for path in data:
|
||||
end_result += f"{arrow_right.join(path[0])} {path[1]:.3f}s\n"
|
||||
return end_result
|
||||
|
||||
|
||||
def run_import_time(command: str):
|
||||
output = subprocess.run(["python3", "-X", "importtime", "-c", command], capture_output=True, text=True)
|
||||
return output.stderr
|
||||
|
||||
|
||||
@require_import_timer
|
||||
class ImportSpeedTester(TempDirTestCase):
|
||||
"""
|
||||
Test suite which checks if imports have seen slowdowns
|
||||
based on a particular baseline.
|
||||
|
||||
If the error messages are not clear enough to get a
|
||||
full view of what is slowing things down (or to
|
||||
figure out how deep the initial depth should be),
|
||||
please view the profile with the `tuna` framework:
|
||||
`tuna import.log`.
|
||||
"""
|
||||
|
||||
clear_on_setup = False
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
super().setUpClass()
|
||||
output = run_import_time("import torch")
|
||||
data = read_import_profile(output)
|
||||
total_time = calculate_total_time(data)
|
||||
cls.pytorch_time = total_time
|
||||
|
||||
def test_base_import(self):
|
||||
output = run_import_time("import accelerate")
|
||||
data = read_import_profile(output)
|
||||
total_time = calculate_total_time(data)
|
||||
pct_more = (total_time - self.pytorch_time) / self.pytorch_time * 100
|
||||
# Base import should never be more than 20% slower than raw torch import
|
||||
err_msg = f"Base import is more than 20% slower than raw torch import ({pct_more:.2f}%), please check the attached `tuna` profile:\n"
|
||||
sorted_data = sort_nodes_by_total_time(data)
|
||||
paths_above_threshold = get_paths_above_threshold(sorted_data, 0.05, max_depth=7)
|
||||
err_msg += f"\n{convert_list_to_string(paths_above_threshold)}"
|
||||
self.assertLess(pct_more, 20, err_msg)
|
||||
|
||||
def test_cli_import(self):
|
||||
output = run_import_time("from accelerate.commands.launch import launch_command_parser")
|
||||
data = read_import_profile(output)
|
||||
total_time = calculate_total_time(data)
|
||||
pct_more = (total_time - self.pytorch_time) / self.pytorch_time * 100
|
||||
# Base import should never be more than 20% slower than raw torch import
|
||||
err_msg = f"Base import is more than 20% slower than raw torch import ({pct_more:.2f}%), please check the attached `tuna` profile:\n"
|
||||
sorted_data = sort_nodes_by_total_time(data)
|
||||
paths_above_threshold = get_paths_above_threshold(sorted_data, 0.05, max_depth=7)
|
||||
err_msg += f"\n{convert_list_to_string(paths_above_threshold)}"
|
||||
self.assertLess(pct_more, 20, err_msg)
|
||||
@ -24,11 +24,13 @@ from accelerate.state import AcceleratorState
|
||||
from accelerate.test_utils import (
|
||||
DEFAULT_LAUNCH_COMMAND,
|
||||
execute_subprocess_async,
|
||||
path_in_accelerate_package,
|
||||
require_multi_device,
|
||||
require_non_cpu,
|
||||
require_non_xpu,
|
||||
)
|
||||
from accelerate.utils import AutocastKwargs, KwargsHandler, TorchDynamoPlugin, clear_environment
|
||||
from accelerate.test_utils.testing import slow
|
||||
from accelerate.utils import AutocastKwargs, KwargsHandler, ProfileKwargs, TorchDynamoPlugin, clear_environment
|
||||
from accelerate.utils.dataclasses import DistributedType
|
||||
|
||||
|
||||
@ -95,6 +97,52 @@ class KwargsHandlerTester(unittest.TestCase):
|
||||
# We should be back in fp16
|
||||
assert g_float16.dtype == torch.float16
|
||||
|
||||
@slow
|
||||
def test_profile_kwargs(self):
|
||||
# Arrange
|
||||
schedule_options = [
|
||||
dict(wait=1, warmup=1, active=2, repeat=1),
|
||||
dict(wait=2, warmup=2, active=2, repeat=2),
|
||||
dict(wait=0, warmup=1, active=3, repeat=3, skip_first=1),
|
||||
dict(wait=3, warmup=2, active=1, repeat=1, skip_first=2),
|
||||
dict(wait=1, warmup=0, active=1, repeat=5),
|
||||
]
|
||||
|
||||
total_steps = 100
|
||||
|
||||
for option in schedule_options:
|
||||
count = 0
|
||||
table_outputs = []
|
||||
steps_per_cycle = option["wait"] + option["warmup"] + option["active"]
|
||||
effective_steps = max(0, total_steps - option.get("skip_first", 0))
|
||||
cycles = effective_steps // steps_per_cycle
|
||||
if option["repeat"] > 0:
|
||||
expected_count = min(cycles, option["repeat"])
|
||||
else:
|
||||
expected_count = cycles
|
||||
|
||||
def on_trace_ready(prof):
|
||||
nonlocal count
|
||||
nonlocal table_outputs
|
||||
|
||||
count += 1
|
||||
table_outputs.append(prof.key_averages().table(sort_by="cpu_time_total", row_limit=-1))
|
||||
|
||||
kwargs = ProfileKwargs(activities=["cpu"], on_trace_ready=on_trace_ready, schedule_option=option)
|
||||
accelerator = Accelerator(kwargs_handlers=[kwargs])
|
||||
|
||||
# Act
|
||||
with accelerator.profile() as prof:
|
||||
for _ in range(total_steps):
|
||||
prof.step()
|
||||
torch.tensor([1, 2, 3, 4, 5], device=accelerator.device)
|
||||
|
||||
# Assert
|
||||
assert isinstance(prof, torch.profiler.profile)
|
||||
assert count == expected_count, f"Option: {option}, Expected count: {expected_count}, but got {count}"
|
||||
for output in table_outputs:
|
||||
self.assertIn("CPU time total:", output)
|
||||
|
||||
def test_torch_dynamo_plugin(self):
|
||||
with clear_environment():
|
||||
prefix = "ACCELERATE_DYNAMO_"
|
||||
@ -107,6 +155,11 @@ class KwargsHandlerTester(unittest.TestCase):
|
||||
assert dynamo_plugin_kwargs == {"backend": "aot_ts_nvfuser", "mode": "reduce-overhead"}
|
||||
assert os.environ.get(prefix + "BACKEND") != "aot_ts_nvfuser"
|
||||
|
||||
@require_multi_device
|
||||
def test_ddp_comm_hook(self):
|
||||
cmd = DEFAULT_LAUNCH_COMMAND + [path_in_accelerate_package("test_utils", "scripts", "test_ddp_comm_hook.py")]
|
||||
execute_subprocess_async(cmd)
|
||||
|
||||
|
||||
def main():
|
||||
ddp_scaler = DistributedDataParallelKwargs(bucket_cap_mb=15, find_unused_parameters=True)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user