Compare commits

...

66 Commits

Author SHA1 Message Date
72e214f561 Update accelerator 2024-05-14 12:00:35 -04:00
ab14a5e6a1 Use partial state for fsdp saving/loading 2024-05-14 11:59:07 -04:00
27a607ea90 Fix small edge case in get_module_leaves (#2774)
* fix edge case

* fix
2024-05-14 11:52:51 +02:00
aa21174de9 fix minor typo (#2767) 2024-05-13 08:24:01 -04:00
6cf1cc0a39 optimize get_module_leaves speed (#2756)
* optimize get_module_leaves

* fix format

* Update modeling.py
2024-05-13 08:23:38 -04:00
bb465a9cf0 Sets default to PyTorch defaults based on backend (#2758)
* Amd

* Add timeout defaults to match pytorch

* forward contrib credits from discussions

* oop

---------

Co-authored-by: Julian Buchel <jubueche@users.noreply.github.com>
2024-05-13 05:41:15 -04:00
67308ca6ef Enable sharded cpu resume (#2762) 2024-05-10 11:39:37 -04:00
63772f6ac2 Revert "Simplify CLI args validation and ensure CLI args take precedence over config file." (#2763)
This reverts commit 724824abbe0aed8606661bbce5e057c0d2447794.
2024-05-10 11:22:56 -04:00
8798cf06ab fix cpu omp num threads set (#2755)
* fix cpu omp num threads set

* fix OMP_NUM_THREADS

* consider no-cpu usage

* fix style
2024-05-10 11:16:06 -04:00
47bb2dd53e Fix sagemaker config (#2753)
* Fix sagemaker

* Default to False

* Include fixes

* Nit

* Ignore launching
2024-05-10 09:09:36 -04:00
724824abbe Simplify CLI args validation and ensure CLI args take precedence over config file. (#2757)
* Remove unnecessary args.debug statement

* Add expected test failure for config sub-sections

* Remove redundancy in config file args parsing

* Make config file --cpu logic more explicit
2024-05-09 09:30:13 -04:00
YH
afc2c99e6a Fix duplicate environment variable check in multi-cpu condition (#2752)
* Del duplicted key

* Apply format
2024-05-07 14:27:29 -04:00
0fb95a2d3b Fix max_memory assignment (#2751) 2024-05-07 11:53:25 +02:00
7ac153f404 LOMO / FIX: Support multiple optimizers (#2745) 2024-05-06 08:28:14 -04:00
0f1b91bb74 Fix stacklevel in logging to log the actual user call site (instead of the call site inside the logger wrapper) of log functions (#2730)
* fix stacklevel in logging to log info about the actual user callsite

* Add two tests for stacklevel in logging

---------

Co-authored-by: luowyang <luowyang@github.com>
2024-05-06 08:21:19 -04:00
d1eb44c856 Fixed the problem of incorrect conditional judgment statement when configuring enable_cpu_affinity (#2748) 2024-05-06 08:20:22 -04:00
11a363287a Update modeling.py by adding try-catch section to skip the unavailable devices (#2681)
* Update modeling.py to ignore the unavailable devices

* Update src/accelerate/utils/modeling.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

Update src/accelerate/utils/modeling.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

Update src/accelerate/utils/modeling.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

Update src/accelerate/utils/modeling.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

---------

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
2024-05-06 12:44:35 +02:00
LFu
5cfe409443 Add feature to allow redirecting std streams into log files when using torchrun as the launcher. (#2740)
* Add --log-dir/--log_dir to `distributed_args` to allow redirecting std
streams into log files when using torchrun as the launcher. Used with
--tee this will acheive similar effect as running with `torchrun --tee X
--log-dir=logs`.

* Deleted the unecessary "--log-dir" argument following suggestion from
@muellerzr, since it will be automatically generated from "--log_dir".
2024-05-04 15:03:05 -04:00
5b3a7f3892 Update setup.py + test falures found during release 2024-05-03 10:40:25 -04:00
060361fca3 Fix tests on main (#2739)
* Start

* Fixings
2024-05-03 10:18:20 -04:00
6ac27e2383 FEAT: Add LOMO optimizer (#2695)
* add v1 lomo

* final fixes

* fix

* Update src/accelerate/accelerator.py

Co-authored-by: Zach Mueller <muellerzr@gmail.com>

* add comment

* more comments

* fix

---------

Co-authored-by: Zach Mueller <muellerzr@gmail.com>
2024-05-03 10:55:44 +02:00
YH
ba5f49219f Fix offload device type (#2717) 2024-05-02 17:07:24 +05:30
2c767338f2 Fix Documentation in FSDP and DeepSpeed Concept Guide (#2725)
* address part of stats comments

* automatically set sync_module_states if low_cpu_mem is set

* Apply suggestions from @stas00

Co-authored-by: Stas Bekman <stas00@users.noreply.github.com>

* add links from fsdp and deepspeed docs. fix deepspeed imports

* replace raise in accelerate.launch

---------

Co-authored-by: Stas Bekman <stas00@users.noreply.github.com>
2024-05-01 09:25:18 -04:00
234a85506d Docs: Fix build main documentation (#2729) 2024-05-01 08:18:52 -04:00
232ebd159a Fix sampler (#2728) 2024-05-01 12:20:26 +02:00
4d3d4bc88f fix sampler serialization (#2723)
* fix sampler serialization

* add getter and setter for sampler

* more maintenable
2024-04-30 11:19:05 +02:00
2b1e7bd462 Fixup free_memory to deal with garbage collection (#2716)
* Fixup cleanup

* Return

* Fixup test

* Fix test

* DeepSpeed

* More careful guard

* bring back as none

* passing

* bring forward
2024-04-30 03:28:57 -04:00
c7e5e41b8c Segment out a deepspeed docker image (#2707)
* Segment out a deepspeed docker image

* Update readme

* Keep pinned ds
2024-04-29 11:25:22 -04:00
9557598c45 Add Upcasting for FSDP in Mixed Precision. Add Concept Guide for FSPD and DeepSpeed. (#2674)
* draft fsdp vs ds

* reframe to migration doc

* updated functionality section

* cast to float32

* improvements to float32 casting

* some cleanup

* addressed @pacman100's comments

* Apply some of @muellerz suggestions

Co-authored-by: Zach Mueller <muellerzr@gmail.com>

* change to subsections

* changed the manner upcasting warnings are surfaced

* update document to discuss fsdp and ds plugins. minor fixes.

* @muellerzr's new suggestions

Co-authored-by: Zach Mueller <muellerzr@gmail.com>

* explain all-or-nothing

* add @pacman100's comments

Co-authored-by: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>

* minor fix

---------

Co-authored-by: Yu Chin Fabian Lim <flim@sg.ibm.com>
Co-authored-by: Zach Mueller <muellerzr@gmail.com>
Co-authored-by: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
2024-04-29 11:19:03 -04:00
156331aecd allow gather_for_metrics to be more flexible (#2710)
* allow gather_for_metrics to be more flexible

* style

* udapte doc

* fix

* style

* typo

* typo

* Update src/accelerate/accelerator.py

Co-authored-by: Zach Mueller <muellerzr@gmail.com>

* remove distributed

* clean

---------

Co-authored-by: Zach Mueller <muellerzr@gmail.com>
2024-04-29 12:14:22 +02:00
cd7df4117d fix bnb multi gpu training (#2714)
* fix bnb multi gpu training

* style

* elif instead

* fix

* style

* fix
2024-04-26 15:52:15 +02:00
6af157ea93 Add diffusers to req (#2711) 2024-04-25 08:31:54 -04:00
83317b3081 add distributed examples (#2672)
* add distributed examples

* typo

* uncomment

* require multigpu

* add stable diffusion example

* style

* add copyright

* style

* remove tqdm

* Apply suggestions from code review

Co-authored-by: Zach Mueller <muellerzr@gmail.com>

* add comments

* remove print

* More comments

---------

Co-authored-by: Zach Mueller <muellerzr@gmail.com>
2024-04-25 11:13:56 +02:00
e831bcb3b1 Change dataloader send_to_device calls to non-blocking (#2685)
* Change dataloader send_to_device calls to non-blocking

* add non_blocking to dataloader dataclass

* add dataloader non blocking option from dataclass

* add handling for non blocking to accelerator

* add notes on non-blocking transfers to quicktour

* link to dataloaderconfiguration in docs

* linting

* "requires" -> "recommended" on non-blocking setting

Co-authored-by: Zach Mueller <muellerzr@gmail.com>

---------

Co-authored-by: drhead <a@a.a>
Co-authored-by: Zach Mueller <muellerzr@gmail.com>
2024-04-24 15:45:57 -04:00
092c3af0c4 Add version checks for the import of DeepSpeed moe utils (#2705)
* fix import for moe utils

* Apply suggestions from code review

Co-authored-by: Zach Mueller <muellerzr@gmail.com>

---------

Co-authored-by: Zach Mueller <muellerzr@gmail.com>
2024-04-25 00:38:56 +05:30
3e944c5583 add cann version info to command accelerate env (#2689) 2024-04-24 09:17:09 -04:00
f67737363c Do a pip freeze during workflows (#2704)
* Do a pip freeze

* No need to do source activate on non-conda workflow
2024-04-24 08:46:13 -04:00
f7daaaa305 fix support (#2699) 2024-04-23 15:32:43 +02:00
3dc131cd8d Add source code for DataLoader Animation (#2696)
* dl animation

* oops

* Export
2024-04-23 04:28:28 -04:00
ef0f62c12a Simplify test logic (#2697)
* simplify test logic 😅

* 😅
2024-04-23 02:49:55 +05:30
baafaf4a6e Fix the rng states of sampler's generator to be synchronized for correct sharding of dataset across GPUs (#2694)
* Fix the rng states of sampler's generator to be synchronized for correct sharding of dataset across GPUs

* add tests
2024-04-22 13:50:04 -04:00
abc86c0e35 Enable BF16 autocast to everything during FP8 + some tweaks to enable FSDP (#2655)
* Basic autocasting stuff

* Delay fp8 autocast until after DDP wrapping

* More fixes

* Bookmark: without dtype change

* Bookmark: with dtype changes

* Different alternative, better results

* Didn't matter what order, same result

* Revert + maintain

* Fin

* Refactor based on feedback

* native_amp bool

* Final nits
2024-04-18 10:14:35 -04:00
4450cb3132 Deprecate tqdm args + slight logic tweaks (#2673)
* Deprecate + slight logic fix

* Maybe fix test?
2024-04-17 06:26:55 -04:00
fd0dcd1c45 fix backend check (#2670)
* fix backend check

* reformat backend check

* Update src/accelerate/state.py

Co-authored-by: Zach Mueller <muellerzr@gmail.com>

* Update src/accelerate/state.py

Co-authored-by: Zach Mueller <muellerzr@gmail.com>

* raise value error if backend mismatch

* Update src/accelerate/state.py

Co-authored-by: Zach Mueller <muellerzr@gmail.com>

---------

Co-authored-by: Zach Mueller <muellerzr@gmail.com>
2024-04-16 21:22:27 -04:00
f478201c28 Pin DS...again.. (#2679) 2024-04-16 12:07:59 -04:00
c7046845e7 Fix deepspeed moe test with version check (#2677) 2024-04-16 10:22:41 -04:00
701e24c539 Handle MoE models with DeepSpeed (#2662)
* Handle MoE models with DeepSpeed

* Update launch.py

* Update test_deepspeed.py

* Update test_deepspeed.py

* Update src/accelerate/utils/dataclasses.py

Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>

* address comments

* Update deepspeed.md

---------

Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
2024-04-16 16:11:49 +05:30
37da848e6c tqdm: *args should come ahead of main_process_only (#2654)
* Update tqdm.py

* add unit test

* add test to test_utils

* ruff changes
2024-04-15 12:30:28 -04:00
c470a1336a Revert "fix backend check (#2652)" (#2669)
This reverts commit 2fc48c7eeea67e747a39be2dec822b07a27bae71.
2024-04-15 04:30:33 -04:00
581a390e2f Megatron plugin can support NPU (#2667) 2024-04-15 03:02:13 -04:00
2fc48c7eee fix backend check (#2652)
* fix backend check

* fix ccl check
2024-04-15 02:59:29 -04:00
1024231133 Add MLU rng state setter (#2664) 2024-04-15 02:59:10 -04:00
5ca095a34f Fix test_from_pretrained_low_cpu_mem_usage_measured failure (#2644)
This test is to test the change in the memory size occupied by model loading when low_cpu_mem_usage is used.
Therefore, the default device used is cpu. However, when judging whether other devices are available,
new packages will be introduced, causing memory changes and interfering with the test results.

Signed-off-by: yuanwu <yuan.wu@intel.com>
2024-04-12 18:23:28 +02:00
b77c65398c Don't use deprecated Repository anymore (#2658)
* Don't use deprecated Repository anymore

* oops

* Update requirements.txt
2024-04-12 09:05:54 -04:00
YH
a91691463b Fix deepspeed plugin attr type (#2646) 2024-04-12 15:29:16 +05:30
5056d327f8 Allow "auto" for gradient clipping in YAML (#2649)
* Allow "auto" for gradient clipping in YAML

* Update src/accelerate/utils/dataclasses.py

Co-authored-by: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>

* Make style

---------

Co-authored-by: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
2024-04-12 13:44:39 +05:30
c0a37015e3 Typo fix in tracking.md (#2650) 2024-04-10 17:16:11 -04:00
e9b9c7d022 device agnostic testing for hooks&utils&big_modeling (#2602)
* device agnostic testing for hooks&utils&big_modeling

* fix failed test cased on cpu

* make style
2024-04-10 13:56:50 -04:00
6c09584f73 add strict arg to load_checkpoint_and_dispatch (#2641) 2024-04-10 11:20:07 +02:00
b8c8583953 add third-party device prefix to execution_device (#2612)
* add xpu device_map

* fix
2024-04-09 13:47:41 +02:00
df485ae1e3 Parenthesis on xpu_available (#2639) 2024-04-09 06:33:38 -04:00
6386f70103 Fix up state with xla + performance regression (#2634)
* Fix up state with xla

* use backend

* Change last time

* Cmoment

* Slight tweak to use dtype
2024-04-09 06:06:28 -04:00
6d92198ef4 Schedule free optimizer support (#2631)
* Schedule free optimizer supporT

* Fin

* Doc

* Add in eval

* Add to exclude

* Fix module issue
2024-04-08 11:28:27 -04:00
16488be9a4 Update version 2024-04-05 13:11:05 -04:00
685bd3a439 CLean 2024-04-05 13:05:05 -04:00
2e69948c1a Patchfix 2024-04-05 13:04:44 -04:00
78 changed files with 2766 additions and 373 deletions

View File

@ -58,3 +58,24 @@ jobs:
file: docker/accelerate-gpu/Dockerfile
push: true
tags: huggingface/accelerate:gpu-release-${{needs.get-version.outputs.version}}
version-cuda-deepspeed:
name: "Latest Accelerate GPU DeepSpeed [version]"
runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
needs: get-version
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- name: Login to DockerHub
uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
- name: Build and Push GPU
uses: docker/build-push-action@v4
with:
file: docker/accelerate-gpu-deepspeed/Dockerfile
push: true
tags: huggingface/accelerate:gpu-deepspeed-release-${{needs.get-version.outputs.version}}

View File

@ -57,4 +57,29 @@ jobs:
push: true
tags: |
huggingface/accelerate:gpu-nightly
huggingface/accelerate:gpu-nightly-${{ env.date }}
huggingface/accelerate:gpu-nightly-${{ env.date }}
latest-cuda-deepspeed:
name: "Latest Accelerate GPU DeepSpeed [dev]"
runs-on: [self-hosted, nvidia-gpu, t4, ci]
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- name: Login to DockerHub
uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
- name: Get current date
id: date
run: |
echo "date=$(date '+%Y-%m-%d')" >> $GITHUB_ENV
- name: Build and Push GPU
uses: docker/build-push-action@v4
with:
file: docker/accelerate-gpu-deepspeed/Dockerfile
push: true
tags: |
huggingface/accelerate:gpu-deepspeed-nightly
huggingface/accelerate:gpu-deepspeed-nightly-${{ env.date }}

View File

@ -13,5 +13,6 @@ jobs:
with:
commit_sha: ${{ github.sha }}
package: accelerate
custom_container: huggingface/transformers-doc-builder
secrets:
hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}

View File

@ -14,3 +14,4 @@ jobs:
commit_sha: ${{ github.event.pull_request.head.sha }}
pr_number: ${{ github.event.number }}
package: accelerate
custom_container: huggingface/transformers-doc-builder

View File

@ -12,13 +12,13 @@ env:
jobs:
run_all_tests_single_gpu:
run_core_tests_single_gpu:
runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
env:
CUDA_VISIBLE_DEVICES: "0"
TEST_TYPE: "single_gpu"
container:
image: huggingface/accelerate-gpu:latest
image: huggingface/accelerate:gpu-nightly
options: --gpus all --shm-size "16gb"
defaults:
run:
@ -33,6 +33,11 @@ jobs:
pip install -e . --no-deps
pip install pytest-reportlog tabulate
- name: Show installed libraries
run: |
source activate accelerate;
pip freeze
- name: Run test on GPUs
working-directory: accelerate
run: |
@ -54,13 +59,67 @@ jobs:
pip install slack_sdk tabulate
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
run_all_tests_multi_gpu:
run_deepspeed_tests_single_gpu:
runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
env:
CUDA_VISIBLE_DEVICES: "0"
TEST_TYPE: "single_gpu_deepspeed"
container:
image: huggingface/accelerate:gpu-deepspeed-nightly
options: --gpus all --shm-size "16gb"
defaults:
run:
shell: bash
steps:
- name: Update clone & pip install
run: |
source activate accelerate
git clone https://github.com/huggingface/accelerate;
cd accelerate;
git checkout ${{ github.sha }};
pip install -e . --no-deps
pip install pytest-reportlog tabulate
- name: Show installed libraries
run: |
source activate accelerate;
pip freeze
- name: Run test on GPUs
working-directory: accelerate
run: |
source activate accelerate
make test_deepspeed
- name: Run Integration tests on GPUs
working-directory: accelerate
if: always()
run: |
source activate accelerate
make test_integrations
- name: Run examples on GPUs
working-directory: accelerate
if: always()
run: |
source activate accelerate
pip uninstall comet_ml -y
make test_examples
- name: Generate Report
working-directory: accelerate
if: always()
run: |
pip install slack_sdk tabulate
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
run_core_tests_multi_gpu:
runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
env:
CUDA_VISIBLE_DEVICES: "0,1"
TEST_TYPE: "multi_gpu"
container:
image: huggingface/accelerate-gpu:latest
image: huggingface/accelerate:gpu-nightly
options: --gpus all --shm-size "16gb"
defaults:
run:
@ -75,6 +134,11 @@ jobs:
pip install -e . --no-deps
pip install pytest-reportlog tabulate
- name: Show installed libraries
run: |
source activate accelerate;
pip freeze
- name: Run core and big modeling tests on GPUs
working-directory: accelerate
run: |
@ -105,6 +169,60 @@ jobs:
pip install slack_sdk tabulate
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
run_deepspeed_tests_multi_gpu:
runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
env:
CUDA_VISIBLE_DEVICES: "0,1"
TEST_TYPE: "multi_gpu_deepspeed"
container:
image: huggingface/accelerate:gpu-deepspeed-nightly
options: --gpus all --shm-size "16gb"
defaults:
run:
shell: bash
steps:
- name: Update clone
run: |
source activate accelerate
git clone https://github.com/huggingface/accelerate;
cd accelerate;
git checkout ${{ github.sha }};
pip install -e . --no-deps
pip install pytest-reportlog tabulate
- name: Show installed libraries
run: |
source activate accelerate;
pip freeze
- name: Run DeepSpeed tests
working-directory: accelerate
run: |
source activate accelerate
make test_deepspeed
- name: Run Integration tests on GPUs
working-directory: accelerate
if: always()
run: |
source activate accelerate
make test_integrations
- name: Run examples on GPUs
working-directory: accelerate
if: always()
run: |
source activate accelerate
pip uninstall comet_ml -y
make test_examples
- name: Generate Report
working-directory: accelerate
if: always()
run: |
pip install slack_sdk tabulate
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
run-integration-tests:
if: always()

View File

@ -9,7 +9,7 @@ env:
IS_GITHUB_CI: "1"
jobs:
run_all_tests_single_gpu:
run_core_tests_single_gpu:
runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
env:
CUDA_VISIBLE_DEVICES: "0"
@ -29,6 +29,11 @@ jobs:
pip install -e .[testing,test_trackers] -U;
pip install pytest-reportlog tabulate ;
- name: Show installed libraries
run: |
source activate accelerate;
pip freeze
- name: Run CLI tests (use make cli)
working-directory: accelerate
run: |
@ -56,7 +61,46 @@ jobs:
pip install tabulate;
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
run_all_tests_multi_gpu:
run_deepspeed_tests_single_gpu:
runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
env:
CUDA_VISIBLE_DEVICES: "0"
container:
image: huggingface/accelerate:gpu-deepspeed-nightly
options: --gpus all --shm-size "16gb"
defaults:
run:
shell: bash
steps:
- name: Install accelerate
run: |
source activate accelerate;
git clone https://github.com/huggingface/accelerate;
cd accelerate;
git checkout ${{ github.sha }};
pip install -e .[testing,test_trackers] -U;
pip install pytest-reportlog tabulate ;
- name: Show installed libraries
run: |
source activate accelerate;
pip freeze
- name: Run test on GPUs
working-directory: accelerate
if: always()
run: |
source activate accelerate;
make test_deepspeed
- name: Generate Report
working-directory: accelerate
if: always()
run: |
pip install tabulate;
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
run_core_tests_multi_gpu:
runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
env:
CUDA_VISIBLE_DEVICES: 0,1
@ -76,6 +120,11 @@ jobs:
pip install -e .[testing,test_trackers] -U;
pip install pytest-reportlog tabulate
- name: Show installed libraries
run: |
source activate accelerate;
pip freeze
- name: Run test on GPUs
working-directory: accelerate
run: |
@ -96,3 +145,40 @@ jobs:
run: |
source activate accelerate;
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
run_deepspeed_tests_multi_gpu:
runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
container:
image: huggingface/accelerate:gpu-deepspeed-nightly
options: --gpus all --shm-size "16gb"
defaults:
run:
shell: bash
steps:
- name: Install accelerate
run: |
source activate accelerate;
git clone https://github.com/huggingface/accelerate;
cd accelerate;
git checkout ${{ github.sha }};
pip install -e .[testing,test_trackers] -U;
pip install pytest-reportlog tabulate ;
- name: Show installed libraries
run: |
source activate accelerate;
pip freeze
- name: Run test on GPUs
working-directory: accelerate
if: always()
run: |
source activate accelerate;
make test_deepspeed
- name: Generate Report
working-directory: accelerate
if: always()
run: |
pip install tabulate;
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY

View File

@ -23,7 +23,7 @@ defaults:
jobs:
run-trainer-tests:
container:
image: huggingface/accelerate:gpu-nightly
image: huggingface/accelerate:gpu-deepspeed-nightly
options: --gpus all --shm-size "16gb"
runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
strategy:

View File

@ -51,6 +51,10 @@ jobs:
if [[ ${{ matrix.test-kind }} = test_rest ]]; then pip uninstall comet_ml -y; fi
if [[ ${{ matrix.test-kind }} = minimum ]]; then pip install torch==1.10.0; fi
pip install pytest-reportlog tabulate setuptools
- name: Show installed libraries
run: |
pip freeze
- name: Run Tests
env:

View File

@ -29,9 +29,10 @@ huggingface/accelerate:{accelerator}-{nightly,release}
```
`accelerator` in this instance is one of many applical pre-configured backend supports:
* `gpu`: Comes compiled off of the `nvidia/cuda` image and includes everything such as `deepspeed`, `bitsandbytes`, etc.
* `cpu`: Comes compiled off of `python:3.8-slim` and is designed for non-CUDA based workloads.
* `gpu`: Comes compiled off of the `nvidia/cuda` image and includes core parts like `bitsandbytes`. Runs off python 3.9.
* `cpu`: Comes compiled off of `python:3.9-slim` and is designed for non-CUDA based workloads.
* More to come soon
* `gpu-deepspeed`: Comes compiled off of the `nvidia/cuda` image and includes core parts like `bitsandbytes` as well as the latest `deepspeed` version. Runs off python 3.10.
## Nightlies vs Releases

View File

@ -0,0 +1,46 @@
# Builds GPU docker image of PyTorch specifically
# Uses multi-staged approach to reduce size
# Stage 1
# Use base conda image to reduce time
FROM continuumio/miniconda3:latest AS compile-image
# Specify py version
# Note: DeepSpeed beyond v0.12.6 requires py 3.10
ENV PYTHON_VERSION=3.10
# Install apt libs
RUN apt-get update && \
apt-get install -y curl git wget && \
apt-get clean && \
rm -rf /var/lib/apt/lists*
# Create our conda env
RUN conda create --name accelerate python=${PYTHON_VERSION} ipython jupyter pip
# We don't install pytorch here yet since CUDA isn't available
# instead we use the direct torch wheel
ENV PATH /opt/conda/envs/accelerate/bin:$PATH
# Activate our bash shell
RUN chsh -s /bin/bash
SHELL ["/bin/bash", "-c"]
# Activate the conda env, install mpy4pi, and install torch + accelerate
RUN source activate accelerate && conda install -c conda-forge mpi4py
RUN source activate accelerate && \
python3 -m pip install --no-cache-dir \
git+https://github.com/huggingface/accelerate#egg=accelerate[testing,test_trackers,deepspeed] \
--extra-index-url https://download.pytorch.org/whl/cu117
RUN python3 -m pip install --no-cache-dir bitsandbytes
# Stage 2
FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 AS build-image
COPY --from=compile-image /opt/conda /opt/conda
ENV PATH /opt/conda/bin:$PATH
# Install apt libs
RUN apt-get update && \
apt-get install -y curl git wget && \
apt-get clean && \
rm -rf /var/lib/apt/lists*
RUN echo "source activate accelerate" >> ~/.profile
# Activate the virtualenv
CMD ["/bin/bash"]

View File

@ -78,6 +78,8 @@
title: Executing and deferring jobs
- local: concept_guides/gradient_synchronization
title: Gradient synchronization
- local: concept_guides/fsdp_and_deepspeed
title: FSDP vs DeepSpeed
- local: concept_guides/low_precision_training
title: How training in low-precision environments is possible (FP8)
- local: concept_guides/training_tpu

View File

@ -0,0 +1,192 @@
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# Moving between FSDP And DeepSpeed
🤗 Accelerate offers flexibilty of training frameworks, by integrating two extremely powerful tools for distributed training, namely [Pytorch FSDP](../usage_guides/fsdp.md) and [Microsoft DeepSpeed](../usage_guides/deepspeed.md). The aim of this tutorial is to draw parallels, as well as to outline potential differences, to empower the user to switch seamlessly between these two frameworks.
<Tip>
To switch between the frameworks, we recommend launching code 🤗 `accelerate launch` passing in the correct config file with `--config_file`, or passing in the respective arguments directly for [FSDP and DeepSpeed](../package_reference/cli#accelerate-launch) .
Example 🤗 Accelerate configurations can be found here for [DeepSpeed](../usage_guides/deepspeed#accelerate-deepspeed-plugin) and [FSDP](../usage_guides/fsdp#how-it-works-out-of-the-box), or in the [example zoo under "Launch Configurations"](../usage_guides/explore)
</Tip>
<Tip warning={true}>
This tutorial is for single-node, multi-GPU, scenarios only.
</Tip>
## Configuring Functionalities
Model tensors are split into different GPUs in an attempt to scale up model sizes; this is termed *sharding* in FSDP, and *partitioning* in DeepSpeed. FSDP sharding and DeepSpeed ZeRO (partitioning) stages are configured by `--fsdp_sharding_strategy`, and `--zero_stage`, respectively. In particular, FSDP `FULL_SHARD` maps to DeepSpeed ZeRO stage `3`; see this [comprehensive mapping between FSDP sharding and DeepSpeed ZeRO settings](../usage_guides/fsdp#mapping-between-fsdp-sharding-strategies-and-deepspeed-zero-stages). The below table summarizes and groups similar settings:
Group | Framework | Configuration | Example | Restrictions (if any)
--|--|--|--|--
sharding / partitioning | FSDP<br>DeepSpeed | `--fsdp_sharding_strategy`<br>`--zero_stage` | `1` (`FULL_SHARD`) <br>`3` |
offload | FSDP<br>DeepSpeed | `--fsdp_offload_params`<br>`--offload_param_device`<br>`--offload_optimizer_device` | `true`<br>`cpu`<br>`cpu` | all or nothing <br><br>
model loading | FSDP<br>DeepSpeed | <span style="white-space:nowrap;">`--fsdp_cpu_ram_efficient_loading`</span><br>`--zero3_init_flag` | `true`<br>`true` | <br>only ZeRO 3
efficient checkpointing | FSDP<br>DeepSpeed | `--fsdp_state_dict_type`<br>`--zero3_save_16bit_model` | `SHARDED_STATE_DICT`<br>`true` | <br>only ZeRO 3
weights prefetching | FSDP<br><br>DeepSpeed | `--fsdp_forward_prefetch`<br>`--fsdp_backward_prefetch`<br>None | `true`<br>`BACKWARD_PRE` | <br><br>
model | FSDP<br><br>DeepSpeed | `--fsdp_auto_wrap_policy`<br><span style="white-space:nowrap;">`--fsdp_transformer_layer_cls_to_wrap`</span><br>None | `TRANSFORMER_BASED_WRAP`<br><Layer Class> |<br>Usually not needed <br>Transparent to user.
parameters summoning | FSDP<br>DeepSpeed | `--fsdp_use_orig_params`<br>None | `true` | required for `torch.compile`<br>Transparent to user
parameters syncing | FSDP<br>DeepSpeed | `--fsdp_sync_module_states`<br>None | `true` |
training | FSDP<br>DeepSpeed | None<br>`--gradient_accumulation_steps`<br>`--gradient_clipping` | <br>`auto`<br>`auto` | Transparent to user
For detailed descriptions of the above, refer to [🤗 `Accelerate` launch documentation](../package_reference/cli#accelerate-launch).
<Tip>
To access other DeepSpeed configurations, such as mixed precision settings,
you need to pass in a `--deepspeed_config_file`, see the [documentation](../usage_guides/deepspeed#deepspeed-config-file).
DeepSpeed can be also configured via [`DeepSpeedPlugin`], e.g., `DeepSpeedPlugin.zero_stage` is equivalent of `--zero_stage`, and `DeepSpeedPlugin.hf_ds_config` can be used to pass `--deepeed_config_file.`
</Tip>
<Tip>
FSDP can be also configured via [`FullyShardedDataParallelPlugin`], e.g., `FullyShardedDataParallelPlugin.sharding_strategy` is equivalent of `--fsdp_sharding_strategy`.
</Tip>
### Checkpointing
Do note that while FSDP can be configured via `--fsdp_state_dict_type` to save either full / sharded checkpoints.
<Tip>
For DeepSpeed Zero3, one could pass a `--zero3_save_16bit_model true`, which conveniently consolidates the model to a single rank and saves; this is the FSDP equivalent of `fsdp_state_dict_type: FULL_STATE_DICT`.
</Tip>
<Tip warning={true}>
For large models, consolidating the model to a single rank can be very slow.
</Tip>
<Tip>
For quicker checkpointing, for FSDP use `fsdp_state_dict_type: SHARDED_STATE_DICT`, and for DeepSpeed Zero3 [use the `zero_to_fp32.py` script to post-convert sharded checkpoints](https://www.deepspeed.ai/tutorials/zero/#extracting-weights).
</Tip>
### Offloading
FSDP only allows *all-or-nothing* offload (i.e., either offload parameters, gradients, and optimizer, or keep them all in GPU), but DeepSpeed can offload parameters and optimizer differently. Furthermore, DeepSpeed also supports [offloading to NVME](https://www.deepspeed.ai/docs/config-json/#parameter-offloading).
### Prefetching
FSDP allows two prefetching configurations `--fsdp_forward_prefetch` and `--fsdp_backward_prefetch` to improve overlap of comms / computation at a cost of extra memory, see [FSDP documentation](https://pytorch.org/docs/stable/fsdp.html).
For DeepSpeed, the prefetching will be turned on when needed, and it turns on depending on certain hyper-params like `stage3_param_persistence_threshold`, `stage3_max_reuse_distance`, etc, [that can be configured for Zero3](https://www.deepspeed.ai/docs/config-json/#parameter-offloading); 🤗 `accelerate` may set these hyper-params automatically if you don't set those explicitly in the deepspeed config file.
<Tip>
For FSDP set `fsdp_backward_prefetch: BACKWARD_PRE` for improved throughputs if memory allows.
</Tip>
### Model Loading
While FSDP require an explicit `--fsdp_cpu_ram_efficient_loading true` to activate efficient model loading, 🤗 `transformers` will activate the similar feature whenever DeepSpeed Zero3 is used.
<Tip>
For FSDP, whenever setting `--fsdp_cpu_ram_efficient_loading true`, 🤗 `accelerate` will automatically set `sync_module_states` to true.
For RAM efficient loading the weights will be loaded only in a singe rank, and thus requires `sync_module_states` to broadcast weights to other ranks.
</Tip>
### Model
FSDP requires an explicit `--fsdp_auto_wrap_policy` for the algorithm to decide how to schedule the all-gather and reduce-scatter operations. But for DeepSpeed this is transparent to the user.
<Tip>
For FSDP, simply set `fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP`. With the latest [`transformers`] versions, we try our best to figure out the suitable `fsdp_transformer_layer_cls_to_wrap` for HF transformers models. However, if you get an error regarding it, please specify this.
</Tip>
### Parameters Summoning
FSDP requires an explicit `--fsdp_use_orig_params` flag if using `torch.compile`, see [the pytorch documenation](https://pytorch.org/docs/stable/fsdp.html#module-torch.distributed.fsdp). For DeepSpeed this is transparent to the user.
<Tip>
For FSDP, when using `torch.compile` please set `fsdp_use_orig_params: True`.
</Tip>
## Training
Deepspeed requires explicit `--gradient_accumulation_steps` and `--gradient_clipping` flags. For FSDP this is transparent to the user.
<Tip>
When using DeepSpeed, set `gradient_accumulation_steps: "auto"` and `gradient_clipping: "auto"` to automatically pick up values set in the [`Accelerator`] or [`TrainingArguments`] (if using `transformers`).
</Tip>
## On Differences in Data Precision Handling
To discuss the how data precision is handled in both FSDP and Deepspeed, it is instructive to first give an overview of how model parameters are handled in these frameworks. Before the model / optimizer parameters are distributed across GPUs, parameter preparation is involved to first "flatten" them to one-dimensional [`torch.Tensor`](https://pytorch.org/docs/stable/tensors.html#torch-tensor). The implementation of FSDP / DeepSpeed varies in the respect of the `dtype` in which these "flattened" parameters are stored, and there are ramifications with regards to how [`torch.Optimizer`](https://pytorch.org/docs/stable/optim.html#module-torch.optim) allocate their `dtype`s. The table below outlines the processes for both frameworks; the "Local" column indicates the process occurring at a per-gpu level, therefore any memory overheads by upcasting should be understood to be amortized by the number of gpus used.
<Tip>
As a rule of thumb, for stable training with automatic mixed precision, all the trainable parameters have to be in `torch.float32`.
</Tip>
Process | Local | Framework | Details
--|--|--|--
Loading, i.e., [`AutoModel.from_pretrained(..., torch_dtype=torch_dtype)`] |
Preparation, i.e., creation of "flat params" | ✅ | FSDP<br>DeepSpeed | created in `torch_dtype`.<br> disregards `torch_dtype`, created in `float32`.
Optimizer initialization | ✅ | FSDP<br>DeepSpeed | creates parameters in `torch_dtype`<br> creates parameters in `float32`
Training Step, i.e, forward, backward, reduction | | FSDP<br>DeepSpeed | follows [`MixedPrecision`](https://pytorch.org/docs/stable/fsdp.html#torch.distributed.fsdp.MixedPrecision)<br> follows `deepspeed_config_file` mixed precision settings.
Optimizer (Pre-Step) | ✅ | FSDP<br>DeepSpeed | upcasting (if any) to `torch_dtype`<br>upcasted to `float32`
Optimizer (Actual Step) | ✅ | FSDP<br>DeepSpeed | occurs in `torch_dtype` <br> occurs in `float32`.
<Tip warning={true}>
Therefore when using DeepSpeed a small number of GPUs, be aware of potentially significant memory overheads due to the upcasting during preperation.
</Tip>
<Tip>
With FSDP, in the absence of mixed precision, it is possible to operate the [`torch.Optimizer`](https://pytorch.org/docs/stable/optim.html#module-torch.optim) in low precision `torch_dtype`, which may be helpful when using small number of GPUs.
</Tip>
<Tip warning={true}>
With mixed precision, FSDP and DeepSpeed will upcast in the model preparation step (c.f. table above). But do note that FSDP will then save checkpoints in the upcasted precision; Deepspeed may still save low precision checkpoints if `--zero3_save_16bit_model` is specified.
</Tip>
To clarify the above table consider the concrete examples below; the optimizer pre- and actual step combined for brevity. With FSDP it is possible to operate in the two modes shown below, but DeepSpeed can only operate in one.
Framework | Model Loading (`torch_dtype`) | Mixed Precision | Preparation (Local) | Training | Optimizer (Local)
--|--|--|--|--|--
FSDP | bf16 | default (none) | bf16 | bf16 | bf16
FSDP | bf16 | bf16 | fp32 | bf16 | fp32
DeepSpeed | bf16 | bf16 | fp32 | bf16 | fp32

View File

@ -208,6 +208,10 @@ The following arguments are only useful when `use_fsdp` is passed or Fully Shard
* `--fsdp_transformer_layer_cls_to_wrap` (`str`) -- Transformer layer class name (case-sensitive) to wrap, e.g, `BertLayer`, `GPTJBlock`, `T5Block` ...
* `--fsdp_backward_prefetch_policy` (`str`) -- FSDP's backward prefetch policy.
* `--fsdp_state_dict_type` (`str`) -- FSDP's state dict type.
* `--fsdp_forward_prefetch` (`str`) -- FSDP forward prefetch.
* `--fsdp_use_orig_params` (`str`) -- If True, allows non-uniform `requires_grad` mixed in a FSDP unit.
* `--fsdp_cpu_ram_efficient_loading` (`str`) - If true, only the first process loads the pretrained model checkoint while all other processes have empty weights. When using this, `--fsdp_sync_module_states` needs to True.
* `--fsdp_sync_module_states` (`str`) - If true, each individually wrapped FSDP unit will broadcast module parameters from rank 0.
**Megatron-LM Arguments**:

View File

@ -17,12 +17,12 @@ rendered properly in your Markdown viewer.
[[autodoc]] utils.DeepSpeedPlugin
[[autodoc]] utils.DummyOptim
[[autodoc]] utils.deepspeed.DummyOptim
[[autodoc]] utils.DummyScheduler
[[autodoc]] utils.deepspeed.DummyScheduler
[[autodoc]] utils.DeepSpeedEngineWrapper
[[autodoc]] utils.deepspeed.DeepSpeedEngineWrapper
[[autodoc]] utils.DeepSpeedOptimizerWrapper
[[autodoc]] utils.deepspeed.DeepSpeedOptimizerWrapper
[[autodoc]] utils.DeepSpeedSchedulerWrapper
[[autodoc]] utils.deepspeed.DeepSpeedSchedulerWrapper

View File

@ -93,6 +93,9 @@ accelerator = Accelerator()
> [!WARNING]
> This step is *optional* but it is considered best practice to allow Accelerate to handle device placement. You could also deactivate automatic device placement by passing `device_placement=False` when initializing the [`Accelerator`]. If you want to explicitly place objects on a device with `.to(device)`, make sure you use `accelerator.device` instead. For example, if you create an optimizer before placing a model on `accelerator.device`, training fails on a TPU.
> [!WARNING]
> Accelerate does not use non-blocking transfers by default for its automatic device placement, which can result in potentially unwanted CUDA synchronizations. You can enable non-blocking transfers by passing a [`~utils.dataclasses.DataLoaderConfiguration`] with `non_blocking=True` set as the `dataloader_config` when initializing the [`Accelerator`]. As usual, non-blocking transfers will only work if the dataloader also has `pin_memory=True` set. Be wary that using non-blocking transfers from GPU to CPU may cause incorrect results if it results in CPU operations being performed on non-ready tensors.
```py
device = accelerator.device
```
@ -121,7 +124,7 @@ To perform distributed evaluation, pass your validation dataloader to the [`~Acc
validation_dataloader = accelerator.prepare(validation_dataloader)
```
Each device in your distributed setup only receives a part of the evaluation data, which means you should group your predictions together with the [`~Accelerator.gather_for_metrics`] method. This method requires all tensors to be the same size on each process, so if your tensors have different sizes on each process (for instance when dynamically padding to the maximum length in a batch), you should use the [`~Accelerator.pad_across_processes`] method to pad you tensor to the largest size across processes.
Each device in your distributed setup only receives a part of the evaluation data, which means you should group your predictions together with the [`~Accelerator.gather_for_metrics`] method. This method requires all tensors to be the same size on each process, so if your tensors have different sizes on each process (for instance when dynamically padding to the maximum length in a batch), you should use the [`~Accelerator.pad_across_processes`] method to pad you tensor to the largest size across processes. Note that the tensors needs to be 1D and that we concatenate the tensors along the first dimension.
```python
for inputs, targets in validation_dataloader:
@ -132,6 +135,8 @@ for inputs, targets in validation_dataloader:
metric.add_batch(all_predictions, all_targets)
```
For more complex cases (e.g. 2D tensors, don't want to concatenate tensors, dict of 3D tensors), you can pass `use_gather_object=True` in `gather_for_metrics`. This will return the list of objects after gathering. Note that using it with GPU tensors is not well supported and inefficient.
> [!TIP]
> Data at the end of a dataset may be duplicated so the batch can be equally divided among all workers. The [`~Accelerator.gather_for_metrics`] method automatically removes the duplicated data to calculate a more accurate metric.

View File

@ -157,10 +157,18 @@ Currently, `Accelerate` supports following config through the CLI:
`gradient_accumulation_steps`: Number of training steps to accumulate gradients before averaging and applying them.
`gradient_clipping`: Enable gradient clipping with value.
`offload_optimizer_device`: [none] Disable optimizer offloading, [cpu] offload optimizer to CPU, [nvme] offload optimizer to NVMe SSD. Only applicable with ZeRO >= Stage-2.
`offload_optimizer_nvme_path`: Decides Nvme Path to offload optimizer states. If unspecified, will default to 'none'.
`offload_param_device`: [none] Disable parameter offloading, [cpu] offload parameters to CPU, [nvme] offload parameters to NVMe SSD. Only applicable with ZeRO Stage-3.
`offload_param_nvme_path`: Decides Nvme Path to offload parameters. If unspecified, will default to 'none'.
`zero3_init_flag`: Decides whether to enable `deepspeed.zero.Init` for constructing massive models. Only applicable with ZeRO Stage-3.
`zero3_save_16bit_model`: Decides whether to save 16-bit model weights when using ZeRO Stage-3.
`mixed_precision`: `no` for FP32 training, `fp16` for FP16 mixed-precision training and `bf16` for BF16 mixed-precision training.
`deepspeed_moe_layer_cls_names`: Comma-separated list of transformer Mixture-of-Experts (MoE) layer class names (case-sensitive) to wrap ,e.g, `MixtralSparseMoeBlock`, `Qwen2MoeSparseMoeBlock`, `JetMoEAttention,JetMoEBlock` ...
`deepspeed_hostfile`: DeepSpeed hostfile for configuring multi-node compute resources.
`deepspeed_exclusion_filter`: DeepSpeed exclusion filter string when using mutli-node setup.
`deepspeed_inclusion_filter`: DeepSpeed inclusion filter string when using mutli-node setup.
`deepspeed_multinode_launcher`: DeepSpeed multi-node launcher to use. If unspecified, will default to `pdsh`.
`deepspeed_config_file`: path to the DeepSpeed config file in `json` format. See the next section for more details on this.
```
To be able to tweak more options, you will need to use a DeepSpeed config file.
@ -721,3 +729,10 @@ Papers:
Finally, please, remember that 🤗 `Accelerate` only integrates DeepSpeed, therefore if you
have any problems or questions with regards to DeepSpeed usage, please, file an issue with [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/issues).
<Tip>
For those interested in the similarities and differences between FSDP and DeepSpeed, please check out the [concept guide here](../concept_guides/fsdp_and_deepspeed.md)!
</Tip>

View File

@ -140,6 +140,8 @@ with distributed_state.split_between_processes(["a dog", "a cat", "a chicken"],
On the first GPU, the prompts will be `["a dog", "a cat"]`, and on the second GPU it will be `["a chicken", "a chicken"]`.
Make sure to drop the final sample, as it will be a duplicate of the previous one.
You can find more complex examples [here](https://github.com/huggingface/accelerate/tree/main/examples/inference/distributed) such as how to use it with LLMs.
## Memory-efficient pipeline parallelism (experimental)
This next part will discuss using *pipeline parallelism*. This is an **experimental** API utilizing the [PiPPy library by PyTorch](https://github.com/pytorch/PiPPy/) as a native solution.
@ -232,4 +234,4 @@ if PartialState().is_last_process:
</Tip>
And that's it! To explore more, please check out the inference examples in the [Accelerate repo](https://github.com/huggingface/accelerate/tree/main/examples/inference) and our [documentation](../package_reference/inference) as we work to improving this integration.
And that's it! To explore more, please check out the inference examples in the [Accelerate repo](https://github.com/huggingface/accelerate/tree/main/examples/inference/pippy) and our [documentation](../package_reference/inference) as we work to improving this integration.

View File

@ -175,3 +175,10 @@ You can then pass `state` into the `save_pretrained` method. There are several
For more control, users can leverage the `FullyShardedDataParallelPlugin`. After creating an instance of this class, users can pass it to the Accelerator class instantiation.
For more information on these options, please refer to the PyTorch [FullyShardedDataParallel](https://github.com/pytorch/pytorch/blob/0df2e863fbd5993a7b9e652910792bd21a516ff3/torch/distributed/fsdp/fully_sharded_data_parallel.py#L236) code.
<Tip>
For those interested in the similarities and differences between FSDP and DeepSpeed, please check out the [concept guide here](../concept_guides/fsdp_and_deepspeed.md)!
</Tip>

View File

@ -198,7 +198,7 @@ achieve the same outcome with:
```python
wandb_tracker = accelerator.get_tracker("wandb", unwrap=True)
with accelerator.on_main_process:
if accelerator.is_main_process:
wandb_tracker.log_artifact(some_artifact_to_log)
```

View File

@ -248,7 +248,7 @@ def training_function(config, args):
# Use accelerator.print to print only on the main process.
test_predictions.append(torch.cat(fold_predictions, dim=0))
# We now need to release all our memory and get rid of the current model, optimizer, etc
accelerator.free_memory()
model, optimizer = accelerator.free_memory(model, optimizer)
# New Code #
# Finally we check the accuracy of our folded results:
test_references = torch.cat(test_references, dim=0)

View File

@ -34,7 +34,7 @@ import datasets
import torch
import transformers
from datasets import load_dataset
from huggingface_hub import Repository
from huggingface_hub import HfApi
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import (
@ -47,7 +47,6 @@ from transformers import (
default_data_collator,
get_scheduler,
)
from transformers.utils import get_full_repo_name
from transformers.utils.versions import require_version
from accelerate import Accelerator, DistributedType
@ -303,11 +302,13 @@ def main():
# Handle the repository creation
if accelerator.is_main_process:
if args.push_to_hub:
if args.hub_model_id is None:
repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
else:
repo_name = args.hub_model_id
repo = Repository(args.output_dir, clone_from=repo_name)
api = HfApi(token=args.hub_token)
# Create repo (repo_name from args or inferred)
repo_name = args.hub_model_id
if repo_name is None:
repo_name = Path(args.output_dir).absolute().name
repo_id = api.create_repo(repo_name, exist_ok=True).repo_id
with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
if "step_*" not in gitignore:
@ -707,7 +708,11 @@ def main():
if accelerator.is_main_process:
tokenizer.save_pretrained(args.output_dir)
if args.push_to_hub:
repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
api.upload_folder(
repo_id=repo_id,
folder_path=args.output_dir,
commit_message="End of training",
)
with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
json.dump({"perplexity": perplexity, "eval_loss": eval_loss.item()}, f)

View File

@ -34,7 +34,7 @@ import datasets
import torch
import transformers
from datasets import load_dataset
from huggingface_hub import Repository
from huggingface_hub import HfApi
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import (
@ -47,7 +47,7 @@ from transformers import (
default_data_collator,
get_scheduler,
)
from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version
from accelerate import Accelerator, DistributedType
@ -277,11 +277,13 @@ def main():
# Handle the repository creation
if accelerator.is_main_process:
if args.push_to_hub:
if args.hub_model_id is None:
repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
else:
repo_name = args.hub_model_id
repo = Repository(args.output_dir, clone_from=repo_name)
api = HfApi(token=args.hub_token)
# Create repo (repo_name from args or inferred)
repo_name = args.hub_model_id
if repo_name is None:
repo_name = Path(args.output_dir).absolute().name
repo_id = api.create_repo(repo_name, exist_ok=True).repo_id
with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
if "step_*" not in gitignore:
@ -661,8 +663,11 @@ def main():
)
if accelerator.is_main_process:
tokenizer.save_pretrained(args.output_dir)
repo.push_to_hub(
commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
api.upload_folder(
repo_id=repo_id,
folder_path=args.output_dir,
commit_message=f"Training in progress epoch {epoch}",
run_as_future=True,
)
if args.checkpointing_steps == "epoch":
@ -690,7 +695,11 @@ def main():
if accelerator.is_main_process:
tokenizer.save_pretrained(args.output_dir)
if args.push_to_hub:
repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
api.upload_folder(
repo_id=repo_id,
folder_path=args.output_dir,
commit_message="End of training",
)
with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
json.dump({"perplexity": perplexity}, f)

View File

@ -0,0 +1,225 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import evaluate
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, set_seed
from accelerate import Accelerator, DistributedType
from accelerate.utils import is_schedulefree_available
if is_schedulefree_available():
import schedulefree
else:
raise ImportError(
"This example requires the `schedulefree` library. Please install it with `pip install schedulefree`"
)
########################################################################
# This is a fully working simple example to use Accelerate and Facebook's
# scheduler-free optimizer: https://github.com/facebookresearch/schedule_free/
#
# This example trains a Bert base model on GLUE MRPC
# in any of the following settings (with the same script):
# - single CPU or single GPU
# - multi GPUS (using PyTorch distributed mode)
# - (multi) TPUs
# - fp16 (mixed-precision) or fp32 (normal precision)
#
# To run it in each of these various modes, follow the instructions
# in the readme for examples:
# https://github.com/huggingface/accelerate/tree/main/examples
#
########################################################################
MAX_GPU_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 32
def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
"""
Creates a set of `DataLoader`s for the `glue` dataset,
using "bert-base-cased" as the tokenizer.
Args:
accelerator (`Accelerator`):
An `Accelerator` object
batch_size (`int`, *optional*):
The batch size for the train and validation DataLoaders.
"""
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
datasets = load_dataset("glue", "mrpc")
def tokenize_function(examples):
# max_length=None => use the model max length (it's actually the default)
outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
return outputs
# Apply the method we just defined to all the examples in all the splits of the dataset
# starting with the main process first:
with accelerator.main_process_first():
tokenized_datasets = datasets.map(
tokenize_function,
batched=True,
remove_columns=["idx", "sentence1", "sentence2"],
)
# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
# transformers library
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
def collate_fn(examples):
# For Torchxla, it's best to pad everything to the same length or training will be very slow.
max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
# When using mixed precision we want round multiples of 8/16
if accelerator.mixed_precision == "fp8":
pad_to_multiple_of = 16
elif accelerator.mixed_precision != "no":
pad_to_multiple_of = 8
else:
pad_to_multiple_of = None
return tokenizer.pad(
examples,
padding="longest",
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors="pt",
)
# Instantiate dataloaders.
train_dataloader = DataLoader(
tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size, drop_last=True
)
eval_dataloader = DataLoader(
tokenized_datasets["validation"],
shuffle=False,
collate_fn=collate_fn,
batch_size=EVAL_BATCH_SIZE,
drop_last=(accelerator.mixed_precision == "fp8"),
)
return train_dataloader, eval_dataloader
# For testing only
if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
from accelerate.test_utils.training import mocked_dataloaders
get_dataloaders = mocked_dataloaders # noqa: F811
def training_function(config, args):
# Initialize accelerator
accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision)
# Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
lr = config["lr"]
num_epochs = int(config["num_epochs"])
seed = int(config["seed"])
batch_size = int(config["batch_size"])
metric = evaluate.load("glue", "mrpc")
# If the batch size is too big we use gradient accumulation
gradient_accumulation_steps = 1
if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
batch_size = MAX_GPU_BATCH_SIZE
set_seed(seed)
train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
# Instantiate the model (we build the model here so that the seed also control new weights initialization)
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)
# We could avoid this line since the accelerator is set with `device_placement=True` (default value).
# Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
# creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
model = model.to(accelerator.device)
# Instantiate optimizer with warmup steps
optimizer = schedulefree.AdamWScheduleFree(
model.parameters(),
lr=lr,
warmup_steps=100,
)
# Prepare everything
# There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
# prepare method.
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
model, optimizer, train_dataloader, eval_dataloader
)
# Now we train the model
for epoch in range(num_epochs):
model.train()
optimizer.train()
for step, batch in enumerate(train_dataloader):
# We could avoid this line since we set the accelerator with `device_placement=True`.
batch.to(accelerator.device)
outputs = model(**batch)
loss = outputs.loss
loss = loss / gradient_accumulation_steps
accelerator.backward(loss)
if step % gradient_accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad()
model.eval()
optimizer.eval()
for step, batch in enumerate(eval_dataloader):
# We could avoid this line since we set the accelerator with `device_placement=True`.
batch.to(accelerator.device)
with torch.no_grad():
outputs = model(**batch)
predictions = outputs.logits.argmax(dim=-1)
predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))
metric.add_batch(
predictions=predictions,
references=references,
)
eval_metric = metric.compute()
# Use accelerator.print to print only on the main process.
accelerator.print(f"epoch {epoch}:", eval_metric)
def main():
parser = argparse.ArgumentParser(description="Simple example of training script.")
parser.add_argument(
"--mixed_precision",
type=str,
default=None,
choices=["no", "fp16", "bf16", "fp8"],
help="Whether to use mixed precision. Choose"
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
"and an Nvidia Ampere GPU.",
)
parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
args = parser.parse_args()
config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
training_function(config, args)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,25 @@
# Distributed inference examples
This folder contains a variety of tutorials for running distributed inference with the following strategy:
Load an entire model onto each GPU and sending chunks of a batch through each GPUs model copy at a time
## Installation
```bash
pip install accelerate torch
```
## Running code
You can either use `torchrun` or the recommended way of `accelerate launch` (without needing to run `accelerate config`) on each script:
```bash
accelerate launch --num_processes {NUM_GPUS} phi2.py
```
Or:
```bash
torchrun --nproc-per-node {NUM_GPUS} phi2.py
```

View File

@ -0,0 +1,86 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from accelerate import PartialState
from accelerate.utils import gather_object
# Start up the distributed environment without needing the Accelerator.
distributed_state = PartialState()
# You can change the model to any LLM such as mistralai/Mistral-7B-v0.1 or meta-llama/Llama-2-7b-chat-hf
model_name = "microsoft/phi-2"
model = AutoModelForCausalLM.from_pretrained(
model_name, device_map=distributed_state.device, torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Need to set the padding token to the eos token for generation
tokenizer.pad_token = tokenizer.eos_token
prompts = [
"I would like to",
"hello how are you",
"what is going on",
"roses are red and",
"welcome to the hotel",
]
# You can change the batch size depending on your GPU RAM
batch_size = 2
# We set it to 8 since it is better for some hardware. More information here https://github.com/huggingface/tokenizers/issues/991
pad_to_multiple_of = 8
# Split into batches
# We will get the following results:
# [ ["I would like to", "hello how are you"], [ "what is going on", "roses are red and"], [ "welcome to the hotel"] ]
formatted_prompts = [prompts[i : i + batch_size] for i in range(0, len(prompts), batch_size)]
# Apply padding on the left since we are doing generation
padding_side_default = tokenizer.padding_side
tokenizer.padding_side = "left"
# Tokenize each batch
tokenized_prompts = [
tokenizer(formatted_prompt, padding=True, pad_to_multiple_of=pad_to_multiple_of, return_tensors="pt")
for formatted_prompt in formatted_prompts
]
# Put back the original padding behavior
tokenizer.padding_side = padding_side_default
completions_per_process = []
# We automatically split the batched data we passed to it across all the processes. We also set apply_padding=True
# so that the GPUs will have the same number of prompts, and you can then gather the results.
# For example, if we have 2 gpus, the distribution will be:
# GPU 0: ["I would like to", "hello how are you"], "what is going on", "roses are red and"]
# GPU 1: ["welcome to the hotel"], ["welcome to the hotel"] -> this prompt is duplicated to ensure that all gpus have the same number of prompts
with distributed_state.split_between_processes(tokenized_prompts, apply_padding=True) as batched_prompts:
for batch in batched_prompts:
# Move the batch to the device
batch = batch.to(distributed_state.device)
# We generate the text, decode it and add it to the list completions_per_process
outputs = model.generate(**batch, max_new_tokens=20)
generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
completions_per_process.extend(generated_text)
# We are gathering string, so we need to use gather_object.
# If you need to gather tensors, you can use gather from accelerate.utils
completions_gather = gather_object(completions_per_process)
# Drop duplicates produced by apply_padding in split_between_processes
completions = completions_gather[: len(prompts)]
distributed_state.print(completions)

View File

@ -0,0 +1,30 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from diffusers import DiffusionPipeline
from accelerate import PartialState # Can also be Accelerator or AcceleratorState
pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
distributed_state = PartialState()
pipe.to(distributed_state.device)
# Assume two processes
# On the first GPU, the prompts will be ["a dog", "a cat"],
# and on the second GPU it will be ["a chicken", "a chicken"].
# Make sure to drop the final sample, as it will be a duplicate of the previous one.
with distributed_state.split_between_processes(["a dog", "a cat", "a chicken"], apply_padding=True) as prompt:
result = pipe(prompt).images

View File

@ -1,3 +1,5 @@
accelerate # used to be installed in Amazon SageMaker environment
evaluate
datasets==2.3.2
datasets==2.3.2
schedulefree
huggingface_hub>=0.20.0

View File

@ -0,0 +1,32 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from manim import *
class Stage0(Scene):
def construct(self):
mascot = ImageMobject("mascot_bookie.png")
mascot.scale(.35)
mascot.move_to([-3.75,-1,0])
text = Paragraph(
"Distributed Training,\nHugging Face Accelerate,\nand PyTorch DataLoaders\n\nHow do they all interact?",
font_size=36,
line_spacing=1,
alignment="center",
weight=BOLD,
)
text.move_to([1.75,.5,0])
self.add(mascot)
self.add(text)

View File

@ -0,0 +1,31 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from manim import *
class Stage01(Scene):
def construct(self):
mascot = ImageMobject("mascot_bookie.png")
mascot.scale(.35)
mascot.move_to([-3.75,-1,0])
text = Paragraph(
"Distributed Training,\nHugging Face Accelerate,\nand PyTorch DataLoaders\n\nHow do they all interact?",
font_size=36,
line_spacing=1,
alignment="center",
weight=BOLD,
)
text.move_to([1.75,.5,0])
self.add(mascot)
self.add(text)

View File

@ -0,0 +1,176 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from manim import *
class Stage2(Scene):
def construct(self):
# The dataset items
fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)
columns = [
VGroup(*[Rectangle(height=0.25,width=0.25,color="green") for i in range(8)]).arrange(RIGHT,buff=0)
for j in range(4)
]
dataset_recs = VGroup(*columns).arrange(UP, buff=0)
dataset_text = Text("Dataset", font_size=24)
dataset = Group(dataset_recs,dataset_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
dataset.move_to([-2,0,0])
self.add(dataset)
code = Code(
code="dataloader = DataLoader(...)\nfor batch in dataloader():\n\t...",
tab_width=4,
background="window",
language="Python",
font="Monospace",
font_size=14,
corner_radius=.2,
insert_line_no=False,
line_spacing=.75,
style=Code.styles_list[1],
)
code.move_to([-3.5, 2.5, 0])
self.add(code)
# The dataloader itself
dataloader = Group(
Rectangle(color="red", height=2, width=2),
Text("DataLoader", font_size=24)
).arrange(DOWN, buff=.5, aligned_edge=DOWN)
sampler = Group(
Rectangle(color="blue", height=1, width=1),
Text("Sampler", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN)
dataloader.move_to([1, 0, 0])
sampler.move_to([.75,.25,0])
self.add(dataloader)
self.add(sampler)
gpu_1 = Group(
Rectangle(color="white", height=1, width=1),
Text("GPU 1", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4, 2, 0])
gpu_2 = Group(
Rectangle(color="white", height=1, width=1),
Text("GPU 2", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4, .5, 0])
gpu_3 = Group(
Rectangle(color="white", height=1, width=1),
Text("GPU 3", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4, -1, 0])
gpu_4 = Group(
Rectangle(color="white", height=1, width=1),
Text("GPU 4", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4, -2.5, 0])
gpus = [gpu_1[0], gpu_2[0], gpu_3[0], gpu_4[0]]
self.add(gpu_1, gpu_2, gpu_3, gpu_4)
# Animate their existence
self.play(
Create(gpu_1[0], run_time=0.5),
Create(gpu_2[0], run_time=0.5),
Create(gpu_3[0], run_time=0.5),
Create(gpu_4[0], run_time=0.5),
Create(dataset_recs, run_time=1),
Create(sampler[0], run_time=1),
Create(dataloader[0], run_time=1)
)
step_1 = MarkupText(
f"Without any special care, \nthe same data is sent though each sampler, \nand the same samples are spit out on each GPU",
font_size=18
)
step_1.move_to([0, -2.5, 0])
self.play(
Write(step_1, run_time=4),
)
first_animations = []
second_animations = []
colors = ["BLUE_E", "DARK_BROWN", "GOLD_E", "GRAY_A"]
current_color = colors[0]
buff = 0
lr_buff = .25
old_target = None
new_datasets = []
for i,data in enumerate(dataset_recs[-1]):
if i % 2 == 0:
# current_color = colors[i//2]
current_color = "BLUE_E"
dataset_target = Rectangle(height=0.46/2,width=0.46/2).set_stroke(width=0.).set_fill(current_color, opacity=0.7)
dataset_target.move_to(data)
dataset_target.generate_target()
aligned_edge = ORIGIN
if i % 2 == 0:
old_target = dataset_target.target
buff -= .25
aligned_edge = LEFT
dataset_target.target.next_to(
sampler, buff=buff, direction=UP,
aligned_edge=LEFT
)
else:
dataset_target.target.next_to(
old_target, direction=RIGHT, buff=0.01,
)
new_datasets.append(dataset_target)
first_animations.append(data.animate(run_time=0.5).set_stroke(current_color))
second_animations.append(MoveToTarget(dataset_target, run_time=1.5))
self.play(*first_animations)
self.play(*second_animations)
self.wait()
move_animation = []
for j,gpu in enumerate(gpus):
buff = 0
for i,data in enumerate(new_datasets):
if i % 2 == 0:
current_color = colors[i//2]
if j != 3:
data = data.copy()
data.generate_target()
aligned_edge = ORIGIN
if i % 2 == 0:
old_target = data.target
buff -= .25
aligned_edge = LEFT
data.target.next_to(
gpu, buff=buff, direction=UP,
aligned_edge=LEFT
)
else:
data.target.next_to(
old_target, direction=RIGHT, buff=0.01,
)
move_animation.append(MoveToTarget(data, run_time=1.5))
self.play(*move_animation)
self.remove(step_1)
step_2 = MarkupText(
f"This behavior is undesireable, because we want\neach GPU to see different data for efficient training.",
font_size=18
)
step_2.move_to([0, -2.5, 0])
self.play(
Write(step_2, run_time=2.5),
)
self.wait()

View File

@ -0,0 +1,34 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from manim import *
class Stage3(Scene):
def construct(self):
step_1 = MarkupText(
f"To combat this, Accelerate employs one of two different\nSampler wrapper methods depending on the scenario:",
font_size=24
)
step_1.move_to([0, 1.5, 0])
self.add(step_1)
step_2 = MarkupText(
f"1. Sharding the dataset before drawing:\n\t● <span fgcolor='{RED}'>IterableDatasetShard</span>\n\t● <span fgcolor='{RED}'>BatchSamplerShard</span>",
font_size=24,
).next_to(step_1, direction=DOWN, aligned_edge=LEFT)
self.add(step_2)
step_3 = MarkupText(
f"\n\n2. Splitting the batch after drawing:\n\t● <span fgcolor='{BLUE}'>DataLoaderDispatcher</span>",
font_size=24,
).next_to(step_2, direction=DOWN, aligned_edge=LEFT)
self.add(step_3)

View File

@ -0,0 +1,52 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from manim import *
class Stage4(Scene):
def construct(self):
step_1 = MarkupText(
f"To understand the next part fully, let's define two terms,\n<span fgcolor='{RED}'>`batch_size`</span> and <span fgcolor='{BLUE}'>`global_batch_size`</span>:",
font_size=18
)
step_1.move_to([0, 1.5, 0])
# <span fgcolor='{YELLOW}'>●</span>
step_2 = MarkupText(
f"\n\n● <span fgcolor='{RED}'>`batch_size`</span>: \n\tThis will be defined as the batch size seen on a given\n\t*individual* GPU",
font_size=18,
).next_to(step_1, direction=DOWN, aligned_edge=LEFT)
step_3 = MarkupText(
f"\n\n● <span fgcolor='{BLUE}'>`global_batch_size`</span>:\n\tThis will be defined as the *total* number of\n\tdifferent items seen in the dataset, across all GPUs",
font_size=18,
).next_to(step_2, direction=DOWN, aligned_edge=LEFT)
step_4 = MarkupText(
f"\n\nSo if we have a dataset of 64 items, 8 GPUs, \nand a `batch_size` of 8, each *step* will go through\nthe entire dataset one time as 8*8=64",
font_size=18,
).next_to(step_3, direction=DOWN, aligned_edge=LEFT)
self.play(
Write(step_1, run_time=4),
)
self.play(
Write(step_2, run_time=4)
)
self.play(
Write(step_3, run_time=4)
)
self.play(
Write(step_4, run_time=6)
)
self.wait()

View File

@ -0,0 +1,203 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from manim import *
class Stage5(Scene):
def construct(self):
# The dataset items
colors = ["BLUE_E", "DARK_BROWN", "GOLD_E", "GRAY_A"]
fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)
columns = [
VGroup(*[Rectangle(height=0.25,width=0.25,color=colors[j]) for i in range(8)]).arrange(RIGHT,buff=0)
for j in range(4)
]
dataset_recs = VGroup(*columns).arrange(UP, buff=0)
dataset_text = Text("Dataset", font_size=24)
dataset = Group(dataset_recs,dataset_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
dataset.move_to([-2,0,0])
self.add(dataset)
code = Code(
code="# We enable this by default\naccelerator = Accelerator()\ndataloader = DataLoader(...)\ndataloader = accelerator.prepare(dataloader)\nfor batch in dataloader:\n\t...",
tab_width=4,
background="window",
language="Python",
font="Monospace",
font_size=14,
corner_radius=.2,
insert_line_no=False,
line_spacing=.75,
style=Code.styles_list[1],
)
code.move_to([-3.5, 2.5, 0])
self.add(code)
# The dataloader itself
sampler_1 = Group(
Rectangle(color="blue", height=1, width=1),
Text("Sampler GPU 1", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN)
sampler_2 = Group(
Rectangle(color="blue", height=1, width=1),
Text("Sampler GPU 2", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN)
sampler_3 = Group(
Rectangle(color="blue", height=1, width=1),
Text("Sampler GPU 3", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN)
sampler_4 = Group(
Rectangle(color="blue", height=1, width=1),
Text("Sampler GPU 4", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN)
sampler_1.move_to([2,2,0])
sampler_2.move_to([2,.5,0])
sampler_3.move_to([2,-1.,0])
sampler_4.move_to([2,-2.5,0])
self.add(sampler_1, sampler_2, sampler_3, sampler_4)
samplers = [sampler_1[0], sampler_2[0], sampler_3[0], sampler_4[0]]
gpu_1 = Group(
Rectangle(color="white", height=1, width=1),
Text("Output GPU 1", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, 2, 0])
gpu_2 = Group(
Rectangle(color="white", height=1, width=1),
Text("Output GPU 2", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, .5, 0])
gpu_3 = Group(
Rectangle(color="white", height=1, width=1),
Text("Output GPU 3", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, -1, 0])
gpu_4 = Group(
Rectangle(color="white", height=1, width=1),
Text("Output GPU 4", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, -2.5, 0])
gpus = [gpu_1[0], gpu_2[0], gpu_3[0], gpu_4[0]]
self.add(gpu_1, gpu_2, gpu_3, gpu_4)
# Animate their existence
self.play(
Create(gpu_1[0], run_time=1),
Create(gpu_2[0], run_time=1),
Create(gpu_3[0], run_time=1),
Create(gpu_4[0], run_time=1),
Create(dataset_recs, run_time=1),
Create(sampler_1[0], run_time=1),
Create(sampler_2[0], run_time=1),
Create(sampler_3[0], run_time=1),
Create(sampler_4[0], run_time=1),
)
first_animations = []
second_animations = []
colors = ["BLUE_E", "DARK_BROWN", "GOLD_E", "GRAY_A"]
current_color = colors[0]
buff = 0
lr_buff = .25
old_target = None
new_datasets = []
for i,row_data in enumerate(dataset_recs):
new_row = []
current_color = colors[i]
if i == 0:
idx = -3
elif i == 1:
idx = -2
elif i == 2:
idx = -1
elif i == 3:
idx = 0
for j,indiv_data in enumerate(row_data):
dataset_target = Rectangle(height=0.46/2,width=0.46/2).set_stroke(width=0.).set_fill(current_color, opacity=0.7)
dataset_target.move_to(indiv_data)
dataset_target.generate_target()
aligned_edge = ORIGIN
if j % 8 == 0:
aligned_edge = LEFT
dataset_target.target.next_to(
samplers[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
)
dataset_target.target.set_x(dataset_target.target.get_x())
elif j % 4 == 0:
old_target = dataset_target.target
dataset_target.target.next_to(
samplers[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
)
dataset_target.target.set_x(dataset_target.target.get_x())
dataset_target.target.set_y(dataset_target.target.get_y()-.25)
else:
dataset_target.target.next_to(
old_target, direction=RIGHT, buff=0.02,
)
old_target = dataset_target.target
new_row.append(dataset_target)
first_animations.append(indiv_data.animate(run_time=0.5).set_stroke(current_color))
second_animations.append(MoveToTarget(dataset_target, run_time=1.5))
new_datasets.append(new_row)
step_1 = MarkupText(
f"Since we splice the dataset between each GPU,\nthe models weights can be averaged during `backward()`\nActing as though we did one giant epoch\nvery quickly.",
font_size=18
)
step_1.move_to([-2.5, -2, 0])
self.play(
Write(step_1, run_time=3),
)
self.play(
*first_animations,
)
self.play(*second_animations)
self.wait(duration=.5)
move_animation = []
import random
for i,row in enumerate(new_datasets):
# row = [row[k] for k in random.sample(range(8), 8)]
current_color = colors[i]
if i == 0:
idx = -3
elif i == 1:
idx = -2
elif i == 2:
idx = -1
elif i == 3:
idx = 0
for j,indiv_data in enumerate(row):
indiv_data.generate_target()
aligned_edge = ORIGIN
if j % 8 == 0:
aligned_edge = LEFT
indiv_data.target.next_to(
gpus[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
)
indiv_data.target.set_x(indiv_data.target.get_x())
elif j % 4 == 0:
indiv_data.target.next_to(
gpus[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
)
indiv_data.target.set_x(indiv_data.target.get_x())
indiv_data.target.set_y(indiv_data.target.get_y()-.25)
else:
indiv_data.target.next_to(
old_target, direction=RIGHT, buff=0.02,
)
old_target = indiv_data.target
move_animation.append(MoveToTarget(indiv_data, run_time=1.5))
self.play(*move_animation)
self.wait()

View File

@ -0,0 +1,193 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from manim import *
class Stage6(Scene):
def construct(self):
# The dataset items
colors = ["BLUE_E", "DARK_BROWN", "GOLD_E", "GRAY_A"]
fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)
columns = [
VGroup(*[Rectangle(height=0.25,width=0.25,color=colors[j]) for i in range(8)]).arrange(RIGHT,buff=0)
for j in range(4)
]
dataset_recs = VGroup(*columns).arrange(UP, buff=0)
dataset_text = Text("Dataset", font_size=24)
dataset = Group(dataset_recs,dataset_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
dataset.move_to([-2,0,0])
self.add(dataset)
code = Code(
code="# We enable this by default\naccelerator = Accelerator()\ndataloader = DataLoader(..., shuffle=True)\ndataloader = accelerator.prepare(dataloader)\nfor batch in dataloader:\n\t...",
tab_width=4,
background="window",
language="Python",
font="Monospace",
font_size=14,
corner_radius=.2,
insert_line_no=False,
line_spacing=.75,
style=Code.styles_list[1],
)
code.move_to([-3.5, 2.5, 0])
self.add(code)
# The dataloader itself
sampler_1 = Group(
Rectangle(color="blue", height=1, width=1),
Text("Sampler GPU 1", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN)
sampler_2 = Group(
Rectangle(color="blue", height=1, width=1),
Text("Sampler GPU 2", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN)
sampler_3 = Group(
Rectangle(color="blue", height=1, width=1),
Text("Sampler GPU 3", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN)
sampler_4 = Group(
Rectangle(color="blue", height=1, width=1),
Text("Sampler GPU 4", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN)
sampler_1.move_to([2,2,0])
sampler_2.move_to([2,.5,0])
sampler_3.move_to([2,-1.,0])
sampler_4.move_to([2,-2.5,0])
self.add(sampler_1, sampler_2, sampler_3, sampler_4)
samplers = [sampler_1[0], sampler_2[0], sampler_3[0], sampler_4[0]]
gpu_1 = Group(
Rectangle(color="white", height=1, width=1),
Text("Output GPU 1", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, 2, 0])
gpu_2 = Group(
Rectangle(color="white", height=1, width=1),
Text("Output GPU 2", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, .5, 0])
gpu_3 = Group(
Rectangle(color="white", height=1, width=1),
Text("Output GPU 3", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, -1, 0])
gpu_4 = Group(
Rectangle(color="white", height=1, width=1),
Text("Output GPU 4", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, -2.5, 0])
gpus = [gpu_1[0], gpu_2[0], gpu_3[0], gpu_4[0]]
self.add(gpu_1, gpu_2, gpu_3, gpu_4)
first_animations = []
second_animations = []
colors = ["BLUE_E", "DARK_BROWN", "GOLD_E", "GRAY_A"]
current_color = colors[0]
buff = 0
lr_buff = .25
old_target = None
new_datasets = []
for i,row_data in enumerate(dataset_recs):
new_row = []
current_color = colors[i]
if i == 0:
idx = -3
elif i == 1:
idx = -2
elif i == 2:
idx = -1
elif i == 3:
idx = 0
for j,indiv_data in enumerate(row_data):
dataset_target = Rectangle(height=0.46/2,width=0.46/2).set_stroke(width=0.).set_fill(current_color, opacity=0.7)
dataset_target.move_to(indiv_data)
dataset_target.generate_target()
aligned_edge = ORIGIN
if j % 8 == 0:
aligned_edge = LEFT
old_target = dataset_target.target
dataset_target.target.next_to(
samplers[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
)
dataset_target.target.set_x(dataset_target.target.get_x())
elif j % 4 == 0:
old_target = dataset_target.target
dataset_target.target.next_to(
samplers[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
)
dataset_target.target.set_x(dataset_target.target.get_x())
dataset_target.target.set_y(dataset_target.target.get_y()-.25)
else:
dataset_target.target.next_to(
old_target, direction=RIGHT, buff=0.02,
)
old_target = dataset_target.target
new_row.append(dataset_target)
first_animations.append(indiv_data.animate(run_time=0.5).set_stroke(current_color))
second_animations.append(MoveToTarget(dataset_target, run_time=1.5))
new_datasets.append(new_row)
step_1 = MarkupText(
f"During shuffling, each mini-batch's\noutput order will be modified",
font_size=18
)
step_1.move_to([-1.5, -2, 0])
self.play(
Write(step_1, run_time=3),
)
self.play(
*first_animations,
)
self.play(*second_animations)
self.wait(duration=.5)
move_animation = []
import random
for i,row in enumerate(new_datasets):
row = [row[k] for k in random.sample(range(8), 8)]
current_color = colors[i]
if i == 0:
idx = -3
elif i == 1:
idx = -2
elif i == 2:
idx = -1
elif i == 3:
idx = 0
for j,indiv_data in enumerate(row):
indiv_data.generate_target()
aligned_edge = ORIGIN
if j % 8 == 0:
aligned_edge = LEFT
indiv_data.target.next_to(
gpus[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
)
indiv_data.target.set_x(indiv_data.target.get_x())
elif j % 4 == 0:
indiv_data.target.next_to(
gpus[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
)
indiv_data.target.set_x(indiv_data.target.get_x())
indiv_data.target.set_y(indiv_data.target.get_y()-.25)
else:
indiv_data.target.next_to(
old_target, direction=RIGHT, buff=0.02,
)
old_target = indiv_data.target
move_animation.append(MoveToTarget(indiv_data, run_time=1.5))
self.play(*move_animation)
self.wait()

View File

@ -0,0 +1,182 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from manim import *
class Stage7(Scene):
def construct(self):
# The dataset items
code = Code(
code="accelerator = Accelerator(dispatch_batches=True)\ndataloader = DataLoader(...)\ndataloader = accelerator.prepare(dataloader)\nfor batch in dataloader:\n\t...",
tab_width=4,
background="window",
language="Python",
font="Monospace",
font_size=14,
corner_radius=.2,
insert_line_no=False,
line_spacing=.75,
style=Code.styles_list[1],
)
code.move_to([-3.5, 2.5, 0])
self.add(code)
colors = ["BLUE_E", "DARK_BROWN", "GOLD_E", "GRAY_A"]
fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)
columns = [
VGroup(*[Rectangle(height=0.25,width=0.25,color=colors[j]) for i in range(8)]).arrange(RIGHT,buff=0)
for j in range(4)
]
dataset_recs = VGroup(*columns).arrange(UP, buff=0)
dataset_text = Text("Dataset", font_size=24)
dataset = Group(dataset_recs,dataset_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
dataset.move_to([-2,0,0])
self.add(dataset)
# The dataloader itself
sampler_1 = Group(
Rectangle(color="blue", height=1.02, width=1.02),
Text("Sampler GPU 1", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN)
sampler_2 = Group(
Rectangle(color="blue", height=1.02, width=1.02),
Text("Sampler GPU 2", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN)
sampler_3 = Group(
Rectangle(color="blue", height=1.02, width=1.02),
Text("Sampler GPU 3", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN)
sampler_4 = Group(
Rectangle(color="blue", height=1.02, width=1.02),
Text("Sampler GPU 4", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN)
sampler_1.move_to([2,2,0])
sampler_2.move_to([2,.5,0])
sampler_3.move_to([2,-1.,0])
sampler_4.move_to([2,-2.5,0])
self.add(sampler_1, sampler_2, sampler_3, sampler_4)
samplers = [sampler_1[0], sampler_2[0], sampler_3[0], sampler_4[0]]
gpu_1 = Group(
Rectangle(color="white", height=1.02, width=.98),
Text("Output GPU 1", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, 2, 0])
gpu_2 = Group(
Rectangle(color="white", height=1.02, width=.98),
Text("Output GPU 2", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, .5, 0])
gpu_3 = Group(
Rectangle(color="white", height=1.02, width=.98),
Text("Output GPU 3", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, -1, 0])
gpu_4 = Group(
Rectangle(color="white", height=1.02, width=.98),
Text("Output GPU 4", font_size=12)
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, -2.5, 0])
gpus = [gpu_1[0], gpu_2[0], gpu_3[0], gpu_4[0]]
self.add(gpu_1, gpu_2, gpu_3, gpu_4)
step_1 = MarkupText(
f"When using a `DataLoaderDispatcher`, all\nof the samples are collected from GPU 0's dataset,\nthen divided and sent to each GPU.\nAs a result, this will be slower.",
font_size=18
)
step_1.move_to([-2.5, -2, 0])
self.play(
Write(step_1, run_time=3.5),
)
first_animations = []
second_animations = []
colors = ["BLUE_E", "DARK_BROWN", "GOLD_E", "GRAY_A"]
current_color = colors[0]
ud_buff = 0.01
lr_buff = 0.01
old_target = None
new_datasets = []
for i,row_data in enumerate(dataset_recs):
new_row = []
current_color = colors[i]
for j,indiv_data in enumerate(row_data):
dataset_target = Rectangle(height=0.46/4,width=0.46/2).set_stroke(width=0.).set_fill(current_color, opacity=0.7)
dataset_target.move_to(indiv_data)
dataset_target.generate_target()
aligned_edge = ORIGIN
if j % 8 == 0:
aligned_edge = LEFT
dataset_target.target.next_to(
samplers[0].get_corner(DOWN+LEFT), buff=0.0125, direction=RIGHT+UP,
)
dataset_target.target.set_x(dataset_target.target.get_x())
dataset_target.target.set_y(dataset_target.target.get_y() + (.25 * i))
elif j % 4 == 0:
old_target = dataset_target.target
dataset_target.target.next_to(
samplers[0].get_corner(DOWN+LEFT), buff=0.0125, direction=RIGHT+UP,
)
dataset_target.target.set_x(dataset_target.target.get_x())
dataset_target.target.set_y(dataset_target.target.get_y()+.125 + (.25 * i))
else:
dataset_target.target.next_to(
old_target, direction=RIGHT, buff=0.0125,
)
old_target = dataset_target.target
new_row.append(dataset_target)
first_animations.append(indiv_data.animate(run_time=0.5).set_stroke(current_color))
second_animations.append(MoveToTarget(dataset_target, run_time=1.5))
new_datasets.append(new_row)
self.play(
*first_animations,
)
self.play(*second_animations)
move_animation = []
for i,row in enumerate(new_datasets):
current_color = colors[i]
if i == 0:
idx = -3
elif i == 1:
idx = -2
elif i == 2:
idx = -1
elif i == 3:
idx = 0
for j,indiv_data in enumerate(row):
indiv_data.generate_target()
indiv_data.animate.stretch_to_fit_height(0.46/2)
aligned_edge = ORIGIN
if j % 8 == 0:
aligned_edge = LEFT
indiv_data.target.next_to(
gpus[abs(idx)].get_corner(UP+LEFT), buff=.01, direction=RIGHT+DOWN,
)
indiv_data.target.set_x(indiv_data.target.get_x())
indiv_data.target.set_y(indiv_data.target.get_y()-.25)
elif j % 4 == 0:
indiv_data.target.next_to(
gpus[abs(idx)].get_corner(UP+LEFT), buff=.01, direction=RIGHT+DOWN,
)
indiv_data.target.set_x(indiv_data.target.get_x())
else:
indiv_data.target.next_to(
old_target, direction=RIGHT, buff=0.01,
)
old_target = indiv_data.target
move_animation.append(MoveToTarget(indiv_data, run_time=1.5))
self.play(*move_animation)
self.wait()

View File

@ -25,17 +25,18 @@ extras["docs"] = []
extras["test_prod"] = ["pytest>=7.2.0,<=8.0.0", "pytest-xdist", "pytest-subtests", "parameterized"]
extras["test_dev"] = [
"datasets",
"diffusers",
"evaluate",
"torchpippy>=0.2.0",
"transformers",
"scipy",
"scikit-learn",
"deepspeed",
"tqdm",
"bitsandbytes",
"timm",
]
extras["testing"] = extras["test_prod"] + extras["test_dev"]
extras["deepspeed"] = ["deepspeed<=0.14.0"]
extras["rich"] = ["rich"]
extras["test_trackers"] = ["wandb", "comet-ml", "tensorboard", "dvclive"]
@ -47,7 +48,7 @@ extras["sagemaker"] = [
setup(
name="accelerate",
version="0.29.0.dev",
version="0.31.0.dev0",
description="Accelerate",
long_description=open("README.md", encoding="utf-8").read(),
long_description_content_type="text/markdown",

View File

@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__version__ = "0.29.0.dev0"
__version__ = "0.30.1.dev0"
from .accelerator import Accelerator
from .big_modeling import (

View File

@ -79,6 +79,7 @@ from .utils import (
is_deepspeed_available,
is_fp8_available,
is_ipex_available,
is_lomo_available,
is_megatron_lm_available,
is_mlu_available,
is_msamp_available,
@ -215,7 +216,7 @@ class Accelerator:
project_dir (`str`, `os.PathLike`, *optional*):
A path to a directory for storing data such as logs of locally-compatible loggers and potentially saved
checkpoints.
step_scheduler_with_optimizer (`bool`, *optional`, defaults to `True`):
step_scheduler_with_optimizer (`bool`, *optional*, defaults to `True`):
Set `True` if the learning rate scheduler is stepped at the same time as the optimizer, `False` if only
done under certain circumstances (at the end of each epoch, for instance).
kwargs_handlers (list of [`~utils.KwargsHandler`], *optional*)
@ -340,6 +341,8 @@ class Accelerator:
self.init_handler = None
self.fp8_recipe_handler = None
self.autocast_handler = None
self.has_lomo_optimizer = False
if kwargs_handlers is not None:
for handler in kwargs_handlers:
assert isinstance(
@ -383,8 +386,15 @@ class Accelerator:
**kwargs,
)
if self.fp8_recipe_handler is None and self.state.mixed_precision == "fp8":
self.fp8_recipe_handler = FP8RecipeKwargs(backend="MSAMP" if is_msamp_available() else "TE")
self.delayed_fp8_autocast = False
if self.fp8_recipe_handler is not None:
# We already check if FP8 is available during `self.state`
if self.state.mixed_precision != "fp8":
raise ValueError("Passing in a `FP8RecipeKwargs` object requires setting `mixed_precision='fp8'`.")
self.delayed_fp8_autocast = self.fp8_recipe_handler.backend == "TE" and self.distributed_type in (
DistributedType.MULTI_GPU,
DistributedType.FSDP,
)
trackers = filter_trackers(log_with, self.logging_dir)
if len(trackers) < 1 and log_with is not None:
@ -450,7 +460,7 @@ class Accelerator:
and self.distributed_type not in (DistributedType.DEEPSPEED, DistributedType.MEGATRON_LM)
):
self.native_amp = True
if self.device.type not in ("xpu", "cuda", "mps", "npu", "xla", "mlu") or is_torch_xla_available(
if self.device.type not in ("xpu", "cuda", "npu", "xla", "mlu") or is_torch_xla_available(
check_is_tpu=True
):
raise ValueError(f"fp16 mixed precision requires a GPU (not {self.device.type!r}).")
@ -479,6 +489,10 @@ class Accelerator:
if mixed_precision == "bf16" and not self.native_amp and not is_torch_xla_available():
raise ValueError("bf16 mixed precision requires PyTorch >= 1.10 and a supported device.")
elif self.state.mixed_precision == "fp8":
# We always enable `native_amp` for FP8
self.native_amp = True
# Start of internal step tracking
self.step = 0
@ -550,6 +564,10 @@ class Accelerator:
def use_seedable_sampler(self):
return self.dataloader_config.use_seedable_sampler
@property
def non_blocking(self):
return self.dataloader_config.non_blocking
@property
def project_dir(self):
return self.project_configuration.project_dir
@ -1345,18 +1363,22 @@ class Accelerator:
model.forward = MethodType(convert_outputs_to_fp32(model.forward.__func__), model)
else:
model.forward = convert_outputs_to_fp32(new_forward)
elif self.mixed_precision == "fp8" and self.fp8_recipe_handler.backend == "TE":
# We prepare fp8 after, allowing for bf16 autocast to happen first
if getattr(self.fp8_recipe_handler, "backend", None) == "TE":
if not has_transformer_engine_layers(model):
with torch.no_grad():
convert_model(model)
model._converted_to_transformer_engine = True
model._original_forward = model.forward
kwargs = self.fp8_recipe_handler.to_kwargs() if self.fp8_recipe_handler is not None else {}
if "fp8_format" in kwargs:
kwargs["fp8_format"] = getattr(te_recipe.Format, kwargs["fp8_format"])
fp8_recipe = te_recipe.DelayedScaling(**kwargs)
model.forward = fp8_autocast(enabled=True, fp8_recipe=fp8_recipe)(model.forward)
# If we are in DDP or FSDP, we delay `autocast` until after FSDP/DDP has been initialized
# to make use of the process group
if not self.delayed_fp8_autocast:
model.forward = fp8_autocast(enabled=True, fp8_recipe=fp8_recipe)(model.forward)
if (getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False)) and getattr(
model, "hf_device_map", False
@ -1368,16 +1390,19 @@ class Accelerator:
" In order to use 8-bit models that have been loaded across multiple GPUs the solution is to use Naive Pipeline Parallelism."
" Therefore you should not specify that you are under any distributed regime in your accelerate config."
)
current_device = list(model_devices)[0]
current_device_index = current_device.index if isinstance(current_device, torch.device) else current_device
elif len(model_devices) == 1:
current_device = list(model_devices)[0]
current_device_index = (
current_device.index if isinstance(current_device, torch.device) else current_device
)
if torch.device(current_device_index) != self.device:
# if on the first device (GPU 0) we don't care
if (self.device.index is not None) or (current_device_index != 0):
raise ValueError(
"You can't train a model that has been loaded in 8-bit precision on a different device than the one "
"you're training on. Make sure you loaded the model on the correct device using for example `device_map={'':torch.cuda.current_device() or device_map={'':torch.xpu.current_device()}"
)
if torch.device(current_device_index) != self.device:
# if on the first device (GPU 0) we don't care
if (self.device.index is not None) or (current_device_index != 0):
raise ValueError(
"You can't train a model that has been loaded in 8-bit precision on a different device than the one "
"you're training on. Make sure you loaded the model on the correct device using for example `device_map={'':torch.cuda.current_device() or device_map={'':torch.xpu.current_device()}"
)
if "cpu" in model_devices or "disk" in model_devices:
raise ValueError(
@ -1447,6 +1472,73 @@ class Accelerator:
),
auto_wrap_policy=fsdp_plugin.auto_wrap_policy,
)
# In the event the model had been loaded in low precision, but
# mixed precision had also been activated, then we follow DeepSpeed's
# strategy to hold the parameters in full precision.
# - assume that trainer.args.bf16 and trainer.args.fp16 are already checked against
# fsdp_plugin.mixed_precision_policy.
# - NOTE: we do not check the mixed_precision attribute on the FSDP root wrapper.
# * this attribute will always set by init_utils.init_core_state so its always not None.
# * mixed_precision.param_dtype only regards _fwd_bwd_param_dtype
# * if model is loaded in 16bit, and even if mixed_precision.param_dtype is None,
# we sill want to upcast the flat_param.
if self.mixed_precision != "no": # if mixed precision is set
upcasted_log = []
for module in FSDP.fsdp_modules(model):
# Referencing DeepSpeed Zero3
# - in Init, params are converted to 16bit while partitioning.
# - in accelerator.prepare, deepspeed.initalize is called to:
# * creates the DeepSpeeedEngine.
# * since zero_optimization() is True , calls engine._configure_zero_optimizer.
#
# Inside the DeepSpeed Zero3 optimizer configuration, which initalizes
# DeepSpeedZeroOptimizer_Stage3, during which:
# * trainable_param_groups are obtained from the attached optimizer
# (already partitioned in 16bit).
# * then _setup_for_real_optimizer -> _create_fp32_partitions
# which performs the fp32 upcasting.
# To mimick DeepSeepds's casting in FSDP, we look at the (single) FlatParameter held
# within an FSDP wrapper. This FlatParameter will be seen by the optimizer.
# - even though there is a torch.device('meta') guard below, we
# expect _init_utils._init_param_handle_from_module to already
# sync the parameter.
if not module._has_params:
continue # skip if FSDP module not managing parameters
param = module._flat_param
if (
param.dtype != torch.float32
and param.device != torch.device("meta")
and param.requires_grad
):
# keep log of names_params that was upcasted
# NOTE: resorted to this because warnings.simplefilter("once") is somehow not working
name_param_log = (module.module.__class__.__name__, ", ".join(module._flat_param._fqns))
if name_param_log not in upcasted_log:
upcasted_log.append(name_param_log)
# this works because of FSDP's _runtime_utils.lazy_init.
# Have to be careful not to call anything before this that
# triggers lazy_init (e.g., _is_fsdp_root).
param.data = param.data.to(torch.float32) # upcasting
module._handle._orig_param_dtype = torch.float32 # update
# report the warnings
# some messages can be quite repetitive, especially when reporting about layers that have identical architecture.
if self.is_main_process:
for name_log, param_log in upcasted_log:
warnings.warn(
f"Upcasted low precision parameters in {name_log} because mixed precision turned on in FSDP. "
f"Affects: {param_log}."
)
if len(upcasted_log) > 0:
warnings.warn(
"FSDP upcast of low precision parameters may affect the precision of model checkpoints."
)
# if the previous and current models are same, delete the previous one
if len(self._models) > 1 and (self._models[-2] is self._models[-1]):
del self._models[-2]
@ -1456,6 +1548,11 @@ class Accelerator:
model = torch.nn.parallel.DistributedDataParallel(model, **kwargs)
elif self.distributed_type == DistributedType.XLA and self.state.fork_launched:
model = xmp.MpModelWrapper(model).to(self.device)
# Now we can apply the FP8 autocast
if self.delayed_fp8_autocast:
model.forward = fp8_autocast(enabled=True, fp8_recipe=fp8_recipe, fp8_group=model.process_group)(
model.forward
)
# torch.compile should be called last and only if the model isn't already compiled.
if self.state.dynamo_plugin.backend != DynamoBackend.NO and not is_compiled_module(model):
if not is_torch_version(">=", "2.0"):
@ -1571,6 +1668,8 @@ class Accelerator:
)
if model is not None:
# if the model is an MOE, set the appropriate MOE layers as leaf Z3 modules
deepspeed_plugin.set_moe_leaf_modules(model)
# deal with config keys that use `auto` value and rely on model's hidden_size
hidden_size_based_keys = [
"zero_optimization.reduce_bucket_size",
@ -1904,6 +2003,7 @@ class Accelerator:
even_batches=self.even_batches,
slice_fn_for_dispatch=slice_fn_for_dispatch,
use_seedable_sampler=self.use_seedable_sampler,
non_blocking=self.non_blocking,
)
self._dataloaders.append(prepared_data_loader)
return prepared_data_loader
@ -1930,6 +2030,14 @@ class Accelerator:
>>> optimizer = accelerator.prepare_optimizer(optimizer, device_placement=True)
```
"""
if is_lomo_available():
# We need to import locally to avoid circular imports since lomo imports stuff from
# transformers & accelerate
from lomo_optim import AdaLomo, Lomo
# Support multiple optimizers: https://github.com/huggingface/accelerate/pull/2695#discussion_r1589164607
self.has_lomo_optimizer |= isinstance(optimizer, (Lomo, AdaLomo))
# Ensure we can't double wrap an optimizer due to `find_batch_size`
if getattr(optimizer, "_is_accelerate_prepared", False):
if optimizer not in self._optimizers:
@ -2000,6 +2108,8 @@ class Accelerator:
>>> accelerator.backward(loss)
```
"""
learning_rate = kwargs.get("learning_rate")
if self.distributed_type != DistributedType.DEEPSPEED:
# deepspeed handles loss scaling by gradient_accumulation_steps in its `backward`
loss = loss / self.gradient_accumulation_steps
@ -2009,6 +2119,8 @@ class Accelerator:
return
elif self.scaler is not None:
self.scaler.scale(loss).backward(**kwargs)
elif learning_rate is not None and self.has_lomo_optimizer:
self.lomo_backward(loss, learning_rate)
else:
loss.backward(**kwargs)
@ -2216,7 +2328,7 @@ class Accelerator:
"""
return gather(tensor)
def gather_for_metrics(self, input_data):
def gather_for_metrics(self, input_data, use_gather_object=False):
"""
Gathers `input_data` and potentially drops duplicates in the last batch if on a distributed system. Should be
used for gathering the inputs and targets for metric calculation.
@ -2224,6 +2336,11 @@ class Accelerator:
Args:
input (`torch.Tensor`, `object`, a nested tuple/list/dictionary of `torch.Tensor`, or a nested tuple/list/dictionary of `object`):
The tensors or objects for calculating metrics across all processes
use_gather_object(`bool`):
Whether to forcibly use gather_object instead of gather (which is already done if all objects passed do
not contain tensors). This flag can be useful for gathering tensors with different sizes that we don't
want to pad and concatenate along the first dimension. Using it with GPU tensors is not well supported
and inefficient as it incurs GPU -> CPU transfer since tensors would be pickled.
Example:
@ -2248,7 +2365,9 @@ class Accelerator:
except TypeError:
all_tensors = False
if not all_tensors:
use_gather_object = use_gather_object or not all_tensors
if use_gather_object:
data = gather_object(input_data)
else:
data = self.gather(input_data)
@ -2267,7 +2386,11 @@ class Accelerator:
def _adjust_samples(tensor):
return tensor[: self.gradient_state.remainder]
return recursively_apply(_adjust_samples, data)
if use_gather_object:
# gather_object put the objects in a list
return _adjust_samples(data)
else:
return recursively_apply(_adjust_samples, data)
else: # remainder is 0
# no remainder even though at end of dataloader, so nothing to do.
return data
@ -2780,7 +2903,7 @@ class Accelerator:
for i, model in enumerate(self._models):
if self.distributed_type == DistributedType.FSDP:
logger.info("Saving FSDP model")
save_fsdp_model(self.state.fsdp_plugin, self, model, output_dir, i)
save_fsdp_model(self.state.fsdp_plugin, model, output_dir, i)
logger.info(f"FSDP Model saved to output dir {output_dir}")
elif self.distributed_type == DistributedType.DEEPSPEED:
logger.info("Saving DeepSpeed Model and Optimizer")
@ -2799,7 +2922,7 @@ class Accelerator:
if self.distributed_type == DistributedType.FSDP:
for i, opt in enumerate(self._optimizers):
logger.info("Saving FSDP Optimizer")
save_fsdp_optimizer(self.state.fsdp_plugin, self, opt, self._models[i], output_dir, i)
save_fsdp_optimizer(self.state.fsdp_plugin, opt, self._models[i], output_dir, i)
logger.info(f"FSDP Optimizer saved to output dir {output_dir}")
elif self.distributed_type not in [DistributedType.DEEPSPEED, DistributedType.MEGATRON_LM]:
optimizers = self._optimizers
@ -2924,7 +3047,7 @@ class Accelerator:
for i, model in enumerate(self._models):
if self.distributed_type == DistributedType.FSDP:
logger.info("Loading FSDP model")
load_fsdp_model(self.state.fsdp_plugin, self, model, input_dir, i)
load_fsdp_model(self.state.fsdp_plugin, model, input_dir, i)
logger.info(f"FSDP Model loaded from input dir {input_dir}")
elif self.distributed_type == DistributedType.DEEPSPEED:
logger.info("Loading DeepSpeed Model and Optimizer")
@ -2943,7 +3066,7 @@ class Accelerator:
if self.distributed_type == DistributedType.FSDP:
for i, opt in enumerate(self._optimizers):
logger.info("Loading FSDP Optimizer")
load_fsdp_optimizer(self.state.fsdp_plugin, self, opt, self._models[i], input_dir, i)
load_fsdp_optimizer(self.state.fsdp_plugin, opt, self._models[i], input_dir, i)
logger.info(f"FSDP Optimizer loaded from input dir {input_dir}")
elif self.distributed_type not in [DistributedType.DEEPSPEED, DistributedType.MEGATRON_LM]:
optimizers = self._optimizers
@ -3002,7 +3125,7 @@ class Accelerator:
for index, obj in enumerate(self._custom_objects):
load_custom_state(obj, input_dir, index)
def free_memory(self):
def free_memory(self, *objects):
"""
Will release all references to the internal objects stored and call the garbage collector. You should call this
method between two trainings with different models/optimizers. Also will reset `Accelerator.step` to 0.
@ -3015,19 +3138,23 @@ class Accelerator:
>>> accelerator = Accelerator()
>>> model, optimizer, scheduler = ...
>>> model, optimizer, scheduler = accelerator.prepare(model, optimizer, scheduler)
>>> accelerator.free_memory()
>>> del model, optimizer, scheduler
>>> model, optimizer, scheduler = accelerator.free_memory(model, optimizer, scheduler)
```
"""
# Deepspeed needs a bit more prep that should be done first
if hasattr(self, "deepspeed_engine_wrapped"):
if self.deepspeed_engine_wrapped is not None:
self.deepspeed_engine_wrapped.engine.destroy()
self.deepspeed_engine_wrapped = None
objects = release_memory(*objects)
self._schedulers = []
self._optimizers = []
self._models = []
self._dataloaders = []
self.deepspeed_engine_wrapped = None
self.step = 0
release_memory()
return objects
def clear(self):
def clear(self, *objects):
"""
Alias for [`Accelerate.free_memory`], releases all references to the internal objects stored and call the
garbage collector. You should call this method between two trainings with different models/optimizers.
@ -3040,11 +3167,10 @@ class Accelerator:
>>> accelerator = Accelerator()
>>> model, optimizer, scheduler = ...
>>> model, optimizer, scheduler = accelerator.prepare(model, optimizer, scheduler)
>>> accelerator.free_memory()
>>> del model, optimizer, scheduler
>>> model, optimizer, scheduler = accelerator.clear(model, optimizer, scheduler)
```
"""
self.free_memory()
return self.free_memory(*objects)
def _get_named_parameters(self, *args):
named_parameters = {}
@ -3257,3 +3383,27 @@ class Accelerator:
return True
return False
def lomo_backward(self, loss: torch.Tensor, learning_rate: float) -> None:
"""
Runs backward pass on LOMO optimizers.
"""
if is_lomo_available():
# We need to import locally to avoid circular imports since lomo imports stuff from
# transformers & accelerate
from lomo_optim import AdaLomo, Lomo
if learning_rate is None:
raise ValueError("A learning rate must be passed in order to call backward pass with LOMO optimizers.")
_backward_called = False
for optimizer in self._optimizers:
if isinstance(optimizer.optimizer, (Lomo, AdaLomo)):
optimizer.optimizer.fused_backward(loss, learning_rate)
_backward_called = True
if not _backward_called:
raise ValueError(
"Backward pass not properly called on LOMO optimizers. Are you sure you passed a LOMO optimizer in accelerator.prepare()?"
)

View File

@ -508,6 +508,7 @@ def load_checkpoint_and_dispatch(
skip_keys: Optional[Union[str, List[str]]] = None,
preload_module_classes: Optional[List[str]] = None,
force_hooks: bool = False,
strict: bool = False,
):
"""
Loads a (potentially sharded) checkpoint inside a model, potentially sending weights to a given device as they are
@ -554,6 +555,9 @@ def load_checkpoint_and_dispatch(
force_hooks (`bool`, *optional*, defaults to `False`):
Whether or not to force device hooks to be attached to the model even if all layers are dispatched to a
single device.
strict (`bool`, *optional*, defaults to `False`):
Whether to strictly enforce that the keys in the checkpoint state_dict match the keys of the model's
state_dict.
Example:
@ -608,6 +612,7 @@ def load_checkpoint_and_dispatch(
dtype=dtype,
offload_state_dict=offload_state_dict,
offload_buffers=offload_buffers,
strict=strict,
)
if device_map is None:
return model

View File

@ -120,8 +120,7 @@ def save_accelerator_state(
from .data_loader import IterableDatasetShard, SeedableRandomSampler
if isinstance(dataloader.dataset, IterableDatasetShard):
sampler = dataloader.sampler.sampler
sampler = dataloader.get_sampler()
if isinstance(sampler, SeedableRandomSampler):
save(sampler, output_sampler_file, save_on_each_node=save_on_each_node, safe_serialization=False)
logger.info(f"Sampler state for dataloader {i} saved in {output_sampler_file}")
@ -227,10 +226,9 @@ def load_accelerator_state(
from .data_loader import IterableDatasetShard, SeedableRandomSampler
if isinstance(dataloader.dataset, IterableDatasetShard):
sampler = dataloader.sampler.sampler
sampler = dataloader.get_sampler()
if isinstance(sampler, SeedableRandomSampler):
dataloader.sampler.sampler = torch.load(input_sampler_file)
sampler = dataloader.set_sampler(torch.load(input_sampler_file))
logger.info("All dataloader sampler states loaded successfully")
# GradScaler state

View File

@ -298,6 +298,18 @@ def get_cluster_input():
"When `zero3_init_flag` is set, it requires Transformers to be installed. "
"Please run `pip3 install transformers`."
)
use_moe = _ask_field(
"Do you want to enable Mixture-of-Experts training (MoE)? [yes/NO]: ",
_convert_yes_no_to_bool,
default=False,
error_message="Please enter yes or no.",
)
if use_moe:
deepspeed_config["deepspeed_moe_layer_cls_names"] = _ask_field(
"Specify the comma-separated list of transformers MoE layer class names (case-sensitive), e.g : "
" `MixtralSparseMoeBlock`, `Qwen2MoeSparseMoeBlock`, `JetMoEAttention,JetMoEBlock` ... : ",
str,
)
if num_machines > 1:
launcher_query = "Which Type of launcher do you want to use?"
@ -567,7 +579,7 @@ def get_cluster_input():
# CPU affinity is only supported on NVIDIA hardware for now
enable_cpu_affinity = False
if distributed_type == (DistributedType.NO, DistributedType.MULTI_GPU) and not use_cpu and not use_mps:
if distributed_type in (DistributedType.NO, DistributedType.MULTI_GPU) and not use_cpu and not use_mps:
enable_cpu_affinity = _ask_field(
"Would you like to enable numa efficiency? (Currently only supported on NVIDIA hardware). [yes/NO]: ",
_convert_yes_no_to_bool,

View File

@ -241,3 +241,4 @@ class SageMakerConfig(BaseConfig):
sagemaker_metrics_file: str = None
additional_args: dict = None
dynamo_config: dict = None
enable_cpu_affinity: bool = False

View File

@ -95,6 +95,7 @@ def write_basic_config(mixed_precision="no", save_location: str = default_json_c
config["num_processes"] = 1
config["distributed_type"] = "NO"
config["debug"] = False
config["enable_cpu_affinity"] = False
config = ClusterConfig(**config)
config.to_json_file(path)
return path

View File

@ -79,6 +79,8 @@ def env_command(args):
}
if pt_cuda_available:
info["GPU type"] = torch.cuda.get_device_name()
if pt_npu_available:
info["CANN version"] = torch.version.cann
print("\nCopy-and-paste the text below in your GitHub issue\n")
print("\n".join([f"- {prop}: {val}" for prop, val in info.items()]))

View File

@ -303,6 +303,15 @@ def launch_command_parser(subparsers=None):
type=str,
help="Tee std streams into a log file and also to console.",
)
distributed_args.add_argument(
"--log_dir",
type=str,
default=None,
help=(
"Base directory to use for log files when using torchrun/torch.distributed.run as launcher. "
"Use with --tee to redirect std streams info log files."
),
)
distributed_args.add_argument(
"--role",
type=str,
@ -487,6 +496,13 @@ def launch_command_parser(subparsers=None):
type=str,
help="DeepSpeed multi-node launcher to use. If unspecified, will default to `pdsh`.",
)
deepspeed_args.add_argument(
"--deepspeed_moe_layer_cls_names",
default=None,
type=str,
help="comma-separated list of transformer MoE layer class names (case-sensitive) to wrap ,e.g, `MixtralSparseMoeBlock`, `Qwen2MoeSparseMoeBlock`, `JetMoEAttention,JetMoEBlock` ..."
" (useful only when `use_deepspeed` flag is passed).",
)
# fsdp arguments
fsdp_args = parser.add_argument_group("FSDP Arguments", "Arguments related to Fully Shared Data Parallelism.")
@ -1027,8 +1043,8 @@ def _validate_launch_command(args):
defaults is not None and defaults.compute_environment != ComputeEnvironment.AMAZON_SAGEMAKER
)
if is_aws_env_disabled and args.num_cpu_threads_per_process is None:
args.num_cpu_threads_per_process = 1
if args.use_cpu and args.num_processes >= 1:
args.num_cpu_threads_per_process = get_int_from_env(["OMP_NUM_THREADS"], 1)
if args.use_cpu and args.num_processes >= 1 and get_int_from_env(["OMP_NUM_THREADS"], 0) == 0:
local_size = get_int_from_env(
["MPI_LOCALNRANKS", "OMPI_COMM_WORLD_LOCAL_SIZE", "MV2_COMM_WORLD_LOCAL_SIZE"], 1
)

View File

@ -429,6 +429,7 @@ class DataLoaderShard(DataLoader, DataLoaderStateMixin):
synchronized_generator=None,
skip_batches=0,
_drop_last: bool = False,
_non_blocking: bool = False,
**kwargs,
):
super().__init__(dataset, **kwargs)
@ -438,6 +439,7 @@ class DataLoaderShard(DataLoader, DataLoaderStateMixin):
self.skip_batches = skip_batches
self.gradient_state = GradientState()
self._drop_last = _drop_last
self._non_blocking = _non_blocking
self.iteration = 0
def __iter__(self):
@ -458,7 +460,7 @@ class DataLoaderShard(DataLoader, DataLoaderStateMixin):
try:
# But we still move it to the device so it is done before `StopIteration` is reached
if self.device is not None:
current_batch = send_to_device(current_batch, self.device)
current_batch = send_to_device(current_batch, self.device, non_blocking=self._non_blocking)
next_batch = next(dataloader_iter)
if batch_index >= self.skip_batches:
yield current_batch
@ -500,6 +502,18 @@ class DataLoaderShard(DataLoader, DataLoaderStateMixin):
else:
return len(self.dataset)
def get_sampler(self):
return get_sampler(self)
def set_sampler(self, sampler):
sampler_is_batch_sampler = isinstance(self.sampler, BatchSampler)
if sampler_is_batch_sampler:
self.sampler.sampler = sampler
else:
self.batch_sampler.sampler = sampler
if hasattr(self.batch_sampler, "batch_sampler"):
self.batch_sampler.batch_sampler.sampler = sampler
if is_torch_xla_available():
import torch_xla.distributed.parallel_loader as xpl
@ -571,7 +585,14 @@ class DataLoaderDispatcher(DataLoader, DataLoaderStateMixin):
"""
def __init__(
self, dataset, split_batches: bool = False, skip_batches=0, _drop_last: bool = False, slice_fn=None, **kwargs
self,
dataset,
split_batches: bool = False,
skip_batches=0,
_drop_last: bool = False,
_non_blocking: bool = False,
slice_fn=None,
**kwargs,
):
shuffle = False
if is_torch_version(">=", "1.11.0"):
@ -588,6 +609,7 @@ class DataLoaderDispatcher(DataLoader, DataLoaderStateMixin):
self.gradient_state = GradientState()
self.state = AcceleratorState()
self._drop_last = _drop_last
self._non_blocking = _non_blocking
self.skip_batches = skip_batches
self.slice_fn = slice_tensors if slice_fn is None else slice_fn
@ -660,7 +682,7 @@ class DataLoaderDispatcher(DataLoader, DataLoaderStateMixin):
if self.state.process_index != 0:
# Initialize tensors on other processes than process 0.
batch = initialize_tensors(batch_info[0])
batch = send_to_device(batch, self.state.device)
batch = send_to_device(batch, self.state.device, non_blocking=self._non_blocking)
# Broadcast the batch before splitting it.
batch = broadcast(batch, from_process=0)
@ -741,6 +763,36 @@ class DataLoaderDispatcher(DataLoader, DataLoaderStateMixin):
def total_dataset_length(self):
return len(self.dataset)
def get_sampler(self):
return get_sampler(self)
def set_sampler(self, sampler):
sampler_is_batch_sampler = isinstance(self.sampler, BatchSampler)
if sampler_is_batch_sampler:
self.sampler.sampler = sampler
else:
self.batch_sampler.sampler = sampler
if hasattr(self.batch_sampler, "batch_sampler"):
self.batch_sampler.batch_sampler.sampler = sampler
def get_sampler(dataloader):
"""
Get the sampler associated to the dataloader
Args:
dataloader (`torch.utils.data.dataloader.DataLoader`):
The data loader to split across several devices.
Returns:
`torch.utils.data.Sampler`: The sampler associated to the dataloader
"""
sampler_is_batch_sampler = isinstance(dataloader.sampler, BatchSampler)
if sampler_is_batch_sampler:
sampler = getattr(dataloader.sampler, "sampler", None)
else:
sampler = getattr(dataloader.batch_sampler, "sampler", None)
return sampler
def prepare_data_loader(
dataloader: DataLoader,
@ -754,6 +806,7 @@ def prepare_data_loader(
even_batches: bool = True,
slice_fn_for_dispatch: Optional[Callable] = None,
use_seedable_sampler: bool = False,
non_blocking: bool = False,
) -> DataLoader:
"""
Wraps a PyTorch `DataLoader` to generate batches for one of the processes only.
@ -812,6 +865,10 @@ def prepare_data_loader(
reproducability. Comes at a cost of potentially different performances due to different shuffling
algorithms but ensures results will be the *exact* same. Should be paired with `set_seed()` at every
`self.set_epoch`
non_blocking (`bool`, *optional*, defaults to `False`):
If set to `True`, dataloader will utilize non-blocking host-to-device transfers. If the dataloader has
`pin_memory` set to `True`, this will help to increase overlap between data transfer and computations.
Returns:
`torch.utils.data.dataloader.DataLoader`: A new data loader that will yield the portion of the batches
@ -863,13 +920,10 @@ def prepare_data_loader(
new_dataset = dataloader.dataset
# Iterable dataset doesn't like batch_sampler, but data_loader creates a default one for it
new_batch_sampler = dataloader.batch_sampler if not isinstance(new_dataset, IterableDataset) else None
sampler_is_batch_sampler = False
synchronized_generator = None
sampler_is_batch_sampler = isinstance(dataloader.sampler, BatchSampler)
if sampler_is_batch_sampler:
sampler = getattr(dataloader.sampler, "sampler", None)
else:
sampler = getattr(dataloader.batch_sampler, "sampler", None)
synchronized_generator = None
sampler = get_sampler(dataloader)
if isinstance(sampler, RandomSampler) and use_seedable_sampler:
# When iterating through the dataloader during distributed processes
# we want to ensure that on each process we are iterating through the same
@ -901,6 +955,10 @@ def prepare_data_loader(
split_batches=split_batches,
)
else:
if not use_seedable_sampler and hasattr(sampler, "generator"):
if sampler.generator is None:
sampler.generator = torch.Generator()
synchronized_generator = sampler.generator
batch_sampler = dataloader.sampler if sampler_is_batch_sampler else dataloader.batch_sampler
new_batch_sampler = BatchSamplerShard(
batch_sampler,
@ -941,6 +999,7 @@ def prepare_data_loader(
split_batches=split_batches,
batch_sampler=new_batch_sampler,
_drop_last=dataloader.drop_last,
_non_blocking=non_blocking,
slice_fn=slice_fn_for_dispatch,
**kwargs,
)
@ -952,6 +1011,7 @@ def prepare_data_loader(
batch_size=dataloader.batch_size,
rng_types=rng_types,
_drop_last=dataloader.drop_last,
_non_blocking=non_blocking,
synchronized_generator=synchronized_generator,
**kwargs,
)
@ -963,16 +1023,12 @@ def prepare_data_loader(
rng_types=rng_types,
synchronized_generator=synchronized_generator,
_drop_last=dataloader.drop_last,
_non_blocking=non_blocking,
**kwargs,
)
if isinstance(sampler, SeedableRandomSampler) and use_seedable_sampler:
if sampler_is_batch_sampler:
dataloader.sampler.sampler = sampler
else:
dataloader.batch_sampler.sampler = sampler
if hasattr(dataloader.batch_sampler, "batch_sampler"):
dataloader.batch_sampler.batch_sampler.sampler = sampler
dataloader.set_sampler(sampler)
if state.distributed_type == DistributedType.XLA:
return MpDeviceLoaderWrapper(dataloader, device)
return dataloader

View File

@ -54,6 +54,8 @@ class MultiProcessAdapter(logging.LoggerAdapter):
)
main_process_only = kwargs.pop("main_process_only", True)
in_order = kwargs.pop("in_order", False)
# set `stacklevel` to exclude ourself in `Logger.findCaller()` while respecting user's choice
kwargs.setdefault("stacklevel", 2)
if self.isEnabledFor(level):
if self._should_log(main_process_only):

View File

@ -18,7 +18,7 @@ import warnings
import torch
from .state import AcceleratorState, GradientState
from .utils import DistributedType, honor_type, is_torch_xla_available
from .utils import DistributedType, honor_type, is_lomo_available, is_torch_xla_available
if is_torch_xla_available():
@ -121,7 +121,22 @@ class AcceleratedOptimizer(torch.optim.Optimizer):
raise ValueError("`set_to_none` for Optimizer.zero_grad` is not supported by this optimizer.")
self.optimizer.zero_grad()
def train(self):
"""
Sets the optimizer to "train" mode. Useful for optimizers like `schedule_free`
"""
return self.optimizer.train()
def eval(self):
"""
Sets the optimizer to "eval" mode. Useful for optimizers like `schedule_free`
"""
return self.optimizer.eval()
def step(self, closure=None):
if is_lomo_available():
from lomo_optim import AdaLomo, Lomo
if (
not self.gradient_state.is_xla_gradients_synced
and self.accelerator_state.distributed_type == DistributedType.XLA
@ -129,6 +144,12 @@ class AcceleratedOptimizer(torch.optim.Optimizer):
gradients = xm._fetch_gradients(self.optimizer)
xm.all_reduce("sum", gradients, scale=1.0 / xm.xrt_world_size())
self.gradient_state.is_xla_gradients_synced = True
if is_lomo_available():
# `step` should be a no-op for LOMO optimizers.
if isinstance(self.optimizer, (Lomo, AdaLomo)):
return
if self.gradient_state.sync_gradients:
if self.scaler is not None:
self.optimizer.step = self._optimizer_patched_step_method

View File

@ -179,22 +179,14 @@ class PartialState:
)
# Sets up self.backend + imports
backend, distributed_type = self._prepare_backend(cpu, use_sagemaker_dp, kwargs.pop("backend", None))
original_backend = kwargs.pop("backend", None)
backend, distributed_type = self._prepare_backend(cpu, use_sagemaker_dp, original_backend)
if original_backend is not None and backend != original_backend:
raise ValueError("Your assigned backend {original_backend} is not avaliable, please use {backend}")
self.backend = backend
self.distributed_type = distributed_type
use_deepspeed = False
if not cpu:
# Deal with XLA
if is_torch_xla_available():
self.device = xm.xla_device()
xm.set_replication(self.device, xm.get_xla_supported_devices())
self.num_processes = xm.xrt_world_size()
self.process_index = xm.get_ordinal()
if is_torch_xla_available(check_is_tpu=True):
self.local_process_index = xm.get_local_ordinal()
else:
self.local_process_index = int(os.environ.get("LOCAL_RANK", -1))
self.distributed_type = DistributedType.XLA
if not cpu and self.backend != "xla":
if int(os.environ.get("LOCAL_RANK", -1)) != -1:
# Deal with spawning deepspeed
if os.environ.get("ACCELERATE_USE_DEEPSPEED", "false") == "true":
@ -204,7 +196,7 @@ class PartialState:
)
from deepspeed import comm as dist
if is_xpu_available and is_ccl_available():
if is_xpu_available() and is_ccl_available():
os.environ["CCL_PROCESS_LAUNCHER"] = "none"
os.environ["CCL_LOCAL_SIZE"] = os.environ.get("LOCAL_WORLD_SIZE", "1")
os.environ["CCL_LOCAL_RANK"] = os.environ.get("LOCAL_RANK", "0")
@ -246,7 +238,7 @@ class PartialState:
if (
self.distributed_type == DistributedType.MULTI_CPU
and get_int_from_env(["OMP_NUM_THREADS", "OMP_NUM_THREADS"], 0) > 0
and get_int_from_env(["OMP_NUM_THREADS"], 0) == 0
):
import psutil
@ -270,6 +262,16 @@ class PartialState:
self.num_processes = 1
self.process_index = 0
self.local_process_index = 0
elif self.backend == "xla":
# XLA needs device setting first for `set_replication`
self.set_device()
xm.set_replication(self.device, xm.get_xla_supported_devices())
self.num_processes = xm.xrt_world_size()
self.process_index = xm.get_ordinal()
if is_torch_xla_available(check_is_tpu=True):
self.local_process_index = xm.get_local_ordinal()
else:
self.local_process_index = int(os.environ.get("LOCAL_RANK", -1))
else:
self.num_processes = torch.distributed.get_world_size()
self.process_index = torch.distributed.get_rank()
@ -284,16 +286,17 @@ class PartialState:
# Set CPU affinity if enabled
if parse_flag_from_env("ACCELERATE_CPU_AFFINITY", False):
set_numa_affinity(self.local_process_index)
self.fork_launched = parse_flag_from_env("FORK_LAUNCHED", 0)
# Check for old RTX 4000's that can't use P2P or IB and are on old drivers
if self.device.type == "cuda" and not check_cuda_p2p_ib_support():
if "NCCL_P2P_DISABLE" not in os.environ or "NCCL_IB_DISABLE" not in os.environ:
raise NotImplementedError(
"Using RTX 4000 series doesn't support faster communication broadband via P2P or IB. "
'Please set `NCCL_P2P_DISABLE="1"` and `NCCL_IB_DISABLE="1" or use `accelerate launch` which '
"will do this automatically."
)
# Check for old RTX 4000's that can't use P2P or IB and are on old drivers
if self.device.type == "cuda" and not check_cuda_p2p_ib_support():
if "NCCL_P2P_DISABLE" not in os.environ or "NCCL_IB_DISABLE" not in os.environ:
raise NotImplementedError(
"Using RTX 4000 series doesn't support faster communication broadband via P2P or IB. "
'Please set `NCCL_P2P_DISABLE="1"` and `NCCL_IB_DISABLE="1" or use `accelerate launch` which '
"will do this automatically."
)
# Important: This should be the *only* code outside of `self.initialized!`
self.fork_launched = parse_flag_from_env("FORK_LAUNCHED", 0)
def __repr__(self) -> str:
return (
@ -715,19 +718,22 @@ class PartialState:
backend = "smddp"
distributed_type = DistributedType.MULTI_GPU
elif int(os.environ.get("LOCAL_RANK", -1)) != -1:
if not cpu:
if is_mlu_available():
backend = "cncl"
distributed_type = DistributedType.MULTI_MLU
elif torch.cuda.is_available():
if backend is None:
backend = "nccl"
distributed_type = DistributedType.MULTI_GPU
elif is_npu_available():
backend = "hccl"
distributed_type = DistributedType.MULTI_NPU
if backend is None and (
elif is_torch_xla_available():
backend = "xla"
distributed_type = DistributedType.XLA
elif int(os.environ.get("LOCAL_RANK", -1)) != -1 and not cpu:
if is_mlu_available():
backend = "cncl"
distributed_type = DistributedType.MULTI_MLU
elif torch.cuda.is_available():
if backend is None:
backend = "nccl"
distributed_type = DistributedType.MULTI_GPU
elif is_npu_available():
backend = "hccl"
distributed_type = DistributedType.MULTI_NPU
if distributed_type is None and (
int(os.environ.get("LOCAL_RANK", -1)) != -1
or get_int_from_env(["PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE", "WORLD_SIZE"], 1) > 1
):
@ -735,8 +741,11 @@ class PartialState:
distributed_type = DistributedType.MULTI_XPU
else:
distributed_type = DistributedType.MULTI_CPU
if is_ccl_available() and (
get_int_from_env(["CCL_WORKER_COUNT"], 0) > 0 or distributed_type == DistributedType.MULTI_XPU
if (
backend in (None, "ccl")
and is_ccl_available()
and (get_int_from_env(["CCL_WORKER_COUNT"], 0) > 0 or distributed_type == DistributedType.MULTI_XPU)
):
if get_ccl_version() >= "1.12":
import oneccl_bindings_for_pytorch # noqa: F401
@ -744,12 +753,13 @@ class PartialState:
import torch_ccl # noqa: F401
backend = "ccl"
elif torch.distributed.is_mpi_available():
elif backend in (None, "mpi") and torch.distributed.is_mpi_available():
backend = "mpi"
else:
backend = "gloo"
if distributed_type is None:
distributed_type = DistributedType.NO
return backend, distributed_type
def set_device(self):
@ -758,17 +768,20 @@ class PartialState:
"""
if self.device is not None:
return
if self.num_processes == 1:
if self.distributed_type == DistributedType.NO:
self.device = torch.device("cpu") if self._cpu else self.default_device
return
device = str(self.distributed_type).split(".")[-1].replace("MULTI_", "").lower()
if device not in ("cpu", "gpu", "mlu", "npu", "xpu"):
if device not in ("cpu", "gpu", "mlu", "npu", "xpu", "xla"):
raise ValueError(
f"Can't set device for {self.distributed_type} ({device}), verify we should be calling `_set_device()` for it!"
)
if device == "gpu":
device = "cuda"
self.device = torch.device(device, self.local_process_index)
if device == "xla":
self.device = xm.xla_device()
else:
if device == "gpu":
device = "cuda"
self.device = torch.device(device, self.local_process_index)
if self.device is not None:
if device == "xpu":
torch.xpu.set_device(self.device)
@ -893,7 +906,6 @@ class AcceleratorState:
fsdp_plugin.set_mixed_precision(self._mixed_precision)
self.fsdp_plugin = fsdp_plugin
if os.environ.get("ACCELERATE_USE_MEGATRON_LM", "false") == "true" and self.distributed_type not in [
DistributedType.MULTI_NPU,
DistributedType.MULTI_XPU,
]:
self.distributed_type = DistributedType.MEGATRON_LM

View File

@ -38,6 +38,7 @@ from .testing import (
require_single_gpu,
require_single_xpu,
require_torch_min_version,
require_torchvision,
require_tpu,
require_xpu,
skip,

View File

@ -20,12 +20,48 @@ from typing import List
from unittest.mock import Mock
import torch
from torch.utils.data import DataLoader, IterableDataset, TensorDataset
from torch.utils.data import (
BatchSampler,
DataLoader,
Dataset,
IterableDataset,
RandomSampler,
TensorDataset,
default_collate,
)
from accelerate.accelerator import Accelerator, DataLoaderConfiguration
from accelerate.utils.dataclasses import DistributedType
NUM_ELEMENTS = 22
NUM_WORKERS = 4
BATCH_SIZE = 4
class DummyDataset(Dataset):
def __len__(self):
return NUM_ELEMENTS
def __getitem__(self, index):
squeeze = False
if isinstance(index, int):
index = [index]
squeeze = True
elif isinstance(index, slice):
index = list(range(*index.indices(self.size)))
else:
index = list(index)
batch = [{"index": i, "label": i % 2, "random_augmentation": torch.rand(1).item()} for i in index]
if squeeze:
batch = batch[0]
return batch
class DummyIterableDataset(IterableDataset):
def __init__(self, data):
self.data = data
@ -206,8 +242,27 @@ def test_join_raises_warning_for_iterable_when_overriding_even_batches():
assert "only supported for map-style datasets" in str(w[-1].message)
def test_data_loader(data_loader, accelerator):
# Prepare the DataLoader
data_loader = accelerator.prepare(data_loader)
all_examples = []
for i, batch in enumerate(data_loader):
index, _ = accelerator.gather_for_metrics((batch["index"], batch["label"]))
all_examples.extend(index.detach().cpu().numpy().tolist())
# Sort the examples
sorted_all_examples = sorted(all_examples)
# Check if all elements are present in the sorted list of iterated samples
assert (
len(set(sorted_all_examples)) == NUM_ELEMENTS
), "Not all the dataset elements have been iterated in an epoch due to duplication of samples across processes."
def main():
accelerator = create_accelerator()
torch.manual_seed(accelerator.process_index)
accelerator.print("Test that even_batches variable ensures uniform batches across processes")
test_default_ensures_even_batch_sizes()
@ -233,6 +288,25 @@ def main():
test_join_raises_warning_for_non_ddp_distributed(accelerator)
accelerator.state.distributed_type = original_state
dataset = DummyDataset()
# Conventional Dataloader with shuffle=False
loader = DataLoader(dataset, shuffle=False, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)
test_data_loader(loader, accelerator)
# Conventional Dataloader with shuffle=True
loader = DataLoader(dataset, shuffle=True, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)
test_data_loader(loader, accelerator)
# Dataloader with batch_sampler
sampler = BatchSampler(RandomSampler(dataset), batch_size=BATCH_SIZE, drop_last=False)
loader = DataLoader(dataset, batch_sampler=sampler, num_workers=NUM_WORKERS)
test_data_loader(loader, accelerator)
# Dataloader with sampler as an instance of `BatchSampler`
sampler = BatchSampler(RandomSampler(dataset), batch_size=BATCH_SIZE, drop_last=False)
loader = DataLoader(dataset, sampler=sampler, batch_size=None, collate_fn=default_collate, num_workers=NUM_WORKERS)
test_data_loader(loader, accelerator)
if __name__ == "__main__":
main()

View File

@ -22,7 +22,6 @@ from copy import deepcopy
from pathlib import Path
import numpy as np
import pytest
import torch
from torch.utils.data import DataLoader, Dataset
@ -39,6 +38,7 @@ from accelerate.utils import (
is_ipex_available,
is_mlu_available,
is_npu_available,
is_pytest_available,
is_xpu_available,
set_seed,
synchronize_rng_states,
@ -711,6 +711,8 @@ def test_trigger():
def test_reinstantiated_state():
import pytest
AcceleratorState._reset_state()
simple_model = torch.nn.Linear(1, 1)
# First define an accelerator
@ -792,9 +794,10 @@ def main():
print("\n**Breakpoint trigger test**")
test_trigger()
if state.local_process_index == 0:
print("\n**Test reinstantiated state**")
test_reinstantiated_state()
if is_pytest_available():
if state.local_process_index == 0:
print("\n**Test reinstantiated state**")
test_reinstantiated_state()
if __name__ == "__main__":

View File

@ -45,10 +45,12 @@ from ..utils import (
is_npu_available,
is_pandas_available,
is_pippy_available,
is_schedulefree_available,
is_tensorboard_available,
is_timm_available,
is_torch_version,
is_torch_xla_available,
is_torchvision_available,
is_transformers_available,
is_wandb_available,
is_xpu_available,
@ -213,6 +215,20 @@ def require_timm(test_case):
return unittest.skipUnless(is_timm_available(), "test requires the timm library")(test_case)
def require_torchvision(test_case):
"""
Decorator marking a test that requires torchvision. These tests are skipped when they are not.
"""
return unittest.skipUnless(is_torchvision_available(), "test requires the torchvision library")(test_case)
def require_schedulefree(test_case):
"""
Decorator marking a test that requires schedulefree. These tests are skipped when they are not.
"""
return unittest.skipUnless(is_schedulefree_available(), "test requires the schedulefree library")(test_case)
def require_bnb(test_case):
"""
Decorator marking a test that requires bitsandbytes. These tests are skipped when they are not.

View File

@ -81,6 +81,7 @@ from .imports import (
is_dvclive_available,
is_fp8_available,
is_ipex_available,
is_lomo_available,
is_megatron_lm_available,
is_mlflow_available,
is_mlu_available,
@ -91,11 +92,14 @@ from .imports import (
is_peft_available,
is_pippy_available,
is_pynvml_available,
is_pytest_available,
is_rich_available,
is_sagemaker_available,
is_schedulefree_available,
is_tensorboard_available,
is_timm_available,
is_torch_xla_available,
is_torchvision_available,
is_transformer_engine_available,
is_transformers_available,
is_wandb_available,

View File

@ -154,6 +154,8 @@ class InitProcessGroupKwargs(KwargsHandler):
[method](https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group) for more
information on each argument.
Note: If `timeout` is set to `None`, the default will be based upon how `backend` is set.
```python
from datetime import timedelta
from accelerate import Accelerator
@ -166,7 +168,12 @@ class InitProcessGroupKwargs(KwargsHandler):
backend: Optional[str] = "nccl"
init_method: Optional[str] = None
timeout: timedelta = timedelta(seconds=1800)
timeout: Optional[timedelta] = None
def __post_init__(self):
if self.timeout is None:
seconds = 1800 if self.backend != "nccl" else 600
self.timeout = timedelta(seconds=seconds)
# Literals
@ -524,6 +531,14 @@ class DataLoaderConfiguration:
"multiple different seeds to compare. Should also be ran with [`~utils.set_seed`] for the best results."
},
)
non_blocking: bool = field(
default=False,
metadata={
"help": "If set to `True`, the dataloader prepared by the Accelerator will utilize non-blocking host-to-device"
" transfers, allowing for better overlap between dataloader communication and computation. Recommended that the"
" prepared dataloader has `pin_memory` set to `True` to work properly."
},
)
@dataclass
@ -682,15 +697,15 @@ class DeepSpeedPlugin:
default=None,
metadata={"help": "Possible options are 0,1,2,3; Default will be taken from environment variable"},
)
is_train_batch_min: str = field(
is_train_batch_min: bool = field(
default=True,
metadata={"help": "If both train & eval dataloaders are specified, this will decide the train_batch_size"},
)
offload_optimizer_device: bool = field(
offload_optimizer_device: str = field(
default=None,
metadata={"help": "Possible options are none|cpu|nvme. Only applicable with ZeRO Stages 2 and 3."},
)
offload_param_device: bool = field(
offload_param_device: str = field(
default=None,
metadata={"help": "Possible options are none|cpu|nvme. Only applicable with ZeRO Stage 3."},
)
@ -713,6 +728,13 @@ class DeepSpeedPlugin:
default=None,
metadata={"help": "Flag to indicate whether to save 16-bit model. Only applicable with ZeRO Stage-3."},
)
transformer_moe_cls_names: str = field(
default=None,
metadata={
"help": "comma-separated list of transformers MoE layer class names (case-sensitive), e.g : "
" `MixtralSparseMoeBlock`, `Qwen2MoeSparseMoeBlock`, `JetMoEAttention,JetMoEBlock` ..."
},
)
def __post_init__(self):
from .deepspeed import HfDeepSpeedConfig
@ -722,9 +744,8 @@ class DeepSpeedPlugin:
self.gradient_accumulation_steps = int(gas) if gas.isdigit() else gas
if self.gradient_clipping is None:
gradient_clipping = os.environ.get("ACCELERATE_GRADIENT_CLIPPING", "none")
if gradient_clipping != "none":
self.gradient_clipping = float(gradient_clipping)
gradient_clipping = os.environ.get("ACCELERATE_GRADIENT_CLIPPING", "auto")
self.gradient_clipping = gradient_clipping if gradient_clipping == "auto" else float(gradient_clipping)
if self.zero_stage is None:
self.zero_stage = int(os.environ.get("ACCELERATE_DEEPSPEED_ZERO_STAGE", 2))
@ -968,6 +989,26 @@ class DeepSpeedPlugin:
"It will only ask for the necessary config variables when using `deepspeed_config_file`."
)
def set_moe_leaf_modules(self, model):
if self.transformer_moe_cls_names is None:
self.transformer_moe_cls_names = os.environ.get("ACCELERATE_DEEPSPEED_MOE_LAYER_CLS_NAMES", None)
if self.transformer_moe_cls_names is not None:
if compare_versions("deepspeed", "<", "0.14.0"):
raise ImportError("DeepSpeed version must be >= 0.14.0 to use MOE support. Please update DeepSpeed.")
from deepspeed.utils import set_z3_leaf_modules
class_names = self.transformer_moe_cls_names.split(",")
transformer_moe_cls = []
for layer_class in class_names:
transformer_cls = get_module_class_from_name(model, layer_class)
if transformer_cls is None:
raise Exception(
f"Could not find a transformer layer class called '{layer_class}' to wrap in the model."
)
else:
transformer_moe_cls.append(transformer_cls)
set_z3_leaf_modules(model, transformer_moe_cls) # z3_leaf
@dataclass
class FullyShardedDataParallelPlugin:
@ -1109,6 +1150,13 @@ class FullyShardedDataParallelPlugin:
self.forward_prefetch = str_to_bool(os.environ.get(prefix + "FORWARD_PREFETCH", "False")) == 1
self.activation_checkpointing = str_to_bool(os.environ.get(prefix + "ACTIVATION_CHECKPOINTING", "False")) == 1
if str_to_bool(os.environ.get("FSDP_CPU_RAM_EFFICIENT_LOADING", "False")) == 1 and not self.sync_module_states:
warnings.warn(
"sync_module_states cannot be False since efficient cpu ram loading enabled. "
"Setting sync_module_states to True."
)
self.sync_module_states = True
if self.sync_module_states:
if is_npu_available():
device = torch.npu.current_device()
@ -1122,26 +1170,6 @@ class FullyShardedDataParallelPlugin:
)
self.param_init_fn = lambda x: x.to_empty(device=device, recurse=False)
@staticmethod
def get_module_class_from_name(module, name):
"""
Gets a class from a module by its name.
Args:
module (`torch.nn.Module`): The module to get the class from.
name (`str`): The name of the class.
"""
modules_children = list(module.children())
if module.__class__.__name__ == name:
return module.__class__
elif len(modules_children) == 0:
return
else:
for child_module in modules_children:
module_class = FullyShardedDataParallelPlugin.get_module_class_from_name(child_module, name)
if module_class is not None:
return module_class
def set_auto_wrap_policy(self, model):
from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy, transformer_auto_wrap_policy
@ -1156,7 +1184,7 @@ class FullyShardedDataParallelPlugin:
).split(",")
transformer_cls_to_wrap = set()
for layer_class in transformer_cls_names_to_wrap:
transformer_cls = FullyShardedDataParallelPlugin.get_module_class_from_name(model, layer_class)
transformer_cls = get_module_class_from_name(model, layer_class)
if transformer_cls is None:
raise Exception("Could not find the transformer layer class to wrap in the model.")
else:
@ -1199,6 +1227,8 @@ class FullyShardedDataParallelPlugin:
from torch.distributed.fsdp.fully_sharded_data_parallel import (
FullOptimStateDictConfig,
FullStateDictConfig,
ShardedOptimStateDictConfig,
ShardedStateDictConfig,
StateDictType,
)
@ -1209,6 +1239,11 @@ class FullyShardedDataParallelPlugin:
self.state_dict_config = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
if self.optim_state_dict_config is None:
self.optim_state_dict_config = FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=True)
elif self.state_dict_type == StateDictType.SHARDED_STATE_DICT:
if self.state_dict_config is None:
self.state_dict_config = ShardedStateDictConfig(offload_to_cpu=True)
if self.optim_state_dict_config is None:
self.optim_state_dict_config = ShardedOptimStateDictConfig(offload_to_cpu=True)
@dataclass
@ -1715,3 +1750,23 @@ class BnbQuantizationConfig:
if not isinstance(self.torch_dtype, torch.dtype):
raise ValueError("torch_dtype must be a torch.dtype")
def get_module_class_from_name(module, name):
"""
Gets a class from a module by its name.
Args:
module (`torch.nn.Module`): The module to get the class from.
name (`str`): The name of the class.
"""
modules_children = list(module.children())
if module.__class__.__name__ == name:
return module.__class__
elif len(modules_children) == 0:
return
else:
for child_module in modules_children:
module_class = get_module_class_from_name(child_module, name)
if module_class is not None:
return module_class

View File

@ -16,6 +16,7 @@ import os
import torch
from ..logging import get_logger
from ..state import PartialState
from .constants import FSDP_MODEL_NAME, FSDP_PYTORCH_VERSION, OPTIMIZER_NAME
from .imports import is_torch_distributed_available
from .modeling import is_peft_model
@ -51,13 +52,14 @@ def _set_model_state_dict(model, state_dict, adapter_only=False):
return model.load_state_dict(state_dict)
def save_fsdp_model(fsdp_plugin, accelerator, model, output_dir, model_index=0, adapter_only=False):
def save_fsdp_model(fsdp_plugin, model, output_dir, model_index=0, adapter_only=False):
state = PartialState()
os.makedirs(output_dir, exist_ok=True)
if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
# FSDP raises error when single GPU is used with `offload_to_cpu=True` for FULL_STATE_DICT
# so, only enable it when num_processes>1
is_multi_process = accelerator.num_processes > 1
is_multi_process = state.num_processes > 1
fsdp_plugin.state_dict_config.offload_to_cpu = is_multi_process
fsdp_plugin.state_dict_config.rank0_only = is_multi_process
@ -68,15 +70,15 @@ def save_fsdp_model(fsdp_plugin, accelerator, model, output_dir, model_index=0,
if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
weights_name = f"{FSDP_MODEL_NAME}.bin" if model_index == 0 else f"{FSDP_MODEL_NAME}_{model_index}.bin"
output_model_file = os.path.join(output_dir, weights_name)
if accelerator.process_index == 0:
if state.process_index == 0:
logger.info(f"Saving model to {output_model_file}")
torch.save(state_dict, output_model_file)
logger.info(f"Model saved to {output_model_file}")
elif fsdp_plugin.state_dict_type == StateDictType.LOCAL_STATE_DICT:
weights_name = (
f"{FSDP_MODEL_NAME}_rank{accelerator.process_index}.bin"
f"{FSDP_MODEL_NAME}_rank{state.process_index}.bin"
if model_index == 0
else f"{FSDP_MODEL_NAME}_{model_index}_rank{accelerator.process_index}.bin"
else f"{FSDP_MODEL_NAME}_{model_index}_rank{state.process_index}.bin"
)
output_model_file = os.path.join(output_dir, weights_name)
logger.info(f"Saving model to {output_model_file}")
@ -96,19 +98,20 @@ def save_fsdp_model(fsdp_plugin, accelerator, model, output_dir, model_index=0,
logger.info(f"Model saved to {ckpt_dir}")
def load_fsdp_model(fsdp_plugin, accelerator, model, input_dir, model_index=0, adapter_only=False):
accelerator.wait_for_everyone()
def load_fsdp_model(fsdp_plugin, model, input_dir, model_index=0, adapter_only=False):
state = PartialState()
state.wait_for_everyone()
if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
# FSDP raises error when single GPU is used with `offload_to_cpu=True` for FULL_STATE_DICT
# so, only enable it when num_processes>1
is_multi_process = accelerator.num_processes > 1
is_multi_process = state.num_processes > 1
fsdp_plugin.state_dict_config.offload_to_cpu = is_multi_process
fsdp_plugin.state_dict_config.rank0_only = is_multi_process
with FSDP.state_dict_type(
model, fsdp_plugin.state_dict_type, fsdp_plugin.state_dict_config, fsdp_plugin.optim_state_dict_config
):
if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
if type(model) != FSDP and accelerator.process_index != 0:
if type(model) != FSDP and state.process_index != 0:
if not fsdp_plugin.sync_module_states:
raise ValueError(
"Set the `sync_module_states` flag to `True` so that model states are synced across processes when "
@ -122,9 +125,9 @@ def load_fsdp_model(fsdp_plugin, accelerator, model, input_dir, model_index=0, a
logger.info(f"Model loaded from {input_model_file}")
elif fsdp_plugin.state_dict_type == StateDictType.LOCAL_STATE_DICT:
weights_name = (
f"{FSDP_MODEL_NAME}_rank{accelerator.process_index}.bin"
f"{FSDP_MODEL_NAME}_rank{state.process_index}.bin"
if model_index == 0
else f"{FSDP_MODEL_NAME}_{model_index}_rank{accelerator.process_index}.bin"
else f"{FSDP_MODEL_NAME}_{model_index}_rank{state.process_index}.bin"
)
input_model_file = os.path.join(input_dir, weights_name)
logger.info(f"Loading model from {input_model_file}")
@ -149,14 +152,15 @@ def load_fsdp_model(fsdp_plugin, accelerator, model, input_dir, model_index=0, a
return load_result
def save_fsdp_optimizer(fsdp_plugin, accelerator, optimizer, model, output_dir, optimizer_index=0):
def save_fsdp_optimizer(fsdp_plugin, optimizer, model, output_dir, optimizer_index=0):
state = PartialState()
os.makedirs(output_dir, exist_ok=True)
with FSDP.state_dict_type(
model, fsdp_plugin.state_dict_type, fsdp_plugin.state_dict_config, fsdp_plugin.optim_state_dict_config
):
optim_state = FSDP.optim_state_dict(model, optimizer)
if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
if accelerator.process_index == 0:
if state.process_index == 0:
optim_state_name = (
f"{OPTIMIZER_NAME}.bin" if optimizer_index == 0 else f"{OPTIMIZER_NAME}_{optimizer_index}.bin"
)
@ -176,14 +180,15 @@ def save_fsdp_optimizer(fsdp_plugin, accelerator, optimizer, model, output_dir,
logger.info(f"Optimizer state saved in {ckpt_dir}")
def load_fsdp_optimizer(fsdp_plugin, accelerator, optimizer, model, input_dir, optimizer_index=0, adapter_only=False):
accelerator.wait_for_everyone()
def load_fsdp_optimizer(fsdp_plugin, optimizer, model, input_dir, optimizer_index=0, adapter_only=False):
state = PartialState()
state.wait_for_everyone()
with FSDP.state_dict_type(
model, fsdp_plugin.state_dict_type, fsdp_plugin.state_dict_config, fsdp_plugin.optim_state_dict_config
):
if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
optim_state = None
if accelerator.process_index == 0 or not fsdp_plugin.optim_state_dict_config.rank0_only:
if state.process_index == 0 or not fsdp_plugin.optim_state_dict_config.rank0_only:
optimizer_name = (
f"{OPTIMIZER_NAME}.bin" if optimizer_index == 0 else f"{OPTIMIZER_NAME}_{optimizer_index}.bin"
)

View File

@ -85,14 +85,26 @@ def is_pynvml_available():
return _is_package_available("pynvml")
def is_pytest_available():
return _is_package_available("pytest")
def is_msamp_available():
return _is_package_available("msamp", "ms-amp")
def is_schedulefree_available():
return _is_package_available("schedulefree")
def is_transformer_engine_available():
return _is_package_available("transformer_engine")
def is_lomo_available():
return _is_package_available("lomo_optim")
def is_fp8_available():
return is_msamp_available() or is_transformer_engine_available()
@ -175,6 +187,8 @@ def is_bf16_available(ignore_tpu=False):
return not ignore_tpu
if is_cuda_available():
return torch.cuda.is_bf16_supported()
if is_mps_available():
return False
return True
@ -198,6 +212,10 @@ def is_bnb_available():
return _is_package_available("bitsandbytes")
def is_torchvision_available():
return _is_package_available("torchvision")
def is_megatron_lm_available():
if str_to_bool(os.environ.get("ACCELERATE_USE_MEGATRON_LM", "False")) == 1:
package_exists = importlib.util.find_spec("megatron") is not None

View File

@ -393,6 +393,8 @@ def prepare_deepspeed_cmd_env(args: argparse.Namespace) -> Tuple[List[str], Dict
current_env["ACCELERATE_DEEPSPEED_CONFIG_FILE"] = str(args.deepspeed_config_file)
if args.enable_cpu_affinity:
current_env["ACCELERATE_CPU_AFFINITY"] = "1"
if args.deepspeed_moe_layer_cls_names is not None:
current_env["ACCELERATE_DEEPSPEED_MOE_LAYER_CLS_NAMES"] = str(args.deepspeed_moe_layer_cls_names)
return cmd, current_env

View File

@ -381,12 +381,13 @@ def set_module_tensor_to_device(
device_quantization = device
device = "cpu"
# `torch.Tensor.to(<int num>)` is not supported by `torch_npu` (see this [issue](https://github.com/Ascend/pytorch/issues/16)).
if is_npu_available() and isinstance(device, int):
device = f"npu:{device}"
elif is_mlu_available() and isinstance(device, int):
device = f"mlu:{device}"
if is_xpu_available() and isinstance(device, int):
device = f"xpu:{device}"
if isinstance(device, int):
if is_npu_available():
device = f"npu:{device}"
elif is_mlu_available():
device = f"mlu:{device}"
elif is_xpu_available():
device = f"xpu:{device}"
if value is None:
new_value = old_value.to(device)
if dtype is not None and device in ["meta", torch.device("meta")]:
@ -447,14 +448,15 @@ def set_module_tensor_to_device(
if not getattr(module.weight, "quant_state", None) and device_index is not None:
module.weight = module.weight.cuda(device_index)
# clean pre and post foward hook
if is_npu_available():
torch.npu.empty_cache()
elif is_mlu_available():
torch.mlu.empty_cache()
elif is_xpu_available():
torch.xpu.empty_cache()
else:
torch.cuda.empty_cache()
if device != "cpu":
if is_npu_available():
torch.npu.empty_cache()
elif is_mlu_available():
torch.mlu.empty_cache()
elif is_xpu_available():
torch.xpu.empty_cache()
else:
torch.cuda.empty_cache()
# When handling tied weights, we update tied_params_map to keep track of the tied weights that have already been allocated on the device in
# order to avoid duplicating memory, see above.
@ -801,27 +803,40 @@ def get_max_memory(max_memory: Optional[Dict[Union[int, str], Union[int, str]]]
import psutil
if max_memory is None:
if not (torch.cuda.is_available() or is_npu_available() or is_mlu_available() or is_xpu_available()):
max_memory = {}
else:
# Make sure CUDA is initialized on each GPU to have the right memory info.
if is_npu_available():
for i in range(torch.npu.device_count()):
max_memory = {}
# Make sure CUDA is initialized on each GPU to have the right memory info.
if is_npu_available():
for i in range(torch.npu.device_count()):
try:
_ = torch.tensor(0, device=torch.device("npu", i))
max_memory = {i: torch.npu.mem_get_info(i)[0] for i in range(torch.npu.device_count())}
elif is_mlu_available():
for i in range(torch.mlu.device_count()):
max_memory[i] = torch.npu.mem_get_info(i)[0]
except Exception:
logger.info(f"Device {i} seems unavailable, Proceeding to check subsequent devices.")
continue
elif is_mlu_available():
for i in range(torch.mlu.device_count()):
try:
_ = torch.tensor(0, device=torch.device("mlu", i))
max_memory = {i: torch.mlu.mem_get_info(i)[0] for i in range(torch.mlu.device_count())}
elif is_xpu_available():
for i in range(torch.xpu.device_count()):
max_memory[i] = torch.mlu.mem_get_info(i)[0]
except Exception:
logger.info(f"Device {i} seems unavailable, Proceeding to check subsequent devices.")
continue
elif is_xpu_available():
for i in range(torch.xpu.device_count()):
try:
_ = torch.tensor(0, device=torch.device("xpu", i))
max_memory = {i: torch.xpu.max_memory_allocated(i) for i in range(torch.xpu.device_count())}
else:
for i in range(torch.cuda.device_count()):
max_memory[i] = torch.xpu.max_memory_allocated(i)
except Exception:
logger.info(f"Device {i} seems unavailable, Proceeding to check subsequent devices.")
continue
else:
for i in range(torch.cuda.device_count()):
try:
_ = torch.tensor([0], device=i)
max_memory = {i: torch.cuda.mem_get_info(i)[0] for i in range(torch.cuda.device_count())}
max_memory[i] = torch.cuda.mem_get_info(i)[0]
except Exception:
logger.info(f"Device {i} seems unavailable, Proceeding to check subsequent devices.")
continue
# allocate everything in the mps device as the RAM is shared
if is_mps_available():
max_memory["mps"] = psutil.virtual_memory().available
@ -914,6 +929,17 @@ def load_offloaded_weights(model, index, offload_folder):
set_module_tensor_to_device(model, param_name, "cpu", value=weight, fp16_statistics=fp16_statistics)
def get_module_leaves(module_sizes):
module_children = {}
for module in module_sizes:
if module == "" or "." not in module:
continue
parent = module.rsplit(".", 1)[0]
module_children[parent] = module_children.get(parent, 0) + 1
leaves = [module for module in module_sizes if module_children.get(module, 0) == 0 and module != ""]
return leaves
def get_balanced_memory(
model: nn.Module,
max_memory: Optional[Dict[Union[int, str], Union[int, str]]] = None,
@ -1023,10 +1049,10 @@ def get_balanced_memory(
buffer = 0
# Compute mean of final modules. In the first dict of module sizes, leaves are the parameters
leaves = [n for n in module_sizes if len([p for p in module_sizes if n == "" or p.startswith(n + ".")]) == 0]
leaves = get_module_leaves(module_sizes)
module_sizes = {n: v for n, v in module_sizes.items() if n not in leaves}
# Once removed, leaves are the final modules.
leaves = [n for n in module_sizes if len([p for p in module_sizes if n == "" or p.startswith(n + ".")]) == 0]
leaves = get_module_leaves(module_sizes)
mean_leaves = int(sum([module_sizes[n] for n in leaves]) / max(len(leaves), 1))
buffer = int(1.25 * max(buffer, mean_leaves))
per_gpu += buffer
@ -1783,7 +1809,7 @@ def get_mixed_precision_context_manager(native_amp: bool = False, autocast_kwarg
)
if state.mixed_precision == "fp16":
return torch.autocast(device_type=device_type, dtype=torch.float16, **autocast_kwargs)
elif state.mixed_precision == "bf16" and state.distributed_type in [
elif state.mixed_precision in ["bf16", "fp8"] and state.distributed_type in [
DistributedType.NO,
DistributedType.MULTI_CPU,
DistributedType.MULTI_GPU,

View File

@ -164,10 +164,7 @@ def send_to_device(tensor, device, non_blocking=False, skip_keys=None):
if is_npu_available():
if isinstance(device, int):
device = f"npu:{device}"
else:
raise error
except Exception as error:
if is_xpu_available():
elif is_xpu_available():
if isinstance(device, int):
device = f"xpu:{device}"
else:

View File

@ -109,6 +109,8 @@ def synchronize_rng_state(rng_type: Optional[RNGType] = None, generator: Optiona
torch.cuda.set_rng_state(rng_state)
elif rng_type == RNGType.NPU:
torch.npu.set_rng_state(rng_state)
elif rng_type == RNGType.MLU:
torch.mlu.set_rng_state(rng_state)
elif rng_type == RNGType.XPU:
torch.xpu.set_rng_state(rng_state)
elif rng_type == RNGType.XLA:

View File

@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
from .imports import is_tqdm_available
@ -21,7 +23,7 @@ if is_tqdm_available():
from ..state import PartialState
def tqdm(main_process_only: bool = True, *args, **kwargs):
def tqdm(*args, main_process_only: bool = True, **kwargs):
"""
Wrapper around `tqdm.tqdm` that optionally displays only on the main process.
@ -31,7 +33,15 @@ def tqdm(main_process_only: bool = True, *args, **kwargs):
"""
if not is_tqdm_available():
raise ImportError("Accelerate's `tqdm` module requires `tqdm` to be installed. Please run `pip install tqdm`.")
disable = False
if main_process_only:
if len(args) > 0 and isinstance(args[0], bool):
warnings.warn(
f"Passing `{args[0]}` as the first argument to Accelerate's `tqdm` wrapper is deprecated "
"and will be removed in v0.33.0. Please use the `main_process_only` keyword argument instead.",
FutureWarning,
)
main_process_only = args[0]
args = args[1:]
disable = kwargs.pop("disable", False)
if main_process_only and not disable:
disable = PartialState().local_process_index != 0
return _tqdm(*args, **kwargs, disable=disable)

View File

@ -51,12 +51,14 @@ from accelerate.utils.deepspeed import (
DummyScheduler,
)
from accelerate.utils.other import patch_environment
from accelerate.utils.versions import compare_versions
set_seed(42)
GPT2_TINY = "sshleifer/tiny-gpt2"
MOBILEVIT = "apple/mobilevit-xx-small"
QWEN_MOE = "peft-internal-testing/tiny-random-qwen-1.5-MoE"
ZERO2 = "zero2"
ZERO3 = "zero3"
@ -811,6 +813,30 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
)
assert deepspeed_plugin.zero_stage == int(stage.replace("zero", ""))
def test_prepare_deepspeed_prepare_moe(self):
if compare_versions("transformers", "<", "4.40") and compare_versions("deepspeed", "<", "0.14"):
return
deepspeed_plugin = DeepSpeedPlugin(
zero3_init_flag=True,
gradient_accumulation_steps=1,
gradient_clipping=1.0,
zero_stage=3,
offload_optimizer_device="none",
offload_param_device="none",
zero3_save_16bit_model=True,
transformer_moe_cls_names="Qwen2MoeSparseMoeBlock",
)
with mockenv_context(**self.dist_env):
accelerator = Accelerator(mixed_precision="fp16", deepspeed_plugin=deepspeed_plugin)
accelerator.state.deepspeed_plugin.deepspeed_config["train_micro_batch_size_per_gpu"] = 1
model = AutoModelForCausalLM.from_pretrained(QWEN_MOE)
model = accelerator.prepare(model)
from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock
for module in model.modules():
if isinstance(module, Qwen2MoeSparseMoeBlock):
assert hasattr(module, "_z3_leaf") and module._z3_leaf
def test_basic_run(self):
test_file_path = path_in_accelerate_package("test_utils", "scripts", "external_deps", "test_performance.py")
with tempfile.TemporaryDirectory() as dirpath:

View File

@ -17,6 +17,7 @@ import pickle
import tempfile
from unittest.mock import patch
import psutil
import pytest
import torch
from parameterized import parameterized
@ -196,14 +197,25 @@ class AcceleratorTester(AccelerateTestCase):
def test_free_memory_dereferences_prepared_components(self):
accelerator = Accelerator()
model, optimizer, scheduler, train_dl, valid_dl = create_components()
accelerator.prepare(model, optimizer, scheduler, train_dl, valid_dl)
# Free up refs with empty_cache() and gc.collect()
accelerator.free_memory()
model, optimizer, scheduler, train_dl, valid_dl = create_components()
free_cpu_ram_before = psutil.virtual_memory().available // 1024 // 1024
model, optimizer, scheduler, train_dl, valid_dl = accelerator.prepare(
model, optimizer, scheduler, train_dl, valid_dl
)
model, optimizer, scheduler, train_dl, valid_dl = accelerator.free_memory(
model, optimizer, scheduler, train_dl, valid_dl
)
free_cpu_ram_after = psutil.virtual_memory().available // 1024 // 1024
assert len(accelerator._models) == 0
assert len(accelerator._optimizers) == 0
assert len(accelerator._schedulers) == 0
assert len(accelerator._dataloaders) == 0
# The less-than comes *specifically* from CUDA CPU things/won't be present on CPU builds
assert free_cpu_ram_after <= free_cpu_ram_before
@require_non_torch_xla
def test_env_var_device(self):

View File

@ -35,14 +35,19 @@ from accelerate.hooks import remove_hook_from_submodules
from accelerate.test_utils import (
require_bnb,
require_cuda,
require_mps,
require_multi_device,
require_multi_gpu,
require_non_cpu,
require_non_torch_xla,
slow,
torch_device,
)
from accelerate.utils import is_torch_version, offload_state_dict
torch_device = f"{torch_device}:0" if torch_device != "cpu" else "cpu"
class ModelForTest(nn.Module):
def __init__(self):
super().__init__()
@ -175,17 +180,9 @@ class BigModelingTester(unittest.TestCase):
with init_empty_weights():
_ = nn.Sequential(*[nn.Linear(10000, 10000) for _ in range(1000)])
@require_cuda
def test_init_on_device_cuda(self):
device = torch.device("cuda:0")
with init_on_device(device):
model = nn.Linear(10, 10)
assert model.weight.device == device
assert model.weight.device == device
@require_mps
def test_init_on_device_mps(self):
device = torch.device("mps:0")
@require_non_cpu
def test_init_on_device(self):
device = torch.device(torch_device)
with init_on_device(device):
model = nn.Linear(10, 10)
assert model.weight.device == device
@ -196,7 +193,7 @@ class BigModelingTester(unittest.TestCase):
x = torch.randn(2, 3)
expected = model(x)
device = torch.device(0 if torch.cuda.is_available() else "cpu")
device = torch.device(torch_device)
cpu_offload(model, execution_device=device)
output = model(x)
@ -214,7 +211,7 @@ class BigModelingTester(unittest.TestCase):
x = torch.randn(2, 3)
expected = model(x)
device = torch.device(0 if torch.cuda.is_available() else "cpu")
device = torch.device(torch_device)
cpu_offload(model, execution_device=device, preload_module_classes=["ModuleWithUnusedSubModules"])
output = model(x)
@ -233,10 +230,10 @@ class BigModelingTester(unittest.TestCase):
assert torch.allclose(expected, output.cpu(), 1e-4, 1e-5), f"Expected: {expected}, Actual: {output.cpu()}"
@slow
@require_cuda
@require_non_cpu
def test_cpu_offload_gpt2(self):
tokenizer = AutoTokenizer.from_pretrained("gpt2")
inputs = tokenizer("Hello world! My name is", return_tensors="pt").to(0)
inputs = tokenizer("Hello world! My name is", return_tensors="pt").to(torch_device)
gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")
cpu_offload(gpt2, execution_device=0)
@ -251,7 +248,7 @@ class BigModelingTester(unittest.TestCase):
x = torch.randn(2, 3)
expected = model(x)
device = torch.device(0 if torch.cuda.is_available() else "cpu")
device = torch.device(torch_device)
with TemporaryDirectory() as tmp_dir:
disk_offload(model, tmp_dir, execution_device=device)
@ -271,7 +268,7 @@ class BigModelingTester(unittest.TestCase):
x = torch.randn(2, 3)
expected = model(x)
device = torch.device(0 if torch.cuda.is_available() else "cpu")
device = torch.device(torch_device)
with TemporaryDirectory() as tmp_dir:
disk_offload(
@ -295,10 +292,10 @@ class BigModelingTester(unittest.TestCase):
assert torch.allclose(expected, output.cpu(), 1e-4, 1e-5), f"Expected: {expected}, Actual: {output.cpu()}"
@slow
@require_cuda
@require_non_cpu
def test_disk_offload_gpt2(self):
tokenizer = AutoTokenizer.from_pretrained("gpt2")
inputs = tokenizer("Hello world! My name is", return_tensors="pt").to(0)
inputs = tokenizer("Hello world! My name is", return_tensors="pt").to(torch_device)
gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")
with TemporaryDirectory() as tmp_dir:
@ -309,7 +306,7 @@ class BigModelingTester(unittest.TestCase):
== "Hello world! My name is Kiyoshi, and I'm a student at the University of Tokyo"
)
@require_cuda
@require_non_cpu
def test_dispatch_model(self):
model = ModelForTest()
device_map = {"linear1": "disk", "batchnorm": "cpu", "linear2": 0}
@ -322,7 +319,7 @@ class BigModelingTester(unittest.TestCase):
output = model(x)
assert torch.allclose(expected, output.cpu(), atol=1e-5)
@require_cuda
@require_non_cpu
def test_dispatch_model_with_non_persistent_buffers(self):
model = ModelForTestNonPersistentBuffers()
device_map = {"linear1": 0, "batchnorm": "cpu", "linear2": "disk"}
@ -334,20 +331,7 @@ class BigModelingTester(unittest.TestCase):
output = model(x)
assert torch.allclose(expected, output.cpu(), atol=1e-5)
@require_mps
def test_dispatch_model_mps(self):
model = ModelForTest()
device_map = {"linear1": "mps", "batchnorm": "disk", "linear2": "disk"}
x = torch.randn(2, 3)
expected = model(x)
with TemporaryDirectory() as tmp_dir:
dispatch_model(model, device_map, offload_dir=tmp_dir)
output = model(x)
assert torch.allclose(expected, output.cpu(), atol=1e-5)
@require_cuda
@require_non_cpu
def test_dispatch_model_tied_weights(self):
model = ModelForTestTiedWeights()
model.linear1.weight = model.linear2.weight
@ -597,8 +581,8 @@ class BigModelingTester(unittest.TestCase):
assert (free_memory_bytes_after_infer - free_memory_bytes_after_dispatch) * 1e-6 < 130
@require_multi_gpu
def test_dispatch_model_multi_gpu(self):
@require_multi_device
def test_dispatch_model_multi_devices(self):
model = BiggerModelForTest()
device_map = {"linear1": "cpu", "linear2": "disk", "batchnorm": "cpu", "linear3": 0, "linear4": 1}
@ -610,7 +594,7 @@ class BigModelingTester(unittest.TestCase):
output = model(x)
assert torch.allclose(expected, output.cpu(), atol=1e-5)
@require_cuda
@require_non_cpu
def test_dispatch_model_copy(self):
original_model = ModelForTestCopy(id=1)
device_map = {"linear1": 0, "batchnorm": "cpu", "linear2": 0}
@ -629,7 +613,7 @@ class BigModelingTester(unittest.TestCase):
assert copied_model.linear1.forward is not original_model.linear1.forward
assert torch.allclose(expected, output.cpu(), atol=1e-5)
@require_cuda
@require_non_cpu
def test_dispatch_model_move_offloaded_model(self):
model = ModelForTest()
device_map = {"linear1": "disk", "batchnorm": "cpu", "linear2": 0}
@ -653,10 +637,10 @@ class BigModelingTester(unittest.TestCase):
model(x)
@slow
@require_multi_gpu
def test_dispatch_model_gpt2_on_two_gpus(self):
@require_multi_device
def test_dispatch_model_gpt2_on_two_devices(self):
tokenizer = AutoTokenizer.from_pretrained("gpt2")
inputs = tokenizer("Hello world! My name is", return_tensors="pt").to(0)
inputs = tokenizer("Hello world! My name is", return_tensors="pt").to(torch_device)
gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")
# Dispatch on GPUs 0 and 1
@ -703,7 +687,7 @@ class BigModelingTester(unittest.TestCase):
== "Hello world! My name is Kiyoshi, and I'm a student at the University of Tokyo"
)
@require_cuda
@require_non_cpu
def test_dispatch_model_with_unused_submodules(self):
model = ModelWithUnusedSubModulesForTest()
device_map = {"linear1": "cpu", "linear2": "disk", "batchnorm": "cpu", "linear3": 0, "linear4": 0}
@ -718,23 +702,8 @@ class BigModelingTester(unittest.TestCase):
output = model(x)
assert torch.allclose(expected, output.cpu(), atol=1e-5)
@require_mps
def test_dispatch_model_with_unused_submodules_mps(self):
model = ModelWithUnusedSubModulesForTest()
device_map = {"linear1": "mps", "linear2": "mps", "batchnorm": "mps", "linear3": "mps", "linear4": "disk"}
x = torch.randn(2, 3)
expected = model(x)
with TemporaryDirectory() as tmp_dir:
dispatch_model(
model, device_map, offload_dir=tmp_dir, preload_module_classes=["ModuleWithUnusedSubModules"]
)
output = model(x)
assert torch.allclose(expected, output.cpu(), atol=1e-5)
@require_multi_gpu
def test_dispatch_model_with_unused_submodules_multi_gpu(self):
@require_multi_device
def test_dispatch_model_with_unused_submodules_multi_device(self):
model = ModelWithUnusedSubModulesForTest()
device_map = {"linear1": "cpu", "linear2": "disk", "batchnorm": "cpu", "linear3": 0, "linear4": 1}
@ -748,7 +717,7 @@ class BigModelingTester(unittest.TestCase):
output = model(x)
assert torch.allclose(expected, output.cpu(), atol=1e-5)
@require_cuda
@require_non_cpu
def test_dispatch_model_force_hooks(self):
model = ModelForTest()
device_map = {"": 0}
@ -760,7 +729,7 @@ class BigModelingTester(unittest.TestCase):
output = model(x)
assert torch.allclose(expected, output.cpu(), atol=1e-5)
@require_cuda
@require_non_cpu
def test_load_checkpoint_and_dispatch(self):
model = ModelForTest()
device_map = {"linear1": "cpu", "batchnorm": "cpu", "linear2": 0}
@ -782,32 +751,8 @@ class BigModelingTester(unittest.TestCase):
output = new_model(x)
assert torch.allclose(expected, output.cpu(), atol=1e-5)
@require_mps
def test_load_checkpoint_and_dispatch_mps(self):
model = ModelForTest()
device_map = {"linear1": "mps", "batchnorm": "mps", "linear2": "disk"}
x = torch.randn(2, 3)
expected = model(x)
with TemporaryDirectory() as tmp_dir:
checkpoint = os.path.join(tmp_dir, "pt_model.bin")
torch.save(model.state_dict(), checkpoint)
new_model = ModelForTest()
new_model = load_checkpoint_and_dispatch(
new_model, checkpoint, device_map=device_map, offload_folder=tmp_dir
)
# CPU-offloaded weights are on the meta device while waiting for the forward pass.
assert new_model.linear1.weight.device == torch.device("mps:0")
assert new_model.linear2.weight.device == torch.device("meta")
output = new_model(x)
assert torch.allclose(expected, output.cpu(), atol=1e-5)
@require_multi_gpu
def test_load_checkpoint_and_dispatch_multi_gpu(self):
@require_multi_device
def test_load_checkpoint_and_dispatch_multi_device(self):
model = BiggerModelForTest()
device_map = {"linear1": "cpu", "linear2": "cpu", "batchnorm": 0, "linear3": 0, "linear4": 1}
@ -830,7 +775,7 @@ class BigModelingTester(unittest.TestCase):
output = new_model(x)
assert torch.allclose(expected, output.cpu(), atol=1e-5)
@require_cuda
@require_non_cpu
def test_load_checkpoint_and_dispatch_with_unused_submodules(self):
model = ModelWithUnusedSubModulesForTest()
device_map = {"linear1": "cpu", "linear2": "cpu", "batchnorm": 0, "linear3": 0, "linear4": 0}
@ -856,38 +801,8 @@ class BigModelingTester(unittest.TestCase):
output = new_model(x)
assert torch.allclose(expected, output.cpu(), atol=1e-5)
@require_mps
def test_load_checkpoint_and_dispatch_with_unused_submodules_mps(self):
model = ModelWithUnusedSubModulesForTest()
device_map = {"linear1": "mps", "linear2": "mps", "batchnorm": "mps", "linear3": "disk", "linear4": "disk"}
x = torch.randn(2, 3)
expected = model(x)
with TemporaryDirectory() as tmp_dir:
checkpoint = os.path.join(tmp_dir, "pt_model.bin")
torch.save(model.state_dict(), checkpoint)
new_model = ModelWithUnusedSubModulesForTest()
new_model = load_checkpoint_and_dispatch(
new_model,
checkpoint,
device_map=device_map,
preload_module_classes=["ModuleWithUnusedSubModules"],
offload_folder=tmp_dir,
)
# CPU-offloaded weights are on the meta device while waiting for the forward pass.
assert new_model.linear1.linear.weight.device == torch.device("mps:0")
assert new_model.linear2.linear.weight.device == torch.device("mps:0")
assert new_model.linear3.linear.weight.device == torch.device("meta")
assert new_model.linear4.linear.weight.device == torch.device("meta")
output = new_model(x)
assert torch.allclose(expected, output.cpu(), atol=1e-5)
@require_multi_gpu
def test_load_checkpoint_and_dispatch_multi_gpu_with_unused_submodules(self):
@require_multi_device
def test_load_checkpoint_and_dispatch_multi_device_with_unused_submodules(self):
model = ModelWithUnusedSubModulesForTest()
device_map = {"linear1": "cpu", "linear2": "cpu", "batchnorm": 0, "linear3": 0, "linear4": 1}
@ -912,7 +827,7 @@ class BigModelingTester(unittest.TestCase):
output = new_model(x)
assert torch.allclose(expected, output.cpu(), atol=1e-5)
@require_cuda
@require_non_cpu
def test_cpu_offload_with_hook(self):
model1 = torch.nn.Linear(4, 5)
model1, hook1 = cpu_offload_with_hook(model1)

View File

@ -20,7 +20,7 @@ from unittest.mock import patch
import torch
from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
from accelerate.commands.config.config_args import BaseConfig, ClusterConfig, SageMakerConfig
from accelerate.commands.config.config_args import BaseConfig, ClusterConfig, SageMakerConfig, load_config_from_file
from accelerate.commands.estimate import estimate_command, estimate_command_parser, gather_data
from accelerate.commands.launch import _validate_launch_command, launch_command_parser
from accelerate.test_utils import execute_subprocess_async
@ -73,8 +73,9 @@ class AccelerateLauncherTester(unittest.TestCase):
execute_subprocess_async(cmd, env=os.environ.copy())
def test_config_compatibility(self):
invalid_configs = ["invalid", "mpi", "sagemaker"]
for config in sorted(self.test_config_path.glob("**/*.yaml")):
if "invalid" in str(config) or "mpi" in str(config):
if any(invalid_config in str(config) for invalid_config in invalid_configs):
continue
with self.subTest(config_file=config):
cmd = get_launch_command(config_file=config) + [self.test_file_path]
@ -196,6 +197,8 @@ class ClusterConfigTester(unittest.TestCase):
Test case for verifying the config dataclasses work
"""
test_config_path = Path("tests/test_configs")
def test_base_config(self):
# Tests that all the dataclasses can be initialized
config = BaseConfig(
@ -257,6 +260,8 @@ class ClusterConfigTester(unittest.TestCase):
assert config.ec2_instance_type == "MY_TYPE"
assert config.iam_role_name == "MY_ROLE"
config = load_config_from_file(str(self.test_config_path / "0_30_0_sagemaker.yaml"))
class TpuConfigTester(unittest.TestCase):
"""

View File

@ -0,0 +1,8 @@
compute_environment: AMAZON_SAGEMAKER
debug: false
distributed_type: NO
mixed_precision: fp16
debug: false
use_cpu: false
ec2_instance_type: MY_TYPE
iam_role_name: MY_ROLE

View File

@ -30,6 +30,7 @@ from accelerate.test_utils.testing import (
require_huggingface_suite,
require_multi_gpu,
require_pippy,
require_schedulefree,
require_trackers,
run_command,
slow,
@ -47,6 +48,7 @@ EXCLUDE_EXAMPLES = [
"local_sgd.py",
"multi_process_metrics.py",
"memory.py",
"schedule_free.py",
"automatic_gradient_accumulation.py",
"fsdp_with_peak_mem_tracking.py",
"deepspeed_with_config_support.py",
@ -216,6 +218,11 @@ class FeatureExamplesTests(TempDirTestCase):
testargs = ["examples/by_feature/multi_process_metrics.py"]
run_command(self.launch_args + testargs)
@require_schedulefree
def test_schedulefree(self):
testargs = ["examples/by_feature/schedule_free.py"]
run_command(self.launch_args + testargs)
@require_trackers
@mock.patch.dict(os.environ, {"WANDB_MODE": "offline", "DVCLIVE_TEST": "true"})
def test_tracking(self):
@ -240,20 +247,30 @@ class FeatureExamplesTests(TempDirTestCase):
testargs = ["examples/by_feature/early_stopping.py"]
run_command(self.launch_args + testargs)
@require_multi_gpu
def test_distributed_inference_examples_stable_diffusion(self):
testargs = ["examples/inference/distributed/stable_diffusion.py"]
run_command(self.launch_args + testargs)
@require_multi_gpu
def test_distributed_inference_examples_phi2(self):
testargs = ["examples/inference/distributed/phi2.py"]
run_command(self.launch_args + testargs)
@require_pippy
@require_multi_gpu
def test_pippy_examples_bert(self):
testargs = ["examples/inference/bert.py"]
testargs = ["examples/inference/pippy/bert.py"]
run_command(self.launch_args + testargs)
@require_pippy
@require_multi_gpu
def test_pippy_examples_gpt2(self):
testargs = ["examples/inference/gpt2.py"]
testargs = ["examples/inference/pippy/gpt2.py"]
run_command(self.launch_args + testargs)
@require_pippy
@require_multi_gpu
def test_pippy_examples_t5(self):
testargs = ["examples/inference/t5.py"]
testargs = ["examples/inference/pippy/t5.py"]
run_command(self.launch_args + testargs)

View File

@ -28,7 +28,10 @@ from accelerate.hooks import (
remove_hook_from_module,
remove_hook_from_submodules,
)
from accelerate.test_utils import require_multi_gpu
from accelerate.test_utils import require_multi_device, torch_device
torch_device = f"{torch_device}:0" if torch_device != "cpu" else "cpu"
class ModelForTest(nn.Module):
@ -150,7 +153,7 @@ class HooksModelTester(unittest.TestCase):
output1 = test_model(x)
assert not output1.requires_grad
@require_multi_gpu
@require_multi_device
def test_align_devices_as_model_parallelism(self):
model = ModelForTest()
# Everything is on CPU
@ -175,7 +178,7 @@ class HooksModelTester(unittest.TestCase):
# We can add a general hook to put back output on same device as input.
add_hook_to_module(model, AlignDevicesHook(io_same_device=True))
x = torch.randn(2, 3).to(0)
x = torch.randn(2, 3).to(torch_device)
output = model(x)
assert output.device == torch.device(0)
@ -188,7 +191,7 @@ class HooksModelTester(unittest.TestCase):
assert model.linear2.weight.device == torch.device("cpu")
# This will move each submodule on different devices
hook_kwargs = {"execution_device": 0 if torch.cuda.is_available() else "cpu", "offload": True}
hook_kwargs = {"execution_device": torch_device, "offload": True}
add_hook_to_module(model.linear1, AlignDevicesHook(**hook_kwargs))
add_hook_to_module(model.batchnorm, AlignDevicesHook(**hook_kwargs))
@ -216,7 +219,7 @@ class HooksModelTester(unittest.TestCase):
# Now test with buffers included in the offload
hook_kwargs = {
"execution_device": 0 if torch.cuda.is_available() else "cpu",
"execution_device": torch_device,
"offload": True,
"offload_buffers": True,
}
@ -252,7 +255,7 @@ class HooksModelTester(unittest.TestCase):
assert model.linear2.weight.device == torch.device("cpu")
# This will move each submodule on different devices
execution_device = 0 if torch.cuda.is_available() else "cpu"
execution_device = torch_device
attach_align_device_hook(model, execution_device=execution_device, offload=True)
# Parameters have been offloaded, so on the meta device
@ -301,7 +304,7 @@ class HooksModelTester(unittest.TestCase):
assert model.linear2.weight.device == torch.device("cpu")
# This will move each submodule on different devices
execution_device = 0 if torch.cuda.is_available() else "cpu"
execution_device = torch_device
attach_align_device_hook(
model, execution_device=execution_device, offload=True, weights_map=model.state_dict()
)

91
tests/test_logging.py Normal file
View File

@ -0,0 +1,91 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
import logging
import os
import pytest
from accelerate import Accelerator
from accelerate.logging import get_logger
def current_lineno() -> int:
# A simple helper that returns the lineno of its call-site.
caller_frame = inspect.currentframe().f_back
caller_info = inspect.getframeinfo(caller_frame)
return caller_info.lineno
class CustomLogger(logging.LoggerAdapter):
# Mocks a user-defined custom logger wrapper that sets `stacklevel=3`.
def log(self, level, msg, *args, **kwargs):
# E.g. the user wants to modify `stacklevel`, `accelerate.logging`
# should respect the user's `stacklevel`. For the specific value
# of `3`, calling `CustomLogger.log()`, etc., should log that callsite,
# rather than the callsite of the following `self.logger.log()`.
kwargs["stacklevel"] = 3
self.logger.log(level, msg, *args, **kwargs)
@pytest.fixture(scope="module")
def accelerator():
return Accelerator()
@pytest.mark.usefixtures("accelerator")
def test_log_stack(caplog):
logger = get_logger(__name__)
logging.basicConfig(
format="%(filename)s:%(name)s:%(lineno)s:%(funcName)s - %(message)s",
datefmt="%m/%d %H:%M:%S",
)
message = "Test"
lineno = current_lineno() + 1 # the next line is the actual callsite
logger.warning(message)
assert len(caplog.records) == 1
rec = caplog.records[0]
assert rec.levelname == logging.getLevelName(logging.WARNING)
assert rec.filename == os.path.basename(__file__)
assert rec.name == __name__
assert rec.lineno == lineno
assert rec.funcName == test_log_stack.__name__
assert rec.message == message
@pytest.mark.usefixtures("accelerator")
def test_custom_stacklevel(caplog):
wrapped_logger = get_logger(__name__)
logging.basicConfig(
format="%(filename)s:%(name)s:%(lineno)s:%(funcName)s - %(message)s",
datefmt="%m/%d %H:%M:%S",
)
logger = CustomLogger(wrapped_logger, {})
message = "Test"
lineno = current_lineno() + 1 # the next line is the actual callsite
logger.warning(message)
# `CustomLogger.log` set custom `stacklevel=3`, so `logger.warning` should
# log its callsite (rather than those of the `warpped_logger`).
assert len(caplog.records) == 1
rec = caplog.records[0]
assert rec.levelname == logging.getLevelName(logging.WARNING)
assert rec.filename == os.path.basename(__file__)
assert rec.name == __name__
assert rec.lineno == lineno
assert rec.funcName == test_custom_stacklevel.__name__
assert rec.message == message

View File

@ -31,6 +31,7 @@ from accelerate.test_utils import (
require_multi_gpu,
require_non_torch_xla,
require_pippy,
require_torchvision,
)
from accelerate.utils import patch_environment
@ -76,6 +77,7 @@ class MultiDeviceTester(unittest.TestCase):
@require_multi_gpu
@require_pippy
@require_torchvision
@require_huggingface_suite
def test_pippy(self):
"""

View File

@ -29,9 +29,11 @@ from accelerate.state import PartialState
from accelerate.test_utils.testing import (
require_cuda,
require_huggingface_suite,
require_non_cpu,
require_non_torch_xla,
require_torch_min_version,
require_tpu,
torch_device,
)
from accelerate.test_utils.training import RegressionModel
from accelerate.utils import (
@ -51,6 +53,7 @@ from accelerate.utils import (
recursively_apply,
save,
send_to_device,
tqdm,
)
from accelerate.utils.operations import is_namedtuple
@ -70,7 +73,7 @@ class UtilsTester(unittest.TestCase):
def test_send_to_device(self):
tensor = torch.randn(5, 2)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device = torch.device(f"{torch_device}:0")
result1 = send_to_device(tensor, device)
assert torch.equal(result1.cpu(), tensor)
@ -178,11 +181,11 @@ class UtilsTester(unittest.TestCase):
model = extract_model_from_parallel(model, keep_fp32_wrapper=False)
_ = pickle.dumps(model)
@require_cuda
@require_non_cpu
def test_can_undo_fp16_conversion(self):
model = RegressionModel()
model._original_forward = model.forward
model.forward = torch.cuda.amp.autocast(dtype=torch.float16)(model.forward)
model.forward = torch.autocast(device_type=torch_device, dtype=torch.float16)(model.forward)
model.forward = convert_outputs_to_fp32(model.forward)
model = extract_model_from_parallel(model, keep_fp32_wrapper=False)
_ = pickle.dumps(model)
@ -401,3 +404,9 @@ class UtilsTester(unittest.TestCase):
with self.assertLogs("accelerate.utils.environment", level="WARNING"):
valid_env_items = convert_dict_to_env_variables(env)
assert valid_env_items == ["ACCELERATE_DEBUG_MODE=1\n", "OTHER_ENV=2\n"]
def test_tqdm_deprecation(self):
with pytest.warns(FutureWarning) as cm:
tqdm(True, range(3), disable=True)
assert "Passing `True` as the first argument to" in cm.pop().message.args[0]
tqdm(range(3), main_process_only=True, disable=True)