mirror of
https://github.com/huggingface/accelerate.git
synced 2025-11-15 06:39:30 +08:00
Compare commits
66 Commits
v0.29.3
...
use-partia
| Author | SHA1 | Date | |
|---|---|---|---|
| 72e214f561 | |||
| ab14a5e6a1 | |||
| 27a607ea90 | |||
| aa21174de9 | |||
| 6cf1cc0a39 | |||
| bb465a9cf0 | |||
| 67308ca6ef | |||
| 63772f6ac2 | |||
| 8798cf06ab | |||
| 47bb2dd53e | |||
| 724824abbe | |||
| afc2c99e6a | |||
| 0fb95a2d3b | |||
| 7ac153f404 | |||
| 0f1b91bb74 | |||
| d1eb44c856 | |||
| 11a363287a | |||
| 5cfe409443 | |||
| 5b3a7f3892 | |||
| 060361fca3 | |||
| 6ac27e2383 | |||
| ba5f49219f | |||
| 2c767338f2 | |||
| 234a85506d | |||
| 232ebd159a | |||
| 4d3d4bc88f | |||
| 2b1e7bd462 | |||
| c7e5e41b8c | |||
| 9557598c45 | |||
| 156331aecd | |||
| cd7df4117d | |||
| 6af157ea93 | |||
| 83317b3081 | |||
| e831bcb3b1 | |||
| 092c3af0c4 | |||
| 3e944c5583 | |||
| f67737363c | |||
| f7daaaa305 | |||
| 3dc131cd8d | |||
| ef0f62c12a | |||
| baafaf4a6e | |||
| abc86c0e35 | |||
| 4450cb3132 | |||
| fd0dcd1c45 | |||
| f478201c28 | |||
| c7046845e7 | |||
| 701e24c539 | |||
| 37da848e6c | |||
| c470a1336a | |||
| 581a390e2f | |||
| 2fc48c7eee | |||
| 1024231133 | |||
| 5ca095a34f | |||
| b77c65398c | |||
| a91691463b | |||
| 5056d327f8 | |||
| c0a37015e3 | |||
| e9b9c7d022 | |||
| 6c09584f73 | |||
| b8c8583953 | |||
| df485ae1e3 | |||
| 6386f70103 | |||
| 6d92198ef4 | |||
| 16488be9a4 | |||
| 685bd3a439 | |||
| 2e69948c1a |
@ -58,3 +58,24 @@ jobs:
|
||||
file: docker/accelerate-gpu/Dockerfile
|
||||
push: true
|
||||
tags: huggingface/accelerate:gpu-release-${{needs.get-version.outputs.version}}
|
||||
|
||||
version-cuda-deepspeed:
|
||||
name: "Latest Accelerate GPU DeepSpeed [version]"
|
||||
runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
|
||||
needs: get-version
|
||||
steps:
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
- name: Login to DockerHub
|
||||
uses: docker/login-action@v2
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Build and Push GPU
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
file: docker/accelerate-gpu-deepspeed/Dockerfile
|
||||
push: true
|
||||
tags: huggingface/accelerate:gpu-deepspeed-release-${{needs.get-version.outputs.version}}
|
||||
|
||||
|
||||
27
.github/workflows/build_docker_images.yml
vendored
27
.github/workflows/build_docker_images.yml
vendored
@ -57,4 +57,29 @@ jobs:
|
||||
push: true
|
||||
tags: |
|
||||
huggingface/accelerate:gpu-nightly
|
||||
huggingface/accelerate:gpu-nightly-${{ env.date }}
|
||||
huggingface/accelerate:gpu-nightly-${{ env.date }}
|
||||
|
||||
latest-cuda-deepspeed:
|
||||
name: "Latest Accelerate GPU DeepSpeed [dev]"
|
||||
runs-on: [self-hosted, nvidia-gpu, t4, ci]
|
||||
steps:
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
- name: Login to DockerHub
|
||||
uses: docker/login-action@v2
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||
- name: Get current date
|
||||
id: date
|
||||
run: |
|
||||
echo "date=$(date '+%Y-%m-%d')" >> $GITHUB_ENV
|
||||
- name: Build and Push GPU
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
file: docker/accelerate-gpu-deepspeed/Dockerfile
|
||||
push: true
|
||||
tags: |
|
||||
huggingface/accelerate:gpu-deepspeed-nightly
|
||||
huggingface/accelerate:gpu-deepspeed-nightly-${{ env.date }}
|
||||
|
||||
|
||||
1
.github/workflows/build_documentation.yml
vendored
1
.github/workflows/build_documentation.yml
vendored
@ -13,5 +13,6 @@ jobs:
|
||||
with:
|
||||
commit_sha: ${{ github.sha }}
|
||||
package: accelerate
|
||||
custom_container: huggingface/transformers-doc-builder
|
||||
secrets:
|
||||
hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
|
||||
|
||||
1
.github/workflows/build_pr_documentation.yml
vendored
1
.github/workflows/build_pr_documentation.yml
vendored
@ -14,3 +14,4 @@ jobs:
|
||||
commit_sha: ${{ github.event.pull_request.head.sha }}
|
||||
pr_number: ${{ github.event.number }}
|
||||
package: accelerate
|
||||
custom_container: huggingface/transformers-doc-builder
|
||||
|
||||
126
.github/workflows/nightly.yml
vendored
126
.github/workflows/nightly.yml
vendored
@ -12,13 +12,13 @@ env:
|
||||
|
||||
|
||||
jobs:
|
||||
run_all_tests_single_gpu:
|
||||
run_core_tests_single_gpu:
|
||||
runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
|
||||
env:
|
||||
CUDA_VISIBLE_DEVICES: "0"
|
||||
TEST_TYPE: "single_gpu"
|
||||
container:
|
||||
image: huggingface/accelerate-gpu:latest
|
||||
image: huggingface/accelerate:gpu-nightly
|
||||
options: --gpus all --shm-size "16gb"
|
||||
defaults:
|
||||
run:
|
||||
@ -33,6 +33,11 @@ jobs:
|
||||
pip install -e . --no-deps
|
||||
pip install pytest-reportlog tabulate
|
||||
|
||||
- name: Show installed libraries
|
||||
run: |
|
||||
source activate accelerate;
|
||||
pip freeze
|
||||
|
||||
- name: Run test on GPUs
|
||||
working-directory: accelerate
|
||||
run: |
|
||||
@ -54,13 +59,67 @@ jobs:
|
||||
pip install slack_sdk tabulate
|
||||
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_all_tests_multi_gpu:
|
||||
run_deepspeed_tests_single_gpu:
|
||||
runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
|
||||
env:
|
||||
CUDA_VISIBLE_DEVICES: "0"
|
||||
TEST_TYPE: "single_gpu_deepspeed"
|
||||
container:
|
||||
image: huggingface/accelerate:gpu-deepspeed-nightly
|
||||
options: --gpus all --shm-size "16gb"
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
steps:
|
||||
- name: Update clone & pip install
|
||||
run: |
|
||||
source activate accelerate
|
||||
git clone https://github.com/huggingface/accelerate;
|
||||
cd accelerate;
|
||||
git checkout ${{ github.sha }};
|
||||
pip install -e . --no-deps
|
||||
pip install pytest-reportlog tabulate
|
||||
|
||||
- name: Show installed libraries
|
||||
run: |
|
||||
source activate accelerate;
|
||||
pip freeze
|
||||
|
||||
- name: Run test on GPUs
|
||||
working-directory: accelerate
|
||||
run: |
|
||||
source activate accelerate
|
||||
make test_deepspeed
|
||||
|
||||
- name: Run Integration tests on GPUs
|
||||
working-directory: accelerate
|
||||
if: always()
|
||||
run: |
|
||||
source activate accelerate
|
||||
make test_integrations
|
||||
|
||||
- name: Run examples on GPUs
|
||||
working-directory: accelerate
|
||||
if: always()
|
||||
run: |
|
||||
source activate accelerate
|
||||
pip uninstall comet_ml -y
|
||||
make test_examples
|
||||
|
||||
- name: Generate Report
|
||||
working-directory: accelerate
|
||||
if: always()
|
||||
run: |
|
||||
pip install slack_sdk tabulate
|
||||
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_core_tests_multi_gpu:
|
||||
runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
|
||||
env:
|
||||
CUDA_VISIBLE_DEVICES: "0,1"
|
||||
TEST_TYPE: "multi_gpu"
|
||||
container:
|
||||
image: huggingface/accelerate-gpu:latest
|
||||
image: huggingface/accelerate:gpu-nightly
|
||||
options: --gpus all --shm-size "16gb"
|
||||
defaults:
|
||||
run:
|
||||
@ -75,6 +134,11 @@ jobs:
|
||||
pip install -e . --no-deps
|
||||
pip install pytest-reportlog tabulate
|
||||
|
||||
- name: Show installed libraries
|
||||
run: |
|
||||
source activate accelerate;
|
||||
pip freeze
|
||||
|
||||
- name: Run core and big modeling tests on GPUs
|
||||
working-directory: accelerate
|
||||
run: |
|
||||
@ -105,6 +169,60 @@ jobs:
|
||||
pip install slack_sdk tabulate
|
||||
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_deepspeed_tests_multi_gpu:
|
||||
runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
|
||||
env:
|
||||
CUDA_VISIBLE_DEVICES: "0,1"
|
||||
TEST_TYPE: "multi_gpu_deepspeed"
|
||||
container:
|
||||
image: huggingface/accelerate:gpu-deepspeed-nightly
|
||||
options: --gpus all --shm-size "16gb"
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
steps:
|
||||
- name: Update clone
|
||||
run: |
|
||||
source activate accelerate
|
||||
git clone https://github.com/huggingface/accelerate;
|
||||
cd accelerate;
|
||||
git checkout ${{ github.sha }};
|
||||
pip install -e . --no-deps
|
||||
pip install pytest-reportlog tabulate
|
||||
|
||||
- name: Show installed libraries
|
||||
run: |
|
||||
source activate accelerate;
|
||||
pip freeze
|
||||
|
||||
- name: Run DeepSpeed tests
|
||||
working-directory: accelerate
|
||||
run: |
|
||||
source activate accelerate
|
||||
make test_deepspeed
|
||||
|
||||
- name: Run Integration tests on GPUs
|
||||
working-directory: accelerate
|
||||
if: always()
|
||||
run: |
|
||||
source activate accelerate
|
||||
make test_integrations
|
||||
|
||||
- name: Run examples on GPUs
|
||||
working-directory: accelerate
|
||||
if: always()
|
||||
run: |
|
||||
source activate accelerate
|
||||
pip uninstall comet_ml -y
|
||||
make test_examples
|
||||
|
||||
- name: Generate Report
|
||||
working-directory: accelerate
|
||||
if: always()
|
||||
run: |
|
||||
pip install slack_sdk tabulate
|
||||
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
|
||||
run-integration-tests:
|
||||
if: always()
|
||||
|
||||
90
.github/workflows/run_merge_tests.yml
vendored
90
.github/workflows/run_merge_tests.yml
vendored
@ -9,7 +9,7 @@ env:
|
||||
IS_GITHUB_CI: "1"
|
||||
|
||||
jobs:
|
||||
run_all_tests_single_gpu:
|
||||
run_core_tests_single_gpu:
|
||||
runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
|
||||
env:
|
||||
CUDA_VISIBLE_DEVICES: "0"
|
||||
@ -29,6 +29,11 @@ jobs:
|
||||
pip install -e .[testing,test_trackers] -U;
|
||||
pip install pytest-reportlog tabulate ;
|
||||
|
||||
- name: Show installed libraries
|
||||
run: |
|
||||
source activate accelerate;
|
||||
pip freeze
|
||||
|
||||
- name: Run CLI tests (use make cli)
|
||||
working-directory: accelerate
|
||||
run: |
|
||||
@ -56,7 +61,46 @@ jobs:
|
||||
pip install tabulate;
|
||||
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_all_tests_multi_gpu:
|
||||
run_deepspeed_tests_single_gpu:
|
||||
runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
|
||||
env:
|
||||
CUDA_VISIBLE_DEVICES: "0"
|
||||
container:
|
||||
image: huggingface/accelerate:gpu-deepspeed-nightly
|
||||
options: --gpus all --shm-size "16gb"
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
steps:
|
||||
- name: Install accelerate
|
||||
run: |
|
||||
source activate accelerate;
|
||||
git clone https://github.com/huggingface/accelerate;
|
||||
cd accelerate;
|
||||
git checkout ${{ github.sha }};
|
||||
pip install -e .[testing,test_trackers] -U;
|
||||
pip install pytest-reportlog tabulate ;
|
||||
|
||||
- name: Show installed libraries
|
||||
run: |
|
||||
source activate accelerate;
|
||||
pip freeze
|
||||
|
||||
- name: Run test on GPUs
|
||||
working-directory: accelerate
|
||||
if: always()
|
||||
run: |
|
||||
source activate accelerate;
|
||||
make test_deepspeed
|
||||
|
||||
- name: Generate Report
|
||||
working-directory: accelerate
|
||||
if: always()
|
||||
run: |
|
||||
pip install tabulate;
|
||||
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_core_tests_multi_gpu:
|
||||
runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
|
||||
env:
|
||||
CUDA_VISIBLE_DEVICES: 0,1
|
||||
@ -76,6 +120,11 @@ jobs:
|
||||
pip install -e .[testing,test_trackers] -U;
|
||||
pip install pytest-reportlog tabulate
|
||||
|
||||
- name: Show installed libraries
|
||||
run: |
|
||||
source activate accelerate;
|
||||
pip freeze
|
||||
|
||||
- name: Run test on GPUs
|
||||
working-directory: accelerate
|
||||
run: |
|
||||
@ -96,3 +145,40 @@ jobs:
|
||||
run: |
|
||||
source activate accelerate;
|
||||
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
run_deepspeed_tests_multi_gpu:
|
||||
runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
|
||||
container:
|
||||
image: huggingface/accelerate:gpu-deepspeed-nightly
|
||||
options: --gpus all --shm-size "16gb"
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
steps:
|
||||
- name: Install accelerate
|
||||
run: |
|
||||
source activate accelerate;
|
||||
git clone https://github.com/huggingface/accelerate;
|
||||
cd accelerate;
|
||||
git checkout ${{ github.sha }};
|
||||
pip install -e .[testing,test_trackers] -U;
|
||||
pip install pytest-reportlog tabulate ;
|
||||
|
||||
- name: Show installed libraries
|
||||
run: |
|
||||
source activate accelerate;
|
||||
pip freeze
|
||||
|
||||
- name: Run test on GPUs
|
||||
working-directory: accelerate
|
||||
if: always()
|
||||
run: |
|
||||
source activate accelerate;
|
||||
make test_deepspeed
|
||||
|
||||
- name: Generate Report
|
||||
working-directory: accelerate
|
||||
if: always()
|
||||
run: |
|
||||
pip install tabulate;
|
||||
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
|
||||
@ -23,7 +23,7 @@ defaults:
|
||||
jobs:
|
||||
run-trainer-tests:
|
||||
container:
|
||||
image: huggingface/accelerate:gpu-nightly
|
||||
image: huggingface/accelerate:gpu-deepspeed-nightly
|
||||
options: --gpus all --shm-size "16gb"
|
||||
runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
|
||||
strategy:
|
||||
|
||||
4
.github/workflows/test.yml
vendored
4
.github/workflows/test.yml
vendored
@ -51,6 +51,10 @@ jobs:
|
||||
if [[ ${{ matrix.test-kind }} = test_rest ]]; then pip uninstall comet_ml -y; fi
|
||||
if [[ ${{ matrix.test-kind }} = minimum ]]; then pip install torch==1.10.0; fi
|
||||
pip install pytest-reportlog tabulate setuptools
|
||||
|
||||
- name: Show installed libraries
|
||||
run: |
|
||||
pip freeze
|
||||
|
||||
- name: Run Tests
|
||||
env:
|
||||
|
||||
@ -29,9 +29,10 @@ huggingface/accelerate:{accelerator}-{nightly,release}
|
||||
```
|
||||
|
||||
`accelerator` in this instance is one of many applical pre-configured backend supports:
|
||||
* `gpu`: Comes compiled off of the `nvidia/cuda` image and includes everything such as `deepspeed`, `bitsandbytes`, etc.
|
||||
* `cpu`: Comes compiled off of `python:3.8-slim` and is designed for non-CUDA based workloads.
|
||||
* `gpu`: Comes compiled off of the `nvidia/cuda` image and includes core parts like `bitsandbytes`. Runs off python 3.9.
|
||||
* `cpu`: Comes compiled off of `python:3.9-slim` and is designed for non-CUDA based workloads.
|
||||
* More to come soon
|
||||
* `gpu-deepspeed`: Comes compiled off of the `nvidia/cuda` image and includes core parts like `bitsandbytes` as well as the latest `deepspeed` version. Runs off python 3.10.
|
||||
|
||||
## Nightlies vs Releases
|
||||
|
||||
|
||||
46
docker/accelerate-gpu-deepspeed/Dockerfile
Normal file
46
docker/accelerate-gpu-deepspeed/Dockerfile
Normal file
@ -0,0 +1,46 @@
|
||||
# Builds GPU docker image of PyTorch specifically
|
||||
# Uses multi-staged approach to reduce size
|
||||
# Stage 1
|
||||
# Use base conda image to reduce time
|
||||
FROM continuumio/miniconda3:latest AS compile-image
|
||||
# Specify py version
|
||||
# Note: DeepSpeed beyond v0.12.6 requires py 3.10
|
||||
ENV PYTHON_VERSION=3.10
|
||||
# Install apt libs
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl git wget && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists*
|
||||
|
||||
# Create our conda env
|
||||
RUN conda create --name accelerate python=${PYTHON_VERSION} ipython jupyter pip
|
||||
# We don't install pytorch here yet since CUDA isn't available
|
||||
# instead we use the direct torch wheel
|
||||
ENV PATH /opt/conda/envs/accelerate/bin:$PATH
|
||||
# Activate our bash shell
|
||||
RUN chsh -s /bin/bash
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
# Activate the conda env, install mpy4pi, and install torch + accelerate
|
||||
RUN source activate accelerate && conda install -c conda-forge mpi4py
|
||||
RUN source activate accelerate && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
git+https://github.com/huggingface/accelerate#egg=accelerate[testing,test_trackers,deepspeed] \
|
||||
--extra-index-url https://download.pytorch.org/whl/cu117
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir bitsandbytes
|
||||
|
||||
# Stage 2
|
||||
FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 AS build-image
|
||||
COPY --from=compile-image /opt/conda /opt/conda
|
||||
ENV PATH /opt/conda/bin:$PATH
|
||||
|
||||
# Install apt libs
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl git wget && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists*
|
||||
|
||||
RUN echo "source activate accelerate" >> ~/.profile
|
||||
|
||||
# Activate the virtualenv
|
||||
CMD ["/bin/bash"]
|
||||
@ -78,6 +78,8 @@
|
||||
title: Executing and deferring jobs
|
||||
- local: concept_guides/gradient_synchronization
|
||||
title: Gradient synchronization
|
||||
- local: concept_guides/fsdp_and_deepspeed
|
||||
title: FSDP vs DeepSpeed
|
||||
- local: concept_guides/low_precision_training
|
||||
title: How training in low-precision environments is possible (FP8)
|
||||
- local: concept_guides/training_tpu
|
||||
|
||||
192
docs/source/concept_guides/fsdp_and_deepspeed.md
Normal file
192
docs/source/concept_guides/fsdp_and_deepspeed.md
Normal file
@ -0,0 +1,192 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
-->
|
||||
|
||||
# Moving between FSDP And DeepSpeed
|
||||
|
||||
🤗 Accelerate offers flexibilty of training frameworks, by integrating two extremely powerful tools for distributed training, namely [Pytorch FSDP](../usage_guides/fsdp.md) and [Microsoft DeepSpeed](../usage_guides/deepspeed.md). The aim of this tutorial is to draw parallels, as well as to outline potential differences, to empower the user to switch seamlessly between these two frameworks.
|
||||
|
||||
<Tip>
|
||||
|
||||
To switch between the frameworks, we recommend launching code 🤗 `accelerate launch` passing in the correct config file with `--config_file`, or passing in the respective arguments directly for [FSDP and DeepSpeed](../package_reference/cli#accelerate-launch) .
|
||||
|
||||
Example 🤗 Accelerate configurations can be found here for [DeepSpeed](../usage_guides/deepspeed#accelerate-deepspeed-plugin) and [FSDP](../usage_guides/fsdp#how-it-works-out-of-the-box), or in the [example zoo under "Launch Configurations"](../usage_guides/explore)
|
||||
|
||||
</Tip>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This tutorial is for single-node, multi-GPU, scenarios only.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Configuring Functionalities
|
||||
|
||||
Model tensors are split into different GPUs in an attempt to scale up model sizes; this is termed *sharding* in FSDP, and *partitioning* in DeepSpeed. FSDP sharding and DeepSpeed ZeRO (partitioning) stages are configured by `--fsdp_sharding_strategy`, and `--zero_stage`, respectively. In particular, FSDP `FULL_SHARD` maps to DeepSpeed ZeRO stage `3`; see this [comprehensive mapping between FSDP sharding and DeepSpeed ZeRO settings](../usage_guides/fsdp#mapping-between-fsdp-sharding-strategies-and-deepspeed-zero-stages). The below table summarizes and groups similar settings:
|
||||
|
||||
Group | Framework | Configuration | Example | Restrictions (if any)
|
||||
--|--|--|--|--
|
||||
sharding / partitioning | FSDP<br>DeepSpeed | `--fsdp_sharding_strategy`<br>`--zero_stage` | `1` (`FULL_SHARD`) <br>`3` |
|
||||
offload | FSDP<br>DeepSpeed | `--fsdp_offload_params`<br>`--offload_param_device`<br>`--offload_optimizer_device` | `true`<br>`cpu`<br>`cpu` | all or nothing <br><br>
|
||||
model loading | FSDP<br>DeepSpeed | <span style="white-space:nowrap;">`--fsdp_cpu_ram_efficient_loading`</span><br>`--zero3_init_flag` | `true`<br>`true` | <br>only ZeRO 3
|
||||
efficient checkpointing | FSDP<br>DeepSpeed | `--fsdp_state_dict_type`<br>`--zero3_save_16bit_model` | `SHARDED_STATE_DICT`<br>`true` | <br>only ZeRO 3
|
||||
weights prefetching | FSDP<br><br>DeepSpeed | `--fsdp_forward_prefetch`<br>`--fsdp_backward_prefetch`<br>None | `true`<br>`BACKWARD_PRE` | <br><br>
|
||||
model | FSDP<br><br>DeepSpeed | `--fsdp_auto_wrap_policy`<br><span style="white-space:nowrap;">`--fsdp_transformer_layer_cls_to_wrap`</span><br>None | `TRANSFORMER_BASED_WRAP`<br><Layer Class> |<br>Usually not needed <br>Transparent to user.
|
||||
parameters summoning | FSDP<br>DeepSpeed | `--fsdp_use_orig_params`<br>None | `true` | required for `torch.compile`<br>Transparent to user
|
||||
parameters syncing | FSDP<br>DeepSpeed | `--fsdp_sync_module_states`<br>None | `true` |
|
||||
training | FSDP<br>DeepSpeed | None<br>`--gradient_accumulation_steps`<br>`--gradient_clipping` | <br>`auto`<br>`auto` | Transparent to user
|
||||
|
||||
For detailed descriptions of the above, refer to [🤗 `Accelerate` launch documentation](../package_reference/cli#accelerate-launch).
|
||||
|
||||
<Tip>
|
||||
|
||||
To access other DeepSpeed configurations, such as mixed precision settings,
|
||||
you need to pass in a `--deepspeed_config_file`, see the [documentation](../usage_guides/deepspeed#deepspeed-config-file).
|
||||
|
||||
DeepSpeed can be also configured via [`DeepSpeedPlugin`], e.g., `DeepSpeedPlugin.zero_stage` is equivalent of `--zero_stage`, and `DeepSpeedPlugin.hf_ds_config` can be used to pass `--deepeed_config_file.`
|
||||
|
||||
</Tip>
|
||||
|
||||
<Tip>
|
||||
|
||||
FSDP can be also configured via [`FullyShardedDataParallelPlugin`], e.g., `FullyShardedDataParallelPlugin.sharding_strategy` is equivalent of `--fsdp_sharding_strategy`.
|
||||
|
||||
</Tip>
|
||||
|
||||
### Checkpointing
|
||||
|
||||
Do note that while FSDP can be configured via `--fsdp_state_dict_type` to save either full / sharded checkpoints.
|
||||
|
||||
<Tip>
|
||||
|
||||
For DeepSpeed Zero3, one could pass a `--zero3_save_16bit_model true`, which conveniently consolidates the model to a single rank and saves; this is the FSDP equivalent of `fsdp_state_dict_type: FULL_STATE_DICT`.
|
||||
|
||||
</Tip>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
For large models, consolidating the model to a single rank can be very slow.
|
||||
|
||||
</Tip>
|
||||
|
||||
<Tip>
|
||||
|
||||
For quicker checkpointing, for FSDP use `fsdp_state_dict_type: SHARDED_STATE_DICT`, and for DeepSpeed Zero3 [use the `zero_to_fp32.py` script to post-convert sharded checkpoints](https://www.deepspeed.ai/tutorials/zero/#extracting-weights).
|
||||
|
||||
|
||||
</Tip>
|
||||
|
||||
### Offloading
|
||||
|
||||
FSDP only allows *all-or-nothing* offload (i.e., either offload parameters, gradients, and optimizer, or keep them all in GPU), but DeepSpeed can offload parameters and optimizer differently. Furthermore, DeepSpeed also supports [offloading to NVME](https://www.deepspeed.ai/docs/config-json/#parameter-offloading).
|
||||
|
||||
### Prefetching
|
||||
|
||||
FSDP allows two prefetching configurations `--fsdp_forward_prefetch` and `--fsdp_backward_prefetch` to improve overlap of comms / computation at a cost of extra memory, see [FSDP documentation](https://pytorch.org/docs/stable/fsdp.html).
|
||||
For DeepSpeed, the prefetching will be turned on when needed, and it turns on depending on certain hyper-params like `stage3_param_persistence_threshold`, `stage3_max_reuse_distance`, etc, [that can be configured for Zero3](https://www.deepspeed.ai/docs/config-json/#parameter-offloading); 🤗 `accelerate` may set these hyper-params automatically if you don't set those explicitly in the deepspeed config file.
|
||||
|
||||
<Tip>
|
||||
|
||||
For FSDP set `fsdp_backward_prefetch: BACKWARD_PRE` for improved throughputs if memory allows.
|
||||
|
||||
</Tip>
|
||||
|
||||
### Model Loading
|
||||
|
||||
While FSDP require an explicit `--fsdp_cpu_ram_efficient_loading true` to activate efficient model loading, 🤗 `transformers` will activate the similar feature whenever DeepSpeed Zero3 is used.
|
||||
|
||||
<Tip>
|
||||
|
||||
For FSDP, whenever setting `--fsdp_cpu_ram_efficient_loading true`, 🤗 `accelerate` will automatically set `sync_module_states` to true.
|
||||
For RAM efficient loading the weights will be loaded only in a singe rank, and thus requires `sync_module_states` to broadcast weights to other ranks.
|
||||
|
||||
</Tip>
|
||||
|
||||
### Model
|
||||
|
||||
FSDP requires an explicit `--fsdp_auto_wrap_policy` for the algorithm to decide how to schedule the all-gather and reduce-scatter operations. But for DeepSpeed this is transparent to the user.
|
||||
|
||||
<Tip>
|
||||
|
||||
For FSDP, simply set `fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP`. With the latest [`transformers`] versions, we try our best to figure out the suitable `fsdp_transformer_layer_cls_to_wrap` for HF transformers models. However, if you get an error regarding it, please specify this.
|
||||
|
||||
</Tip>
|
||||
|
||||
### Parameters Summoning
|
||||
|
||||
FSDP requires an explicit `--fsdp_use_orig_params` flag if using `torch.compile`, see [the pytorch documenation](https://pytorch.org/docs/stable/fsdp.html#module-torch.distributed.fsdp). For DeepSpeed this is transparent to the user.
|
||||
|
||||
<Tip>
|
||||
|
||||
For FSDP, when using `torch.compile` please set `fsdp_use_orig_params: True`.
|
||||
|
||||
</Tip>
|
||||
|
||||
|
||||
## Training
|
||||
|
||||
Deepspeed requires explicit `--gradient_accumulation_steps` and `--gradient_clipping` flags. For FSDP this is transparent to the user.
|
||||
|
||||
<Tip>
|
||||
|
||||
When using DeepSpeed, set `gradient_accumulation_steps: "auto"` and `gradient_clipping: "auto"` to automatically pick up values set in the [`Accelerator`] or [`TrainingArguments`] (if using `transformers`).
|
||||
|
||||
</Tip>
|
||||
|
||||
|
||||
## On Differences in Data Precision Handling
|
||||
|
||||
To discuss the how data precision is handled in both FSDP and Deepspeed, it is instructive to first give an overview of how model parameters are handled in these frameworks. Before the model / optimizer parameters are distributed across GPUs, parameter preparation is involved to first "flatten" them to one-dimensional [`torch.Tensor`](https://pytorch.org/docs/stable/tensors.html#torch-tensor). The implementation of FSDP / DeepSpeed varies in the respect of the `dtype` in which these "flattened" parameters are stored, and there are ramifications with regards to how [`torch.Optimizer`](https://pytorch.org/docs/stable/optim.html#module-torch.optim) allocate their `dtype`s. The table below outlines the processes for both frameworks; the "Local" column indicates the process occurring at a per-gpu level, therefore any memory overheads by upcasting should be understood to be amortized by the number of gpus used.
|
||||
|
||||
<Tip>
|
||||
|
||||
As a rule of thumb, for stable training with automatic mixed precision, all the trainable parameters have to be in `torch.float32`.
|
||||
|
||||
</Tip>
|
||||
|
||||
Process | Local | Framework | Details
|
||||
--|--|--|--
|
||||
Loading, i.e., [`AutoModel.from_pretrained(..., torch_dtype=torch_dtype)`] |
|
||||
Preparation, i.e., creation of "flat params" | ✅ | FSDP<br>DeepSpeed | created in `torch_dtype`.<br> disregards `torch_dtype`, created in `float32`.
|
||||
Optimizer initialization | ✅ | FSDP<br>DeepSpeed | creates parameters in `torch_dtype`<br> creates parameters in `float32`
|
||||
Training Step, i.e, forward, backward, reduction | | FSDP<br>DeepSpeed | follows [`MixedPrecision`](https://pytorch.org/docs/stable/fsdp.html#torch.distributed.fsdp.MixedPrecision)<br> follows `deepspeed_config_file` mixed precision settings.
|
||||
Optimizer (Pre-Step) | ✅ | FSDP<br>DeepSpeed | upcasting (if any) to `torch_dtype`<br>upcasted to `float32`
|
||||
Optimizer (Actual Step) | ✅ | FSDP<br>DeepSpeed | occurs in `torch_dtype` <br> occurs in `float32`.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Therefore when using DeepSpeed a small number of GPUs, be aware of potentially significant memory overheads due to the upcasting during preperation.
|
||||
|
||||
</Tip>
|
||||
|
||||
<Tip>
|
||||
|
||||
With FSDP, in the absence of mixed precision, it is possible to operate the [`torch.Optimizer`](https://pytorch.org/docs/stable/optim.html#module-torch.optim) in low precision `torch_dtype`, which may be helpful when using small number of GPUs.
|
||||
|
||||
</Tip>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
With mixed precision, FSDP and DeepSpeed will upcast in the model preparation step (c.f. table above). But do note that FSDP will then save checkpoints in the upcasted precision; Deepspeed may still save low precision checkpoints if `--zero3_save_16bit_model` is specified.
|
||||
|
||||
</Tip>
|
||||
|
||||
|
||||
To clarify the above table consider the concrete examples below; the optimizer pre- and actual step combined for brevity. With FSDP it is possible to operate in the two modes shown below, but DeepSpeed can only operate in one.
|
||||
|
||||
Framework | Model Loading (`torch_dtype`) | Mixed Precision | Preparation (Local) | Training | Optimizer (Local)
|
||||
--|--|--|--|--|--
|
||||
FSDP | bf16 | default (none) | bf16 | bf16 | bf16
|
||||
FSDP | bf16 | bf16 | fp32 | bf16 | fp32
|
||||
DeepSpeed | bf16 | bf16 | fp32 | bf16 | fp32
|
||||
@ -208,6 +208,10 @@ The following arguments are only useful when `use_fsdp` is passed or Fully Shard
|
||||
* `--fsdp_transformer_layer_cls_to_wrap` (`str`) -- Transformer layer class name (case-sensitive) to wrap, e.g, `BertLayer`, `GPTJBlock`, `T5Block` ...
|
||||
* `--fsdp_backward_prefetch_policy` (`str`) -- FSDP's backward prefetch policy.
|
||||
* `--fsdp_state_dict_type` (`str`) -- FSDP's state dict type.
|
||||
* `--fsdp_forward_prefetch` (`str`) -- FSDP forward prefetch.
|
||||
* `--fsdp_use_orig_params` (`str`) -- If True, allows non-uniform `requires_grad` mixed in a FSDP unit.
|
||||
* `--fsdp_cpu_ram_efficient_loading` (`str`) - If true, only the first process loads the pretrained model checkoint while all other processes have empty weights. When using this, `--fsdp_sync_module_states` needs to True.
|
||||
* `--fsdp_sync_module_states` (`str`) - If true, each individually wrapped FSDP unit will broadcast module parameters from rank 0.
|
||||
|
||||
**Megatron-LM Arguments**:
|
||||
|
||||
|
||||
@ -17,12 +17,12 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
[[autodoc]] utils.DeepSpeedPlugin
|
||||
|
||||
[[autodoc]] utils.DummyOptim
|
||||
[[autodoc]] utils.deepspeed.DummyOptim
|
||||
|
||||
[[autodoc]] utils.DummyScheduler
|
||||
[[autodoc]] utils.deepspeed.DummyScheduler
|
||||
|
||||
[[autodoc]] utils.DeepSpeedEngineWrapper
|
||||
[[autodoc]] utils.deepspeed.DeepSpeedEngineWrapper
|
||||
|
||||
[[autodoc]] utils.DeepSpeedOptimizerWrapper
|
||||
[[autodoc]] utils.deepspeed.DeepSpeedOptimizerWrapper
|
||||
|
||||
[[autodoc]] utils.DeepSpeedSchedulerWrapper
|
||||
[[autodoc]] utils.deepspeed.DeepSpeedSchedulerWrapper
|
||||
|
||||
@ -93,6 +93,9 @@ accelerator = Accelerator()
|
||||
> [!WARNING]
|
||||
> This step is *optional* but it is considered best practice to allow Accelerate to handle device placement. You could also deactivate automatic device placement by passing `device_placement=False` when initializing the [`Accelerator`]. If you want to explicitly place objects on a device with `.to(device)`, make sure you use `accelerator.device` instead. For example, if you create an optimizer before placing a model on `accelerator.device`, training fails on a TPU.
|
||||
|
||||
> [!WARNING]
|
||||
> Accelerate does not use non-blocking transfers by default for its automatic device placement, which can result in potentially unwanted CUDA synchronizations. You can enable non-blocking transfers by passing a [`~utils.dataclasses.DataLoaderConfiguration`] with `non_blocking=True` set as the `dataloader_config` when initializing the [`Accelerator`]. As usual, non-blocking transfers will only work if the dataloader also has `pin_memory=True` set. Be wary that using non-blocking transfers from GPU to CPU may cause incorrect results if it results in CPU operations being performed on non-ready tensors.
|
||||
|
||||
```py
|
||||
device = accelerator.device
|
||||
```
|
||||
@ -121,7 +124,7 @@ To perform distributed evaluation, pass your validation dataloader to the [`~Acc
|
||||
validation_dataloader = accelerator.prepare(validation_dataloader)
|
||||
```
|
||||
|
||||
Each device in your distributed setup only receives a part of the evaluation data, which means you should group your predictions together with the [`~Accelerator.gather_for_metrics`] method. This method requires all tensors to be the same size on each process, so if your tensors have different sizes on each process (for instance when dynamically padding to the maximum length in a batch), you should use the [`~Accelerator.pad_across_processes`] method to pad you tensor to the largest size across processes.
|
||||
Each device in your distributed setup only receives a part of the evaluation data, which means you should group your predictions together with the [`~Accelerator.gather_for_metrics`] method. This method requires all tensors to be the same size on each process, so if your tensors have different sizes on each process (for instance when dynamically padding to the maximum length in a batch), you should use the [`~Accelerator.pad_across_processes`] method to pad you tensor to the largest size across processes. Note that the tensors needs to be 1D and that we concatenate the tensors along the first dimension.
|
||||
|
||||
```python
|
||||
for inputs, targets in validation_dataloader:
|
||||
@ -132,6 +135,8 @@ for inputs, targets in validation_dataloader:
|
||||
metric.add_batch(all_predictions, all_targets)
|
||||
```
|
||||
|
||||
For more complex cases (e.g. 2D tensors, don't want to concatenate tensors, dict of 3D tensors), you can pass `use_gather_object=True` in `gather_for_metrics`. This will return the list of objects after gathering. Note that using it with GPU tensors is not well supported and inefficient.
|
||||
|
||||
> [!TIP]
|
||||
> Data at the end of a dataset may be duplicated so the batch can be equally divided among all workers. The [`~Accelerator.gather_for_metrics`] method automatically removes the duplicated data to calculate a more accurate metric.
|
||||
|
||||
|
||||
@ -157,10 +157,18 @@ Currently, `Accelerate` supports following config through the CLI:
|
||||
`gradient_accumulation_steps`: Number of training steps to accumulate gradients before averaging and applying them.
|
||||
`gradient_clipping`: Enable gradient clipping with value.
|
||||
`offload_optimizer_device`: [none] Disable optimizer offloading, [cpu] offload optimizer to CPU, [nvme] offload optimizer to NVMe SSD. Only applicable with ZeRO >= Stage-2.
|
||||
`offload_optimizer_nvme_path`: Decides Nvme Path to offload optimizer states. If unspecified, will default to 'none'.
|
||||
`offload_param_device`: [none] Disable parameter offloading, [cpu] offload parameters to CPU, [nvme] offload parameters to NVMe SSD. Only applicable with ZeRO Stage-3.
|
||||
`offload_param_nvme_path`: Decides Nvme Path to offload parameters. If unspecified, will default to 'none'.
|
||||
`zero3_init_flag`: Decides whether to enable `deepspeed.zero.Init` for constructing massive models. Only applicable with ZeRO Stage-3.
|
||||
`zero3_save_16bit_model`: Decides whether to save 16-bit model weights when using ZeRO Stage-3.
|
||||
`mixed_precision`: `no` for FP32 training, `fp16` for FP16 mixed-precision training and `bf16` for BF16 mixed-precision training.
|
||||
`deepspeed_moe_layer_cls_names`: Comma-separated list of transformer Mixture-of-Experts (MoE) layer class names (case-sensitive) to wrap ,e.g, `MixtralSparseMoeBlock`, `Qwen2MoeSparseMoeBlock`, `JetMoEAttention,JetMoEBlock` ...
|
||||
`deepspeed_hostfile`: DeepSpeed hostfile for configuring multi-node compute resources.
|
||||
`deepspeed_exclusion_filter`: DeepSpeed exclusion filter string when using mutli-node setup.
|
||||
`deepspeed_inclusion_filter`: DeepSpeed inclusion filter string when using mutli-node setup.
|
||||
`deepspeed_multinode_launcher`: DeepSpeed multi-node launcher to use. If unspecified, will default to `pdsh`.
|
||||
`deepspeed_config_file`: path to the DeepSpeed config file in `json` format. See the next section for more details on this.
|
||||
```
|
||||
To be able to tweak more options, you will need to use a DeepSpeed config file.
|
||||
|
||||
@ -721,3 +729,10 @@ Papers:
|
||||
|
||||
Finally, please, remember that 🤗 `Accelerate` only integrates DeepSpeed, therefore if you
|
||||
have any problems or questions with regards to DeepSpeed usage, please, file an issue with [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/issues).
|
||||
|
||||
|
||||
<Tip>
|
||||
|
||||
For those interested in the similarities and differences between FSDP and DeepSpeed, please check out the [concept guide here](../concept_guides/fsdp_and_deepspeed.md)!
|
||||
|
||||
</Tip>
|
||||
@ -140,6 +140,8 @@ with distributed_state.split_between_processes(["a dog", "a cat", "a chicken"],
|
||||
On the first GPU, the prompts will be `["a dog", "a cat"]`, and on the second GPU it will be `["a chicken", "a chicken"]`.
|
||||
Make sure to drop the final sample, as it will be a duplicate of the previous one.
|
||||
|
||||
You can find more complex examples [here](https://github.com/huggingface/accelerate/tree/main/examples/inference/distributed) such as how to use it with LLMs.
|
||||
|
||||
## Memory-efficient pipeline parallelism (experimental)
|
||||
|
||||
This next part will discuss using *pipeline parallelism*. This is an **experimental** API utilizing the [PiPPy library by PyTorch](https://github.com/pytorch/PiPPy/) as a native solution.
|
||||
@ -232,4 +234,4 @@ if PartialState().is_last_process:
|
||||
|
||||
</Tip>
|
||||
|
||||
And that's it! To explore more, please check out the inference examples in the [Accelerate repo](https://github.com/huggingface/accelerate/tree/main/examples/inference) and our [documentation](../package_reference/inference) as we work to improving this integration.
|
||||
And that's it! To explore more, please check out the inference examples in the [Accelerate repo](https://github.com/huggingface/accelerate/tree/main/examples/inference/pippy) and our [documentation](../package_reference/inference) as we work to improving this integration.
|
||||
|
||||
@ -175,3 +175,10 @@ You can then pass `state` into the `save_pretrained` method. There are several
|
||||
|
||||
For more control, users can leverage the `FullyShardedDataParallelPlugin`. After creating an instance of this class, users can pass it to the Accelerator class instantiation.
|
||||
For more information on these options, please refer to the PyTorch [FullyShardedDataParallel](https://github.com/pytorch/pytorch/blob/0df2e863fbd5993a7b9e652910792bd21a516ff3/torch/distributed/fsdp/fully_sharded_data_parallel.py#L236) code.
|
||||
|
||||
|
||||
<Tip>
|
||||
|
||||
For those interested in the similarities and differences between FSDP and DeepSpeed, please check out the [concept guide here](../concept_guides/fsdp_and_deepspeed.md)!
|
||||
|
||||
</Tip>
|
||||
@ -198,7 +198,7 @@ achieve the same outcome with:
|
||||
|
||||
```python
|
||||
wandb_tracker = accelerator.get_tracker("wandb", unwrap=True)
|
||||
with accelerator.on_main_process:
|
||||
if accelerator.is_main_process:
|
||||
wandb_tracker.log_artifact(some_artifact_to_log)
|
||||
```
|
||||
|
||||
|
||||
@ -248,7 +248,7 @@ def training_function(config, args):
|
||||
# Use accelerator.print to print only on the main process.
|
||||
test_predictions.append(torch.cat(fold_predictions, dim=0))
|
||||
# We now need to release all our memory and get rid of the current model, optimizer, etc
|
||||
accelerator.free_memory()
|
||||
model, optimizer = accelerator.free_memory(model, optimizer)
|
||||
# New Code #
|
||||
# Finally we check the accuracy of our folded results:
|
||||
test_references = torch.cat(test_references, dim=0)
|
||||
|
||||
@ -34,7 +34,7 @@ import datasets
|
||||
import torch
|
||||
import transformers
|
||||
from datasets import load_dataset
|
||||
from huggingface_hub import Repository
|
||||
from huggingface_hub import HfApi
|
||||
from torch.utils.data import DataLoader
|
||||
from tqdm.auto import tqdm
|
||||
from transformers import (
|
||||
@ -47,7 +47,6 @@ from transformers import (
|
||||
default_data_collator,
|
||||
get_scheduler,
|
||||
)
|
||||
from transformers.utils import get_full_repo_name
|
||||
from transformers.utils.versions import require_version
|
||||
|
||||
from accelerate import Accelerator, DistributedType
|
||||
@ -303,11 +302,13 @@ def main():
|
||||
# Handle the repository creation
|
||||
if accelerator.is_main_process:
|
||||
if args.push_to_hub:
|
||||
if args.hub_model_id is None:
|
||||
repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
|
||||
else:
|
||||
repo_name = args.hub_model_id
|
||||
repo = Repository(args.output_dir, clone_from=repo_name)
|
||||
api = HfApi(token=args.hub_token)
|
||||
|
||||
# Create repo (repo_name from args or inferred)
|
||||
repo_name = args.hub_model_id
|
||||
if repo_name is None:
|
||||
repo_name = Path(args.output_dir).absolute().name
|
||||
repo_id = api.create_repo(repo_name, exist_ok=True).repo_id
|
||||
|
||||
with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
|
||||
if "step_*" not in gitignore:
|
||||
@ -707,7 +708,11 @@ def main():
|
||||
if accelerator.is_main_process:
|
||||
tokenizer.save_pretrained(args.output_dir)
|
||||
if args.push_to_hub:
|
||||
repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
|
||||
api.upload_folder(
|
||||
repo_id=repo_id,
|
||||
folder_path=args.output_dir,
|
||||
commit_message="End of training",
|
||||
)
|
||||
|
||||
with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
|
||||
json.dump({"perplexity": perplexity, "eval_loss": eval_loss.item()}, f)
|
||||
|
||||
@ -34,7 +34,7 @@ import datasets
|
||||
import torch
|
||||
import transformers
|
||||
from datasets import load_dataset
|
||||
from huggingface_hub import Repository
|
||||
from huggingface_hub import HfApi
|
||||
from torch.utils.data import DataLoader
|
||||
from tqdm.auto import tqdm
|
||||
from transformers import (
|
||||
@ -47,7 +47,7 @@ from transformers import (
|
||||
default_data_collator,
|
||||
get_scheduler,
|
||||
)
|
||||
from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
|
||||
from transformers.utils import check_min_version, send_example_telemetry
|
||||
from transformers.utils.versions import require_version
|
||||
|
||||
from accelerate import Accelerator, DistributedType
|
||||
@ -277,11 +277,13 @@ def main():
|
||||
# Handle the repository creation
|
||||
if accelerator.is_main_process:
|
||||
if args.push_to_hub:
|
||||
if args.hub_model_id is None:
|
||||
repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
|
||||
else:
|
||||
repo_name = args.hub_model_id
|
||||
repo = Repository(args.output_dir, clone_from=repo_name)
|
||||
api = HfApi(token=args.hub_token)
|
||||
|
||||
# Create repo (repo_name from args or inferred)
|
||||
repo_name = args.hub_model_id
|
||||
if repo_name is None:
|
||||
repo_name = Path(args.output_dir).absolute().name
|
||||
repo_id = api.create_repo(repo_name, exist_ok=True).repo_id
|
||||
|
||||
with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
|
||||
if "step_*" not in gitignore:
|
||||
@ -661,8 +663,11 @@ def main():
|
||||
)
|
||||
if accelerator.is_main_process:
|
||||
tokenizer.save_pretrained(args.output_dir)
|
||||
repo.push_to_hub(
|
||||
commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
|
||||
api.upload_folder(
|
||||
repo_id=repo_id,
|
||||
folder_path=args.output_dir,
|
||||
commit_message=f"Training in progress epoch {epoch}",
|
||||
run_as_future=True,
|
||||
)
|
||||
|
||||
if args.checkpointing_steps == "epoch":
|
||||
@ -690,7 +695,11 @@ def main():
|
||||
if accelerator.is_main_process:
|
||||
tokenizer.save_pretrained(args.output_dir)
|
||||
if args.push_to_hub:
|
||||
repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
|
||||
api.upload_folder(
|
||||
repo_id=repo_id,
|
||||
folder_path=args.output_dir,
|
||||
commit_message="End of training",
|
||||
)
|
||||
|
||||
with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
|
||||
json.dump({"perplexity": perplexity}, f)
|
||||
|
||||
225
examples/by_feature/schedule_free.py
Normal file
225
examples/by_feature/schedule_free.py
Normal file
@ -0,0 +1,225 @@
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import evaluate
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
from torch.utils.data import DataLoader
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, set_seed
|
||||
|
||||
from accelerate import Accelerator, DistributedType
|
||||
from accelerate.utils import is_schedulefree_available
|
||||
|
||||
|
||||
if is_schedulefree_available():
|
||||
import schedulefree
|
||||
else:
|
||||
raise ImportError(
|
||||
"This example requires the `schedulefree` library. Please install it with `pip install schedulefree`"
|
||||
)
|
||||
|
||||
|
||||
########################################################################
|
||||
# This is a fully working simple example to use Accelerate and Facebook's
|
||||
# scheduler-free optimizer: https://github.com/facebookresearch/schedule_free/
|
||||
#
|
||||
# This example trains a Bert base model on GLUE MRPC
|
||||
# in any of the following settings (with the same script):
|
||||
# - single CPU or single GPU
|
||||
# - multi GPUS (using PyTorch distributed mode)
|
||||
# - (multi) TPUs
|
||||
# - fp16 (mixed-precision) or fp32 (normal precision)
|
||||
#
|
||||
# To run it in each of these various modes, follow the instructions
|
||||
# in the readme for examples:
|
||||
# https://github.com/huggingface/accelerate/tree/main/examples
|
||||
#
|
||||
########################################################################
|
||||
|
||||
|
||||
MAX_GPU_BATCH_SIZE = 16
|
||||
EVAL_BATCH_SIZE = 32
|
||||
|
||||
|
||||
def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
|
||||
"""
|
||||
Creates a set of `DataLoader`s for the `glue` dataset,
|
||||
using "bert-base-cased" as the tokenizer.
|
||||
|
||||
Args:
|
||||
accelerator (`Accelerator`):
|
||||
An `Accelerator` object
|
||||
batch_size (`int`, *optional*):
|
||||
The batch size for the train and validation DataLoaders.
|
||||
"""
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
||||
datasets = load_dataset("glue", "mrpc")
|
||||
|
||||
def tokenize_function(examples):
|
||||
# max_length=None => use the model max length (it's actually the default)
|
||||
outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
|
||||
return outputs
|
||||
|
||||
# Apply the method we just defined to all the examples in all the splits of the dataset
|
||||
# starting with the main process first:
|
||||
with accelerator.main_process_first():
|
||||
tokenized_datasets = datasets.map(
|
||||
tokenize_function,
|
||||
batched=True,
|
||||
remove_columns=["idx", "sentence1", "sentence2"],
|
||||
)
|
||||
|
||||
# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
|
||||
# transformers library
|
||||
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
|
||||
|
||||
def collate_fn(examples):
|
||||
# For Torchxla, it's best to pad everything to the same length or training will be very slow.
|
||||
max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
|
||||
# When using mixed precision we want round multiples of 8/16
|
||||
if accelerator.mixed_precision == "fp8":
|
||||
pad_to_multiple_of = 16
|
||||
elif accelerator.mixed_precision != "no":
|
||||
pad_to_multiple_of = 8
|
||||
else:
|
||||
pad_to_multiple_of = None
|
||||
|
||||
return tokenizer.pad(
|
||||
examples,
|
||||
padding="longest",
|
||||
max_length=max_length,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
# Instantiate dataloaders.
|
||||
train_dataloader = DataLoader(
|
||||
tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size, drop_last=True
|
||||
)
|
||||
eval_dataloader = DataLoader(
|
||||
tokenized_datasets["validation"],
|
||||
shuffle=False,
|
||||
collate_fn=collate_fn,
|
||||
batch_size=EVAL_BATCH_SIZE,
|
||||
drop_last=(accelerator.mixed_precision == "fp8"),
|
||||
)
|
||||
|
||||
return train_dataloader, eval_dataloader
|
||||
|
||||
|
||||
# For testing only
|
||||
|
||||
|
||||
if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
|
||||
from accelerate.test_utils.training import mocked_dataloaders
|
||||
|
||||
get_dataloaders = mocked_dataloaders # noqa: F811
|
||||
|
||||
|
||||
def training_function(config, args):
|
||||
# Initialize accelerator
|
||||
accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision)
|
||||
# Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
|
||||
lr = config["lr"]
|
||||
num_epochs = int(config["num_epochs"])
|
||||
seed = int(config["seed"])
|
||||
batch_size = int(config["batch_size"])
|
||||
|
||||
metric = evaluate.load("glue", "mrpc")
|
||||
|
||||
# If the batch size is too big we use gradient accumulation
|
||||
gradient_accumulation_steps = 1
|
||||
if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
|
||||
gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
|
||||
batch_size = MAX_GPU_BATCH_SIZE
|
||||
|
||||
set_seed(seed)
|
||||
train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
|
||||
# Instantiate the model (we build the model here so that the seed also control new weights initialization)
|
||||
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)
|
||||
|
||||
# We could avoid this line since the accelerator is set with `device_placement=True` (default value).
|
||||
# Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
|
||||
# creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
|
||||
model = model.to(accelerator.device)
|
||||
# Instantiate optimizer with warmup steps
|
||||
optimizer = schedulefree.AdamWScheduleFree(
|
||||
model.parameters(),
|
||||
lr=lr,
|
||||
warmup_steps=100,
|
||||
)
|
||||
|
||||
# Prepare everything
|
||||
# There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
|
||||
# prepare method.
|
||||
|
||||
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
|
||||
model, optimizer, train_dataloader, eval_dataloader
|
||||
)
|
||||
|
||||
# Now we train the model
|
||||
for epoch in range(num_epochs):
|
||||
model.train()
|
||||
optimizer.train()
|
||||
for step, batch in enumerate(train_dataloader):
|
||||
# We could avoid this line since we set the accelerator with `device_placement=True`.
|
||||
batch.to(accelerator.device)
|
||||
outputs = model(**batch)
|
||||
loss = outputs.loss
|
||||
loss = loss / gradient_accumulation_steps
|
||||
accelerator.backward(loss)
|
||||
if step % gradient_accumulation_steps == 0:
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
model.eval()
|
||||
optimizer.eval()
|
||||
for step, batch in enumerate(eval_dataloader):
|
||||
# We could avoid this line since we set the accelerator with `device_placement=True`.
|
||||
batch.to(accelerator.device)
|
||||
with torch.no_grad():
|
||||
outputs = model(**batch)
|
||||
predictions = outputs.logits.argmax(dim=-1)
|
||||
predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))
|
||||
metric.add_batch(
|
||||
predictions=predictions,
|
||||
references=references,
|
||||
)
|
||||
|
||||
eval_metric = metric.compute()
|
||||
# Use accelerator.print to print only on the main process.
|
||||
accelerator.print(f"epoch {epoch}:", eval_metric)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Simple example of training script.")
|
||||
parser.add_argument(
|
||||
"--mixed_precision",
|
||||
type=str,
|
||||
default=None,
|
||||
choices=["no", "fp16", "bf16", "fp8"],
|
||||
help="Whether to use mixed precision. Choose"
|
||||
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
|
||||
"and an Nvidia Ampere GPU.",
|
||||
)
|
||||
parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
|
||||
args = parser.parse_args()
|
||||
config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
|
||||
training_function(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
25
examples/inference/distributed/README.md
Normal file
25
examples/inference/distributed/README.md
Normal file
@ -0,0 +1,25 @@
|
||||
# Distributed inference examples
|
||||
|
||||
This folder contains a variety of tutorials for running distributed inference with the following strategy:
|
||||
|
||||
Load an entire model onto each GPU and sending chunks of a batch through each GPU’s model copy at a time
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
pip install accelerate torch
|
||||
```
|
||||
|
||||
## Running code
|
||||
|
||||
You can either use `torchrun` or the recommended way of `accelerate launch` (without needing to run `accelerate config`) on each script:
|
||||
|
||||
```bash
|
||||
accelerate launch --num_processes {NUM_GPUS} phi2.py
|
||||
```
|
||||
|
||||
Or:
|
||||
|
||||
```bash
|
||||
torchrun --nproc-per-node {NUM_GPUS} phi2.py
|
||||
```
|
||||
86
examples/inference/distributed/phi2.py
Normal file
86
examples/inference/distributed/phi2.py
Normal file
@ -0,0 +1,86 @@
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
from accelerate import PartialState
|
||||
from accelerate.utils import gather_object
|
||||
|
||||
|
||||
# Start up the distributed environment without needing the Accelerator.
|
||||
distributed_state = PartialState()
|
||||
|
||||
# You can change the model to any LLM such as mistralai/Mistral-7B-v0.1 or meta-llama/Llama-2-7b-chat-hf
|
||||
model_name = "microsoft/phi-2"
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_name, device_map=distributed_state.device, torch_dtype=torch.float16
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
# Need to set the padding token to the eos token for generation
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
prompts = [
|
||||
"I would like to",
|
||||
"hello how are you",
|
||||
"what is going on",
|
||||
"roses are red and",
|
||||
"welcome to the hotel",
|
||||
]
|
||||
|
||||
# You can change the batch size depending on your GPU RAM
|
||||
batch_size = 2
|
||||
# We set it to 8 since it is better for some hardware. More information here https://github.com/huggingface/tokenizers/issues/991
|
||||
pad_to_multiple_of = 8
|
||||
|
||||
# Split into batches
|
||||
# We will get the following results:
|
||||
# [ ["I would like to", "hello how are you"], [ "what is going on", "roses are red and"], [ "welcome to the hotel"] ]
|
||||
formatted_prompts = [prompts[i : i + batch_size] for i in range(0, len(prompts), batch_size)]
|
||||
|
||||
# Apply padding on the left since we are doing generation
|
||||
padding_side_default = tokenizer.padding_side
|
||||
tokenizer.padding_side = "left"
|
||||
# Tokenize each batch
|
||||
tokenized_prompts = [
|
||||
tokenizer(formatted_prompt, padding=True, pad_to_multiple_of=pad_to_multiple_of, return_tensors="pt")
|
||||
for formatted_prompt in formatted_prompts
|
||||
]
|
||||
# Put back the original padding behavior
|
||||
tokenizer.padding_side = padding_side_default
|
||||
|
||||
completions_per_process = []
|
||||
# We automatically split the batched data we passed to it across all the processes. We also set apply_padding=True
|
||||
# so that the GPUs will have the same number of prompts, and you can then gather the results.
|
||||
# For example, if we have 2 gpus, the distribution will be:
|
||||
# GPU 0: ["I would like to", "hello how are you"], "what is going on", "roses are red and"]
|
||||
# GPU 1: ["welcome to the hotel"], ["welcome to the hotel"] -> this prompt is duplicated to ensure that all gpus have the same number of prompts
|
||||
with distributed_state.split_between_processes(tokenized_prompts, apply_padding=True) as batched_prompts:
|
||||
for batch in batched_prompts:
|
||||
# Move the batch to the device
|
||||
batch = batch.to(distributed_state.device)
|
||||
# We generate the text, decode it and add it to the list completions_per_process
|
||||
outputs = model.generate(**batch, max_new_tokens=20)
|
||||
generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
||||
completions_per_process.extend(generated_text)
|
||||
|
||||
# We are gathering string, so we need to use gather_object.
|
||||
# If you need to gather tensors, you can use gather from accelerate.utils
|
||||
completions_gather = gather_object(completions_per_process)
|
||||
|
||||
# Drop duplicates produced by apply_padding in split_between_processes
|
||||
completions = completions_gather[: len(prompts)]
|
||||
|
||||
distributed_state.print(completions)
|
||||
30
examples/inference/distributed/stable_diffusion.py
Normal file
30
examples/inference/distributed/stable_diffusion.py
Normal file
@ -0,0 +1,30 @@
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
from accelerate import PartialState # Can also be Accelerator or AcceleratorState
|
||||
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
|
||||
distributed_state = PartialState()
|
||||
pipe.to(distributed_state.device)
|
||||
|
||||
# Assume two processes
|
||||
# On the first GPU, the prompts will be ["a dog", "a cat"],
|
||||
# and on the second GPU it will be ["a chicken", "a chicken"].
|
||||
# Make sure to drop the final sample, as it will be a duplicate of the previous one.
|
||||
with distributed_state.split_between_processes(["a dog", "a cat", "a chicken"], apply_padding=True) as prompt:
|
||||
result = pipe(prompt).images
|
||||
@ -1,3 +1,5 @@
|
||||
accelerate # used to be installed in Amazon SageMaker environment
|
||||
evaluate
|
||||
datasets==2.3.2
|
||||
datasets==2.3.2
|
||||
schedulefree
|
||||
huggingface_hub>=0.20.0
|
||||
|
||||
32
manim_animations/dataloaders/stage_0.py
Normal file
32
manim_animations/dataloaders/stage_0.py
Normal file
@ -0,0 +1,32 @@
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from manim import *
|
||||
|
||||
|
||||
class Stage0(Scene):
|
||||
def construct(self):
|
||||
mascot = ImageMobject("mascot_bookie.png")
|
||||
mascot.scale(.35)
|
||||
mascot.move_to([-3.75,-1,0])
|
||||
text = Paragraph(
|
||||
"Distributed Training,\nHugging Face Accelerate,\nand PyTorch DataLoaders\n\nHow do they all interact?",
|
||||
font_size=36,
|
||||
line_spacing=1,
|
||||
alignment="center",
|
||||
weight=BOLD,
|
||||
)
|
||||
text.move_to([1.75,.5,0])
|
||||
self.add(mascot)
|
||||
self.add(text)
|
||||
31
manim_animations/dataloaders/stage_1.py
Normal file
31
manim_animations/dataloaders/stage_1.py
Normal file
@ -0,0 +1,31 @@
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from manim import *
|
||||
|
||||
class Stage01(Scene):
|
||||
def construct(self):
|
||||
mascot = ImageMobject("mascot_bookie.png")
|
||||
mascot.scale(.35)
|
||||
mascot.move_to([-3.75,-1,0])
|
||||
text = Paragraph(
|
||||
"Distributed Training,\nHugging Face Accelerate,\nand PyTorch DataLoaders\n\nHow do they all interact?",
|
||||
font_size=36,
|
||||
line_spacing=1,
|
||||
alignment="center",
|
||||
weight=BOLD,
|
||||
)
|
||||
text.move_to([1.75,.5,0])
|
||||
self.add(mascot)
|
||||
self.add(text)
|
||||
176
manim_animations/dataloaders/stage_2.py
Normal file
176
manim_animations/dataloaders/stage_2.py
Normal file
@ -0,0 +1,176 @@
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from manim import *
|
||||
|
||||
|
||||
class Stage2(Scene):
|
||||
def construct(self):
|
||||
# The dataset items
|
||||
fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)
|
||||
columns = [
|
||||
VGroup(*[Rectangle(height=0.25,width=0.25,color="green") for i in range(8)]).arrange(RIGHT,buff=0)
|
||||
for j in range(4)
|
||||
]
|
||||
dataset_recs = VGroup(*columns).arrange(UP, buff=0)
|
||||
dataset_text = Text("Dataset", font_size=24)
|
||||
dataset = Group(dataset_recs,dataset_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
|
||||
dataset.move_to([-2,0,0])
|
||||
self.add(dataset)
|
||||
|
||||
code = Code(
|
||||
code="dataloader = DataLoader(...)\nfor batch in dataloader():\n\t...",
|
||||
tab_width=4,
|
||||
background="window",
|
||||
language="Python",
|
||||
font="Monospace",
|
||||
font_size=14,
|
||||
corner_radius=.2,
|
||||
insert_line_no=False,
|
||||
line_spacing=.75,
|
||||
style=Code.styles_list[1],
|
||||
)
|
||||
code.move_to([-3.5, 2.5, 0])
|
||||
self.add(code)
|
||||
|
||||
# The dataloader itself
|
||||
dataloader = Group(
|
||||
Rectangle(color="red", height=2, width=2),
|
||||
Text("DataLoader", font_size=24)
|
||||
).arrange(DOWN, buff=.5, aligned_edge=DOWN)
|
||||
|
||||
sampler = Group(
|
||||
Rectangle(color="blue", height=1, width=1),
|
||||
Text("Sampler", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN)
|
||||
dataloader.move_to([1, 0, 0])
|
||||
sampler.move_to([.75,.25,0])
|
||||
self.add(dataloader)
|
||||
self.add(sampler)
|
||||
|
||||
gpu_1 = Group(
|
||||
Rectangle(color="white", height=1, width=1),
|
||||
Text("GPU 1", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4, 2, 0])
|
||||
gpu_2 = Group(
|
||||
Rectangle(color="white", height=1, width=1),
|
||||
Text("GPU 2", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4, .5, 0])
|
||||
gpu_3 = Group(
|
||||
Rectangle(color="white", height=1, width=1),
|
||||
Text("GPU 3", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4, -1, 0])
|
||||
gpu_4 = Group(
|
||||
Rectangle(color="white", height=1, width=1),
|
||||
Text("GPU 4", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4, -2.5, 0])
|
||||
gpus = [gpu_1[0], gpu_2[0], gpu_3[0], gpu_4[0]]
|
||||
self.add(gpu_1, gpu_2, gpu_3, gpu_4)
|
||||
|
||||
# Animate their existence
|
||||
self.play(
|
||||
Create(gpu_1[0], run_time=0.5),
|
||||
Create(gpu_2[0], run_time=0.5),
|
||||
Create(gpu_3[0], run_time=0.5),
|
||||
Create(gpu_4[0], run_time=0.5),
|
||||
Create(dataset_recs, run_time=1),
|
||||
Create(sampler[0], run_time=1),
|
||||
Create(dataloader[0], run_time=1)
|
||||
)
|
||||
|
||||
step_1 = MarkupText(
|
||||
f"Without any special care, \nthe same data is sent though each sampler, \nand the same samples are spit out on each GPU",
|
||||
font_size=18
|
||||
)
|
||||
step_1.move_to([0, -2.5, 0])
|
||||
self.play(
|
||||
Write(step_1, run_time=4),
|
||||
)
|
||||
|
||||
first_animations = []
|
||||
second_animations = []
|
||||
|
||||
|
||||
colors = ["BLUE_E", "DARK_BROWN", "GOLD_E", "GRAY_A"]
|
||||
current_color = colors[0]
|
||||
buff = 0
|
||||
lr_buff = .25
|
||||
old_target = None
|
||||
new_datasets = []
|
||||
for i,data in enumerate(dataset_recs[-1]):
|
||||
if i % 2 == 0:
|
||||
# current_color = colors[i//2]
|
||||
current_color = "BLUE_E"
|
||||
dataset_target = Rectangle(height=0.46/2,width=0.46/2).set_stroke(width=0.).set_fill(current_color, opacity=0.7)
|
||||
dataset_target.move_to(data)
|
||||
dataset_target.generate_target()
|
||||
aligned_edge = ORIGIN
|
||||
if i % 2 == 0:
|
||||
old_target = dataset_target.target
|
||||
buff -= .25
|
||||
aligned_edge = LEFT
|
||||
dataset_target.target.next_to(
|
||||
sampler, buff=buff, direction=UP,
|
||||
aligned_edge=LEFT
|
||||
)
|
||||
else:
|
||||
dataset_target.target.next_to(
|
||||
old_target, direction=RIGHT, buff=0.01,
|
||||
)
|
||||
new_datasets.append(dataset_target)
|
||||
first_animations.append(data.animate(run_time=0.5).set_stroke(current_color))
|
||||
second_animations.append(MoveToTarget(dataset_target, run_time=1.5))
|
||||
self.play(*first_animations)
|
||||
self.play(*second_animations)
|
||||
self.wait()
|
||||
|
||||
move_animation = []
|
||||
|
||||
for j,gpu in enumerate(gpus):
|
||||
buff = 0
|
||||
for i,data in enumerate(new_datasets):
|
||||
if i % 2 == 0:
|
||||
current_color = colors[i//2]
|
||||
if j != 3:
|
||||
data = data.copy()
|
||||
data.generate_target()
|
||||
aligned_edge = ORIGIN
|
||||
if i % 2 == 0:
|
||||
old_target = data.target
|
||||
buff -= .25
|
||||
aligned_edge = LEFT
|
||||
data.target.next_to(
|
||||
gpu, buff=buff, direction=UP,
|
||||
aligned_edge=LEFT
|
||||
)
|
||||
else:
|
||||
data.target.next_to(
|
||||
old_target, direction=RIGHT, buff=0.01,
|
||||
)
|
||||
move_animation.append(MoveToTarget(data, run_time=1.5))
|
||||
|
||||
|
||||
self.play(*move_animation)
|
||||
|
||||
self.remove(step_1)
|
||||
step_2 = MarkupText(
|
||||
f"This behavior is undesireable, because we want\neach GPU to see different data for efficient training.",
|
||||
font_size=18
|
||||
)
|
||||
step_2.move_to([0, -2.5, 0])
|
||||
|
||||
self.play(
|
||||
Write(step_2, run_time=2.5),
|
||||
)
|
||||
self.wait()
|
||||
34
manim_animations/dataloaders/stage_3.py
Normal file
34
manim_animations/dataloaders/stage_3.py
Normal file
@ -0,0 +1,34 @@
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from manim import *
|
||||
|
||||
class Stage3(Scene):
|
||||
def construct(self):
|
||||
step_1 = MarkupText(
|
||||
f"To combat this, Accelerate employs one of two different\nSampler wrapper methods depending on the scenario:",
|
||||
font_size=24
|
||||
)
|
||||
step_1.move_to([0, 1.5, 0])
|
||||
self.add(step_1)
|
||||
step_2 = MarkupText(
|
||||
f"1. Sharding the dataset before drawing:\n\t● <span fgcolor='{RED}'>IterableDatasetShard</span>\n\t● <span fgcolor='{RED}'>BatchSamplerShard</span>",
|
||||
font_size=24,
|
||||
).next_to(step_1, direction=DOWN, aligned_edge=LEFT)
|
||||
self.add(step_2)
|
||||
step_3 = MarkupText(
|
||||
f"\n\n2. Splitting the batch after drawing:\n\t● <span fgcolor='{BLUE}'>DataLoaderDispatcher</span>",
|
||||
font_size=24,
|
||||
).next_to(step_2, direction=DOWN, aligned_edge=LEFT)
|
||||
self.add(step_3)
|
||||
52
manim_animations/dataloaders/stage_4.py
Normal file
52
manim_animations/dataloaders/stage_4.py
Normal file
@ -0,0 +1,52 @@
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from manim import *
|
||||
|
||||
class Stage4(Scene):
|
||||
def construct(self):
|
||||
|
||||
step_1 = MarkupText(
|
||||
f"To understand the next part fully, let's define two terms,\n<span fgcolor='{RED}'>`batch_size`</span> and <span fgcolor='{BLUE}'>`global_batch_size`</span>:",
|
||||
font_size=18
|
||||
)
|
||||
step_1.move_to([0, 1.5, 0])
|
||||
# <span fgcolor='{YELLOW}'>●</span>
|
||||
step_2 = MarkupText(
|
||||
f"\n\n● <span fgcolor='{RED}'>`batch_size`</span>: \n\tThis will be defined as the batch size seen on a given\n\t*individual* GPU",
|
||||
font_size=18,
|
||||
).next_to(step_1, direction=DOWN, aligned_edge=LEFT)
|
||||
|
||||
step_3 = MarkupText(
|
||||
f"\n\n● <span fgcolor='{BLUE}'>`global_batch_size`</span>:\n\tThis will be defined as the *total* number of\n\tdifferent items seen in the dataset, across all GPUs",
|
||||
font_size=18,
|
||||
).next_to(step_2, direction=DOWN, aligned_edge=LEFT)
|
||||
|
||||
step_4 = MarkupText(
|
||||
f"\n\nSo if we have a dataset of 64 items, 8 GPUs, \nand a `batch_size` of 8, each *step* will go through\nthe entire dataset one time as 8*8=64",
|
||||
font_size=18,
|
||||
).next_to(step_3, direction=DOWN, aligned_edge=LEFT)
|
||||
self.play(
|
||||
Write(step_1, run_time=4),
|
||||
)
|
||||
self.play(
|
||||
Write(step_2, run_time=4)
|
||||
)
|
||||
self.play(
|
||||
Write(step_3, run_time=4)
|
||||
)
|
||||
self.play(
|
||||
Write(step_4, run_time=6)
|
||||
)
|
||||
self.wait()
|
||||
203
manim_animations/dataloaders/stage_5.py
Normal file
203
manim_animations/dataloaders/stage_5.py
Normal file
@ -0,0 +1,203 @@
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from manim import *
|
||||
|
||||
class Stage5(Scene):
|
||||
def construct(self):
|
||||
# The dataset items
|
||||
colors = ["BLUE_E", "DARK_BROWN", "GOLD_E", "GRAY_A"]
|
||||
fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)
|
||||
columns = [
|
||||
VGroup(*[Rectangle(height=0.25,width=0.25,color=colors[j]) for i in range(8)]).arrange(RIGHT,buff=0)
|
||||
for j in range(4)
|
||||
]
|
||||
dataset_recs = VGroup(*columns).arrange(UP, buff=0)
|
||||
dataset_text = Text("Dataset", font_size=24)
|
||||
dataset = Group(dataset_recs,dataset_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
|
||||
dataset.move_to([-2,0,0])
|
||||
self.add(dataset)
|
||||
code = Code(
|
||||
code="# We enable this by default\naccelerator = Accelerator()\ndataloader = DataLoader(...)\ndataloader = accelerator.prepare(dataloader)\nfor batch in dataloader:\n\t...",
|
||||
tab_width=4,
|
||||
background="window",
|
||||
language="Python",
|
||||
font="Monospace",
|
||||
font_size=14,
|
||||
corner_radius=.2,
|
||||
insert_line_no=False,
|
||||
line_spacing=.75,
|
||||
style=Code.styles_list[1],
|
||||
)
|
||||
code.move_to([-3.5, 2.5, 0])
|
||||
self.add(code)
|
||||
|
||||
# The dataloader itself
|
||||
|
||||
sampler_1 = Group(
|
||||
Rectangle(color="blue", height=1, width=1),
|
||||
Text("Sampler GPU 1", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN)
|
||||
sampler_2 = Group(
|
||||
Rectangle(color="blue", height=1, width=1),
|
||||
Text("Sampler GPU 2", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN)
|
||||
sampler_3 = Group(
|
||||
Rectangle(color="blue", height=1, width=1),
|
||||
Text("Sampler GPU 3", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN)
|
||||
sampler_4 = Group(
|
||||
Rectangle(color="blue", height=1, width=1),
|
||||
Text("Sampler GPU 4", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN)
|
||||
sampler_1.move_to([2,2,0])
|
||||
sampler_2.move_to([2,.5,0])
|
||||
sampler_3.move_to([2,-1.,0])
|
||||
sampler_4.move_to([2,-2.5,0])
|
||||
self.add(sampler_1, sampler_2, sampler_3, sampler_4)
|
||||
samplers = [sampler_1[0], sampler_2[0], sampler_3[0], sampler_4[0]]
|
||||
|
||||
gpu_1 = Group(
|
||||
Rectangle(color="white", height=1, width=1),
|
||||
Text("Output GPU 1", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, 2, 0])
|
||||
gpu_2 = Group(
|
||||
Rectangle(color="white", height=1, width=1),
|
||||
Text("Output GPU 2", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, .5, 0])
|
||||
gpu_3 = Group(
|
||||
Rectangle(color="white", height=1, width=1),
|
||||
Text("Output GPU 3", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, -1, 0])
|
||||
gpu_4 = Group(
|
||||
Rectangle(color="white", height=1, width=1),
|
||||
Text("Output GPU 4", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, -2.5, 0])
|
||||
gpus = [gpu_1[0], gpu_2[0], gpu_3[0], gpu_4[0]]
|
||||
self.add(gpu_1, gpu_2, gpu_3, gpu_4)
|
||||
|
||||
# Animate their existence
|
||||
self.play(
|
||||
Create(gpu_1[0], run_time=1),
|
||||
Create(gpu_2[0], run_time=1),
|
||||
Create(gpu_3[0], run_time=1),
|
||||
Create(gpu_4[0], run_time=1),
|
||||
Create(dataset_recs, run_time=1),
|
||||
Create(sampler_1[0], run_time=1),
|
||||
Create(sampler_2[0], run_time=1),
|
||||
Create(sampler_3[0], run_time=1),
|
||||
Create(sampler_4[0], run_time=1),
|
||||
)
|
||||
|
||||
first_animations = []
|
||||
second_animations = []
|
||||
|
||||
|
||||
colors = ["BLUE_E", "DARK_BROWN", "GOLD_E", "GRAY_A"]
|
||||
current_color = colors[0]
|
||||
buff = 0
|
||||
lr_buff = .25
|
||||
old_target = None
|
||||
new_datasets = []
|
||||
for i,row_data in enumerate(dataset_recs):
|
||||
new_row = []
|
||||
current_color = colors[i]
|
||||
if i == 0:
|
||||
idx = -3
|
||||
elif i == 1:
|
||||
idx = -2
|
||||
elif i == 2:
|
||||
idx = -1
|
||||
elif i == 3:
|
||||
idx = 0
|
||||
for j,indiv_data in enumerate(row_data):
|
||||
dataset_target = Rectangle(height=0.46/2,width=0.46/2).set_stroke(width=0.).set_fill(current_color, opacity=0.7)
|
||||
dataset_target.move_to(indiv_data)
|
||||
dataset_target.generate_target()
|
||||
aligned_edge = ORIGIN
|
||||
if j % 8 == 0:
|
||||
aligned_edge = LEFT
|
||||
dataset_target.target.next_to(
|
||||
samplers[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
|
||||
)
|
||||
dataset_target.target.set_x(dataset_target.target.get_x())
|
||||
elif j % 4 == 0:
|
||||
old_target = dataset_target.target
|
||||
dataset_target.target.next_to(
|
||||
samplers[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
|
||||
)
|
||||
dataset_target.target.set_x(dataset_target.target.get_x())
|
||||
dataset_target.target.set_y(dataset_target.target.get_y()-.25)
|
||||
else:
|
||||
dataset_target.target.next_to(
|
||||
old_target, direction=RIGHT, buff=0.02,
|
||||
)
|
||||
old_target = dataset_target.target
|
||||
new_row.append(dataset_target)
|
||||
first_animations.append(indiv_data.animate(run_time=0.5).set_stroke(current_color))
|
||||
second_animations.append(MoveToTarget(dataset_target, run_time=1.5))
|
||||
|
||||
new_datasets.append(new_row)
|
||||
step_1 = MarkupText(
|
||||
f"Since we splice the dataset between each GPU,\nthe models weights can be averaged during `backward()`\nActing as though we did one giant epoch\nvery quickly.",
|
||||
font_size=18
|
||||
)
|
||||
step_1.move_to([-2.5, -2, 0])
|
||||
|
||||
self.play(
|
||||
Write(step_1, run_time=3),
|
||||
)
|
||||
self.play(
|
||||
*first_animations,
|
||||
)
|
||||
self.play(*second_animations)
|
||||
self.wait(duration=.5)
|
||||
|
||||
move_animation = []
|
||||
import random
|
||||
for i,row in enumerate(new_datasets):
|
||||
# row = [row[k] for k in random.sample(range(8), 8)]
|
||||
current_color = colors[i]
|
||||
if i == 0:
|
||||
idx = -3
|
||||
elif i == 1:
|
||||
idx = -2
|
||||
elif i == 2:
|
||||
idx = -1
|
||||
elif i == 3:
|
||||
idx = 0
|
||||
for j,indiv_data in enumerate(row):
|
||||
indiv_data.generate_target()
|
||||
aligned_edge = ORIGIN
|
||||
if j % 8 == 0:
|
||||
aligned_edge = LEFT
|
||||
indiv_data.target.next_to(
|
||||
gpus[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
|
||||
)
|
||||
indiv_data.target.set_x(indiv_data.target.get_x())
|
||||
elif j % 4 == 0:
|
||||
indiv_data.target.next_to(
|
||||
gpus[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
|
||||
)
|
||||
indiv_data.target.set_x(indiv_data.target.get_x())
|
||||
indiv_data.target.set_y(indiv_data.target.get_y()-.25)
|
||||
else:
|
||||
indiv_data.target.next_to(
|
||||
old_target, direction=RIGHT, buff=0.02,
|
||||
)
|
||||
old_target = indiv_data.target
|
||||
move_animation.append(MoveToTarget(indiv_data, run_time=1.5))
|
||||
|
||||
self.play(*move_animation)
|
||||
self.wait()
|
||||
193
manim_animations/dataloaders/stage_6.py
Normal file
193
manim_animations/dataloaders/stage_6.py
Normal file
@ -0,0 +1,193 @@
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from manim import *
|
||||
|
||||
|
||||
class Stage6(Scene):
|
||||
def construct(self):
|
||||
# The dataset items
|
||||
colors = ["BLUE_E", "DARK_BROWN", "GOLD_E", "GRAY_A"]
|
||||
fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)
|
||||
columns = [
|
||||
VGroup(*[Rectangle(height=0.25,width=0.25,color=colors[j]) for i in range(8)]).arrange(RIGHT,buff=0)
|
||||
for j in range(4)
|
||||
]
|
||||
dataset_recs = VGroup(*columns).arrange(UP, buff=0)
|
||||
dataset_text = Text("Dataset", font_size=24)
|
||||
dataset = Group(dataset_recs,dataset_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
|
||||
dataset.move_to([-2,0,0])
|
||||
self.add(dataset)
|
||||
code = Code(
|
||||
code="# We enable this by default\naccelerator = Accelerator()\ndataloader = DataLoader(..., shuffle=True)\ndataloader = accelerator.prepare(dataloader)\nfor batch in dataloader:\n\t...",
|
||||
tab_width=4,
|
||||
background="window",
|
||||
language="Python",
|
||||
font="Monospace",
|
||||
font_size=14,
|
||||
corner_radius=.2,
|
||||
insert_line_no=False,
|
||||
line_spacing=.75,
|
||||
style=Code.styles_list[1],
|
||||
)
|
||||
code.move_to([-3.5, 2.5, 0])
|
||||
self.add(code)
|
||||
|
||||
# The dataloader itself
|
||||
|
||||
sampler_1 = Group(
|
||||
Rectangle(color="blue", height=1, width=1),
|
||||
Text("Sampler GPU 1", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN)
|
||||
sampler_2 = Group(
|
||||
Rectangle(color="blue", height=1, width=1),
|
||||
Text("Sampler GPU 2", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN)
|
||||
sampler_3 = Group(
|
||||
Rectangle(color="blue", height=1, width=1),
|
||||
Text("Sampler GPU 3", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN)
|
||||
sampler_4 = Group(
|
||||
Rectangle(color="blue", height=1, width=1),
|
||||
Text("Sampler GPU 4", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN)
|
||||
sampler_1.move_to([2,2,0])
|
||||
sampler_2.move_to([2,.5,0])
|
||||
sampler_3.move_to([2,-1.,0])
|
||||
sampler_4.move_to([2,-2.5,0])
|
||||
self.add(sampler_1, sampler_2, sampler_3, sampler_4)
|
||||
samplers = [sampler_1[0], sampler_2[0], sampler_3[0], sampler_4[0]]
|
||||
|
||||
gpu_1 = Group(
|
||||
Rectangle(color="white", height=1, width=1),
|
||||
Text("Output GPU 1", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, 2, 0])
|
||||
gpu_2 = Group(
|
||||
Rectangle(color="white", height=1, width=1),
|
||||
Text("Output GPU 2", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, .5, 0])
|
||||
gpu_3 = Group(
|
||||
Rectangle(color="white", height=1, width=1),
|
||||
Text("Output GPU 3", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, -1, 0])
|
||||
gpu_4 = Group(
|
||||
Rectangle(color="white", height=1, width=1),
|
||||
Text("Output GPU 4", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, -2.5, 0])
|
||||
gpus = [gpu_1[0], gpu_2[0], gpu_3[0], gpu_4[0]]
|
||||
self.add(gpu_1, gpu_2, gpu_3, gpu_4)
|
||||
|
||||
|
||||
first_animations = []
|
||||
second_animations = []
|
||||
|
||||
|
||||
colors = ["BLUE_E", "DARK_BROWN", "GOLD_E", "GRAY_A"]
|
||||
current_color = colors[0]
|
||||
buff = 0
|
||||
lr_buff = .25
|
||||
old_target = None
|
||||
new_datasets = []
|
||||
for i,row_data in enumerate(dataset_recs):
|
||||
new_row = []
|
||||
current_color = colors[i]
|
||||
if i == 0:
|
||||
idx = -3
|
||||
elif i == 1:
|
||||
idx = -2
|
||||
elif i == 2:
|
||||
idx = -1
|
||||
elif i == 3:
|
||||
idx = 0
|
||||
for j,indiv_data in enumerate(row_data):
|
||||
dataset_target = Rectangle(height=0.46/2,width=0.46/2).set_stroke(width=0.).set_fill(current_color, opacity=0.7)
|
||||
dataset_target.move_to(indiv_data)
|
||||
dataset_target.generate_target()
|
||||
aligned_edge = ORIGIN
|
||||
if j % 8 == 0:
|
||||
aligned_edge = LEFT
|
||||
old_target = dataset_target.target
|
||||
dataset_target.target.next_to(
|
||||
samplers[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
|
||||
)
|
||||
dataset_target.target.set_x(dataset_target.target.get_x())
|
||||
elif j % 4 == 0:
|
||||
old_target = dataset_target.target
|
||||
dataset_target.target.next_to(
|
||||
samplers[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
|
||||
)
|
||||
dataset_target.target.set_x(dataset_target.target.get_x())
|
||||
dataset_target.target.set_y(dataset_target.target.get_y()-.25)
|
||||
else:
|
||||
dataset_target.target.next_to(
|
||||
old_target, direction=RIGHT, buff=0.02,
|
||||
)
|
||||
old_target = dataset_target.target
|
||||
new_row.append(dataset_target)
|
||||
first_animations.append(indiv_data.animate(run_time=0.5).set_stroke(current_color))
|
||||
second_animations.append(MoveToTarget(dataset_target, run_time=1.5))
|
||||
|
||||
new_datasets.append(new_row)
|
||||
step_1 = MarkupText(
|
||||
f"During shuffling, each mini-batch's\noutput order will be modified",
|
||||
font_size=18
|
||||
)
|
||||
step_1.move_to([-1.5, -2, 0])
|
||||
|
||||
self.play(
|
||||
Write(step_1, run_time=3),
|
||||
)
|
||||
self.play(
|
||||
*first_animations,
|
||||
)
|
||||
self.play(*second_animations)
|
||||
self.wait(duration=.5)
|
||||
|
||||
move_animation = []
|
||||
import random
|
||||
for i,row in enumerate(new_datasets):
|
||||
row = [row[k] for k in random.sample(range(8), 8)]
|
||||
current_color = colors[i]
|
||||
if i == 0:
|
||||
idx = -3
|
||||
elif i == 1:
|
||||
idx = -2
|
||||
elif i == 2:
|
||||
idx = -1
|
||||
elif i == 3:
|
||||
idx = 0
|
||||
for j,indiv_data in enumerate(row):
|
||||
indiv_data.generate_target()
|
||||
aligned_edge = ORIGIN
|
||||
if j % 8 == 0:
|
||||
aligned_edge = LEFT
|
||||
indiv_data.target.next_to(
|
||||
gpus[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
|
||||
)
|
||||
indiv_data.target.set_x(indiv_data.target.get_x())
|
||||
elif j % 4 == 0:
|
||||
indiv_data.target.next_to(
|
||||
gpus[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
|
||||
)
|
||||
indiv_data.target.set_x(indiv_data.target.get_x())
|
||||
indiv_data.target.set_y(indiv_data.target.get_y()-.25)
|
||||
else:
|
||||
indiv_data.target.next_to(
|
||||
old_target, direction=RIGHT, buff=0.02,
|
||||
)
|
||||
old_target = indiv_data.target
|
||||
move_animation.append(MoveToTarget(indiv_data, run_time=1.5))
|
||||
|
||||
self.play(*move_animation)
|
||||
self.wait()
|
||||
182
manim_animations/dataloaders/stage_7.py
Normal file
182
manim_animations/dataloaders/stage_7.py
Normal file
@ -0,0 +1,182 @@
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from manim import *
|
||||
|
||||
class Stage7(Scene):
|
||||
def construct(self):
|
||||
# The dataset items
|
||||
code = Code(
|
||||
code="accelerator = Accelerator(dispatch_batches=True)\ndataloader = DataLoader(...)\ndataloader = accelerator.prepare(dataloader)\nfor batch in dataloader:\n\t...",
|
||||
tab_width=4,
|
||||
background="window",
|
||||
language="Python",
|
||||
font="Monospace",
|
||||
font_size=14,
|
||||
corner_radius=.2,
|
||||
insert_line_no=False,
|
||||
line_spacing=.75,
|
||||
style=Code.styles_list[1],
|
||||
)
|
||||
code.move_to([-3.5, 2.5, 0])
|
||||
self.add(code)
|
||||
colors = ["BLUE_E", "DARK_BROWN", "GOLD_E", "GRAY_A"]
|
||||
fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)
|
||||
columns = [
|
||||
VGroup(*[Rectangle(height=0.25,width=0.25,color=colors[j]) for i in range(8)]).arrange(RIGHT,buff=0)
|
||||
for j in range(4)
|
||||
]
|
||||
dataset_recs = VGroup(*columns).arrange(UP, buff=0)
|
||||
dataset_text = Text("Dataset", font_size=24)
|
||||
dataset = Group(dataset_recs,dataset_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
|
||||
dataset.move_to([-2,0,0])
|
||||
self.add(dataset)
|
||||
|
||||
# The dataloader itself
|
||||
|
||||
sampler_1 = Group(
|
||||
Rectangle(color="blue", height=1.02, width=1.02),
|
||||
Text("Sampler GPU 1", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN)
|
||||
sampler_2 = Group(
|
||||
Rectangle(color="blue", height=1.02, width=1.02),
|
||||
Text("Sampler GPU 2", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN)
|
||||
sampler_3 = Group(
|
||||
Rectangle(color="blue", height=1.02, width=1.02),
|
||||
Text("Sampler GPU 3", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN)
|
||||
sampler_4 = Group(
|
||||
Rectangle(color="blue", height=1.02, width=1.02),
|
||||
Text("Sampler GPU 4", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN)
|
||||
sampler_1.move_to([2,2,0])
|
||||
sampler_2.move_to([2,.5,0])
|
||||
sampler_3.move_to([2,-1.,0])
|
||||
sampler_4.move_to([2,-2.5,0])
|
||||
self.add(sampler_1, sampler_2, sampler_3, sampler_4)
|
||||
samplers = [sampler_1[0], sampler_2[0], sampler_3[0], sampler_4[0]]
|
||||
|
||||
gpu_1 = Group(
|
||||
Rectangle(color="white", height=1.02, width=.98),
|
||||
Text("Output GPU 1", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, 2, 0])
|
||||
gpu_2 = Group(
|
||||
Rectangle(color="white", height=1.02, width=.98),
|
||||
Text("Output GPU 2", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, .5, 0])
|
||||
gpu_3 = Group(
|
||||
Rectangle(color="white", height=1.02, width=.98),
|
||||
Text("Output GPU 3", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, -1, 0])
|
||||
gpu_4 = Group(
|
||||
Rectangle(color="white", height=1.02, width=.98),
|
||||
Text("Output GPU 4", font_size=12)
|
||||
).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, -2.5, 0])
|
||||
gpus = [gpu_1[0], gpu_2[0], gpu_3[0], gpu_4[0]]
|
||||
self.add(gpu_1, gpu_2, gpu_3, gpu_4)
|
||||
|
||||
step_1 = MarkupText(
|
||||
f"When using a `DataLoaderDispatcher`, all\nof the samples are collected from GPU 0's dataset,\nthen divided and sent to each GPU.\nAs a result, this will be slower.",
|
||||
font_size=18
|
||||
)
|
||||
step_1.move_to([-2.5, -2, 0])
|
||||
|
||||
self.play(
|
||||
Write(step_1, run_time=3.5),
|
||||
)
|
||||
|
||||
first_animations = []
|
||||
second_animations = []
|
||||
|
||||
|
||||
colors = ["BLUE_E", "DARK_BROWN", "GOLD_E", "GRAY_A"]
|
||||
current_color = colors[0]
|
||||
ud_buff = 0.01
|
||||
lr_buff = 0.01
|
||||
old_target = None
|
||||
new_datasets = []
|
||||
for i,row_data in enumerate(dataset_recs):
|
||||
new_row = []
|
||||
current_color = colors[i]
|
||||
|
||||
for j,indiv_data in enumerate(row_data):
|
||||
dataset_target = Rectangle(height=0.46/4,width=0.46/2).set_stroke(width=0.).set_fill(current_color, opacity=0.7)
|
||||
dataset_target.move_to(indiv_data)
|
||||
dataset_target.generate_target()
|
||||
aligned_edge = ORIGIN
|
||||
if j % 8 == 0:
|
||||
aligned_edge = LEFT
|
||||
dataset_target.target.next_to(
|
||||
samplers[0].get_corner(DOWN+LEFT), buff=0.0125, direction=RIGHT+UP,
|
||||
)
|
||||
dataset_target.target.set_x(dataset_target.target.get_x())
|
||||
dataset_target.target.set_y(dataset_target.target.get_y() + (.25 * i))
|
||||
elif j % 4 == 0:
|
||||
old_target = dataset_target.target
|
||||
dataset_target.target.next_to(
|
||||
samplers[0].get_corner(DOWN+LEFT), buff=0.0125, direction=RIGHT+UP,
|
||||
)
|
||||
dataset_target.target.set_x(dataset_target.target.get_x())
|
||||
dataset_target.target.set_y(dataset_target.target.get_y()+.125 + (.25 * i))
|
||||
else:
|
||||
dataset_target.target.next_to(
|
||||
old_target, direction=RIGHT, buff=0.0125,
|
||||
)
|
||||
old_target = dataset_target.target
|
||||
new_row.append(dataset_target)
|
||||
first_animations.append(indiv_data.animate(run_time=0.5).set_stroke(current_color))
|
||||
second_animations.append(MoveToTarget(dataset_target, run_time=1.5))
|
||||
|
||||
new_datasets.append(new_row)
|
||||
self.play(
|
||||
*first_animations,
|
||||
)
|
||||
self.play(*second_animations)
|
||||
move_animation = []
|
||||
for i,row in enumerate(new_datasets):
|
||||
current_color = colors[i]
|
||||
if i == 0:
|
||||
idx = -3
|
||||
elif i == 1:
|
||||
idx = -2
|
||||
elif i == 2:
|
||||
idx = -1
|
||||
elif i == 3:
|
||||
idx = 0
|
||||
for j,indiv_data in enumerate(row):
|
||||
indiv_data.generate_target()
|
||||
indiv_data.animate.stretch_to_fit_height(0.46/2)
|
||||
aligned_edge = ORIGIN
|
||||
if j % 8 == 0:
|
||||
aligned_edge = LEFT
|
||||
indiv_data.target.next_to(
|
||||
gpus[abs(idx)].get_corner(UP+LEFT), buff=.01, direction=RIGHT+DOWN,
|
||||
)
|
||||
indiv_data.target.set_x(indiv_data.target.get_x())
|
||||
indiv_data.target.set_y(indiv_data.target.get_y()-.25)
|
||||
elif j % 4 == 0:
|
||||
indiv_data.target.next_to(
|
||||
gpus[abs(idx)].get_corner(UP+LEFT), buff=.01, direction=RIGHT+DOWN,
|
||||
)
|
||||
indiv_data.target.set_x(indiv_data.target.get_x())
|
||||
else:
|
||||
indiv_data.target.next_to(
|
||||
old_target, direction=RIGHT, buff=0.01,
|
||||
)
|
||||
old_target = indiv_data.target
|
||||
move_animation.append(MoveToTarget(indiv_data, run_time=1.5))
|
||||
|
||||
self.play(*move_animation)
|
||||
self.wait()
|
||||
5
setup.py
5
setup.py
@ -25,17 +25,18 @@ extras["docs"] = []
|
||||
extras["test_prod"] = ["pytest>=7.2.0,<=8.0.0", "pytest-xdist", "pytest-subtests", "parameterized"]
|
||||
extras["test_dev"] = [
|
||||
"datasets",
|
||||
"diffusers",
|
||||
"evaluate",
|
||||
"torchpippy>=0.2.0",
|
||||
"transformers",
|
||||
"scipy",
|
||||
"scikit-learn",
|
||||
"deepspeed",
|
||||
"tqdm",
|
||||
"bitsandbytes",
|
||||
"timm",
|
||||
]
|
||||
extras["testing"] = extras["test_prod"] + extras["test_dev"]
|
||||
extras["deepspeed"] = ["deepspeed<=0.14.0"]
|
||||
extras["rich"] = ["rich"]
|
||||
|
||||
extras["test_trackers"] = ["wandb", "comet-ml", "tensorboard", "dvclive"]
|
||||
@ -47,7 +48,7 @@ extras["sagemaker"] = [
|
||||
|
||||
setup(
|
||||
name="accelerate",
|
||||
version="0.29.0.dev",
|
||||
version="0.31.0.dev0",
|
||||
description="Accelerate",
|
||||
long_description=open("README.md", encoding="utf-8").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
|
||||
@ -11,7 +11,7 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
__version__ = "0.29.0.dev0"
|
||||
__version__ = "0.30.1.dev0"
|
||||
|
||||
from .accelerator import Accelerator
|
||||
from .big_modeling import (
|
||||
|
||||
@ -79,6 +79,7 @@ from .utils import (
|
||||
is_deepspeed_available,
|
||||
is_fp8_available,
|
||||
is_ipex_available,
|
||||
is_lomo_available,
|
||||
is_megatron_lm_available,
|
||||
is_mlu_available,
|
||||
is_msamp_available,
|
||||
@ -215,7 +216,7 @@ class Accelerator:
|
||||
project_dir (`str`, `os.PathLike`, *optional*):
|
||||
A path to a directory for storing data such as logs of locally-compatible loggers and potentially saved
|
||||
checkpoints.
|
||||
step_scheduler_with_optimizer (`bool`, *optional`, defaults to `True`):
|
||||
step_scheduler_with_optimizer (`bool`, *optional*, defaults to `True`):
|
||||
Set `True` if the learning rate scheduler is stepped at the same time as the optimizer, `False` if only
|
||||
done under certain circumstances (at the end of each epoch, for instance).
|
||||
kwargs_handlers (list of [`~utils.KwargsHandler`], *optional*)
|
||||
@ -340,6 +341,8 @@ class Accelerator:
|
||||
self.init_handler = None
|
||||
self.fp8_recipe_handler = None
|
||||
self.autocast_handler = None
|
||||
self.has_lomo_optimizer = False
|
||||
|
||||
if kwargs_handlers is not None:
|
||||
for handler in kwargs_handlers:
|
||||
assert isinstance(
|
||||
@ -383,8 +386,15 @@ class Accelerator:
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if self.fp8_recipe_handler is None and self.state.mixed_precision == "fp8":
|
||||
self.fp8_recipe_handler = FP8RecipeKwargs(backend="MSAMP" if is_msamp_available() else "TE")
|
||||
self.delayed_fp8_autocast = False
|
||||
if self.fp8_recipe_handler is not None:
|
||||
# We already check if FP8 is available during `self.state`
|
||||
if self.state.mixed_precision != "fp8":
|
||||
raise ValueError("Passing in a `FP8RecipeKwargs` object requires setting `mixed_precision='fp8'`.")
|
||||
self.delayed_fp8_autocast = self.fp8_recipe_handler.backend == "TE" and self.distributed_type in (
|
||||
DistributedType.MULTI_GPU,
|
||||
DistributedType.FSDP,
|
||||
)
|
||||
|
||||
trackers = filter_trackers(log_with, self.logging_dir)
|
||||
if len(trackers) < 1 and log_with is not None:
|
||||
@ -450,7 +460,7 @@ class Accelerator:
|
||||
and self.distributed_type not in (DistributedType.DEEPSPEED, DistributedType.MEGATRON_LM)
|
||||
):
|
||||
self.native_amp = True
|
||||
if self.device.type not in ("xpu", "cuda", "mps", "npu", "xla", "mlu") or is_torch_xla_available(
|
||||
if self.device.type not in ("xpu", "cuda", "npu", "xla", "mlu") or is_torch_xla_available(
|
||||
check_is_tpu=True
|
||||
):
|
||||
raise ValueError(f"fp16 mixed precision requires a GPU (not {self.device.type!r}).")
|
||||
@ -479,6 +489,10 @@ class Accelerator:
|
||||
if mixed_precision == "bf16" and not self.native_amp and not is_torch_xla_available():
|
||||
raise ValueError("bf16 mixed precision requires PyTorch >= 1.10 and a supported device.")
|
||||
|
||||
elif self.state.mixed_precision == "fp8":
|
||||
# We always enable `native_amp` for FP8
|
||||
self.native_amp = True
|
||||
|
||||
# Start of internal step tracking
|
||||
self.step = 0
|
||||
|
||||
@ -550,6 +564,10 @@ class Accelerator:
|
||||
def use_seedable_sampler(self):
|
||||
return self.dataloader_config.use_seedable_sampler
|
||||
|
||||
@property
|
||||
def non_blocking(self):
|
||||
return self.dataloader_config.non_blocking
|
||||
|
||||
@property
|
||||
def project_dir(self):
|
||||
return self.project_configuration.project_dir
|
||||
@ -1345,18 +1363,22 @@ class Accelerator:
|
||||
model.forward = MethodType(convert_outputs_to_fp32(model.forward.__func__), model)
|
||||
else:
|
||||
model.forward = convert_outputs_to_fp32(new_forward)
|
||||
elif self.mixed_precision == "fp8" and self.fp8_recipe_handler.backend == "TE":
|
||||
|
||||
# We prepare fp8 after, allowing for bf16 autocast to happen first
|
||||
if getattr(self.fp8_recipe_handler, "backend", None) == "TE":
|
||||
if not has_transformer_engine_layers(model):
|
||||
with torch.no_grad():
|
||||
convert_model(model)
|
||||
model._converted_to_transformer_engine = True
|
||||
model._original_forward = model.forward
|
||||
|
||||
kwargs = self.fp8_recipe_handler.to_kwargs() if self.fp8_recipe_handler is not None else {}
|
||||
if "fp8_format" in kwargs:
|
||||
kwargs["fp8_format"] = getattr(te_recipe.Format, kwargs["fp8_format"])
|
||||
fp8_recipe = te_recipe.DelayedScaling(**kwargs)
|
||||
model.forward = fp8_autocast(enabled=True, fp8_recipe=fp8_recipe)(model.forward)
|
||||
# If we are in DDP or FSDP, we delay `autocast` until after FSDP/DDP has been initialized
|
||||
# to make use of the process group
|
||||
if not self.delayed_fp8_autocast:
|
||||
model.forward = fp8_autocast(enabled=True, fp8_recipe=fp8_recipe)(model.forward)
|
||||
|
||||
if (getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False)) and getattr(
|
||||
model, "hf_device_map", False
|
||||
@ -1368,16 +1390,19 @@ class Accelerator:
|
||||
" In order to use 8-bit models that have been loaded across multiple GPUs the solution is to use Naive Pipeline Parallelism."
|
||||
" Therefore you should not specify that you are under any distributed regime in your accelerate config."
|
||||
)
|
||||
current_device = list(model_devices)[0]
|
||||
current_device_index = current_device.index if isinstance(current_device, torch.device) else current_device
|
||||
elif len(model_devices) == 1:
|
||||
current_device = list(model_devices)[0]
|
||||
current_device_index = (
|
||||
current_device.index if isinstance(current_device, torch.device) else current_device
|
||||
)
|
||||
|
||||
if torch.device(current_device_index) != self.device:
|
||||
# if on the first device (GPU 0) we don't care
|
||||
if (self.device.index is not None) or (current_device_index != 0):
|
||||
raise ValueError(
|
||||
"You can't train a model that has been loaded in 8-bit precision on a different device than the one "
|
||||
"you're training on. Make sure you loaded the model on the correct device using for example `device_map={'':torch.cuda.current_device() or device_map={'':torch.xpu.current_device()}"
|
||||
)
|
||||
if torch.device(current_device_index) != self.device:
|
||||
# if on the first device (GPU 0) we don't care
|
||||
if (self.device.index is not None) or (current_device_index != 0):
|
||||
raise ValueError(
|
||||
"You can't train a model that has been loaded in 8-bit precision on a different device than the one "
|
||||
"you're training on. Make sure you loaded the model on the correct device using for example `device_map={'':torch.cuda.current_device() or device_map={'':torch.xpu.current_device()}"
|
||||
)
|
||||
|
||||
if "cpu" in model_devices or "disk" in model_devices:
|
||||
raise ValueError(
|
||||
@ -1447,6 +1472,73 @@ class Accelerator:
|
||||
),
|
||||
auto_wrap_policy=fsdp_plugin.auto_wrap_policy,
|
||||
)
|
||||
|
||||
# In the event the model had been loaded in low precision, but
|
||||
# mixed precision had also been activated, then we follow DeepSpeed's
|
||||
# strategy to hold the parameters in full precision.
|
||||
# - assume that trainer.args.bf16 and trainer.args.fp16 are already checked against
|
||||
# fsdp_plugin.mixed_precision_policy.
|
||||
# - NOTE: we do not check the mixed_precision attribute on the FSDP root wrapper.
|
||||
# * this attribute will always set by init_utils.init_core_state so its always not None.
|
||||
# * mixed_precision.param_dtype only regards _fwd_bwd_param_dtype
|
||||
# * if model is loaded in 16bit, and even if mixed_precision.param_dtype is None,
|
||||
# we sill want to upcast the flat_param.
|
||||
if self.mixed_precision != "no": # if mixed precision is set
|
||||
upcasted_log = []
|
||||
for module in FSDP.fsdp_modules(model):
|
||||
# Referencing DeepSpeed Zero3
|
||||
# - in Init, params are converted to 16bit while partitioning.
|
||||
# - in accelerator.prepare, deepspeed.initalize is called to:
|
||||
# * creates the DeepSpeeedEngine.
|
||||
# * since zero_optimization() is True , calls engine._configure_zero_optimizer.
|
||||
#
|
||||
# Inside the DeepSpeed Zero3 optimizer configuration, which initalizes
|
||||
# DeepSpeedZeroOptimizer_Stage3, during which:
|
||||
# * trainable_param_groups are obtained from the attached optimizer
|
||||
# (already partitioned in 16bit).
|
||||
# * then _setup_for_real_optimizer -> _create_fp32_partitions
|
||||
# which performs the fp32 upcasting.
|
||||
|
||||
# To mimick DeepSeepds's casting in FSDP, we look at the (single) FlatParameter held
|
||||
# within an FSDP wrapper. This FlatParameter will be seen by the optimizer.
|
||||
# - even though there is a torch.device('meta') guard below, we
|
||||
# expect _init_utils._init_param_handle_from_module to already
|
||||
# sync the parameter.
|
||||
|
||||
if not module._has_params:
|
||||
continue # skip if FSDP module not managing parameters
|
||||
param = module._flat_param
|
||||
if (
|
||||
param.dtype != torch.float32
|
||||
and param.device != torch.device("meta")
|
||||
and param.requires_grad
|
||||
):
|
||||
# keep log of names_params that was upcasted
|
||||
# NOTE: resorted to this because warnings.simplefilter("once") is somehow not working
|
||||
name_param_log = (module.module.__class__.__name__, ", ".join(module._flat_param._fqns))
|
||||
if name_param_log not in upcasted_log:
|
||||
upcasted_log.append(name_param_log)
|
||||
|
||||
# this works because of FSDP's _runtime_utils.lazy_init.
|
||||
# Have to be careful not to call anything before this that
|
||||
# triggers lazy_init (e.g., _is_fsdp_root).
|
||||
param.data = param.data.to(torch.float32) # upcasting
|
||||
module._handle._orig_param_dtype = torch.float32 # update
|
||||
|
||||
# report the warnings
|
||||
# some messages can be quite repetitive, especially when reporting about layers that have identical architecture.
|
||||
if self.is_main_process:
|
||||
for name_log, param_log in upcasted_log:
|
||||
warnings.warn(
|
||||
f"Upcasted low precision parameters in {name_log} because mixed precision turned on in FSDP. "
|
||||
f"Affects: {param_log}."
|
||||
)
|
||||
|
||||
if len(upcasted_log) > 0:
|
||||
warnings.warn(
|
||||
"FSDP upcast of low precision parameters may affect the precision of model checkpoints."
|
||||
)
|
||||
|
||||
# if the previous and current models are same, delete the previous one
|
||||
if len(self._models) > 1 and (self._models[-2] is self._models[-1]):
|
||||
del self._models[-2]
|
||||
@ -1456,6 +1548,11 @@ class Accelerator:
|
||||
model = torch.nn.parallel.DistributedDataParallel(model, **kwargs)
|
||||
elif self.distributed_type == DistributedType.XLA and self.state.fork_launched:
|
||||
model = xmp.MpModelWrapper(model).to(self.device)
|
||||
# Now we can apply the FP8 autocast
|
||||
if self.delayed_fp8_autocast:
|
||||
model.forward = fp8_autocast(enabled=True, fp8_recipe=fp8_recipe, fp8_group=model.process_group)(
|
||||
model.forward
|
||||
)
|
||||
# torch.compile should be called last and only if the model isn't already compiled.
|
||||
if self.state.dynamo_plugin.backend != DynamoBackend.NO and not is_compiled_module(model):
|
||||
if not is_torch_version(">=", "2.0"):
|
||||
@ -1571,6 +1668,8 @@ class Accelerator:
|
||||
)
|
||||
|
||||
if model is not None:
|
||||
# if the model is an MOE, set the appropriate MOE layers as leaf Z3 modules
|
||||
deepspeed_plugin.set_moe_leaf_modules(model)
|
||||
# deal with config keys that use `auto` value and rely on model's hidden_size
|
||||
hidden_size_based_keys = [
|
||||
"zero_optimization.reduce_bucket_size",
|
||||
@ -1904,6 +2003,7 @@ class Accelerator:
|
||||
even_batches=self.even_batches,
|
||||
slice_fn_for_dispatch=slice_fn_for_dispatch,
|
||||
use_seedable_sampler=self.use_seedable_sampler,
|
||||
non_blocking=self.non_blocking,
|
||||
)
|
||||
self._dataloaders.append(prepared_data_loader)
|
||||
return prepared_data_loader
|
||||
@ -1930,6 +2030,14 @@ class Accelerator:
|
||||
>>> optimizer = accelerator.prepare_optimizer(optimizer, device_placement=True)
|
||||
```
|
||||
"""
|
||||
if is_lomo_available():
|
||||
# We need to import locally to avoid circular imports since lomo imports stuff from
|
||||
# transformers & accelerate
|
||||
from lomo_optim import AdaLomo, Lomo
|
||||
|
||||
# Support multiple optimizers: https://github.com/huggingface/accelerate/pull/2695#discussion_r1589164607
|
||||
self.has_lomo_optimizer |= isinstance(optimizer, (Lomo, AdaLomo))
|
||||
|
||||
# Ensure we can't double wrap an optimizer due to `find_batch_size`
|
||||
if getattr(optimizer, "_is_accelerate_prepared", False):
|
||||
if optimizer not in self._optimizers:
|
||||
@ -2000,6 +2108,8 @@ class Accelerator:
|
||||
>>> accelerator.backward(loss)
|
||||
```
|
||||
"""
|
||||
learning_rate = kwargs.get("learning_rate")
|
||||
|
||||
if self.distributed_type != DistributedType.DEEPSPEED:
|
||||
# deepspeed handles loss scaling by gradient_accumulation_steps in its `backward`
|
||||
loss = loss / self.gradient_accumulation_steps
|
||||
@ -2009,6 +2119,8 @@ class Accelerator:
|
||||
return
|
||||
elif self.scaler is not None:
|
||||
self.scaler.scale(loss).backward(**kwargs)
|
||||
elif learning_rate is not None and self.has_lomo_optimizer:
|
||||
self.lomo_backward(loss, learning_rate)
|
||||
else:
|
||||
loss.backward(**kwargs)
|
||||
|
||||
@ -2216,7 +2328,7 @@ class Accelerator:
|
||||
"""
|
||||
return gather(tensor)
|
||||
|
||||
def gather_for_metrics(self, input_data):
|
||||
def gather_for_metrics(self, input_data, use_gather_object=False):
|
||||
"""
|
||||
Gathers `input_data` and potentially drops duplicates in the last batch if on a distributed system. Should be
|
||||
used for gathering the inputs and targets for metric calculation.
|
||||
@ -2224,6 +2336,11 @@ class Accelerator:
|
||||
Args:
|
||||
input (`torch.Tensor`, `object`, a nested tuple/list/dictionary of `torch.Tensor`, or a nested tuple/list/dictionary of `object`):
|
||||
The tensors or objects for calculating metrics across all processes
|
||||
use_gather_object(`bool`):
|
||||
Whether to forcibly use gather_object instead of gather (which is already done if all objects passed do
|
||||
not contain tensors). This flag can be useful for gathering tensors with different sizes that we don't
|
||||
want to pad and concatenate along the first dimension. Using it with GPU tensors is not well supported
|
||||
and inefficient as it incurs GPU -> CPU transfer since tensors would be pickled.
|
||||
|
||||
Example:
|
||||
|
||||
@ -2248,7 +2365,9 @@ class Accelerator:
|
||||
except TypeError:
|
||||
all_tensors = False
|
||||
|
||||
if not all_tensors:
|
||||
use_gather_object = use_gather_object or not all_tensors
|
||||
|
||||
if use_gather_object:
|
||||
data = gather_object(input_data)
|
||||
else:
|
||||
data = self.gather(input_data)
|
||||
@ -2267,7 +2386,11 @@ class Accelerator:
|
||||
def _adjust_samples(tensor):
|
||||
return tensor[: self.gradient_state.remainder]
|
||||
|
||||
return recursively_apply(_adjust_samples, data)
|
||||
if use_gather_object:
|
||||
# gather_object put the objects in a list
|
||||
return _adjust_samples(data)
|
||||
else:
|
||||
return recursively_apply(_adjust_samples, data)
|
||||
else: # remainder is 0
|
||||
# no remainder even though at end of dataloader, so nothing to do.
|
||||
return data
|
||||
@ -2780,7 +2903,7 @@ class Accelerator:
|
||||
for i, model in enumerate(self._models):
|
||||
if self.distributed_type == DistributedType.FSDP:
|
||||
logger.info("Saving FSDP model")
|
||||
save_fsdp_model(self.state.fsdp_plugin, self, model, output_dir, i)
|
||||
save_fsdp_model(self.state.fsdp_plugin, model, output_dir, i)
|
||||
logger.info(f"FSDP Model saved to output dir {output_dir}")
|
||||
elif self.distributed_type == DistributedType.DEEPSPEED:
|
||||
logger.info("Saving DeepSpeed Model and Optimizer")
|
||||
@ -2799,7 +2922,7 @@ class Accelerator:
|
||||
if self.distributed_type == DistributedType.FSDP:
|
||||
for i, opt in enumerate(self._optimizers):
|
||||
logger.info("Saving FSDP Optimizer")
|
||||
save_fsdp_optimizer(self.state.fsdp_plugin, self, opt, self._models[i], output_dir, i)
|
||||
save_fsdp_optimizer(self.state.fsdp_plugin, opt, self._models[i], output_dir, i)
|
||||
logger.info(f"FSDP Optimizer saved to output dir {output_dir}")
|
||||
elif self.distributed_type not in [DistributedType.DEEPSPEED, DistributedType.MEGATRON_LM]:
|
||||
optimizers = self._optimizers
|
||||
@ -2924,7 +3047,7 @@ class Accelerator:
|
||||
for i, model in enumerate(self._models):
|
||||
if self.distributed_type == DistributedType.FSDP:
|
||||
logger.info("Loading FSDP model")
|
||||
load_fsdp_model(self.state.fsdp_plugin, self, model, input_dir, i)
|
||||
load_fsdp_model(self.state.fsdp_plugin, model, input_dir, i)
|
||||
logger.info(f"FSDP Model loaded from input dir {input_dir}")
|
||||
elif self.distributed_type == DistributedType.DEEPSPEED:
|
||||
logger.info("Loading DeepSpeed Model and Optimizer")
|
||||
@ -2943,7 +3066,7 @@ class Accelerator:
|
||||
if self.distributed_type == DistributedType.FSDP:
|
||||
for i, opt in enumerate(self._optimizers):
|
||||
logger.info("Loading FSDP Optimizer")
|
||||
load_fsdp_optimizer(self.state.fsdp_plugin, self, opt, self._models[i], input_dir, i)
|
||||
load_fsdp_optimizer(self.state.fsdp_plugin, opt, self._models[i], input_dir, i)
|
||||
logger.info(f"FSDP Optimizer loaded from input dir {input_dir}")
|
||||
elif self.distributed_type not in [DistributedType.DEEPSPEED, DistributedType.MEGATRON_LM]:
|
||||
optimizers = self._optimizers
|
||||
@ -3002,7 +3125,7 @@ class Accelerator:
|
||||
for index, obj in enumerate(self._custom_objects):
|
||||
load_custom_state(obj, input_dir, index)
|
||||
|
||||
def free_memory(self):
|
||||
def free_memory(self, *objects):
|
||||
"""
|
||||
Will release all references to the internal objects stored and call the garbage collector. You should call this
|
||||
method between two trainings with different models/optimizers. Also will reset `Accelerator.step` to 0.
|
||||
@ -3015,19 +3138,23 @@ class Accelerator:
|
||||
>>> accelerator = Accelerator()
|
||||
>>> model, optimizer, scheduler = ...
|
||||
>>> model, optimizer, scheduler = accelerator.prepare(model, optimizer, scheduler)
|
||||
>>> accelerator.free_memory()
|
||||
>>> del model, optimizer, scheduler
|
||||
>>> model, optimizer, scheduler = accelerator.free_memory(model, optimizer, scheduler)
|
||||
```
|
||||
"""
|
||||
# Deepspeed needs a bit more prep that should be done first
|
||||
if hasattr(self, "deepspeed_engine_wrapped"):
|
||||
if self.deepspeed_engine_wrapped is not None:
|
||||
self.deepspeed_engine_wrapped.engine.destroy()
|
||||
self.deepspeed_engine_wrapped = None
|
||||
objects = release_memory(*objects)
|
||||
self._schedulers = []
|
||||
self._optimizers = []
|
||||
self._models = []
|
||||
self._dataloaders = []
|
||||
self.deepspeed_engine_wrapped = None
|
||||
self.step = 0
|
||||
release_memory()
|
||||
return objects
|
||||
|
||||
def clear(self):
|
||||
def clear(self, *objects):
|
||||
"""
|
||||
Alias for [`Accelerate.free_memory`], releases all references to the internal objects stored and call the
|
||||
garbage collector. You should call this method between two trainings with different models/optimizers.
|
||||
@ -3040,11 +3167,10 @@ class Accelerator:
|
||||
>>> accelerator = Accelerator()
|
||||
>>> model, optimizer, scheduler = ...
|
||||
>>> model, optimizer, scheduler = accelerator.prepare(model, optimizer, scheduler)
|
||||
>>> accelerator.free_memory()
|
||||
>>> del model, optimizer, scheduler
|
||||
>>> model, optimizer, scheduler = accelerator.clear(model, optimizer, scheduler)
|
||||
```
|
||||
"""
|
||||
self.free_memory()
|
||||
return self.free_memory(*objects)
|
||||
|
||||
def _get_named_parameters(self, *args):
|
||||
named_parameters = {}
|
||||
@ -3257,3 +3383,27 @@ class Accelerator:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def lomo_backward(self, loss: torch.Tensor, learning_rate: float) -> None:
|
||||
"""
|
||||
Runs backward pass on LOMO optimizers.
|
||||
"""
|
||||
if is_lomo_available():
|
||||
# We need to import locally to avoid circular imports since lomo imports stuff from
|
||||
# transformers & accelerate
|
||||
from lomo_optim import AdaLomo, Lomo
|
||||
|
||||
if learning_rate is None:
|
||||
raise ValueError("A learning rate must be passed in order to call backward pass with LOMO optimizers.")
|
||||
|
||||
_backward_called = False
|
||||
|
||||
for optimizer in self._optimizers:
|
||||
if isinstance(optimizer.optimizer, (Lomo, AdaLomo)):
|
||||
optimizer.optimizer.fused_backward(loss, learning_rate)
|
||||
_backward_called = True
|
||||
|
||||
if not _backward_called:
|
||||
raise ValueError(
|
||||
"Backward pass not properly called on LOMO optimizers. Are you sure you passed a LOMO optimizer in accelerator.prepare()?"
|
||||
)
|
||||
|
||||
@ -508,6 +508,7 @@ def load_checkpoint_and_dispatch(
|
||||
skip_keys: Optional[Union[str, List[str]]] = None,
|
||||
preload_module_classes: Optional[List[str]] = None,
|
||||
force_hooks: bool = False,
|
||||
strict: bool = False,
|
||||
):
|
||||
"""
|
||||
Loads a (potentially sharded) checkpoint inside a model, potentially sending weights to a given device as they are
|
||||
@ -554,6 +555,9 @@ def load_checkpoint_and_dispatch(
|
||||
force_hooks (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to force device hooks to be attached to the model even if all layers are dispatched to a
|
||||
single device.
|
||||
strict (`bool`, *optional*, defaults to `False`):
|
||||
Whether to strictly enforce that the keys in the checkpoint state_dict match the keys of the model's
|
||||
state_dict.
|
||||
|
||||
Example:
|
||||
|
||||
@ -608,6 +612,7 @@ def load_checkpoint_and_dispatch(
|
||||
dtype=dtype,
|
||||
offload_state_dict=offload_state_dict,
|
||||
offload_buffers=offload_buffers,
|
||||
strict=strict,
|
||||
)
|
||||
if device_map is None:
|
||||
return model
|
||||
|
||||
@ -120,8 +120,7 @@ def save_accelerator_state(
|
||||
from .data_loader import IterableDatasetShard, SeedableRandomSampler
|
||||
|
||||
if isinstance(dataloader.dataset, IterableDatasetShard):
|
||||
sampler = dataloader.sampler.sampler
|
||||
|
||||
sampler = dataloader.get_sampler()
|
||||
if isinstance(sampler, SeedableRandomSampler):
|
||||
save(sampler, output_sampler_file, save_on_each_node=save_on_each_node, safe_serialization=False)
|
||||
logger.info(f"Sampler state for dataloader {i} saved in {output_sampler_file}")
|
||||
@ -227,10 +226,9 @@ def load_accelerator_state(
|
||||
from .data_loader import IterableDatasetShard, SeedableRandomSampler
|
||||
|
||||
if isinstance(dataloader.dataset, IterableDatasetShard):
|
||||
sampler = dataloader.sampler.sampler
|
||||
|
||||
sampler = dataloader.get_sampler()
|
||||
if isinstance(sampler, SeedableRandomSampler):
|
||||
dataloader.sampler.sampler = torch.load(input_sampler_file)
|
||||
sampler = dataloader.set_sampler(torch.load(input_sampler_file))
|
||||
logger.info("All dataloader sampler states loaded successfully")
|
||||
|
||||
# GradScaler state
|
||||
|
||||
@ -298,6 +298,18 @@ def get_cluster_input():
|
||||
"When `zero3_init_flag` is set, it requires Transformers to be installed. "
|
||||
"Please run `pip3 install transformers`."
|
||||
)
|
||||
use_moe = _ask_field(
|
||||
"Do you want to enable Mixture-of-Experts training (MoE)? [yes/NO]: ",
|
||||
_convert_yes_no_to_bool,
|
||||
default=False,
|
||||
error_message="Please enter yes or no.",
|
||||
)
|
||||
if use_moe:
|
||||
deepspeed_config["deepspeed_moe_layer_cls_names"] = _ask_field(
|
||||
"Specify the comma-separated list of transformers MoE layer class names (case-sensitive), e.g : "
|
||||
" `MixtralSparseMoeBlock`, `Qwen2MoeSparseMoeBlock`, `JetMoEAttention,JetMoEBlock` ... : ",
|
||||
str,
|
||||
)
|
||||
|
||||
if num_machines > 1:
|
||||
launcher_query = "Which Type of launcher do you want to use?"
|
||||
@ -567,7 +579,7 @@ def get_cluster_input():
|
||||
|
||||
# CPU affinity is only supported on NVIDIA hardware for now
|
||||
enable_cpu_affinity = False
|
||||
if distributed_type == (DistributedType.NO, DistributedType.MULTI_GPU) and not use_cpu and not use_mps:
|
||||
if distributed_type in (DistributedType.NO, DistributedType.MULTI_GPU) and not use_cpu and not use_mps:
|
||||
enable_cpu_affinity = _ask_field(
|
||||
"Would you like to enable numa efficiency? (Currently only supported on NVIDIA hardware). [yes/NO]: ",
|
||||
_convert_yes_no_to_bool,
|
||||
|
||||
@ -241,3 +241,4 @@ class SageMakerConfig(BaseConfig):
|
||||
sagemaker_metrics_file: str = None
|
||||
additional_args: dict = None
|
||||
dynamo_config: dict = None
|
||||
enable_cpu_affinity: bool = False
|
||||
|
||||
@ -95,6 +95,7 @@ def write_basic_config(mixed_precision="no", save_location: str = default_json_c
|
||||
config["num_processes"] = 1
|
||||
config["distributed_type"] = "NO"
|
||||
config["debug"] = False
|
||||
config["enable_cpu_affinity"] = False
|
||||
config = ClusterConfig(**config)
|
||||
config.to_json_file(path)
|
||||
return path
|
||||
|
||||
@ -79,6 +79,8 @@ def env_command(args):
|
||||
}
|
||||
if pt_cuda_available:
|
||||
info["GPU type"] = torch.cuda.get_device_name()
|
||||
if pt_npu_available:
|
||||
info["CANN version"] = torch.version.cann
|
||||
|
||||
print("\nCopy-and-paste the text below in your GitHub issue\n")
|
||||
print("\n".join([f"- {prop}: {val}" for prop, val in info.items()]))
|
||||
|
||||
@ -303,6 +303,15 @@ def launch_command_parser(subparsers=None):
|
||||
type=str,
|
||||
help="Tee std streams into a log file and also to console.",
|
||||
)
|
||||
distributed_args.add_argument(
|
||||
"--log_dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"Base directory to use for log files when using torchrun/torch.distributed.run as launcher. "
|
||||
"Use with --tee to redirect std streams info log files."
|
||||
),
|
||||
)
|
||||
distributed_args.add_argument(
|
||||
"--role",
|
||||
type=str,
|
||||
@ -487,6 +496,13 @@ def launch_command_parser(subparsers=None):
|
||||
type=str,
|
||||
help="DeepSpeed multi-node launcher to use. If unspecified, will default to `pdsh`.",
|
||||
)
|
||||
deepspeed_args.add_argument(
|
||||
"--deepspeed_moe_layer_cls_names",
|
||||
default=None,
|
||||
type=str,
|
||||
help="comma-separated list of transformer MoE layer class names (case-sensitive) to wrap ,e.g, `MixtralSparseMoeBlock`, `Qwen2MoeSparseMoeBlock`, `JetMoEAttention,JetMoEBlock` ..."
|
||||
" (useful only when `use_deepspeed` flag is passed).",
|
||||
)
|
||||
|
||||
# fsdp arguments
|
||||
fsdp_args = parser.add_argument_group("FSDP Arguments", "Arguments related to Fully Shared Data Parallelism.")
|
||||
@ -1027,8 +1043,8 @@ def _validate_launch_command(args):
|
||||
defaults is not None and defaults.compute_environment != ComputeEnvironment.AMAZON_SAGEMAKER
|
||||
)
|
||||
if is_aws_env_disabled and args.num_cpu_threads_per_process is None:
|
||||
args.num_cpu_threads_per_process = 1
|
||||
if args.use_cpu and args.num_processes >= 1:
|
||||
args.num_cpu_threads_per_process = get_int_from_env(["OMP_NUM_THREADS"], 1)
|
||||
if args.use_cpu and args.num_processes >= 1 and get_int_from_env(["OMP_NUM_THREADS"], 0) == 0:
|
||||
local_size = get_int_from_env(
|
||||
["MPI_LOCALNRANKS", "OMPI_COMM_WORLD_LOCAL_SIZE", "MV2_COMM_WORLD_LOCAL_SIZE"], 1
|
||||
)
|
||||
|
||||
@ -429,6 +429,7 @@ class DataLoaderShard(DataLoader, DataLoaderStateMixin):
|
||||
synchronized_generator=None,
|
||||
skip_batches=0,
|
||||
_drop_last: bool = False,
|
||||
_non_blocking: bool = False,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(dataset, **kwargs)
|
||||
@ -438,6 +439,7 @@ class DataLoaderShard(DataLoader, DataLoaderStateMixin):
|
||||
self.skip_batches = skip_batches
|
||||
self.gradient_state = GradientState()
|
||||
self._drop_last = _drop_last
|
||||
self._non_blocking = _non_blocking
|
||||
self.iteration = 0
|
||||
|
||||
def __iter__(self):
|
||||
@ -458,7 +460,7 @@ class DataLoaderShard(DataLoader, DataLoaderStateMixin):
|
||||
try:
|
||||
# But we still move it to the device so it is done before `StopIteration` is reached
|
||||
if self.device is not None:
|
||||
current_batch = send_to_device(current_batch, self.device)
|
||||
current_batch = send_to_device(current_batch, self.device, non_blocking=self._non_blocking)
|
||||
next_batch = next(dataloader_iter)
|
||||
if batch_index >= self.skip_batches:
|
||||
yield current_batch
|
||||
@ -500,6 +502,18 @@ class DataLoaderShard(DataLoader, DataLoaderStateMixin):
|
||||
else:
|
||||
return len(self.dataset)
|
||||
|
||||
def get_sampler(self):
|
||||
return get_sampler(self)
|
||||
|
||||
def set_sampler(self, sampler):
|
||||
sampler_is_batch_sampler = isinstance(self.sampler, BatchSampler)
|
||||
if sampler_is_batch_sampler:
|
||||
self.sampler.sampler = sampler
|
||||
else:
|
||||
self.batch_sampler.sampler = sampler
|
||||
if hasattr(self.batch_sampler, "batch_sampler"):
|
||||
self.batch_sampler.batch_sampler.sampler = sampler
|
||||
|
||||
|
||||
if is_torch_xla_available():
|
||||
import torch_xla.distributed.parallel_loader as xpl
|
||||
@ -571,7 +585,14 @@ class DataLoaderDispatcher(DataLoader, DataLoaderStateMixin):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, dataset, split_batches: bool = False, skip_batches=0, _drop_last: bool = False, slice_fn=None, **kwargs
|
||||
self,
|
||||
dataset,
|
||||
split_batches: bool = False,
|
||||
skip_batches=0,
|
||||
_drop_last: bool = False,
|
||||
_non_blocking: bool = False,
|
||||
slice_fn=None,
|
||||
**kwargs,
|
||||
):
|
||||
shuffle = False
|
||||
if is_torch_version(">=", "1.11.0"):
|
||||
@ -588,6 +609,7 @@ class DataLoaderDispatcher(DataLoader, DataLoaderStateMixin):
|
||||
self.gradient_state = GradientState()
|
||||
self.state = AcceleratorState()
|
||||
self._drop_last = _drop_last
|
||||
self._non_blocking = _non_blocking
|
||||
self.skip_batches = skip_batches
|
||||
|
||||
self.slice_fn = slice_tensors if slice_fn is None else slice_fn
|
||||
@ -660,7 +682,7 @@ class DataLoaderDispatcher(DataLoader, DataLoaderStateMixin):
|
||||
if self.state.process_index != 0:
|
||||
# Initialize tensors on other processes than process 0.
|
||||
batch = initialize_tensors(batch_info[0])
|
||||
batch = send_to_device(batch, self.state.device)
|
||||
batch = send_to_device(batch, self.state.device, non_blocking=self._non_blocking)
|
||||
# Broadcast the batch before splitting it.
|
||||
batch = broadcast(batch, from_process=0)
|
||||
|
||||
@ -741,6 +763,36 @@ class DataLoaderDispatcher(DataLoader, DataLoaderStateMixin):
|
||||
def total_dataset_length(self):
|
||||
return len(self.dataset)
|
||||
|
||||
def get_sampler(self):
|
||||
return get_sampler(self)
|
||||
|
||||
def set_sampler(self, sampler):
|
||||
sampler_is_batch_sampler = isinstance(self.sampler, BatchSampler)
|
||||
if sampler_is_batch_sampler:
|
||||
self.sampler.sampler = sampler
|
||||
else:
|
||||
self.batch_sampler.sampler = sampler
|
||||
if hasattr(self.batch_sampler, "batch_sampler"):
|
||||
self.batch_sampler.batch_sampler.sampler = sampler
|
||||
|
||||
|
||||
def get_sampler(dataloader):
|
||||
"""
|
||||
Get the sampler associated to the dataloader
|
||||
|
||||
Args:
|
||||
dataloader (`torch.utils.data.dataloader.DataLoader`):
|
||||
The data loader to split across several devices.
|
||||
Returns:
|
||||
`torch.utils.data.Sampler`: The sampler associated to the dataloader
|
||||
"""
|
||||
sampler_is_batch_sampler = isinstance(dataloader.sampler, BatchSampler)
|
||||
if sampler_is_batch_sampler:
|
||||
sampler = getattr(dataloader.sampler, "sampler", None)
|
||||
else:
|
||||
sampler = getattr(dataloader.batch_sampler, "sampler", None)
|
||||
return sampler
|
||||
|
||||
|
||||
def prepare_data_loader(
|
||||
dataloader: DataLoader,
|
||||
@ -754,6 +806,7 @@ def prepare_data_loader(
|
||||
even_batches: bool = True,
|
||||
slice_fn_for_dispatch: Optional[Callable] = None,
|
||||
use_seedable_sampler: bool = False,
|
||||
non_blocking: bool = False,
|
||||
) -> DataLoader:
|
||||
"""
|
||||
Wraps a PyTorch `DataLoader` to generate batches for one of the processes only.
|
||||
@ -812,6 +865,10 @@ def prepare_data_loader(
|
||||
reproducability. Comes at a cost of potentially different performances due to different shuffling
|
||||
algorithms but ensures results will be the *exact* same. Should be paired with `set_seed()` at every
|
||||
`self.set_epoch`
|
||||
non_blocking (`bool`, *optional*, defaults to `False`):
|
||||
If set to `True`, dataloader will utilize non-blocking host-to-device transfers. If the dataloader has
|
||||
`pin_memory` set to `True`, this will help to increase overlap between data transfer and computations.
|
||||
|
||||
|
||||
Returns:
|
||||
`torch.utils.data.dataloader.DataLoader`: A new data loader that will yield the portion of the batches
|
||||
@ -863,13 +920,10 @@ def prepare_data_loader(
|
||||
new_dataset = dataloader.dataset
|
||||
# Iterable dataset doesn't like batch_sampler, but data_loader creates a default one for it
|
||||
new_batch_sampler = dataloader.batch_sampler if not isinstance(new_dataset, IterableDataset) else None
|
||||
sampler_is_batch_sampler = False
|
||||
synchronized_generator = None
|
||||
sampler_is_batch_sampler = isinstance(dataloader.sampler, BatchSampler)
|
||||
if sampler_is_batch_sampler:
|
||||
sampler = getattr(dataloader.sampler, "sampler", None)
|
||||
else:
|
||||
sampler = getattr(dataloader.batch_sampler, "sampler", None)
|
||||
synchronized_generator = None
|
||||
|
||||
sampler = get_sampler(dataloader)
|
||||
if isinstance(sampler, RandomSampler) and use_seedable_sampler:
|
||||
# When iterating through the dataloader during distributed processes
|
||||
# we want to ensure that on each process we are iterating through the same
|
||||
@ -901,6 +955,10 @@ def prepare_data_loader(
|
||||
split_batches=split_batches,
|
||||
)
|
||||
else:
|
||||
if not use_seedable_sampler and hasattr(sampler, "generator"):
|
||||
if sampler.generator is None:
|
||||
sampler.generator = torch.Generator()
|
||||
synchronized_generator = sampler.generator
|
||||
batch_sampler = dataloader.sampler if sampler_is_batch_sampler else dataloader.batch_sampler
|
||||
new_batch_sampler = BatchSamplerShard(
|
||||
batch_sampler,
|
||||
@ -941,6 +999,7 @@ def prepare_data_loader(
|
||||
split_batches=split_batches,
|
||||
batch_sampler=new_batch_sampler,
|
||||
_drop_last=dataloader.drop_last,
|
||||
_non_blocking=non_blocking,
|
||||
slice_fn=slice_fn_for_dispatch,
|
||||
**kwargs,
|
||||
)
|
||||
@ -952,6 +1011,7 @@ def prepare_data_loader(
|
||||
batch_size=dataloader.batch_size,
|
||||
rng_types=rng_types,
|
||||
_drop_last=dataloader.drop_last,
|
||||
_non_blocking=non_blocking,
|
||||
synchronized_generator=synchronized_generator,
|
||||
**kwargs,
|
||||
)
|
||||
@ -963,16 +1023,12 @@ def prepare_data_loader(
|
||||
rng_types=rng_types,
|
||||
synchronized_generator=synchronized_generator,
|
||||
_drop_last=dataloader.drop_last,
|
||||
_non_blocking=non_blocking,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if isinstance(sampler, SeedableRandomSampler) and use_seedable_sampler:
|
||||
if sampler_is_batch_sampler:
|
||||
dataloader.sampler.sampler = sampler
|
||||
else:
|
||||
dataloader.batch_sampler.sampler = sampler
|
||||
if hasattr(dataloader.batch_sampler, "batch_sampler"):
|
||||
dataloader.batch_sampler.batch_sampler.sampler = sampler
|
||||
dataloader.set_sampler(sampler)
|
||||
if state.distributed_type == DistributedType.XLA:
|
||||
return MpDeviceLoaderWrapper(dataloader, device)
|
||||
return dataloader
|
||||
|
||||
@ -54,6 +54,8 @@ class MultiProcessAdapter(logging.LoggerAdapter):
|
||||
)
|
||||
main_process_only = kwargs.pop("main_process_only", True)
|
||||
in_order = kwargs.pop("in_order", False)
|
||||
# set `stacklevel` to exclude ourself in `Logger.findCaller()` while respecting user's choice
|
||||
kwargs.setdefault("stacklevel", 2)
|
||||
|
||||
if self.isEnabledFor(level):
|
||||
if self._should_log(main_process_only):
|
||||
|
||||
@ -18,7 +18,7 @@ import warnings
|
||||
import torch
|
||||
|
||||
from .state import AcceleratorState, GradientState
|
||||
from .utils import DistributedType, honor_type, is_torch_xla_available
|
||||
from .utils import DistributedType, honor_type, is_lomo_available, is_torch_xla_available
|
||||
|
||||
|
||||
if is_torch_xla_available():
|
||||
@ -121,7 +121,22 @@ class AcceleratedOptimizer(torch.optim.Optimizer):
|
||||
raise ValueError("`set_to_none` for Optimizer.zero_grad` is not supported by this optimizer.")
|
||||
self.optimizer.zero_grad()
|
||||
|
||||
def train(self):
|
||||
"""
|
||||
Sets the optimizer to "train" mode. Useful for optimizers like `schedule_free`
|
||||
"""
|
||||
return self.optimizer.train()
|
||||
|
||||
def eval(self):
|
||||
"""
|
||||
Sets the optimizer to "eval" mode. Useful for optimizers like `schedule_free`
|
||||
"""
|
||||
return self.optimizer.eval()
|
||||
|
||||
def step(self, closure=None):
|
||||
if is_lomo_available():
|
||||
from lomo_optim import AdaLomo, Lomo
|
||||
|
||||
if (
|
||||
not self.gradient_state.is_xla_gradients_synced
|
||||
and self.accelerator_state.distributed_type == DistributedType.XLA
|
||||
@ -129,6 +144,12 @@ class AcceleratedOptimizer(torch.optim.Optimizer):
|
||||
gradients = xm._fetch_gradients(self.optimizer)
|
||||
xm.all_reduce("sum", gradients, scale=1.0 / xm.xrt_world_size())
|
||||
self.gradient_state.is_xla_gradients_synced = True
|
||||
|
||||
if is_lomo_available():
|
||||
# `step` should be a no-op for LOMO optimizers.
|
||||
if isinstance(self.optimizer, (Lomo, AdaLomo)):
|
||||
return
|
||||
|
||||
if self.gradient_state.sync_gradients:
|
||||
if self.scaler is not None:
|
||||
self.optimizer.step = self._optimizer_patched_step_method
|
||||
|
||||
@ -179,22 +179,14 @@ class PartialState:
|
||||
)
|
||||
|
||||
# Sets up self.backend + imports
|
||||
backend, distributed_type = self._prepare_backend(cpu, use_sagemaker_dp, kwargs.pop("backend", None))
|
||||
original_backend = kwargs.pop("backend", None)
|
||||
backend, distributed_type = self._prepare_backend(cpu, use_sagemaker_dp, original_backend)
|
||||
if original_backend is not None and backend != original_backend:
|
||||
raise ValueError("Your assigned backend {original_backend} is not avaliable, please use {backend}")
|
||||
self.backend = backend
|
||||
self.distributed_type = distributed_type
|
||||
use_deepspeed = False
|
||||
if not cpu:
|
||||
# Deal with XLA
|
||||
if is_torch_xla_available():
|
||||
self.device = xm.xla_device()
|
||||
xm.set_replication(self.device, xm.get_xla_supported_devices())
|
||||
self.num_processes = xm.xrt_world_size()
|
||||
self.process_index = xm.get_ordinal()
|
||||
if is_torch_xla_available(check_is_tpu=True):
|
||||
self.local_process_index = xm.get_local_ordinal()
|
||||
else:
|
||||
self.local_process_index = int(os.environ.get("LOCAL_RANK", -1))
|
||||
self.distributed_type = DistributedType.XLA
|
||||
if not cpu and self.backend != "xla":
|
||||
if int(os.environ.get("LOCAL_RANK", -1)) != -1:
|
||||
# Deal with spawning deepspeed
|
||||
if os.environ.get("ACCELERATE_USE_DEEPSPEED", "false") == "true":
|
||||
@ -204,7 +196,7 @@ class PartialState:
|
||||
)
|
||||
from deepspeed import comm as dist
|
||||
|
||||
if is_xpu_available and is_ccl_available():
|
||||
if is_xpu_available() and is_ccl_available():
|
||||
os.environ["CCL_PROCESS_LAUNCHER"] = "none"
|
||||
os.environ["CCL_LOCAL_SIZE"] = os.environ.get("LOCAL_WORLD_SIZE", "1")
|
||||
os.environ["CCL_LOCAL_RANK"] = os.environ.get("LOCAL_RANK", "0")
|
||||
@ -246,7 +238,7 @@ class PartialState:
|
||||
|
||||
if (
|
||||
self.distributed_type == DistributedType.MULTI_CPU
|
||||
and get_int_from_env(["OMP_NUM_THREADS", "OMP_NUM_THREADS"], 0) > 0
|
||||
and get_int_from_env(["OMP_NUM_THREADS"], 0) == 0
|
||||
):
|
||||
import psutil
|
||||
|
||||
@ -270,6 +262,16 @@ class PartialState:
|
||||
self.num_processes = 1
|
||||
self.process_index = 0
|
||||
self.local_process_index = 0
|
||||
elif self.backend == "xla":
|
||||
# XLA needs device setting first for `set_replication`
|
||||
self.set_device()
|
||||
xm.set_replication(self.device, xm.get_xla_supported_devices())
|
||||
self.num_processes = xm.xrt_world_size()
|
||||
self.process_index = xm.get_ordinal()
|
||||
if is_torch_xla_available(check_is_tpu=True):
|
||||
self.local_process_index = xm.get_local_ordinal()
|
||||
else:
|
||||
self.local_process_index = int(os.environ.get("LOCAL_RANK", -1))
|
||||
else:
|
||||
self.num_processes = torch.distributed.get_world_size()
|
||||
self.process_index = torch.distributed.get_rank()
|
||||
@ -284,16 +286,17 @@ class PartialState:
|
||||
# Set CPU affinity if enabled
|
||||
if parse_flag_from_env("ACCELERATE_CPU_AFFINITY", False):
|
||||
set_numa_affinity(self.local_process_index)
|
||||
self.fork_launched = parse_flag_from_env("FORK_LAUNCHED", 0)
|
||||
|
||||
# Check for old RTX 4000's that can't use P2P or IB and are on old drivers
|
||||
if self.device.type == "cuda" and not check_cuda_p2p_ib_support():
|
||||
if "NCCL_P2P_DISABLE" not in os.environ or "NCCL_IB_DISABLE" not in os.environ:
|
||||
raise NotImplementedError(
|
||||
"Using RTX 4000 series doesn't support faster communication broadband via P2P or IB. "
|
||||
'Please set `NCCL_P2P_DISABLE="1"` and `NCCL_IB_DISABLE="1" or use `accelerate launch` which '
|
||||
"will do this automatically."
|
||||
)
|
||||
# Check for old RTX 4000's that can't use P2P or IB and are on old drivers
|
||||
if self.device.type == "cuda" and not check_cuda_p2p_ib_support():
|
||||
if "NCCL_P2P_DISABLE" not in os.environ or "NCCL_IB_DISABLE" not in os.environ:
|
||||
raise NotImplementedError(
|
||||
"Using RTX 4000 series doesn't support faster communication broadband via P2P or IB. "
|
||||
'Please set `NCCL_P2P_DISABLE="1"` and `NCCL_IB_DISABLE="1" or use `accelerate launch` which '
|
||||
"will do this automatically."
|
||||
)
|
||||
# Important: This should be the *only* code outside of `self.initialized!`
|
||||
self.fork_launched = parse_flag_from_env("FORK_LAUNCHED", 0)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return (
|
||||
@ -715,19 +718,22 @@ class PartialState:
|
||||
|
||||
backend = "smddp"
|
||||
distributed_type = DistributedType.MULTI_GPU
|
||||
elif int(os.environ.get("LOCAL_RANK", -1)) != -1:
|
||||
if not cpu:
|
||||
if is_mlu_available():
|
||||
backend = "cncl"
|
||||
distributed_type = DistributedType.MULTI_MLU
|
||||
elif torch.cuda.is_available():
|
||||
if backend is None:
|
||||
backend = "nccl"
|
||||
distributed_type = DistributedType.MULTI_GPU
|
||||
elif is_npu_available():
|
||||
backend = "hccl"
|
||||
distributed_type = DistributedType.MULTI_NPU
|
||||
if backend is None and (
|
||||
elif is_torch_xla_available():
|
||||
backend = "xla"
|
||||
distributed_type = DistributedType.XLA
|
||||
elif int(os.environ.get("LOCAL_RANK", -1)) != -1 and not cpu:
|
||||
if is_mlu_available():
|
||||
backend = "cncl"
|
||||
distributed_type = DistributedType.MULTI_MLU
|
||||
elif torch.cuda.is_available():
|
||||
if backend is None:
|
||||
backend = "nccl"
|
||||
distributed_type = DistributedType.MULTI_GPU
|
||||
elif is_npu_available():
|
||||
backend = "hccl"
|
||||
distributed_type = DistributedType.MULTI_NPU
|
||||
|
||||
if distributed_type is None and (
|
||||
int(os.environ.get("LOCAL_RANK", -1)) != -1
|
||||
or get_int_from_env(["PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE", "WORLD_SIZE"], 1) > 1
|
||||
):
|
||||
@ -735,8 +741,11 @@ class PartialState:
|
||||
distributed_type = DistributedType.MULTI_XPU
|
||||
else:
|
||||
distributed_type = DistributedType.MULTI_CPU
|
||||
if is_ccl_available() and (
|
||||
get_int_from_env(["CCL_WORKER_COUNT"], 0) > 0 or distributed_type == DistributedType.MULTI_XPU
|
||||
|
||||
if (
|
||||
backend in (None, "ccl")
|
||||
and is_ccl_available()
|
||||
and (get_int_from_env(["CCL_WORKER_COUNT"], 0) > 0 or distributed_type == DistributedType.MULTI_XPU)
|
||||
):
|
||||
if get_ccl_version() >= "1.12":
|
||||
import oneccl_bindings_for_pytorch # noqa: F401
|
||||
@ -744,12 +753,13 @@ class PartialState:
|
||||
import torch_ccl # noqa: F401
|
||||
|
||||
backend = "ccl"
|
||||
elif torch.distributed.is_mpi_available():
|
||||
elif backend in (None, "mpi") and torch.distributed.is_mpi_available():
|
||||
backend = "mpi"
|
||||
else:
|
||||
backend = "gloo"
|
||||
if distributed_type is None:
|
||||
distributed_type = DistributedType.NO
|
||||
|
||||
return backend, distributed_type
|
||||
|
||||
def set_device(self):
|
||||
@ -758,17 +768,20 @@ class PartialState:
|
||||
"""
|
||||
if self.device is not None:
|
||||
return
|
||||
if self.num_processes == 1:
|
||||
if self.distributed_type == DistributedType.NO:
|
||||
self.device = torch.device("cpu") if self._cpu else self.default_device
|
||||
return
|
||||
device = str(self.distributed_type).split(".")[-1].replace("MULTI_", "").lower()
|
||||
if device not in ("cpu", "gpu", "mlu", "npu", "xpu"):
|
||||
if device not in ("cpu", "gpu", "mlu", "npu", "xpu", "xla"):
|
||||
raise ValueError(
|
||||
f"Can't set device for {self.distributed_type} ({device}), verify we should be calling `_set_device()` for it!"
|
||||
)
|
||||
if device == "gpu":
|
||||
device = "cuda"
|
||||
self.device = torch.device(device, self.local_process_index)
|
||||
if device == "xla":
|
||||
self.device = xm.xla_device()
|
||||
else:
|
||||
if device == "gpu":
|
||||
device = "cuda"
|
||||
self.device = torch.device(device, self.local_process_index)
|
||||
if self.device is not None:
|
||||
if device == "xpu":
|
||||
torch.xpu.set_device(self.device)
|
||||
@ -893,7 +906,6 @@ class AcceleratorState:
|
||||
fsdp_plugin.set_mixed_precision(self._mixed_precision)
|
||||
self.fsdp_plugin = fsdp_plugin
|
||||
if os.environ.get("ACCELERATE_USE_MEGATRON_LM", "false") == "true" and self.distributed_type not in [
|
||||
DistributedType.MULTI_NPU,
|
||||
DistributedType.MULTI_XPU,
|
||||
]:
|
||||
self.distributed_type = DistributedType.MEGATRON_LM
|
||||
|
||||
@ -38,6 +38,7 @@ from .testing import (
|
||||
require_single_gpu,
|
||||
require_single_xpu,
|
||||
require_torch_min_version,
|
||||
require_torchvision,
|
||||
require_tpu,
|
||||
require_xpu,
|
||||
skip,
|
||||
|
||||
@ -20,12 +20,48 @@ from typing import List
|
||||
from unittest.mock import Mock
|
||||
|
||||
import torch
|
||||
from torch.utils.data import DataLoader, IterableDataset, TensorDataset
|
||||
from torch.utils.data import (
|
||||
BatchSampler,
|
||||
DataLoader,
|
||||
Dataset,
|
||||
IterableDataset,
|
||||
RandomSampler,
|
||||
TensorDataset,
|
||||
default_collate,
|
||||
)
|
||||
|
||||
from accelerate.accelerator import Accelerator, DataLoaderConfiguration
|
||||
from accelerate.utils.dataclasses import DistributedType
|
||||
|
||||
|
||||
NUM_ELEMENTS = 22
|
||||
NUM_WORKERS = 4
|
||||
BATCH_SIZE = 4
|
||||
|
||||
|
||||
class DummyDataset(Dataset):
|
||||
def __len__(self):
|
||||
return NUM_ELEMENTS
|
||||
|
||||
def __getitem__(self, index):
|
||||
squeeze = False
|
||||
|
||||
if isinstance(index, int):
|
||||
index = [index]
|
||||
squeeze = True
|
||||
elif isinstance(index, slice):
|
||||
index = list(range(*index.indices(self.size)))
|
||||
else:
|
||||
index = list(index)
|
||||
|
||||
batch = [{"index": i, "label": i % 2, "random_augmentation": torch.rand(1).item()} for i in index]
|
||||
|
||||
if squeeze:
|
||||
batch = batch[0]
|
||||
|
||||
return batch
|
||||
|
||||
|
||||
class DummyIterableDataset(IterableDataset):
|
||||
def __init__(self, data):
|
||||
self.data = data
|
||||
@ -206,8 +242,27 @@ def test_join_raises_warning_for_iterable_when_overriding_even_batches():
|
||||
assert "only supported for map-style datasets" in str(w[-1].message)
|
||||
|
||||
|
||||
def test_data_loader(data_loader, accelerator):
|
||||
# Prepare the DataLoader
|
||||
data_loader = accelerator.prepare(data_loader)
|
||||
|
||||
all_examples = []
|
||||
for i, batch in enumerate(data_loader):
|
||||
index, _ = accelerator.gather_for_metrics((batch["index"], batch["label"]))
|
||||
all_examples.extend(index.detach().cpu().numpy().tolist())
|
||||
|
||||
# Sort the examples
|
||||
sorted_all_examples = sorted(all_examples)
|
||||
|
||||
# Check if all elements are present in the sorted list of iterated samples
|
||||
assert (
|
||||
len(set(sorted_all_examples)) == NUM_ELEMENTS
|
||||
), "Not all the dataset elements have been iterated in an epoch due to duplication of samples across processes."
|
||||
|
||||
|
||||
def main():
|
||||
accelerator = create_accelerator()
|
||||
torch.manual_seed(accelerator.process_index)
|
||||
|
||||
accelerator.print("Test that even_batches variable ensures uniform batches across processes")
|
||||
test_default_ensures_even_batch_sizes()
|
||||
@ -233,6 +288,25 @@ def main():
|
||||
test_join_raises_warning_for_non_ddp_distributed(accelerator)
|
||||
accelerator.state.distributed_type = original_state
|
||||
|
||||
dataset = DummyDataset()
|
||||
# Conventional Dataloader with shuffle=False
|
||||
loader = DataLoader(dataset, shuffle=False, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)
|
||||
test_data_loader(loader, accelerator)
|
||||
|
||||
# Conventional Dataloader with shuffle=True
|
||||
loader = DataLoader(dataset, shuffle=True, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)
|
||||
test_data_loader(loader, accelerator)
|
||||
|
||||
# Dataloader with batch_sampler
|
||||
sampler = BatchSampler(RandomSampler(dataset), batch_size=BATCH_SIZE, drop_last=False)
|
||||
loader = DataLoader(dataset, batch_sampler=sampler, num_workers=NUM_WORKERS)
|
||||
test_data_loader(loader, accelerator)
|
||||
|
||||
# Dataloader with sampler as an instance of `BatchSampler`
|
||||
sampler = BatchSampler(RandomSampler(dataset), batch_size=BATCH_SIZE, drop_last=False)
|
||||
loader = DataLoader(dataset, sampler=sampler, batch_size=None, collate_fn=default_collate, num_workers=NUM_WORKERS)
|
||||
test_data_loader(loader, accelerator)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@ -22,7 +22,6 @@ from copy import deepcopy
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
from torch.utils.data import DataLoader, Dataset
|
||||
|
||||
@ -39,6 +38,7 @@ from accelerate.utils import (
|
||||
is_ipex_available,
|
||||
is_mlu_available,
|
||||
is_npu_available,
|
||||
is_pytest_available,
|
||||
is_xpu_available,
|
||||
set_seed,
|
||||
synchronize_rng_states,
|
||||
@ -711,6 +711,8 @@ def test_trigger():
|
||||
|
||||
|
||||
def test_reinstantiated_state():
|
||||
import pytest
|
||||
|
||||
AcceleratorState._reset_state()
|
||||
simple_model = torch.nn.Linear(1, 1)
|
||||
# First define an accelerator
|
||||
@ -792,9 +794,10 @@ def main():
|
||||
print("\n**Breakpoint trigger test**")
|
||||
test_trigger()
|
||||
|
||||
if state.local_process_index == 0:
|
||||
print("\n**Test reinstantiated state**")
|
||||
test_reinstantiated_state()
|
||||
if is_pytest_available():
|
||||
if state.local_process_index == 0:
|
||||
print("\n**Test reinstantiated state**")
|
||||
test_reinstantiated_state()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -45,10 +45,12 @@ from ..utils import (
|
||||
is_npu_available,
|
||||
is_pandas_available,
|
||||
is_pippy_available,
|
||||
is_schedulefree_available,
|
||||
is_tensorboard_available,
|
||||
is_timm_available,
|
||||
is_torch_version,
|
||||
is_torch_xla_available,
|
||||
is_torchvision_available,
|
||||
is_transformers_available,
|
||||
is_wandb_available,
|
||||
is_xpu_available,
|
||||
@ -213,6 +215,20 @@ def require_timm(test_case):
|
||||
return unittest.skipUnless(is_timm_available(), "test requires the timm library")(test_case)
|
||||
|
||||
|
||||
def require_torchvision(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires torchvision. These tests are skipped when they are not.
|
||||
"""
|
||||
return unittest.skipUnless(is_torchvision_available(), "test requires the torchvision library")(test_case)
|
||||
|
||||
|
||||
def require_schedulefree(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires schedulefree. These tests are skipped when they are not.
|
||||
"""
|
||||
return unittest.skipUnless(is_schedulefree_available(), "test requires the schedulefree library")(test_case)
|
||||
|
||||
|
||||
def require_bnb(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires bitsandbytes. These tests are skipped when they are not.
|
||||
|
||||
@ -81,6 +81,7 @@ from .imports import (
|
||||
is_dvclive_available,
|
||||
is_fp8_available,
|
||||
is_ipex_available,
|
||||
is_lomo_available,
|
||||
is_megatron_lm_available,
|
||||
is_mlflow_available,
|
||||
is_mlu_available,
|
||||
@ -91,11 +92,14 @@ from .imports import (
|
||||
is_peft_available,
|
||||
is_pippy_available,
|
||||
is_pynvml_available,
|
||||
is_pytest_available,
|
||||
is_rich_available,
|
||||
is_sagemaker_available,
|
||||
is_schedulefree_available,
|
||||
is_tensorboard_available,
|
||||
is_timm_available,
|
||||
is_torch_xla_available,
|
||||
is_torchvision_available,
|
||||
is_transformer_engine_available,
|
||||
is_transformers_available,
|
||||
is_wandb_available,
|
||||
|
||||
@ -154,6 +154,8 @@ class InitProcessGroupKwargs(KwargsHandler):
|
||||
[method](https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group) for more
|
||||
information on each argument.
|
||||
|
||||
Note: If `timeout` is set to `None`, the default will be based upon how `backend` is set.
|
||||
|
||||
```python
|
||||
from datetime import timedelta
|
||||
from accelerate import Accelerator
|
||||
@ -166,7 +168,12 @@ class InitProcessGroupKwargs(KwargsHandler):
|
||||
|
||||
backend: Optional[str] = "nccl"
|
||||
init_method: Optional[str] = None
|
||||
timeout: timedelta = timedelta(seconds=1800)
|
||||
timeout: Optional[timedelta] = None
|
||||
|
||||
def __post_init__(self):
|
||||
if self.timeout is None:
|
||||
seconds = 1800 if self.backend != "nccl" else 600
|
||||
self.timeout = timedelta(seconds=seconds)
|
||||
|
||||
|
||||
# Literals
|
||||
@ -524,6 +531,14 @@ class DataLoaderConfiguration:
|
||||
"multiple different seeds to compare. Should also be ran with [`~utils.set_seed`] for the best results."
|
||||
},
|
||||
)
|
||||
non_blocking: bool = field(
|
||||
default=False,
|
||||
metadata={
|
||||
"help": "If set to `True`, the dataloader prepared by the Accelerator will utilize non-blocking host-to-device"
|
||||
" transfers, allowing for better overlap between dataloader communication and computation. Recommended that the"
|
||||
" prepared dataloader has `pin_memory` set to `True` to work properly."
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -682,15 +697,15 @@ class DeepSpeedPlugin:
|
||||
default=None,
|
||||
metadata={"help": "Possible options are 0,1,2,3; Default will be taken from environment variable"},
|
||||
)
|
||||
is_train_batch_min: str = field(
|
||||
is_train_batch_min: bool = field(
|
||||
default=True,
|
||||
metadata={"help": "If both train & eval dataloaders are specified, this will decide the train_batch_size"},
|
||||
)
|
||||
offload_optimizer_device: bool = field(
|
||||
offload_optimizer_device: str = field(
|
||||
default=None,
|
||||
metadata={"help": "Possible options are none|cpu|nvme. Only applicable with ZeRO Stages 2 and 3."},
|
||||
)
|
||||
offload_param_device: bool = field(
|
||||
offload_param_device: str = field(
|
||||
default=None,
|
||||
metadata={"help": "Possible options are none|cpu|nvme. Only applicable with ZeRO Stage 3."},
|
||||
)
|
||||
@ -713,6 +728,13 @@ class DeepSpeedPlugin:
|
||||
default=None,
|
||||
metadata={"help": "Flag to indicate whether to save 16-bit model. Only applicable with ZeRO Stage-3."},
|
||||
)
|
||||
transformer_moe_cls_names: str = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "comma-separated list of transformers MoE layer class names (case-sensitive), e.g : "
|
||||
" `MixtralSparseMoeBlock`, `Qwen2MoeSparseMoeBlock`, `JetMoEAttention,JetMoEBlock` ..."
|
||||
},
|
||||
)
|
||||
|
||||
def __post_init__(self):
|
||||
from .deepspeed import HfDeepSpeedConfig
|
||||
@ -722,9 +744,8 @@ class DeepSpeedPlugin:
|
||||
self.gradient_accumulation_steps = int(gas) if gas.isdigit() else gas
|
||||
|
||||
if self.gradient_clipping is None:
|
||||
gradient_clipping = os.environ.get("ACCELERATE_GRADIENT_CLIPPING", "none")
|
||||
if gradient_clipping != "none":
|
||||
self.gradient_clipping = float(gradient_clipping)
|
||||
gradient_clipping = os.environ.get("ACCELERATE_GRADIENT_CLIPPING", "auto")
|
||||
self.gradient_clipping = gradient_clipping if gradient_clipping == "auto" else float(gradient_clipping)
|
||||
|
||||
if self.zero_stage is None:
|
||||
self.zero_stage = int(os.environ.get("ACCELERATE_DEEPSPEED_ZERO_STAGE", 2))
|
||||
@ -968,6 +989,26 @@ class DeepSpeedPlugin:
|
||||
"It will only ask for the necessary config variables when using `deepspeed_config_file`."
|
||||
)
|
||||
|
||||
def set_moe_leaf_modules(self, model):
|
||||
if self.transformer_moe_cls_names is None:
|
||||
self.transformer_moe_cls_names = os.environ.get("ACCELERATE_DEEPSPEED_MOE_LAYER_CLS_NAMES", None)
|
||||
if self.transformer_moe_cls_names is not None:
|
||||
if compare_versions("deepspeed", "<", "0.14.0"):
|
||||
raise ImportError("DeepSpeed version must be >= 0.14.0 to use MOE support. Please update DeepSpeed.")
|
||||
from deepspeed.utils import set_z3_leaf_modules
|
||||
|
||||
class_names = self.transformer_moe_cls_names.split(",")
|
||||
transformer_moe_cls = []
|
||||
for layer_class in class_names:
|
||||
transformer_cls = get_module_class_from_name(model, layer_class)
|
||||
if transformer_cls is None:
|
||||
raise Exception(
|
||||
f"Could not find a transformer layer class called '{layer_class}' to wrap in the model."
|
||||
)
|
||||
else:
|
||||
transformer_moe_cls.append(transformer_cls)
|
||||
set_z3_leaf_modules(model, transformer_moe_cls) # z3_leaf
|
||||
|
||||
|
||||
@dataclass
|
||||
class FullyShardedDataParallelPlugin:
|
||||
@ -1109,6 +1150,13 @@ class FullyShardedDataParallelPlugin:
|
||||
self.forward_prefetch = str_to_bool(os.environ.get(prefix + "FORWARD_PREFETCH", "False")) == 1
|
||||
self.activation_checkpointing = str_to_bool(os.environ.get(prefix + "ACTIVATION_CHECKPOINTING", "False")) == 1
|
||||
|
||||
if str_to_bool(os.environ.get("FSDP_CPU_RAM_EFFICIENT_LOADING", "False")) == 1 and not self.sync_module_states:
|
||||
warnings.warn(
|
||||
"sync_module_states cannot be False since efficient cpu ram loading enabled. "
|
||||
"Setting sync_module_states to True."
|
||||
)
|
||||
self.sync_module_states = True
|
||||
|
||||
if self.sync_module_states:
|
||||
if is_npu_available():
|
||||
device = torch.npu.current_device()
|
||||
@ -1122,26 +1170,6 @@ class FullyShardedDataParallelPlugin:
|
||||
)
|
||||
self.param_init_fn = lambda x: x.to_empty(device=device, recurse=False)
|
||||
|
||||
@staticmethod
|
||||
def get_module_class_from_name(module, name):
|
||||
"""
|
||||
Gets a class from a module by its name.
|
||||
|
||||
Args:
|
||||
module (`torch.nn.Module`): The module to get the class from.
|
||||
name (`str`): The name of the class.
|
||||
"""
|
||||
modules_children = list(module.children())
|
||||
if module.__class__.__name__ == name:
|
||||
return module.__class__
|
||||
elif len(modules_children) == 0:
|
||||
return
|
||||
else:
|
||||
for child_module in modules_children:
|
||||
module_class = FullyShardedDataParallelPlugin.get_module_class_from_name(child_module, name)
|
||||
if module_class is not None:
|
||||
return module_class
|
||||
|
||||
def set_auto_wrap_policy(self, model):
|
||||
from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy, transformer_auto_wrap_policy
|
||||
|
||||
@ -1156,7 +1184,7 @@ class FullyShardedDataParallelPlugin:
|
||||
).split(",")
|
||||
transformer_cls_to_wrap = set()
|
||||
for layer_class in transformer_cls_names_to_wrap:
|
||||
transformer_cls = FullyShardedDataParallelPlugin.get_module_class_from_name(model, layer_class)
|
||||
transformer_cls = get_module_class_from_name(model, layer_class)
|
||||
if transformer_cls is None:
|
||||
raise Exception("Could not find the transformer layer class to wrap in the model.")
|
||||
else:
|
||||
@ -1199,6 +1227,8 @@ class FullyShardedDataParallelPlugin:
|
||||
from torch.distributed.fsdp.fully_sharded_data_parallel import (
|
||||
FullOptimStateDictConfig,
|
||||
FullStateDictConfig,
|
||||
ShardedOptimStateDictConfig,
|
||||
ShardedStateDictConfig,
|
||||
StateDictType,
|
||||
)
|
||||
|
||||
@ -1209,6 +1239,11 @@ class FullyShardedDataParallelPlugin:
|
||||
self.state_dict_config = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
|
||||
if self.optim_state_dict_config is None:
|
||||
self.optim_state_dict_config = FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=True)
|
||||
elif self.state_dict_type == StateDictType.SHARDED_STATE_DICT:
|
||||
if self.state_dict_config is None:
|
||||
self.state_dict_config = ShardedStateDictConfig(offload_to_cpu=True)
|
||||
if self.optim_state_dict_config is None:
|
||||
self.optim_state_dict_config = ShardedOptimStateDictConfig(offload_to_cpu=True)
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -1715,3 +1750,23 @@ class BnbQuantizationConfig:
|
||||
|
||||
if not isinstance(self.torch_dtype, torch.dtype):
|
||||
raise ValueError("torch_dtype must be a torch.dtype")
|
||||
|
||||
|
||||
def get_module_class_from_name(module, name):
|
||||
"""
|
||||
Gets a class from a module by its name.
|
||||
|
||||
Args:
|
||||
module (`torch.nn.Module`): The module to get the class from.
|
||||
name (`str`): The name of the class.
|
||||
"""
|
||||
modules_children = list(module.children())
|
||||
if module.__class__.__name__ == name:
|
||||
return module.__class__
|
||||
elif len(modules_children) == 0:
|
||||
return
|
||||
else:
|
||||
for child_module in modules_children:
|
||||
module_class = get_module_class_from_name(child_module, name)
|
||||
if module_class is not None:
|
||||
return module_class
|
||||
|
||||
@ -16,6 +16,7 @@ import os
|
||||
import torch
|
||||
|
||||
from ..logging import get_logger
|
||||
from ..state import PartialState
|
||||
from .constants import FSDP_MODEL_NAME, FSDP_PYTORCH_VERSION, OPTIMIZER_NAME
|
||||
from .imports import is_torch_distributed_available
|
||||
from .modeling import is_peft_model
|
||||
@ -51,13 +52,14 @@ def _set_model_state_dict(model, state_dict, adapter_only=False):
|
||||
return model.load_state_dict(state_dict)
|
||||
|
||||
|
||||
def save_fsdp_model(fsdp_plugin, accelerator, model, output_dir, model_index=0, adapter_only=False):
|
||||
def save_fsdp_model(fsdp_plugin, model, output_dir, model_index=0, adapter_only=False):
|
||||
state = PartialState()
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
|
||||
# FSDP raises error when single GPU is used with `offload_to_cpu=True` for FULL_STATE_DICT
|
||||
# so, only enable it when num_processes>1
|
||||
is_multi_process = accelerator.num_processes > 1
|
||||
is_multi_process = state.num_processes > 1
|
||||
fsdp_plugin.state_dict_config.offload_to_cpu = is_multi_process
|
||||
fsdp_plugin.state_dict_config.rank0_only = is_multi_process
|
||||
|
||||
@ -68,15 +70,15 @@ def save_fsdp_model(fsdp_plugin, accelerator, model, output_dir, model_index=0,
|
||||
if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
|
||||
weights_name = f"{FSDP_MODEL_NAME}.bin" if model_index == 0 else f"{FSDP_MODEL_NAME}_{model_index}.bin"
|
||||
output_model_file = os.path.join(output_dir, weights_name)
|
||||
if accelerator.process_index == 0:
|
||||
if state.process_index == 0:
|
||||
logger.info(f"Saving model to {output_model_file}")
|
||||
torch.save(state_dict, output_model_file)
|
||||
logger.info(f"Model saved to {output_model_file}")
|
||||
elif fsdp_plugin.state_dict_type == StateDictType.LOCAL_STATE_DICT:
|
||||
weights_name = (
|
||||
f"{FSDP_MODEL_NAME}_rank{accelerator.process_index}.bin"
|
||||
f"{FSDP_MODEL_NAME}_rank{state.process_index}.bin"
|
||||
if model_index == 0
|
||||
else f"{FSDP_MODEL_NAME}_{model_index}_rank{accelerator.process_index}.bin"
|
||||
else f"{FSDP_MODEL_NAME}_{model_index}_rank{state.process_index}.bin"
|
||||
)
|
||||
output_model_file = os.path.join(output_dir, weights_name)
|
||||
logger.info(f"Saving model to {output_model_file}")
|
||||
@ -96,19 +98,20 @@ def save_fsdp_model(fsdp_plugin, accelerator, model, output_dir, model_index=0,
|
||||
logger.info(f"Model saved to {ckpt_dir}")
|
||||
|
||||
|
||||
def load_fsdp_model(fsdp_plugin, accelerator, model, input_dir, model_index=0, adapter_only=False):
|
||||
accelerator.wait_for_everyone()
|
||||
def load_fsdp_model(fsdp_plugin, model, input_dir, model_index=0, adapter_only=False):
|
||||
state = PartialState()
|
||||
state.wait_for_everyone()
|
||||
if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
|
||||
# FSDP raises error when single GPU is used with `offload_to_cpu=True` for FULL_STATE_DICT
|
||||
# so, only enable it when num_processes>1
|
||||
is_multi_process = accelerator.num_processes > 1
|
||||
is_multi_process = state.num_processes > 1
|
||||
fsdp_plugin.state_dict_config.offload_to_cpu = is_multi_process
|
||||
fsdp_plugin.state_dict_config.rank0_only = is_multi_process
|
||||
with FSDP.state_dict_type(
|
||||
model, fsdp_plugin.state_dict_type, fsdp_plugin.state_dict_config, fsdp_plugin.optim_state_dict_config
|
||||
):
|
||||
if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
|
||||
if type(model) != FSDP and accelerator.process_index != 0:
|
||||
if type(model) != FSDP and state.process_index != 0:
|
||||
if not fsdp_plugin.sync_module_states:
|
||||
raise ValueError(
|
||||
"Set the `sync_module_states` flag to `True` so that model states are synced across processes when "
|
||||
@ -122,9 +125,9 @@ def load_fsdp_model(fsdp_plugin, accelerator, model, input_dir, model_index=0, a
|
||||
logger.info(f"Model loaded from {input_model_file}")
|
||||
elif fsdp_plugin.state_dict_type == StateDictType.LOCAL_STATE_DICT:
|
||||
weights_name = (
|
||||
f"{FSDP_MODEL_NAME}_rank{accelerator.process_index}.bin"
|
||||
f"{FSDP_MODEL_NAME}_rank{state.process_index}.bin"
|
||||
if model_index == 0
|
||||
else f"{FSDP_MODEL_NAME}_{model_index}_rank{accelerator.process_index}.bin"
|
||||
else f"{FSDP_MODEL_NAME}_{model_index}_rank{state.process_index}.bin"
|
||||
)
|
||||
input_model_file = os.path.join(input_dir, weights_name)
|
||||
logger.info(f"Loading model from {input_model_file}")
|
||||
@ -149,14 +152,15 @@ def load_fsdp_model(fsdp_plugin, accelerator, model, input_dir, model_index=0, a
|
||||
return load_result
|
||||
|
||||
|
||||
def save_fsdp_optimizer(fsdp_plugin, accelerator, optimizer, model, output_dir, optimizer_index=0):
|
||||
def save_fsdp_optimizer(fsdp_plugin, optimizer, model, output_dir, optimizer_index=0):
|
||||
state = PartialState()
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
with FSDP.state_dict_type(
|
||||
model, fsdp_plugin.state_dict_type, fsdp_plugin.state_dict_config, fsdp_plugin.optim_state_dict_config
|
||||
):
|
||||
optim_state = FSDP.optim_state_dict(model, optimizer)
|
||||
if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
|
||||
if accelerator.process_index == 0:
|
||||
if state.process_index == 0:
|
||||
optim_state_name = (
|
||||
f"{OPTIMIZER_NAME}.bin" if optimizer_index == 0 else f"{OPTIMIZER_NAME}_{optimizer_index}.bin"
|
||||
)
|
||||
@ -176,14 +180,15 @@ def save_fsdp_optimizer(fsdp_plugin, accelerator, optimizer, model, output_dir,
|
||||
logger.info(f"Optimizer state saved in {ckpt_dir}")
|
||||
|
||||
|
||||
def load_fsdp_optimizer(fsdp_plugin, accelerator, optimizer, model, input_dir, optimizer_index=0, adapter_only=False):
|
||||
accelerator.wait_for_everyone()
|
||||
def load_fsdp_optimizer(fsdp_plugin, optimizer, model, input_dir, optimizer_index=0, adapter_only=False):
|
||||
state = PartialState()
|
||||
state.wait_for_everyone()
|
||||
with FSDP.state_dict_type(
|
||||
model, fsdp_plugin.state_dict_type, fsdp_plugin.state_dict_config, fsdp_plugin.optim_state_dict_config
|
||||
):
|
||||
if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
|
||||
optim_state = None
|
||||
if accelerator.process_index == 0 or not fsdp_plugin.optim_state_dict_config.rank0_only:
|
||||
if state.process_index == 0 or not fsdp_plugin.optim_state_dict_config.rank0_only:
|
||||
optimizer_name = (
|
||||
f"{OPTIMIZER_NAME}.bin" if optimizer_index == 0 else f"{OPTIMIZER_NAME}_{optimizer_index}.bin"
|
||||
)
|
||||
|
||||
@ -85,14 +85,26 @@ def is_pynvml_available():
|
||||
return _is_package_available("pynvml")
|
||||
|
||||
|
||||
def is_pytest_available():
|
||||
return _is_package_available("pytest")
|
||||
|
||||
|
||||
def is_msamp_available():
|
||||
return _is_package_available("msamp", "ms-amp")
|
||||
|
||||
|
||||
def is_schedulefree_available():
|
||||
return _is_package_available("schedulefree")
|
||||
|
||||
|
||||
def is_transformer_engine_available():
|
||||
return _is_package_available("transformer_engine")
|
||||
|
||||
|
||||
def is_lomo_available():
|
||||
return _is_package_available("lomo_optim")
|
||||
|
||||
|
||||
def is_fp8_available():
|
||||
return is_msamp_available() or is_transformer_engine_available()
|
||||
|
||||
@ -175,6 +187,8 @@ def is_bf16_available(ignore_tpu=False):
|
||||
return not ignore_tpu
|
||||
if is_cuda_available():
|
||||
return torch.cuda.is_bf16_supported()
|
||||
if is_mps_available():
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
@ -198,6 +212,10 @@ def is_bnb_available():
|
||||
return _is_package_available("bitsandbytes")
|
||||
|
||||
|
||||
def is_torchvision_available():
|
||||
return _is_package_available("torchvision")
|
||||
|
||||
|
||||
def is_megatron_lm_available():
|
||||
if str_to_bool(os.environ.get("ACCELERATE_USE_MEGATRON_LM", "False")) == 1:
|
||||
package_exists = importlib.util.find_spec("megatron") is not None
|
||||
|
||||
@ -393,6 +393,8 @@ def prepare_deepspeed_cmd_env(args: argparse.Namespace) -> Tuple[List[str], Dict
|
||||
current_env["ACCELERATE_DEEPSPEED_CONFIG_FILE"] = str(args.deepspeed_config_file)
|
||||
if args.enable_cpu_affinity:
|
||||
current_env["ACCELERATE_CPU_AFFINITY"] = "1"
|
||||
if args.deepspeed_moe_layer_cls_names is not None:
|
||||
current_env["ACCELERATE_DEEPSPEED_MOE_LAYER_CLS_NAMES"] = str(args.deepspeed_moe_layer_cls_names)
|
||||
return cmd, current_env
|
||||
|
||||
|
||||
|
||||
@ -381,12 +381,13 @@ def set_module_tensor_to_device(
|
||||
device_quantization = device
|
||||
device = "cpu"
|
||||
# `torch.Tensor.to(<int num>)` is not supported by `torch_npu` (see this [issue](https://github.com/Ascend/pytorch/issues/16)).
|
||||
if is_npu_available() and isinstance(device, int):
|
||||
device = f"npu:{device}"
|
||||
elif is_mlu_available() and isinstance(device, int):
|
||||
device = f"mlu:{device}"
|
||||
if is_xpu_available() and isinstance(device, int):
|
||||
device = f"xpu:{device}"
|
||||
if isinstance(device, int):
|
||||
if is_npu_available():
|
||||
device = f"npu:{device}"
|
||||
elif is_mlu_available():
|
||||
device = f"mlu:{device}"
|
||||
elif is_xpu_available():
|
||||
device = f"xpu:{device}"
|
||||
if value is None:
|
||||
new_value = old_value.to(device)
|
||||
if dtype is not None and device in ["meta", torch.device("meta")]:
|
||||
@ -447,14 +448,15 @@ def set_module_tensor_to_device(
|
||||
if not getattr(module.weight, "quant_state", None) and device_index is not None:
|
||||
module.weight = module.weight.cuda(device_index)
|
||||
# clean pre and post foward hook
|
||||
if is_npu_available():
|
||||
torch.npu.empty_cache()
|
||||
elif is_mlu_available():
|
||||
torch.mlu.empty_cache()
|
||||
elif is_xpu_available():
|
||||
torch.xpu.empty_cache()
|
||||
else:
|
||||
torch.cuda.empty_cache()
|
||||
if device != "cpu":
|
||||
if is_npu_available():
|
||||
torch.npu.empty_cache()
|
||||
elif is_mlu_available():
|
||||
torch.mlu.empty_cache()
|
||||
elif is_xpu_available():
|
||||
torch.xpu.empty_cache()
|
||||
else:
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
# When handling tied weights, we update tied_params_map to keep track of the tied weights that have already been allocated on the device in
|
||||
# order to avoid duplicating memory, see above.
|
||||
@ -801,27 +803,40 @@ def get_max_memory(max_memory: Optional[Dict[Union[int, str], Union[int, str]]]
|
||||
import psutil
|
||||
|
||||
if max_memory is None:
|
||||
if not (torch.cuda.is_available() or is_npu_available() or is_mlu_available() or is_xpu_available()):
|
||||
max_memory = {}
|
||||
|
||||
else:
|
||||
# Make sure CUDA is initialized on each GPU to have the right memory info.
|
||||
if is_npu_available():
|
||||
for i in range(torch.npu.device_count()):
|
||||
max_memory = {}
|
||||
# Make sure CUDA is initialized on each GPU to have the right memory info.
|
||||
if is_npu_available():
|
||||
for i in range(torch.npu.device_count()):
|
||||
try:
|
||||
_ = torch.tensor(0, device=torch.device("npu", i))
|
||||
max_memory = {i: torch.npu.mem_get_info(i)[0] for i in range(torch.npu.device_count())}
|
||||
elif is_mlu_available():
|
||||
for i in range(torch.mlu.device_count()):
|
||||
max_memory[i] = torch.npu.mem_get_info(i)[0]
|
||||
except Exception:
|
||||
logger.info(f"Device {i} seems unavailable, Proceeding to check subsequent devices.")
|
||||
continue
|
||||
elif is_mlu_available():
|
||||
for i in range(torch.mlu.device_count()):
|
||||
try:
|
||||
_ = torch.tensor(0, device=torch.device("mlu", i))
|
||||
max_memory = {i: torch.mlu.mem_get_info(i)[0] for i in range(torch.mlu.device_count())}
|
||||
elif is_xpu_available():
|
||||
for i in range(torch.xpu.device_count()):
|
||||
max_memory[i] = torch.mlu.mem_get_info(i)[0]
|
||||
except Exception:
|
||||
logger.info(f"Device {i} seems unavailable, Proceeding to check subsequent devices.")
|
||||
continue
|
||||
elif is_xpu_available():
|
||||
for i in range(torch.xpu.device_count()):
|
||||
try:
|
||||
_ = torch.tensor(0, device=torch.device("xpu", i))
|
||||
max_memory = {i: torch.xpu.max_memory_allocated(i) for i in range(torch.xpu.device_count())}
|
||||
else:
|
||||
for i in range(torch.cuda.device_count()):
|
||||
max_memory[i] = torch.xpu.max_memory_allocated(i)
|
||||
except Exception:
|
||||
logger.info(f"Device {i} seems unavailable, Proceeding to check subsequent devices.")
|
||||
continue
|
||||
else:
|
||||
for i in range(torch.cuda.device_count()):
|
||||
try:
|
||||
_ = torch.tensor([0], device=i)
|
||||
max_memory = {i: torch.cuda.mem_get_info(i)[0] for i in range(torch.cuda.device_count())}
|
||||
max_memory[i] = torch.cuda.mem_get_info(i)[0]
|
||||
except Exception:
|
||||
logger.info(f"Device {i} seems unavailable, Proceeding to check subsequent devices.")
|
||||
continue
|
||||
# allocate everything in the mps device as the RAM is shared
|
||||
if is_mps_available():
|
||||
max_memory["mps"] = psutil.virtual_memory().available
|
||||
@ -914,6 +929,17 @@ def load_offloaded_weights(model, index, offload_folder):
|
||||
set_module_tensor_to_device(model, param_name, "cpu", value=weight, fp16_statistics=fp16_statistics)
|
||||
|
||||
|
||||
def get_module_leaves(module_sizes):
|
||||
module_children = {}
|
||||
for module in module_sizes:
|
||||
if module == "" or "." not in module:
|
||||
continue
|
||||
parent = module.rsplit(".", 1)[0]
|
||||
module_children[parent] = module_children.get(parent, 0) + 1
|
||||
leaves = [module for module in module_sizes if module_children.get(module, 0) == 0 and module != ""]
|
||||
return leaves
|
||||
|
||||
|
||||
def get_balanced_memory(
|
||||
model: nn.Module,
|
||||
max_memory: Optional[Dict[Union[int, str], Union[int, str]]] = None,
|
||||
@ -1023,10 +1049,10 @@ def get_balanced_memory(
|
||||
buffer = 0
|
||||
|
||||
# Compute mean of final modules. In the first dict of module sizes, leaves are the parameters
|
||||
leaves = [n for n in module_sizes if len([p for p in module_sizes if n == "" or p.startswith(n + ".")]) == 0]
|
||||
leaves = get_module_leaves(module_sizes)
|
||||
module_sizes = {n: v for n, v in module_sizes.items() if n not in leaves}
|
||||
# Once removed, leaves are the final modules.
|
||||
leaves = [n for n in module_sizes if len([p for p in module_sizes if n == "" or p.startswith(n + ".")]) == 0]
|
||||
leaves = get_module_leaves(module_sizes)
|
||||
mean_leaves = int(sum([module_sizes[n] for n in leaves]) / max(len(leaves), 1))
|
||||
buffer = int(1.25 * max(buffer, mean_leaves))
|
||||
per_gpu += buffer
|
||||
@ -1783,7 +1809,7 @@ def get_mixed_precision_context_manager(native_amp: bool = False, autocast_kwarg
|
||||
)
|
||||
if state.mixed_precision == "fp16":
|
||||
return torch.autocast(device_type=device_type, dtype=torch.float16, **autocast_kwargs)
|
||||
elif state.mixed_precision == "bf16" and state.distributed_type in [
|
||||
elif state.mixed_precision in ["bf16", "fp8"] and state.distributed_type in [
|
||||
DistributedType.NO,
|
||||
DistributedType.MULTI_CPU,
|
||||
DistributedType.MULTI_GPU,
|
||||
|
||||
@ -164,10 +164,7 @@ def send_to_device(tensor, device, non_blocking=False, skip_keys=None):
|
||||
if is_npu_available():
|
||||
if isinstance(device, int):
|
||||
device = f"npu:{device}"
|
||||
else:
|
||||
raise error
|
||||
except Exception as error:
|
||||
if is_xpu_available():
|
||||
elif is_xpu_available():
|
||||
if isinstance(device, int):
|
||||
device = f"xpu:{device}"
|
||||
else:
|
||||
|
||||
@ -109,6 +109,8 @@ def synchronize_rng_state(rng_type: Optional[RNGType] = None, generator: Optiona
|
||||
torch.cuda.set_rng_state(rng_state)
|
||||
elif rng_type == RNGType.NPU:
|
||||
torch.npu.set_rng_state(rng_state)
|
||||
elif rng_type == RNGType.MLU:
|
||||
torch.mlu.set_rng_state(rng_state)
|
||||
elif rng_type == RNGType.XPU:
|
||||
torch.xpu.set_rng_state(rng_state)
|
||||
elif rng_type == RNGType.XLA:
|
||||
|
||||
@ -12,6 +12,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import warnings
|
||||
|
||||
from .imports import is_tqdm_available
|
||||
|
||||
|
||||
@ -21,7 +23,7 @@ if is_tqdm_available():
|
||||
from ..state import PartialState
|
||||
|
||||
|
||||
def tqdm(main_process_only: bool = True, *args, **kwargs):
|
||||
def tqdm(*args, main_process_only: bool = True, **kwargs):
|
||||
"""
|
||||
Wrapper around `tqdm.tqdm` that optionally displays only on the main process.
|
||||
|
||||
@ -31,7 +33,15 @@ def tqdm(main_process_only: bool = True, *args, **kwargs):
|
||||
"""
|
||||
if not is_tqdm_available():
|
||||
raise ImportError("Accelerate's `tqdm` module requires `tqdm` to be installed. Please run `pip install tqdm`.")
|
||||
disable = False
|
||||
if main_process_only:
|
||||
if len(args) > 0 and isinstance(args[0], bool):
|
||||
warnings.warn(
|
||||
f"Passing `{args[0]}` as the first argument to Accelerate's `tqdm` wrapper is deprecated "
|
||||
"and will be removed in v0.33.0. Please use the `main_process_only` keyword argument instead.",
|
||||
FutureWarning,
|
||||
)
|
||||
main_process_only = args[0]
|
||||
args = args[1:]
|
||||
disable = kwargs.pop("disable", False)
|
||||
if main_process_only and not disable:
|
||||
disable = PartialState().local_process_index != 0
|
||||
return _tqdm(*args, **kwargs, disable=disable)
|
||||
|
||||
@ -51,12 +51,14 @@ from accelerate.utils.deepspeed import (
|
||||
DummyScheduler,
|
||||
)
|
||||
from accelerate.utils.other import patch_environment
|
||||
from accelerate.utils.versions import compare_versions
|
||||
|
||||
|
||||
set_seed(42)
|
||||
|
||||
GPT2_TINY = "sshleifer/tiny-gpt2"
|
||||
MOBILEVIT = "apple/mobilevit-xx-small"
|
||||
QWEN_MOE = "peft-internal-testing/tiny-random-qwen-1.5-MoE"
|
||||
|
||||
ZERO2 = "zero2"
|
||||
ZERO3 = "zero3"
|
||||
@ -811,6 +813,30 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
|
||||
)
|
||||
assert deepspeed_plugin.zero_stage == int(stage.replace("zero", ""))
|
||||
|
||||
def test_prepare_deepspeed_prepare_moe(self):
|
||||
if compare_versions("transformers", "<", "4.40") and compare_versions("deepspeed", "<", "0.14"):
|
||||
return
|
||||
deepspeed_plugin = DeepSpeedPlugin(
|
||||
zero3_init_flag=True,
|
||||
gradient_accumulation_steps=1,
|
||||
gradient_clipping=1.0,
|
||||
zero_stage=3,
|
||||
offload_optimizer_device="none",
|
||||
offload_param_device="none",
|
||||
zero3_save_16bit_model=True,
|
||||
transformer_moe_cls_names="Qwen2MoeSparseMoeBlock",
|
||||
)
|
||||
with mockenv_context(**self.dist_env):
|
||||
accelerator = Accelerator(mixed_precision="fp16", deepspeed_plugin=deepspeed_plugin)
|
||||
accelerator.state.deepspeed_plugin.deepspeed_config["train_micro_batch_size_per_gpu"] = 1
|
||||
model = AutoModelForCausalLM.from_pretrained(QWEN_MOE)
|
||||
model = accelerator.prepare(model)
|
||||
from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock
|
||||
|
||||
for module in model.modules():
|
||||
if isinstance(module, Qwen2MoeSparseMoeBlock):
|
||||
assert hasattr(module, "_z3_leaf") and module._z3_leaf
|
||||
|
||||
def test_basic_run(self):
|
||||
test_file_path = path_in_accelerate_package("test_utils", "scripts", "external_deps", "test_performance.py")
|
||||
with tempfile.TemporaryDirectory() as dirpath:
|
||||
|
||||
@ -17,6 +17,7 @@ import pickle
|
||||
import tempfile
|
||||
from unittest.mock import patch
|
||||
|
||||
import psutil
|
||||
import pytest
|
||||
import torch
|
||||
from parameterized import parameterized
|
||||
@ -196,14 +197,25 @@ class AcceleratorTester(AccelerateTestCase):
|
||||
|
||||
def test_free_memory_dereferences_prepared_components(self):
|
||||
accelerator = Accelerator()
|
||||
model, optimizer, scheduler, train_dl, valid_dl = create_components()
|
||||
accelerator.prepare(model, optimizer, scheduler, train_dl, valid_dl)
|
||||
# Free up refs with empty_cache() and gc.collect()
|
||||
accelerator.free_memory()
|
||||
model, optimizer, scheduler, train_dl, valid_dl = create_components()
|
||||
free_cpu_ram_before = psutil.virtual_memory().available // 1024 // 1024
|
||||
model, optimizer, scheduler, train_dl, valid_dl = accelerator.prepare(
|
||||
model, optimizer, scheduler, train_dl, valid_dl
|
||||
)
|
||||
model, optimizer, scheduler, train_dl, valid_dl = accelerator.free_memory(
|
||||
model, optimizer, scheduler, train_dl, valid_dl
|
||||
)
|
||||
|
||||
free_cpu_ram_after = psutil.virtual_memory().available // 1024 // 1024
|
||||
|
||||
assert len(accelerator._models) == 0
|
||||
assert len(accelerator._optimizers) == 0
|
||||
assert len(accelerator._schedulers) == 0
|
||||
assert len(accelerator._dataloaders) == 0
|
||||
# The less-than comes *specifically* from CUDA CPU things/won't be present on CPU builds
|
||||
assert free_cpu_ram_after <= free_cpu_ram_before
|
||||
|
||||
@require_non_torch_xla
|
||||
def test_env_var_device(self):
|
||||
|
||||
@ -35,14 +35,19 @@ from accelerate.hooks import remove_hook_from_submodules
|
||||
from accelerate.test_utils import (
|
||||
require_bnb,
|
||||
require_cuda,
|
||||
require_mps,
|
||||
require_multi_device,
|
||||
require_multi_gpu,
|
||||
require_non_cpu,
|
||||
require_non_torch_xla,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
from accelerate.utils import is_torch_version, offload_state_dict
|
||||
|
||||
|
||||
torch_device = f"{torch_device}:0" if torch_device != "cpu" else "cpu"
|
||||
|
||||
|
||||
class ModelForTest(nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
@ -175,17 +180,9 @@ class BigModelingTester(unittest.TestCase):
|
||||
with init_empty_weights():
|
||||
_ = nn.Sequential(*[nn.Linear(10000, 10000) for _ in range(1000)])
|
||||
|
||||
@require_cuda
|
||||
def test_init_on_device_cuda(self):
|
||||
device = torch.device("cuda:0")
|
||||
with init_on_device(device):
|
||||
model = nn.Linear(10, 10)
|
||||
assert model.weight.device == device
|
||||
assert model.weight.device == device
|
||||
|
||||
@require_mps
|
||||
def test_init_on_device_mps(self):
|
||||
device = torch.device("mps:0")
|
||||
@require_non_cpu
|
||||
def test_init_on_device(self):
|
||||
device = torch.device(torch_device)
|
||||
with init_on_device(device):
|
||||
model = nn.Linear(10, 10)
|
||||
assert model.weight.device == device
|
||||
@ -196,7 +193,7 @@ class BigModelingTester(unittest.TestCase):
|
||||
x = torch.randn(2, 3)
|
||||
expected = model(x)
|
||||
|
||||
device = torch.device(0 if torch.cuda.is_available() else "cpu")
|
||||
device = torch.device(torch_device)
|
||||
|
||||
cpu_offload(model, execution_device=device)
|
||||
output = model(x)
|
||||
@ -214,7 +211,7 @@ class BigModelingTester(unittest.TestCase):
|
||||
x = torch.randn(2, 3)
|
||||
expected = model(x)
|
||||
|
||||
device = torch.device(0 if torch.cuda.is_available() else "cpu")
|
||||
device = torch.device(torch_device)
|
||||
|
||||
cpu_offload(model, execution_device=device, preload_module_classes=["ModuleWithUnusedSubModules"])
|
||||
output = model(x)
|
||||
@ -233,10 +230,10 @@ class BigModelingTester(unittest.TestCase):
|
||||
assert torch.allclose(expected, output.cpu(), 1e-4, 1e-5), f"Expected: {expected}, Actual: {output.cpu()}"
|
||||
|
||||
@slow
|
||||
@require_cuda
|
||||
@require_non_cpu
|
||||
def test_cpu_offload_gpt2(self):
|
||||
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
||||
inputs = tokenizer("Hello world! My name is", return_tensors="pt").to(0)
|
||||
inputs = tokenizer("Hello world! My name is", return_tensors="pt").to(torch_device)
|
||||
|
||||
gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")
|
||||
cpu_offload(gpt2, execution_device=0)
|
||||
@ -251,7 +248,7 @@ class BigModelingTester(unittest.TestCase):
|
||||
x = torch.randn(2, 3)
|
||||
expected = model(x)
|
||||
|
||||
device = torch.device(0 if torch.cuda.is_available() else "cpu")
|
||||
device = torch.device(torch_device)
|
||||
|
||||
with TemporaryDirectory() as tmp_dir:
|
||||
disk_offload(model, tmp_dir, execution_device=device)
|
||||
@ -271,7 +268,7 @@ class BigModelingTester(unittest.TestCase):
|
||||
x = torch.randn(2, 3)
|
||||
expected = model(x)
|
||||
|
||||
device = torch.device(0 if torch.cuda.is_available() else "cpu")
|
||||
device = torch.device(torch_device)
|
||||
|
||||
with TemporaryDirectory() as tmp_dir:
|
||||
disk_offload(
|
||||
@ -295,10 +292,10 @@ class BigModelingTester(unittest.TestCase):
|
||||
assert torch.allclose(expected, output.cpu(), 1e-4, 1e-5), f"Expected: {expected}, Actual: {output.cpu()}"
|
||||
|
||||
@slow
|
||||
@require_cuda
|
||||
@require_non_cpu
|
||||
def test_disk_offload_gpt2(self):
|
||||
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
||||
inputs = tokenizer("Hello world! My name is", return_tensors="pt").to(0)
|
||||
inputs = tokenizer("Hello world! My name is", return_tensors="pt").to(torch_device)
|
||||
|
||||
gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")
|
||||
with TemporaryDirectory() as tmp_dir:
|
||||
@ -309,7 +306,7 @@ class BigModelingTester(unittest.TestCase):
|
||||
== "Hello world! My name is Kiyoshi, and I'm a student at the University of Tokyo"
|
||||
)
|
||||
|
||||
@require_cuda
|
||||
@require_non_cpu
|
||||
def test_dispatch_model(self):
|
||||
model = ModelForTest()
|
||||
device_map = {"linear1": "disk", "batchnorm": "cpu", "linear2": 0}
|
||||
@ -322,7 +319,7 @@ class BigModelingTester(unittest.TestCase):
|
||||
output = model(x)
|
||||
assert torch.allclose(expected, output.cpu(), atol=1e-5)
|
||||
|
||||
@require_cuda
|
||||
@require_non_cpu
|
||||
def test_dispatch_model_with_non_persistent_buffers(self):
|
||||
model = ModelForTestNonPersistentBuffers()
|
||||
device_map = {"linear1": 0, "batchnorm": "cpu", "linear2": "disk"}
|
||||
@ -334,20 +331,7 @@ class BigModelingTester(unittest.TestCase):
|
||||
output = model(x)
|
||||
assert torch.allclose(expected, output.cpu(), atol=1e-5)
|
||||
|
||||
@require_mps
|
||||
def test_dispatch_model_mps(self):
|
||||
model = ModelForTest()
|
||||
device_map = {"linear1": "mps", "batchnorm": "disk", "linear2": "disk"}
|
||||
|
||||
x = torch.randn(2, 3)
|
||||
expected = model(x)
|
||||
|
||||
with TemporaryDirectory() as tmp_dir:
|
||||
dispatch_model(model, device_map, offload_dir=tmp_dir)
|
||||
output = model(x)
|
||||
assert torch.allclose(expected, output.cpu(), atol=1e-5)
|
||||
|
||||
@require_cuda
|
||||
@require_non_cpu
|
||||
def test_dispatch_model_tied_weights(self):
|
||||
model = ModelForTestTiedWeights()
|
||||
model.linear1.weight = model.linear2.weight
|
||||
@ -597,8 +581,8 @@ class BigModelingTester(unittest.TestCase):
|
||||
|
||||
assert (free_memory_bytes_after_infer - free_memory_bytes_after_dispatch) * 1e-6 < 130
|
||||
|
||||
@require_multi_gpu
|
||||
def test_dispatch_model_multi_gpu(self):
|
||||
@require_multi_device
|
||||
def test_dispatch_model_multi_devices(self):
|
||||
model = BiggerModelForTest()
|
||||
device_map = {"linear1": "cpu", "linear2": "disk", "batchnorm": "cpu", "linear3": 0, "linear4": 1}
|
||||
|
||||
@ -610,7 +594,7 @@ class BigModelingTester(unittest.TestCase):
|
||||
output = model(x)
|
||||
assert torch.allclose(expected, output.cpu(), atol=1e-5)
|
||||
|
||||
@require_cuda
|
||||
@require_non_cpu
|
||||
def test_dispatch_model_copy(self):
|
||||
original_model = ModelForTestCopy(id=1)
|
||||
device_map = {"linear1": 0, "batchnorm": "cpu", "linear2": 0}
|
||||
@ -629,7 +613,7 @@ class BigModelingTester(unittest.TestCase):
|
||||
assert copied_model.linear1.forward is not original_model.linear1.forward
|
||||
assert torch.allclose(expected, output.cpu(), atol=1e-5)
|
||||
|
||||
@require_cuda
|
||||
@require_non_cpu
|
||||
def test_dispatch_model_move_offloaded_model(self):
|
||||
model = ModelForTest()
|
||||
device_map = {"linear1": "disk", "batchnorm": "cpu", "linear2": 0}
|
||||
@ -653,10 +637,10 @@ class BigModelingTester(unittest.TestCase):
|
||||
model(x)
|
||||
|
||||
@slow
|
||||
@require_multi_gpu
|
||||
def test_dispatch_model_gpt2_on_two_gpus(self):
|
||||
@require_multi_device
|
||||
def test_dispatch_model_gpt2_on_two_devices(self):
|
||||
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
||||
inputs = tokenizer("Hello world! My name is", return_tensors="pt").to(0)
|
||||
inputs = tokenizer("Hello world! My name is", return_tensors="pt").to(torch_device)
|
||||
|
||||
gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")
|
||||
# Dispatch on GPUs 0 and 1
|
||||
@ -703,7 +687,7 @@ class BigModelingTester(unittest.TestCase):
|
||||
== "Hello world! My name is Kiyoshi, and I'm a student at the University of Tokyo"
|
||||
)
|
||||
|
||||
@require_cuda
|
||||
@require_non_cpu
|
||||
def test_dispatch_model_with_unused_submodules(self):
|
||||
model = ModelWithUnusedSubModulesForTest()
|
||||
device_map = {"linear1": "cpu", "linear2": "disk", "batchnorm": "cpu", "linear3": 0, "linear4": 0}
|
||||
@ -718,23 +702,8 @@ class BigModelingTester(unittest.TestCase):
|
||||
output = model(x)
|
||||
assert torch.allclose(expected, output.cpu(), atol=1e-5)
|
||||
|
||||
@require_mps
|
||||
def test_dispatch_model_with_unused_submodules_mps(self):
|
||||
model = ModelWithUnusedSubModulesForTest()
|
||||
device_map = {"linear1": "mps", "linear2": "mps", "batchnorm": "mps", "linear3": "mps", "linear4": "disk"}
|
||||
|
||||
x = torch.randn(2, 3)
|
||||
expected = model(x)
|
||||
|
||||
with TemporaryDirectory() as tmp_dir:
|
||||
dispatch_model(
|
||||
model, device_map, offload_dir=tmp_dir, preload_module_classes=["ModuleWithUnusedSubModules"]
|
||||
)
|
||||
output = model(x)
|
||||
assert torch.allclose(expected, output.cpu(), atol=1e-5)
|
||||
|
||||
@require_multi_gpu
|
||||
def test_dispatch_model_with_unused_submodules_multi_gpu(self):
|
||||
@require_multi_device
|
||||
def test_dispatch_model_with_unused_submodules_multi_device(self):
|
||||
model = ModelWithUnusedSubModulesForTest()
|
||||
device_map = {"linear1": "cpu", "linear2": "disk", "batchnorm": "cpu", "linear3": 0, "linear4": 1}
|
||||
|
||||
@ -748,7 +717,7 @@ class BigModelingTester(unittest.TestCase):
|
||||
output = model(x)
|
||||
assert torch.allclose(expected, output.cpu(), atol=1e-5)
|
||||
|
||||
@require_cuda
|
||||
@require_non_cpu
|
||||
def test_dispatch_model_force_hooks(self):
|
||||
model = ModelForTest()
|
||||
device_map = {"": 0}
|
||||
@ -760,7 +729,7 @@ class BigModelingTester(unittest.TestCase):
|
||||
output = model(x)
|
||||
assert torch.allclose(expected, output.cpu(), atol=1e-5)
|
||||
|
||||
@require_cuda
|
||||
@require_non_cpu
|
||||
def test_load_checkpoint_and_dispatch(self):
|
||||
model = ModelForTest()
|
||||
device_map = {"linear1": "cpu", "batchnorm": "cpu", "linear2": 0}
|
||||
@ -782,32 +751,8 @@ class BigModelingTester(unittest.TestCase):
|
||||
output = new_model(x)
|
||||
assert torch.allclose(expected, output.cpu(), atol=1e-5)
|
||||
|
||||
@require_mps
|
||||
def test_load_checkpoint_and_dispatch_mps(self):
|
||||
model = ModelForTest()
|
||||
device_map = {"linear1": "mps", "batchnorm": "mps", "linear2": "disk"}
|
||||
|
||||
x = torch.randn(2, 3)
|
||||
expected = model(x)
|
||||
|
||||
with TemporaryDirectory() as tmp_dir:
|
||||
checkpoint = os.path.join(tmp_dir, "pt_model.bin")
|
||||
torch.save(model.state_dict(), checkpoint)
|
||||
|
||||
new_model = ModelForTest()
|
||||
new_model = load_checkpoint_and_dispatch(
|
||||
new_model, checkpoint, device_map=device_map, offload_folder=tmp_dir
|
||||
)
|
||||
|
||||
# CPU-offloaded weights are on the meta device while waiting for the forward pass.
|
||||
assert new_model.linear1.weight.device == torch.device("mps:0")
|
||||
assert new_model.linear2.weight.device == torch.device("meta")
|
||||
|
||||
output = new_model(x)
|
||||
assert torch.allclose(expected, output.cpu(), atol=1e-5)
|
||||
|
||||
@require_multi_gpu
|
||||
def test_load_checkpoint_and_dispatch_multi_gpu(self):
|
||||
@require_multi_device
|
||||
def test_load_checkpoint_and_dispatch_multi_device(self):
|
||||
model = BiggerModelForTest()
|
||||
device_map = {"linear1": "cpu", "linear2": "cpu", "batchnorm": 0, "linear3": 0, "linear4": 1}
|
||||
|
||||
@ -830,7 +775,7 @@ class BigModelingTester(unittest.TestCase):
|
||||
output = new_model(x)
|
||||
assert torch.allclose(expected, output.cpu(), atol=1e-5)
|
||||
|
||||
@require_cuda
|
||||
@require_non_cpu
|
||||
def test_load_checkpoint_and_dispatch_with_unused_submodules(self):
|
||||
model = ModelWithUnusedSubModulesForTest()
|
||||
device_map = {"linear1": "cpu", "linear2": "cpu", "batchnorm": 0, "linear3": 0, "linear4": 0}
|
||||
@ -856,38 +801,8 @@ class BigModelingTester(unittest.TestCase):
|
||||
output = new_model(x)
|
||||
assert torch.allclose(expected, output.cpu(), atol=1e-5)
|
||||
|
||||
@require_mps
|
||||
def test_load_checkpoint_and_dispatch_with_unused_submodules_mps(self):
|
||||
model = ModelWithUnusedSubModulesForTest()
|
||||
device_map = {"linear1": "mps", "linear2": "mps", "batchnorm": "mps", "linear3": "disk", "linear4": "disk"}
|
||||
|
||||
x = torch.randn(2, 3)
|
||||
expected = model(x)
|
||||
|
||||
with TemporaryDirectory() as tmp_dir:
|
||||
checkpoint = os.path.join(tmp_dir, "pt_model.bin")
|
||||
torch.save(model.state_dict(), checkpoint)
|
||||
|
||||
new_model = ModelWithUnusedSubModulesForTest()
|
||||
new_model = load_checkpoint_and_dispatch(
|
||||
new_model,
|
||||
checkpoint,
|
||||
device_map=device_map,
|
||||
preload_module_classes=["ModuleWithUnusedSubModules"],
|
||||
offload_folder=tmp_dir,
|
||||
)
|
||||
|
||||
# CPU-offloaded weights are on the meta device while waiting for the forward pass.
|
||||
assert new_model.linear1.linear.weight.device == torch.device("mps:0")
|
||||
assert new_model.linear2.linear.weight.device == torch.device("mps:0")
|
||||
assert new_model.linear3.linear.weight.device == torch.device("meta")
|
||||
assert new_model.linear4.linear.weight.device == torch.device("meta")
|
||||
|
||||
output = new_model(x)
|
||||
assert torch.allclose(expected, output.cpu(), atol=1e-5)
|
||||
|
||||
@require_multi_gpu
|
||||
def test_load_checkpoint_and_dispatch_multi_gpu_with_unused_submodules(self):
|
||||
@require_multi_device
|
||||
def test_load_checkpoint_and_dispatch_multi_device_with_unused_submodules(self):
|
||||
model = ModelWithUnusedSubModulesForTest()
|
||||
device_map = {"linear1": "cpu", "linear2": "cpu", "batchnorm": 0, "linear3": 0, "linear4": 1}
|
||||
|
||||
@ -912,7 +827,7 @@ class BigModelingTester(unittest.TestCase):
|
||||
output = new_model(x)
|
||||
assert torch.allclose(expected, output.cpu(), atol=1e-5)
|
||||
|
||||
@require_cuda
|
||||
@require_non_cpu
|
||||
def test_cpu_offload_with_hook(self):
|
||||
model1 = torch.nn.Linear(4, 5)
|
||||
model1, hook1 = cpu_offload_with_hook(model1)
|
||||
|
||||
@ -20,7 +20,7 @@ from unittest.mock import patch
|
||||
import torch
|
||||
from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
|
||||
|
||||
from accelerate.commands.config.config_args import BaseConfig, ClusterConfig, SageMakerConfig
|
||||
from accelerate.commands.config.config_args import BaseConfig, ClusterConfig, SageMakerConfig, load_config_from_file
|
||||
from accelerate.commands.estimate import estimate_command, estimate_command_parser, gather_data
|
||||
from accelerate.commands.launch import _validate_launch_command, launch_command_parser
|
||||
from accelerate.test_utils import execute_subprocess_async
|
||||
@ -73,8 +73,9 @@ class AccelerateLauncherTester(unittest.TestCase):
|
||||
execute_subprocess_async(cmd, env=os.environ.copy())
|
||||
|
||||
def test_config_compatibility(self):
|
||||
invalid_configs = ["invalid", "mpi", "sagemaker"]
|
||||
for config in sorted(self.test_config_path.glob("**/*.yaml")):
|
||||
if "invalid" in str(config) or "mpi" in str(config):
|
||||
if any(invalid_config in str(config) for invalid_config in invalid_configs):
|
||||
continue
|
||||
with self.subTest(config_file=config):
|
||||
cmd = get_launch_command(config_file=config) + [self.test_file_path]
|
||||
@ -196,6 +197,8 @@ class ClusterConfigTester(unittest.TestCase):
|
||||
Test case for verifying the config dataclasses work
|
||||
"""
|
||||
|
||||
test_config_path = Path("tests/test_configs")
|
||||
|
||||
def test_base_config(self):
|
||||
# Tests that all the dataclasses can be initialized
|
||||
config = BaseConfig(
|
||||
@ -257,6 +260,8 @@ class ClusterConfigTester(unittest.TestCase):
|
||||
assert config.ec2_instance_type == "MY_TYPE"
|
||||
assert config.iam_role_name == "MY_ROLE"
|
||||
|
||||
config = load_config_from_file(str(self.test_config_path / "0_30_0_sagemaker.yaml"))
|
||||
|
||||
|
||||
class TpuConfigTester(unittest.TestCase):
|
||||
"""
|
||||
|
||||
8
tests/test_configs/0_30_0_sagemaker.yaml
Normal file
8
tests/test_configs/0_30_0_sagemaker.yaml
Normal file
@ -0,0 +1,8 @@
|
||||
compute_environment: AMAZON_SAGEMAKER
|
||||
debug: false
|
||||
distributed_type: NO
|
||||
mixed_precision: fp16
|
||||
debug: false
|
||||
use_cpu: false
|
||||
ec2_instance_type: MY_TYPE
|
||||
iam_role_name: MY_ROLE
|
||||
@ -30,6 +30,7 @@ from accelerate.test_utils.testing import (
|
||||
require_huggingface_suite,
|
||||
require_multi_gpu,
|
||||
require_pippy,
|
||||
require_schedulefree,
|
||||
require_trackers,
|
||||
run_command,
|
||||
slow,
|
||||
@ -47,6 +48,7 @@ EXCLUDE_EXAMPLES = [
|
||||
"local_sgd.py",
|
||||
"multi_process_metrics.py",
|
||||
"memory.py",
|
||||
"schedule_free.py",
|
||||
"automatic_gradient_accumulation.py",
|
||||
"fsdp_with_peak_mem_tracking.py",
|
||||
"deepspeed_with_config_support.py",
|
||||
@ -216,6 +218,11 @@ class FeatureExamplesTests(TempDirTestCase):
|
||||
testargs = ["examples/by_feature/multi_process_metrics.py"]
|
||||
run_command(self.launch_args + testargs)
|
||||
|
||||
@require_schedulefree
|
||||
def test_schedulefree(self):
|
||||
testargs = ["examples/by_feature/schedule_free.py"]
|
||||
run_command(self.launch_args + testargs)
|
||||
|
||||
@require_trackers
|
||||
@mock.patch.dict(os.environ, {"WANDB_MODE": "offline", "DVCLIVE_TEST": "true"})
|
||||
def test_tracking(self):
|
||||
@ -240,20 +247,30 @@ class FeatureExamplesTests(TempDirTestCase):
|
||||
testargs = ["examples/by_feature/early_stopping.py"]
|
||||
run_command(self.launch_args + testargs)
|
||||
|
||||
@require_multi_gpu
|
||||
def test_distributed_inference_examples_stable_diffusion(self):
|
||||
testargs = ["examples/inference/distributed/stable_diffusion.py"]
|
||||
run_command(self.launch_args + testargs)
|
||||
|
||||
@require_multi_gpu
|
||||
def test_distributed_inference_examples_phi2(self):
|
||||
testargs = ["examples/inference/distributed/phi2.py"]
|
||||
run_command(self.launch_args + testargs)
|
||||
|
||||
@require_pippy
|
||||
@require_multi_gpu
|
||||
def test_pippy_examples_bert(self):
|
||||
testargs = ["examples/inference/bert.py"]
|
||||
testargs = ["examples/inference/pippy/bert.py"]
|
||||
run_command(self.launch_args + testargs)
|
||||
|
||||
@require_pippy
|
||||
@require_multi_gpu
|
||||
def test_pippy_examples_gpt2(self):
|
||||
testargs = ["examples/inference/gpt2.py"]
|
||||
testargs = ["examples/inference/pippy/gpt2.py"]
|
||||
run_command(self.launch_args + testargs)
|
||||
|
||||
@require_pippy
|
||||
@require_multi_gpu
|
||||
def test_pippy_examples_t5(self):
|
||||
testargs = ["examples/inference/t5.py"]
|
||||
testargs = ["examples/inference/pippy/t5.py"]
|
||||
run_command(self.launch_args + testargs)
|
||||
|
||||
@ -28,7 +28,10 @@ from accelerate.hooks import (
|
||||
remove_hook_from_module,
|
||||
remove_hook_from_submodules,
|
||||
)
|
||||
from accelerate.test_utils import require_multi_gpu
|
||||
from accelerate.test_utils import require_multi_device, torch_device
|
||||
|
||||
|
||||
torch_device = f"{torch_device}:0" if torch_device != "cpu" else "cpu"
|
||||
|
||||
|
||||
class ModelForTest(nn.Module):
|
||||
@ -150,7 +153,7 @@ class HooksModelTester(unittest.TestCase):
|
||||
output1 = test_model(x)
|
||||
assert not output1.requires_grad
|
||||
|
||||
@require_multi_gpu
|
||||
@require_multi_device
|
||||
def test_align_devices_as_model_parallelism(self):
|
||||
model = ModelForTest()
|
||||
# Everything is on CPU
|
||||
@ -175,7 +178,7 @@ class HooksModelTester(unittest.TestCase):
|
||||
|
||||
# We can add a general hook to put back output on same device as input.
|
||||
add_hook_to_module(model, AlignDevicesHook(io_same_device=True))
|
||||
x = torch.randn(2, 3).to(0)
|
||||
x = torch.randn(2, 3).to(torch_device)
|
||||
output = model(x)
|
||||
assert output.device == torch.device(0)
|
||||
|
||||
@ -188,7 +191,7 @@ class HooksModelTester(unittest.TestCase):
|
||||
assert model.linear2.weight.device == torch.device("cpu")
|
||||
|
||||
# This will move each submodule on different devices
|
||||
hook_kwargs = {"execution_device": 0 if torch.cuda.is_available() else "cpu", "offload": True}
|
||||
hook_kwargs = {"execution_device": torch_device, "offload": True}
|
||||
|
||||
add_hook_to_module(model.linear1, AlignDevicesHook(**hook_kwargs))
|
||||
add_hook_to_module(model.batchnorm, AlignDevicesHook(**hook_kwargs))
|
||||
@ -216,7 +219,7 @@ class HooksModelTester(unittest.TestCase):
|
||||
|
||||
# Now test with buffers included in the offload
|
||||
hook_kwargs = {
|
||||
"execution_device": 0 if torch.cuda.is_available() else "cpu",
|
||||
"execution_device": torch_device,
|
||||
"offload": True,
|
||||
"offload_buffers": True,
|
||||
}
|
||||
@ -252,7 +255,7 @@ class HooksModelTester(unittest.TestCase):
|
||||
assert model.linear2.weight.device == torch.device("cpu")
|
||||
|
||||
# This will move each submodule on different devices
|
||||
execution_device = 0 if torch.cuda.is_available() else "cpu"
|
||||
execution_device = torch_device
|
||||
attach_align_device_hook(model, execution_device=execution_device, offload=True)
|
||||
|
||||
# Parameters have been offloaded, so on the meta device
|
||||
@ -301,7 +304,7 @@ class HooksModelTester(unittest.TestCase):
|
||||
assert model.linear2.weight.device == torch.device("cpu")
|
||||
|
||||
# This will move each submodule on different devices
|
||||
execution_device = 0 if torch.cuda.is_available() else "cpu"
|
||||
execution_device = torch_device
|
||||
attach_align_device_hook(
|
||||
model, execution_device=execution_device, offload=True, weights_map=model.state_dict()
|
||||
)
|
||||
|
||||
91
tests/test_logging.py
Normal file
91
tests/test_logging.py
Normal file
@ -0,0 +1,91 @@
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import inspect
|
||||
import logging
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from accelerate import Accelerator
|
||||
from accelerate.logging import get_logger
|
||||
|
||||
|
||||
def current_lineno() -> int:
|
||||
# A simple helper that returns the lineno of its call-site.
|
||||
caller_frame = inspect.currentframe().f_back
|
||||
caller_info = inspect.getframeinfo(caller_frame)
|
||||
return caller_info.lineno
|
||||
|
||||
|
||||
class CustomLogger(logging.LoggerAdapter):
|
||||
# Mocks a user-defined custom logger wrapper that sets `stacklevel=3`.
|
||||
def log(self, level, msg, *args, **kwargs):
|
||||
# E.g. the user wants to modify `stacklevel`, `accelerate.logging`
|
||||
# should respect the user's `stacklevel`. For the specific value
|
||||
# of `3`, calling `CustomLogger.log()`, etc., should log that callsite,
|
||||
# rather than the callsite of the following `self.logger.log()`.
|
||||
kwargs["stacklevel"] = 3
|
||||
self.logger.log(level, msg, *args, **kwargs)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def accelerator():
|
||||
return Accelerator()
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("accelerator")
|
||||
def test_log_stack(caplog):
|
||||
logger = get_logger(__name__)
|
||||
logging.basicConfig(
|
||||
format="%(filename)s:%(name)s:%(lineno)s:%(funcName)s - %(message)s",
|
||||
datefmt="%m/%d %H:%M:%S",
|
||||
)
|
||||
|
||||
message = "Test"
|
||||
lineno = current_lineno() + 1 # the next line is the actual callsite
|
||||
logger.warning(message)
|
||||
|
||||
assert len(caplog.records) == 1
|
||||
rec = caplog.records[0]
|
||||
assert rec.levelname == logging.getLevelName(logging.WARNING)
|
||||
assert rec.filename == os.path.basename(__file__)
|
||||
assert rec.name == __name__
|
||||
assert rec.lineno == lineno
|
||||
assert rec.funcName == test_log_stack.__name__
|
||||
assert rec.message == message
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("accelerator")
|
||||
def test_custom_stacklevel(caplog):
|
||||
wrapped_logger = get_logger(__name__)
|
||||
logging.basicConfig(
|
||||
format="%(filename)s:%(name)s:%(lineno)s:%(funcName)s - %(message)s",
|
||||
datefmt="%m/%d %H:%M:%S",
|
||||
)
|
||||
logger = CustomLogger(wrapped_logger, {})
|
||||
|
||||
message = "Test"
|
||||
lineno = current_lineno() + 1 # the next line is the actual callsite
|
||||
logger.warning(message)
|
||||
|
||||
# `CustomLogger.log` set custom `stacklevel=3`, so `logger.warning` should
|
||||
# log its callsite (rather than those of the `warpped_logger`).
|
||||
assert len(caplog.records) == 1
|
||||
rec = caplog.records[0]
|
||||
assert rec.levelname == logging.getLevelName(logging.WARNING)
|
||||
assert rec.filename == os.path.basename(__file__)
|
||||
assert rec.name == __name__
|
||||
assert rec.lineno == lineno
|
||||
assert rec.funcName == test_custom_stacklevel.__name__
|
||||
assert rec.message == message
|
||||
@ -31,6 +31,7 @@ from accelerate.test_utils import (
|
||||
require_multi_gpu,
|
||||
require_non_torch_xla,
|
||||
require_pippy,
|
||||
require_torchvision,
|
||||
)
|
||||
from accelerate.utils import patch_environment
|
||||
|
||||
@ -76,6 +77,7 @@ class MultiDeviceTester(unittest.TestCase):
|
||||
|
||||
@require_multi_gpu
|
||||
@require_pippy
|
||||
@require_torchvision
|
||||
@require_huggingface_suite
|
||||
def test_pippy(self):
|
||||
"""
|
||||
|
||||
@ -29,9 +29,11 @@ from accelerate.state import PartialState
|
||||
from accelerate.test_utils.testing import (
|
||||
require_cuda,
|
||||
require_huggingface_suite,
|
||||
require_non_cpu,
|
||||
require_non_torch_xla,
|
||||
require_torch_min_version,
|
||||
require_tpu,
|
||||
torch_device,
|
||||
)
|
||||
from accelerate.test_utils.training import RegressionModel
|
||||
from accelerate.utils import (
|
||||
@ -51,6 +53,7 @@ from accelerate.utils import (
|
||||
recursively_apply,
|
||||
save,
|
||||
send_to_device,
|
||||
tqdm,
|
||||
)
|
||||
from accelerate.utils.operations import is_namedtuple
|
||||
|
||||
@ -70,7 +73,7 @@ class UtilsTester(unittest.TestCase):
|
||||
|
||||
def test_send_to_device(self):
|
||||
tensor = torch.randn(5, 2)
|
||||
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
||||
device = torch.device(f"{torch_device}:0")
|
||||
|
||||
result1 = send_to_device(tensor, device)
|
||||
assert torch.equal(result1.cpu(), tensor)
|
||||
@ -178,11 +181,11 @@ class UtilsTester(unittest.TestCase):
|
||||
model = extract_model_from_parallel(model, keep_fp32_wrapper=False)
|
||||
_ = pickle.dumps(model)
|
||||
|
||||
@require_cuda
|
||||
@require_non_cpu
|
||||
def test_can_undo_fp16_conversion(self):
|
||||
model = RegressionModel()
|
||||
model._original_forward = model.forward
|
||||
model.forward = torch.cuda.amp.autocast(dtype=torch.float16)(model.forward)
|
||||
model.forward = torch.autocast(device_type=torch_device, dtype=torch.float16)(model.forward)
|
||||
model.forward = convert_outputs_to_fp32(model.forward)
|
||||
model = extract_model_from_parallel(model, keep_fp32_wrapper=False)
|
||||
_ = pickle.dumps(model)
|
||||
@ -401,3 +404,9 @@ class UtilsTester(unittest.TestCase):
|
||||
with self.assertLogs("accelerate.utils.environment", level="WARNING"):
|
||||
valid_env_items = convert_dict_to_env_variables(env)
|
||||
assert valid_env_items == ["ACCELERATE_DEBUG_MODE=1\n", "OTHER_ENV=2\n"]
|
||||
|
||||
def test_tqdm_deprecation(self):
|
||||
with pytest.warns(FutureWarning) as cm:
|
||||
tqdm(True, range(3), disable=True)
|
||||
assert "Passing `True` as the first argument to" in cm.pop().message.args[0]
|
||||
tqdm(range(3), main_process_only=True, disable=True)
|
||||
|
||||
Reference in New Issue
Block a user