Update accelerator

Use partial state for fsdp saving/loading
Fix small edge case in get_module_leaves (#2774 )
2025-11-15 06:39:30 +08:00 · 2024-05-14 12:00:35 -04:00 · 2024-05-14 11:59:07 -04:00 · 2024-05-14 11:52:51 +02:00 · 2024-05-13 08:24:01 -04:00 · 2024-05-13 08:23:38 -04:00
78 changed files with 2766 additions and 373 deletions
--- a/.github/workflows/build-docker-images-release.yml
+++ b/.github/workflows/build-docker-images-release.yml
@ -58,3 +58,24 @@ jobs:
          file: docker/accelerate-gpu/Dockerfile
          push: true
          tags: huggingface/accelerate:gpu-release-${{needs.get-version.outputs.version}}
+
+  version-cuda-deepspeed:
+    name: "Latest Accelerate GPU DeepSpeed [version]"
+    runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
+    needs: get-version
+    steps:
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+      - name: Login to DockerHub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+
+      - name: Build and Push GPU
+        uses: docker/build-push-action@v4
+        with:
+          file: docker/accelerate-gpu-deepspeed/Dockerfile
+          push: true
+          tags: huggingface/accelerate:gpu-deepspeed-release-${{needs.get-version.outputs.version}}
+
--- a/.github/workflows/build_docker_images.yml
+++ b/.github/workflows/build_docker_images.yml
@ -57,4 +57,29 @@ jobs:
          push: true
          tags: |
            huggingface/accelerate:gpu-nightly
-            huggingface/accelerate:gpu-nightly-${{ env.date }}
+            huggingface/accelerate:gpu-nightly-${{ env.date }}
+
+  latest-cuda-deepspeed:
+    name: "Latest Accelerate GPU DeepSpeed [dev]"
+    runs-on: [self-hosted, nvidia-gpu, t4, ci]
+    steps:
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+      - name: Login to DockerHub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      - name: Get current date
+        id: date
+        run: |
+          echo "date=$(date '+%Y-%m-%d')" >> $GITHUB_ENV
+      - name: Build and Push GPU
+        uses: docker/build-push-action@v4
+        with:
+          file: docker/accelerate-gpu-deepspeed/Dockerfile          
+          push: true
+          tags: |
+            huggingface/accelerate:gpu-deepspeed-nightly
+            huggingface/accelerate:gpu-deepspeed-nightly-${{ env.date }}
+
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@ -13,5 +13,6 @@ jobs:
    with:
      commit_sha: ${{ github.sha }}
      package: accelerate
+      custom_container: huggingface/transformers-doc-builder
    secrets:
      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@ -14,3 +14,4 @@ jobs:
      commit_sha: ${{ github.event.pull_request.head.sha }}
      pr_number: ${{ github.event.number }}
      package: accelerate
+      custom_container: huggingface/transformers-doc-builder
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@ -12,13 +12,13 @@ env:


 jobs:
-  run_all_tests_single_gpu:
+  run_core_tests_single_gpu:
    runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
    env:
      CUDA_VISIBLE_DEVICES: "0"
      TEST_TYPE: "single_gpu"
    container:
-      image: huggingface/accelerate-gpu:latest
+      image: huggingface/accelerate:gpu-nightly
      options: --gpus all --shm-size "16gb"
    defaults:
      run:
@ -33,6 +33,11 @@ jobs:
          pip install -e . --no-deps
          pip install pytest-reportlog tabulate

+      - name: Show installed libraries
+        run: |
+          source activate accelerate;
+          pip freeze
+
      - name: Run test on GPUs
        working-directory: accelerate
        run: |
@ -54,13 +59,67 @@ jobs:
          pip install slack_sdk tabulate
          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY

-  run_all_tests_multi_gpu:
+  run_deepspeed_tests_single_gpu:
+    runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
+    env:
+      CUDA_VISIBLE_DEVICES: "0"
+      TEST_TYPE: "single_gpu_deepspeed"
+    container:
+      image: huggingface/accelerate:gpu-deepspeed-nightly
+      options: --gpus all --shm-size "16gb"
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Update clone & pip install
+        run: |
+          source activate accelerate
+          git clone https://github.com/huggingface/accelerate;
+          cd accelerate;
+          git checkout ${{ github.sha }};
+          pip install -e . --no-deps
+          pip install pytest-reportlog tabulate
+
+      - name: Show installed libraries
+        run: |
+          source activate accelerate;
+          pip freeze
+
+      - name: Run test on GPUs
+        working-directory: accelerate
+        run: |
+          source activate accelerate
+          make test_deepspeed
+
+      - name: Run Integration tests on GPUs
+        working-directory: accelerate
+        if: always()
+        run: |
+          source activate accelerate
+          make test_integrations
+
+      - name: Run examples on GPUs
+        working-directory: accelerate
+        if: always()
+        run: |
+          source activate accelerate
+          pip uninstall comet_ml -y
+          make test_examples
+          
+      - name: Generate Report
+        working-directory: accelerate
+        if: always()
+        run: |
+          pip install slack_sdk tabulate
+          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
+
+  run_core_tests_multi_gpu:
    runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
    env:
      CUDA_VISIBLE_DEVICES: "0,1"
      TEST_TYPE: "multi_gpu"
    container:
-      image: huggingface/accelerate-gpu:latest
+      image: huggingface/accelerate:gpu-nightly
      options: --gpus all --shm-size "16gb"
    defaults:
      run:
@ -75,6 +134,11 @@ jobs:
          pip install -e . --no-deps
          pip install pytest-reportlog tabulate

+      - name: Show installed libraries
+        run: |
+          source activate accelerate;
+          pip freeze
+
      - name: Run core and big modeling tests on GPUs
        working-directory: accelerate
        run: |
@ -105,6 +169,60 @@ jobs:
          pip install slack_sdk tabulate
          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY

+  run_deepspeed_tests_multi_gpu:
+    runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
+    env:
+      CUDA_VISIBLE_DEVICES: "0,1"
+      TEST_TYPE: "multi_gpu_deepspeed"
+    container:
+      image: huggingface/accelerate:gpu-deepspeed-nightly
+      options: --gpus all --shm-size "16gb"
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Update clone
+        run: |
+          source activate accelerate
+          git clone https://github.com/huggingface/accelerate;
+          cd accelerate;
+          git checkout ${{ github.sha }};
+          pip install -e . --no-deps
+          pip install pytest-reportlog tabulate
+
+      - name: Show installed libraries
+        run: |
+          source activate accelerate;
+          pip freeze
+
+      - name: Run DeepSpeed tests
+        working-directory: accelerate
+        run: |
+          source activate accelerate
+          make test_deepspeed
+
+      - name: Run Integration tests on GPUs
+        working-directory: accelerate
+        if: always()
+        run: |
+          source activate accelerate
+          make test_integrations
+
+      - name: Run examples on GPUs
+        working-directory: accelerate
+        if: always()
+        run: |
+          source activate accelerate
+          pip uninstall comet_ml -y
+          make test_examples
+
+      - name: Generate Report
+        working-directory: accelerate
+        if: always()
+        run: |
+          pip install slack_sdk tabulate
+          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
+
  
  run-integration-tests:
    if: always()
--- a/.github/workflows/run_merge_tests.yml
+++ b/.github/workflows/run_merge_tests.yml
@ -9,7 +9,7 @@ env:
  IS_GITHUB_CI: "1"

 jobs:
-  run_all_tests_single_gpu:
+  run_core_tests_single_gpu:
    runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
    env:
      CUDA_VISIBLE_DEVICES: "0"
@ -29,6 +29,11 @@ jobs:
          pip install -e .[testing,test_trackers] -U;
          pip install pytest-reportlog tabulate  ;

+      - name: Show installed libraries
+        run: |
+          source activate accelerate;
+          pip freeze
+
      - name: Run CLI tests (use make cli)
        working-directory: accelerate
        run: |
@ -56,7 +61,46 @@ jobs:
          pip install tabulate;
          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY

-  run_all_tests_multi_gpu:
+  run_deepspeed_tests_single_gpu:
+    runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
+    env:
+      CUDA_VISIBLE_DEVICES: "0"
+    container:
+      image: huggingface/accelerate:gpu-deepspeed-nightly
+      options: --gpus all --shm-size "16gb"
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Install accelerate
+        run: |
+          source activate accelerate;
+          git clone https://github.com/huggingface/accelerate;
+          cd accelerate;
+          git checkout ${{ github.sha }};
+          pip install -e .[testing,test_trackers] -U;
+          pip install pytest-reportlog tabulate  ;
+
+      - name: Show installed libraries
+        run: |
+          source activate accelerate;
+          pip freeze
+          
+      - name: Run test on GPUs
+        working-directory: accelerate
+        if: always()
+        run: |
+          source activate accelerate;
+          make test_deepspeed
+
+      - name: Generate Report
+        working-directory: accelerate
+        if: always()
+        run: |
+          pip install tabulate;
+          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
+
+  run_core_tests_multi_gpu:
    runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
    env:
      CUDA_VISIBLE_DEVICES: 0,1
@ -76,6 +120,11 @@ jobs:
          pip install -e .[testing,test_trackers] -U;
          pip install pytest-reportlog tabulate

+      - name: Show installed libraries
+        run: |
+          source activate accelerate;
+          pip freeze
+
      - name: Run test on GPUs
        working-directory: accelerate
        run: |
@ -96,3 +145,40 @@ jobs:
        run: |
          source activate accelerate;
          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
+
+  run_deepspeed_tests_multi_gpu:
+    runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
+    container:
+      image: huggingface/accelerate:gpu-deepspeed-nightly
+      options: --gpus all --shm-size "16gb"
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Install accelerate
+        run: |
+          source activate accelerate;
+          git clone https://github.com/huggingface/accelerate;
+          cd accelerate;
+          git checkout ${{ github.sha }};
+          pip install -e .[testing,test_trackers] -U;
+          pip install pytest-reportlog tabulate  ;
+
+      - name: Show installed libraries
+        run: |
+          source activate accelerate;
+          pip freeze
+
+      - name: Run test on GPUs
+        working-directory: accelerate
+        if: always()
+        run: |
+          source activate accelerate;
+          make test_deepspeed
+
+      - name: Generate Report
+        working-directory: accelerate
+        if: always()
+        run: |
+          pip install tabulate;
+          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
--- a/.github/workflows/self_hosted_integration_tests.yml
+++ b/.github/workflows/self_hosted_integration_tests.yml
@ -23,7 +23,7 @@ defaults:
 jobs:
  run-trainer-tests:
    container:
-      image: huggingface/accelerate:gpu-nightly
+      image: huggingface/accelerate:gpu-deepspeed-nightly
      options: --gpus all --shm-size "16gb"
    runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
    strategy:
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -51,6 +51,10 @@ jobs:
        if [[ ${{ matrix.test-kind }} = test_rest ]]; then pip uninstall comet_ml -y; fi
        if [[ ${{ matrix.test-kind }} = minimum ]]; then pip install torch==1.10.0; fi
        pip install pytest-reportlog tabulate setuptools
+
+    - name: Show installed libraries
+      run: |
+        pip freeze
    
    - name: Run Tests
      env: 
--- a/docker/README.md
+++ b/docker/README.md
@ -29,9 +29,10 @@ huggingface/accelerate:{accelerator}-{nightly,release}
 ```

 `accelerator` in this instance is one of many applical pre-configured backend supports:
-* `gpu`: Comes compiled off of the `nvidia/cuda` image and includes everything such as `deepspeed`, `bitsandbytes`, etc. 
-* `cpu`: Comes compiled off of `python:3.8-slim` and is designed for non-CUDA based workloads.
+* `gpu`: Comes compiled off of the `nvidia/cuda` image and includes core parts like `bitsandbytes`. Runs off python 3.9.
+* `cpu`: Comes compiled off of `python:3.9-slim` and is designed for non-CUDA based workloads.
 * More to come soon
+* `gpu-deepspeed`: Comes compiled off of the `nvidia/cuda` image and includes core parts like `bitsandbytes` as well as the latest `deepspeed` version. Runs off python 3.10.

 ## Nightlies vs Releases

--- a/docker/accelerate-gpu-deepspeed/Dockerfile
+++ b/docker/accelerate-gpu-deepspeed/Dockerfile
@ -0,0 +1,46 @@
+# Builds GPU docker image of PyTorch specifically
+# Uses multi-staged approach to reduce size
+# Stage 1
+# Use base conda image to reduce time
+FROM continuumio/miniconda3:latest AS compile-image
+# Specify py version
+# Note: DeepSpeed beyond v0.12.6 requires py 3.10
+ENV PYTHON_VERSION=3.10
+# Install apt libs
+RUN apt-get update && \
+    apt-get install -y curl git wget && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists*
+
+# Create our conda env
+RUN conda create --name accelerate python=${PYTHON_VERSION} ipython jupyter pip
+# We don't install pytorch here yet since CUDA isn't available
+# instead we use the direct torch wheel
+ENV PATH /opt/conda/envs/accelerate/bin:$PATH
+# Activate our bash shell
+RUN chsh -s /bin/bash
+SHELL ["/bin/bash", "-c"]
+# Activate the conda env, install mpy4pi, and install torch + accelerate
+RUN source activate accelerate && conda install -c conda-forge mpi4py
+RUN source activate accelerate && \
+    python3 -m pip install --no-cache-dir \
+    git+https://github.com/huggingface/accelerate#egg=accelerate[testing,test_trackers,deepspeed] \
+    --extra-index-url https://download.pytorch.org/whl/cu117
+
+RUN python3 -m pip install --no-cache-dir bitsandbytes
+
+# Stage 2
+FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 AS build-image
+COPY --from=compile-image /opt/conda /opt/conda
+ENV PATH /opt/conda/bin:$PATH
+
+# Install apt libs
+RUN apt-get update && \
+    apt-get install -y curl git wget && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists*
+
+RUN echo "source activate accelerate" >> ~/.profile
+
+# Activate the virtualenv
+CMD ["/bin/bash"]
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@ -78,6 +78,8 @@
    title: Executing and deferring jobs
  - local: concept_guides/gradient_synchronization
    title: Gradient synchronization
+  - local: concept_guides/fsdp_and_deepspeed
+    title: FSDP vs DeepSpeed
  - local: concept_guides/low_precision_training
    title: How training in low-precision environments is possible (FP8)
  - local: concept_guides/training_tpu
--- a/docs/source/concept_guides/fsdp_and_deepspeed.md
+++ b/docs/source/concept_guides/fsdp_and_deepspeed.md
@ -0,0 +1,192 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# Moving between FSDP And DeepSpeed
+
+🤗 Accelerate offers flexibilty of training frameworks, by integrating two extremely powerful tools for distributed training, namely [Pytorch FSDP](../usage_guides/fsdp.md) and [Microsoft DeepSpeed](../usage_guides/deepspeed.md). The aim of this tutorial is to draw parallels, as well as to outline potential differences, to empower the user to switch seamlessly between these two frameworks.
+
+<Tip>
+
+  To switch between the frameworks, we recommend launching code 🤗 `accelerate launch` passing in the correct config file with `--config_file`, or passing in the respective arguments directly for [FSDP and DeepSpeed](../package_reference/cli#accelerate-launch) .
+
+  Example 🤗 Accelerate configurations can be found here for [DeepSpeed](../usage_guides/deepspeed#accelerate-deepspeed-plugin) and [FSDP](../usage_guides/fsdp#how-it-works-out-of-the-box), or in the [example zoo under "Launch Configurations"](../usage_guides/explore)
+ 
+</Tip>
+
+<Tip warning={true}>
+
+This tutorial is for single-node, multi-GPU, scenarios only.
+
+</Tip>
+
+## Configuring Functionalities
+
+Model tensors are split into different GPUs in an attempt to scale up model sizes; this is termed *sharding* in FSDP, and *partitioning* in DeepSpeed. FSDP sharding and DeepSpeed ZeRO (partitioning) stages are configured by `--fsdp_sharding_strategy`, and `--zero_stage`, respectively.  In particular, FSDP `FULL_SHARD` maps to DeepSpeed ZeRO stage `3`; see this [comprehensive mapping between FSDP sharding and DeepSpeed ZeRO settings](../usage_guides/fsdp#mapping-between-fsdp-sharding-strategies-and-deepspeed-zero-stages). The below table summarizes and groups similar settings:
+
+Group | Framework | Configuration | Example | Restrictions (if any)
+--|--|--|--|--
+sharding / partitioning | FSDP<br>DeepSpeed | `--fsdp_sharding_strategy`<br>`--zero_stage` | `1` (`FULL_SHARD`) <br>`3` | 
+offload | FSDP<br>DeepSpeed | `--fsdp_offload_params`<br>`--offload_param_device`<br>`--offload_optimizer_device` | `true`<br>`cpu`<br>`cpu` | all or nothing <br><br> 
+model loading | FSDP<br>DeepSpeed | <span style="white-space:nowrap;">`--fsdp_cpu_ram_efficient_loading`</span><br>`--zero3_init_flag` | `true`<br>`true` | <br>only ZeRO 3
+efficient checkpointing | FSDP<br>DeepSpeed | `--fsdp_state_dict_type`<br>`--zero3_save_16bit_model` |  `SHARDED_STATE_DICT`<br>`true` |  <br>only ZeRO 3
+weights prefetching | FSDP<br><br>DeepSpeed | `--fsdp_forward_prefetch`<br>`--fsdp_backward_prefetch`<br>None | `true`<br>`BACKWARD_PRE` | <br><br>
+model | FSDP<br><br>DeepSpeed |  `--fsdp_auto_wrap_policy`<br><span style="white-space:nowrap;">`--fsdp_transformer_layer_cls_to_wrap`</span><br>None | `TRANSFORMER_BASED_WRAP`<br><Layer Class> |<br>Usually not needed <br>Transparent to user.
+parameters summoning | FSDP<br>DeepSpeed | `--fsdp_use_orig_params`<br>None | `true` | required for `torch.compile`<br>Transparent to user
+parameters syncing | FSDP<br>DeepSpeed | `--fsdp_sync_module_states`<br>None | `true` | 
+training | FSDP<br>DeepSpeed | None<br>`--gradient_accumulation_steps`<br>`--gradient_clipping` | <br>`auto`<br>`auto` | Transparent to user
+
+For detailed descriptions of the above, refer to [🤗 `Accelerate` launch documentation](../package_reference/cli#accelerate-launch).
+
+<Tip>
+
+    To access other DeepSpeed configurations, such as mixed precision settings, 
+    you need to pass in a `--deepspeed_config_file`, see the [documentation](../usage_guides/deepspeed#deepspeed-config-file).  
+
+    DeepSpeed can be also configured via [`DeepSpeedPlugin`], e.g., `DeepSpeedPlugin.zero_stage` is equivalent of `--zero_stage`, and `DeepSpeedPlugin.hf_ds_config` can be used to pass `--deepeed_config_file.`
+
+</Tip>
+
+<Tip>
+
+    FSDP can be also configured via [`FullyShardedDataParallelPlugin`], e.g., `FullyShardedDataParallelPlugin.sharding_strategy` is equivalent of `--fsdp_sharding_strategy`.
+    
+</Tip>
+
+### Checkpointing
+
+Do note that while FSDP can be configured via `--fsdp_state_dict_type` to save either full / sharded checkpoints.
+
+<Tip>
+
+    For DeepSpeed Zero3, one could pass a `--zero3_save_16bit_model true`, which conveniently consolidates the model to a single rank and saves; this is the FSDP equivalent of `fsdp_state_dict_type: FULL_STATE_DICT`. 
+
+</Tip>
+
+<Tip warning={true}>
+
+    For large models, consolidating the model to a single rank can be very slow.
+
+</Tip>
+
+<Tip>
+
+    For quicker checkpointing, for FSDP use `fsdp_state_dict_type: SHARDED_STATE_DICT`, and for DeepSpeed Zero3 [use the `zero_to_fp32.py` script to post-convert sharded checkpoints](https://www.deepspeed.ai/tutorials/zero/#extracting-weights).
+
+
+</Tip>
+
+### Offloading
+
+FSDP only allows *all-or-nothing* offload (i.e., either offload parameters, gradients, and optimizer, or keep them all in GPU), but DeepSpeed can offload parameters and optimizer differently. Furthermore, DeepSpeed also supports [offloading to NVME](https://www.deepspeed.ai/docs/config-json/#parameter-offloading).
+
+### Prefetching
+
+FSDP allows two prefetching configurations `--fsdp_forward_prefetch` and `--fsdp_backward_prefetch` to improve overlap of comms / computation at a cost of extra memory, see [FSDP documentation](https://pytorch.org/docs/stable/fsdp.html). 
+For DeepSpeed, the prefetching will be turned on when needed, and it turns on depending on certain hyper-params like `stage3_param_persistence_threshold`, `stage3_max_reuse_distance`, etc, [that can be configured for Zero3](https://www.deepspeed.ai/docs/config-json/#parameter-offloading); 🤗 `accelerate` may set these hyper-params automatically if you don't set those explicitly in the deepspeed config file.
+
+<Tip>
+
+    For FSDP set `fsdp_backward_prefetch: BACKWARD_PRE` for improved throughputs if memory allows.
+
+</Tip>
+
+### Model Loading
+
+While FSDP require an explicit `--fsdp_cpu_ram_efficient_loading true` to activate efficient model loading, 🤗 `transformers` will activate the similar feature whenever DeepSpeed Zero3 is used.
+
+<Tip>
+
+    For FSDP, whenever setting `--fsdp_cpu_ram_efficient_loading true`, 🤗 `accelerate` will automatically set `sync_module_states` to true. 
+    For RAM efficient loading the weights will be loaded only in a singe rank, and thus requires `sync_module_states` to broadcast weights to other ranks.
+
+</Tip>
+
+### Model
+
+FSDP requires an explicit `--fsdp_auto_wrap_policy` for the algorithm to decide how to schedule the all-gather and reduce-scatter operations. But for DeepSpeed this is transparent to the user.
+
+<Tip>
+
+    For FSDP, simply set `fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP`. With the latest [`transformers`] versions, we try our best to figure out the suitable `fsdp_transformer_layer_cls_to_wrap` for HF transformers models. However, if you get an error regarding it, please specify this.
+
+</Tip>
+
+### Parameters Summoning
+
+FSDP requires an explicit `--fsdp_use_orig_params` flag if using `torch.compile`, see [the pytorch documenation](https://pytorch.org/docs/stable/fsdp.html#module-torch.distributed.fsdp). For DeepSpeed this is transparent to the user.
+
+<Tip>
+
+    For FSDP, when using `torch.compile` please set `fsdp_use_orig_params: True`.
+
+</Tip>
+
+
+## Training
+
+Deepspeed requires explicit `--gradient_accumulation_steps` and `--gradient_clipping` flags. For FSDP this is transparent to the user.
+
+<Tip>
+
+    When using DeepSpeed, set `gradient_accumulation_steps: "auto"` and `gradient_clipping: "auto"` to automatically pick up values set in the [`Accelerator`] or [`TrainingArguments`] (if using `transformers`).
+
+</Tip>
+
+
+## On Differences in Data Precision Handling
+
+To discuss the how data precision is handled in both FSDP and Deepspeed, it is instructive to first give an overview of how model parameters are handled in these frameworks. Before the model / optimizer parameters are distributed across GPUs, parameter preparation is involved to first "flatten" them to  one-dimensional [`torch.Tensor`](https://pytorch.org/docs/stable/tensors.html#torch-tensor). The implementation of FSDP / DeepSpeed varies in the respect of the `dtype` in which these "flattened" parameters are stored, and there are ramifications with regards to how [`torch.Optimizer`](https://pytorch.org/docs/stable/optim.html#module-torch.optim) allocate their `dtype`s. The table below outlines the processes for both frameworks; the "Local" column indicates the process occurring at a per-gpu level, therefore any memory overheads by upcasting should be understood to be amortized by the number of gpus used.
+
+<Tip>
+
+    As a rule of thumb, for stable training with automatic mixed precision, all the trainable parameters have to be in `torch.float32`.
+
+</Tip>
+
+Process | Local | Framework | Details
+--|--|--|--
+Loading, i.e., [`AutoModel.from_pretrained(..., torch_dtype=torch_dtype)`] |  
+Preparation, i.e., creation of "flat params" | ✅ | FSDP<br>DeepSpeed | created in `torch_dtype`.<br> disregards `torch_dtype`, created in `float32`.
+Optimizer initialization | ✅ | FSDP<br>DeepSpeed  | creates parameters in `torch_dtype`<br> creates parameters in `float32`
+Training Step, i.e, forward, backward, reduction | | FSDP<br>DeepSpeed  | follows [`MixedPrecision`](https://pytorch.org/docs/stable/fsdp.html#torch.distributed.fsdp.MixedPrecision)<br> follows `deepspeed_config_file` mixed precision settings.
+Optimizer (Pre-Step) | ✅ | FSDP<br>DeepSpeed | upcasting (if any) to `torch_dtype`<br>upcasted to `float32`
+Optimizer (Actual Step) | ✅ | FSDP<br>DeepSpeed  | occurs in `torch_dtype` <br> occurs in `float32`.
+
+<Tip warning={true}>
+
+    Therefore when using DeepSpeed a small number of GPUs, be aware of potentially significant memory overheads due to the upcasting during preperation.
+
+</Tip>
+
+<Tip>
+
+    With FSDP, in the absence of mixed precision, it is possible to operate the [`torch.Optimizer`](https://pytorch.org/docs/stable/optim.html#module-torch.optim) in low precision `torch_dtype`, which may be helpful when using small number of GPUs. 
+
+</Tip>
+
+<Tip warning={true}>
+
+    With mixed precision, FSDP and DeepSpeed will upcast in the model preparation step (c.f. table above). But do note that FSDP will then save checkpoints in the upcasted precision; Deepspeed may still save low precision checkpoints if `--zero3_save_16bit_model` is specified.
+
+</Tip>
+
+
+To clarify the above table consider the concrete examples below; the optimizer pre- and actual step combined for brevity. With FSDP it is possible to operate in the two modes shown below, but DeepSpeed can only operate in one.
+
+Framework | Model Loading (`torch_dtype`) | Mixed Precision | Preparation (Local) | Training | Optimizer (Local)
+--|--|--|--|--|--
+FSDP | bf16 | default (none) | bf16 | bf16 | bf16
+FSDP | bf16 | bf16 | fp32 | bf16 | fp32
+DeepSpeed   | bf16 | bf16 | fp32 | bf16 | fp32
--- a/docs/source/package_reference/cli.md
+++ b/docs/source/package_reference/cli.md
@ -208,6 +208,10 @@ The following arguments are only useful when `use_fsdp` is passed or Fully Shard
 * `--fsdp_transformer_layer_cls_to_wrap` (`str`) -- Transformer layer class name (case-sensitive) to wrap, e.g, `BertLayer`, `GPTJBlock`, `T5Block` ...
 * `--fsdp_backward_prefetch_policy` (`str`) -- FSDP's backward prefetch policy.
 * `--fsdp_state_dict_type` (`str`) -- FSDP's state dict type.
+* `--fsdp_forward_prefetch` (`str`) -- FSDP forward prefetch.
+* `--fsdp_use_orig_params` (`str`) -- If True, allows non-uniform `requires_grad` mixed in a FSDP unit.
+* `--fsdp_cpu_ram_efficient_loading` (`str`) - If true, only the first process loads the pretrained model checkoint while all other processes have empty weights. When using this, `--fsdp_sync_module_states` needs to True. 
+* `--fsdp_sync_module_states` (`str`) - If true, each individually wrapped FSDP unit will broadcast module parameters from rank 0.

 **Megatron-LM Arguments**:

--- a/docs/source/package_reference/deepspeed.md
+++ b/docs/source/package_reference/deepspeed.md
@ -17,12 +17,12 @@ rendered properly in your Markdown viewer.

 [[autodoc]] utils.DeepSpeedPlugin

-[[autodoc]] utils.DummyOptim
+[[autodoc]] utils.deepspeed.DummyOptim

-[[autodoc]] utils.DummyScheduler
+[[autodoc]] utils.deepspeed.DummyScheduler

-[[autodoc]] utils.DeepSpeedEngineWrapper
+[[autodoc]] utils.deepspeed.DeepSpeedEngineWrapper

-[[autodoc]] utils.DeepSpeedOptimizerWrapper
+[[autodoc]] utils.deepspeed.DeepSpeedOptimizerWrapper

-[[autodoc]] utils.DeepSpeedSchedulerWrapper
+[[autodoc]] utils.deepspeed.DeepSpeedSchedulerWrapper
--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@ -93,6 +93,9 @@ accelerator = Accelerator()
 > [!WARNING]
 > This step is *optional* but it is considered best practice to allow Accelerate to handle device placement. You could also deactivate automatic device placement by passing `device_placement=False` when initializing the [`Accelerator`]. If you want to explicitly place objects on a device with `.to(device)`, make sure you use `accelerator.device` instead. For example, if you create an optimizer before placing a model on `accelerator.device`, training fails on a TPU.

+> [!WARNING]
+> Accelerate does not use non-blocking transfers by default for its automatic device placement, which can result in potentially unwanted CUDA synchronizations.  You can enable non-blocking transfers by passing a [`~utils.dataclasses.DataLoaderConfiguration`] with `non_blocking=True` set as the `dataloader_config` when initializing the [`Accelerator`].  As usual, non-blocking transfers will only work if the dataloader also has `pin_memory=True` set.  Be wary that using non-blocking transfers from GPU to CPU may cause incorrect results if it results in CPU operations being performed on non-ready tensors.
+
 ```py
 device = accelerator.device
 ```
@ -121,7 +124,7 @@ To perform distributed evaluation, pass your validation dataloader to the [`~Acc
 validation_dataloader = accelerator.prepare(validation_dataloader)
 ```

-Each device in your distributed setup only receives a part of the evaluation data, which means you should group your predictions together with the [`~Accelerator.gather_for_metrics`] method. This method requires all tensors to be the same size on each process, so if your tensors have different sizes on each process (for instance when dynamically padding to the maximum length in a batch), you should use the [`~Accelerator.pad_across_processes`] method to pad you tensor to the largest size across processes.
+Each device in your distributed setup only receives a part of the evaluation data, which means you should group your predictions together with the [`~Accelerator.gather_for_metrics`] method. This method requires all tensors to be the same size on each process, so if your tensors have different sizes on each process (for instance when dynamically padding to the maximum length in a batch), you should use the [`~Accelerator.pad_across_processes`] method to pad you tensor to the largest size across processes. Note that the tensors needs to be 1D and that we concatenate the tensors along the first dimension. 

 ```python
 for inputs, targets in validation_dataloader:
@ -132,6 +135,8 @@ for inputs, targets in validation_dataloader:
    metric.add_batch(all_predictions, all_targets)
 ```

+For more complex cases (e.g. 2D tensors, don't want to concatenate tensors, dict of 3D tensors), you can pass `use_gather_object=True` in `gather_for_metrics`. This will return the list of objects after gathering. Note that using it with GPU tensors is not well supported and inefficient.
+
 > [!TIP]
 > Data at the end of a dataset may be duplicated so the batch can be equally divided among all workers. The [`~Accelerator.gather_for_metrics`] method automatically removes the duplicated data to calculate a more accurate metric.

--- a/docs/source/usage_guides/deepspeed.md
+++ b/docs/source/usage_guides/deepspeed.md
@ -157,10 +157,18 @@ Currently, `Accelerate` supports following config through the CLI:
 `gradient_accumulation_steps`: Number of training steps to accumulate gradients before averaging and applying them.
 `gradient_clipping`: Enable gradient clipping with value.
 `offload_optimizer_device`: [none] Disable optimizer offloading, [cpu] offload optimizer to CPU, [nvme] offload optimizer to NVMe SSD. Only applicable with ZeRO >= Stage-2.
+`offload_optimizer_nvme_path`: Decides Nvme Path to offload optimizer states. If unspecified, will default to 'none'.
 `offload_param_device`: [none] Disable parameter offloading, [cpu] offload parameters to CPU, [nvme] offload parameters to NVMe SSD. Only applicable with ZeRO Stage-3.
+`offload_param_nvme_path`: Decides Nvme Path to offload parameters. If unspecified, will default to 'none'.
 `zero3_init_flag`: Decides whether to enable `deepspeed.zero.Init` for constructing massive models. Only applicable with ZeRO Stage-3.
 `zero3_save_16bit_model`: Decides whether to save 16-bit model weights when using ZeRO Stage-3.
 `mixed_precision`: `no` for FP32 training, `fp16` for FP16 mixed-precision training and `bf16` for BF16 mixed-precision training.
+`deepspeed_moe_layer_cls_names`: Comma-separated list of transformer Mixture-of-Experts (MoE) layer class names (case-sensitive) to wrap ,e.g, `MixtralSparseMoeBlock`, `Qwen2MoeSparseMoeBlock`, `JetMoEAttention,JetMoEBlock` ...
+`deepspeed_hostfile`: DeepSpeed hostfile for configuring multi-node compute resources.
+`deepspeed_exclusion_filter`: DeepSpeed exclusion filter string when using mutli-node setup.
+`deepspeed_inclusion_filter`: DeepSpeed inclusion filter string when using mutli-node setup.
+`deepspeed_multinode_launcher`: DeepSpeed multi-node launcher to use. If unspecified, will default to `pdsh`.
+`deepspeed_config_file`: path to the DeepSpeed config file in `json` format. See the next section for more details on this.
 ```
 To be able to tweak more options, you will need to use a DeepSpeed config file.

@ -721,3 +729,10 @@ Papers:

 Finally, please, remember that 🤗 `Accelerate` only integrates DeepSpeed, therefore if you
 have any problems or questions with regards to DeepSpeed usage, please, file an issue with [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/issues).
+
+
+<Tip>
+
+    For those interested in the similarities and differences between FSDP and DeepSpeed, please check out the [concept guide here](../concept_guides/fsdp_and_deepspeed.md)!
+    
+</Tip>
--- a/docs/source/usage_guides/distributed_inference.md
+++ b/docs/source/usage_guides/distributed_inference.md
@ -140,6 +140,8 @@ with distributed_state.split_between_processes(["a dog", "a cat", "a chicken"],
 On the first GPU, the prompts will be `["a dog", "a cat"]`, and on the second GPU it will be `["a chicken", "a chicken"]`.
 Make sure to drop the final sample, as it will be a duplicate of the previous one.

+You can find more complex examples [here](https://github.com/huggingface/accelerate/tree/main/examples/inference/distributed) such as how to use it with LLMs.
+
 ## Memory-efficient pipeline parallelism (experimental)

 This next part will discuss using *pipeline parallelism*. This is an **experimental** API utilizing the [PiPPy library by PyTorch](https://github.com/pytorch/PiPPy/) as a native solution. 
@ -232,4 +234,4 @@ if PartialState().is_last_process:
    
 </Tip>

-And that's it! To explore more, please check out the inference examples in the [Accelerate repo](https://github.com/huggingface/accelerate/tree/main/examples/inference) and our [documentation](../package_reference/inference) as we work to improving this integration. 
+And that's it! To explore more, please check out the inference examples in the [Accelerate repo](https://github.com/huggingface/accelerate/tree/main/examples/inference/pippy) and our [documentation](../package_reference/inference) as we work to improving this integration. 
--- a/docs/source/usage_guides/fsdp.md
+++ b/docs/source/usage_guides/fsdp.md
@ -175,3 +175,10 @@ You can then pass `state` into the `save_pretrained` method.  There are several

 For more control, users can leverage the `FullyShardedDataParallelPlugin`. After creating an instance of this class, users can pass it to the Accelerator class instantiation.
 For more information on these options, please refer to the PyTorch [FullyShardedDataParallel](https://github.com/pytorch/pytorch/blob/0df2e863fbd5993a7b9e652910792bd21a516ff3/torch/distributed/fsdp/fully_sharded_data_parallel.py#L236) code.
+
+
+<Tip>
+
+    For those interested in the similarities and differences between FSDP and DeepSpeed, please check out the [concept guide here](../concept_guides/fsdp_and_deepspeed.md)!
+    
+</Tip>
--- a/docs/source/usage_guides/tracking.md
+++ b/docs/source/usage_guides/tracking.md
@ -198,7 +198,7 @@ achieve the same outcome with:

 ```python
 wandb_tracker = accelerator.get_tracker("wandb", unwrap=True)
-with accelerator.on_main_process:
+if accelerator.is_main_process:
    wandb_tracker.log_artifact(some_artifact_to_log)
 ```

--- a/examples/by_feature/cross_validation.py
+++ b/examples/by_feature/cross_validation.py
@ -248,7 +248,7 @@ def training_function(config, args):
        # Use accelerator.print to print only on the main process.
        test_predictions.append(torch.cat(fold_predictions, dim=0))
        # We now need to release all our memory and get rid of the current model, optimizer, etc
-        accelerator.free_memory()
+        model, optimizer = accelerator.free_memory(model, optimizer)
    # New Code #
    # Finally we check the accuracy of our folded results:
    test_references = torch.cat(test_references, dim=0)
--- a/examples/by_feature/deepspeed_with_config_support.py
+++ b/examples/by_feature/deepspeed_with_config_support.py
@ -34,7 +34,7 @@ import datasets
 import torch
 import transformers
 from datasets import load_dataset
-from huggingface_hub import Repository
+from huggingface_hub import HfApi
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
 from transformers import (
@ -47,7 +47,6 @@ from transformers import (
    default_data_collator,
    get_scheduler,
 )
-from transformers.utils import get_full_repo_name
 from transformers.utils.versions import require_version

 from accelerate import Accelerator, DistributedType
@ -303,11 +302,13 @@ def main():
    # Handle the repository creation
    if accelerator.is_main_process:
        if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            repo = Repository(args.output_dir, clone_from=repo_name)
+            api = HfApi(token=args.hub_token)
+
+            # Create repo (repo_name from args or inferred)
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            repo_id = api.create_repo(repo_name, exist_ok=True).repo_id

            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                if "step_*" not in gitignore:
@ -707,7 +708,11 @@ def main():
        if accelerator.is_main_process:
            tokenizer.save_pretrained(args.output_dir)
            if args.push_to_hub:
-                repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
+                api.upload_folder(
+                    repo_id=repo_id,
+                    folder_path=args.output_dir,
+                    commit_message="End of training",
+                )

        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
            json.dump({"perplexity": perplexity, "eval_loss": eval_loss.item()}, f)
--- a/examples/by_feature/megatron_lm_gpt_pretraining.py
+++ b/examples/by_feature/megatron_lm_gpt_pretraining.py
@ -34,7 +34,7 @@ import datasets
 import torch
 import transformers
 from datasets import load_dataset
-from huggingface_hub import Repository
+from huggingface_hub import HfApi
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
 from transformers import (
@ -47,7 +47,7 @@ from transformers import (
    default_data_collator,
    get_scheduler,
 )
-from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
+from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version

 from accelerate import Accelerator, DistributedType
@ -277,11 +277,13 @@ def main():
    # Handle the repository creation
    if accelerator.is_main_process:
        if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            repo = Repository(args.output_dir, clone_from=repo_name)
+            api = HfApi(token=args.hub_token)
+
+            # Create repo (repo_name from args or inferred)
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            repo_id = api.create_repo(repo_name, exist_ok=True).repo_id

            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                if "step_*" not in gitignore:
@ -661,8 +663,11 @@ def main():
            )
            if accelerator.is_main_process:
                tokenizer.save_pretrained(args.output_dir)
-                repo.push_to_hub(
-                    commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
+                api.upload_folder(
+                    repo_id=repo_id,
+                    folder_path=args.output_dir,
+                    commit_message=f"Training in progress epoch {epoch}",
+                    run_as_future=True,
                )

        if args.checkpointing_steps == "epoch":
@ -690,7 +695,11 @@ def main():
        if accelerator.is_main_process:
            tokenizer.save_pretrained(args.output_dir)
            if args.push_to_hub:
-                repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
+                api.upload_folder(
+                    repo_id=repo_id,
+                    folder_path=args.output_dir,
+                    commit_message="End of training",
+                )

        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
            json.dump({"perplexity": perplexity}, f)
--- a/examples/by_feature/schedule_free.py
+++ b/examples/by_feature/schedule_free.py
@ -0,0 +1,225 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+
+import evaluate
+import torch
+from datasets import load_dataset
+from torch.utils.data import DataLoader
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, set_seed
+
+from accelerate import Accelerator, DistributedType
+from accelerate.utils import is_schedulefree_available
+
+
+if is_schedulefree_available():
+    import schedulefree
+else:
+    raise ImportError(
+        "This example requires the `schedulefree` library. Please install it with `pip install schedulefree`"
+    )
+
+
+########################################################################
+# This is a fully working simple example to use Accelerate and Facebook's
+# scheduler-free optimizer: https://github.com/facebookresearch/schedule_free/
+#
+# This example trains a Bert base model on GLUE MRPC
+# in any of the following settings (with the same script):
+#   - single CPU or single GPU
+#   - multi GPUS (using PyTorch distributed mode)
+#   - (multi) TPUs
+#   - fp16 (mixed-precision) or fp32 (normal precision)
+#
+# To run it in each of these various modes, follow the instructions
+# in the readme for examples:
+# https://github.com/huggingface/accelerate/tree/main/examples
+#
+########################################################################
+
+
+MAX_GPU_BATCH_SIZE = 16
+EVAL_BATCH_SIZE = 32
+
+
+def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
+    """
+    Creates a set of `DataLoader`s for the `glue` dataset,
+    using "bert-base-cased" as the tokenizer.
+
+    Args:
+        accelerator (`Accelerator`):
+            An `Accelerator` object
+        batch_size (`int`, *optional*):
+            The batch size for the train and validation DataLoaders.
+    """
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+    datasets = load_dataset("glue", "mrpc")
+
+    def tokenize_function(examples):
+        # max_length=None => use the model max length (it's actually the default)
+        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
+        return outputs
+
+    # Apply the method we just defined to all the examples in all the splits of the dataset
+    # starting with the main process first:
+    with accelerator.main_process_first():
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            batched=True,
+            remove_columns=["idx", "sentence1", "sentence2"],
+        )
+
+    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
+    # transformers library
+    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
+
+    def collate_fn(examples):
+        # For Torchxla, it's best to pad everything to the same length or training will be very slow.
+        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
+        # When using mixed precision we want round multiples of 8/16
+        if accelerator.mixed_precision == "fp8":
+            pad_to_multiple_of = 16
+        elif accelerator.mixed_precision != "no":
+            pad_to_multiple_of = 8
+        else:
+            pad_to_multiple_of = None
+
+        return tokenizer.pad(
+            examples,
+            padding="longest",
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors="pt",
+        )
+
+    # Instantiate dataloaders.
+    train_dataloader = DataLoader(
+        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size, drop_last=True
+    )
+    eval_dataloader = DataLoader(
+        tokenized_datasets["validation"],
+        shuffle=False,
+        collate_fn=collate_fn,
+        batch_size=EVAL_BATCH_SIZE,
+        drop_last=(accelerator.mixed_precision == "fp8"),
+    )
+
+    return train_dataloader, eval_dataloader
+
+
+# For testing only
+
+
+if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
+    from accelerate.test_utils.training import mocked_dataloaders
+
+    get_dataloaders = mocked_dataloaders  # noqa: F811
+
+
+def training_function(config, args):
+    # Initialize accelerator
+    accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision)
+    # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
+    lr = config["lr"]
+    num_epochs = int(config["num_epochs"])
+    seed = int(config["seed"])
+    batch_size = int(config["batch_size"])
+
+    metric = evaluate.load("glue", "mrpc")
+
+    # If the batch size is too big we use gradient accumulation
+    gradient_accumulation_steps = 1
+    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
+        gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
+        batch_size = MAX_GPU_BATCH_SIZE
+
+    set_seed(seed)
+    train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
+    # Instantiate the model (we build the model here so that the seed also control new weights initialization)
+    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)
+
+    # We could avoid this line since the accelerator is set with `device_placement=True` (default value).
+    # Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
+    # creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
+    model = model.to(accelerator.device)
+    # Instantiate optimizer with warmup steps
+    optimizer = schedulefree.AdamWScheduleFree(
+        model.parameters(),
+        lr=lr,
+        warmup_steps=100,
+    )
+
+    # Prepare everything
+    # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
+    # prepare method.
+
+    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader
+    )
+
+    # Now we train the model
+    for epoch in range(num_epochs):
+        model.train()
+        optimizer.train()
+        for step, batch in enumerate(train_dataloader):
+            # We could avoid this line since we set the accelerator with `device_placement=True`.
+            batch.to(accelerator.device)
+            outputs = model(**batch)
+            loss = outputs.loss
+            loss = loss / gradient_accumulation_steps
+            accelerator.backward(loss)
+            if step % gradient_accumulation_steps == 0:
+                optimizer.step()
+                optimizer.zero_grad()
+
+        model.eval()
+        optimizer.eval()
+        for step, batch in enumerate(eval_dataloader):
+            # We could avoid this line since we set the accelerator with `device_placement=True`.
+            batch.to(accelerator.device)
+            with torch.no_grad():
+                outputs = model(**batch)
+            predictions = outputs.logits.argmax(dim=-1)
+            predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))
+            metric.add_batch(
+                predictions=predictions,
+                references=references,
+            )
+
+        eval_metric = metric.compute()
+        # Use accelerator.print to print only on the main process.
+        accelerator.print(f"epoch {epoch}:", eval_metric)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Simple example of training script.")
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16", "fp8"],
+        help="Whether to use mixed precision. Choose"
+        "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+        "and an Nvidia Ampere GPU.",
+    )
+    parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
+    args = parser.parse_args()
+    config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
+    training_function(config, args)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/inference/distributed/README.md
+++ b/examples/inference/distributed/README.md
@ -0,0 +1,25 @@
+# Distributed inference examples
+
+This folder contains a variety of tutorials for running distributed inference with the following strategy: 
+
+Load an entire model onto each GPU and sending chunks of a batch through each GPU’s model copy at a time
+
+## Installation
+
+```bash
+pip install accelerate torch
+```
+
+## Running code
+
+You can either use `torchrun` or the recommended way of `accelerate launch` (without needing to run `accelerate config`) on each script:
+
+```bash
+accelerate launch --num_processes {NUM_GPUS} phi2.py
+```
+
+Or:
+
+```bash
+torchrun --nproc-per-node {NUM_GPUS} phi2.py
+```
--- a/examples/inference/distributed/phi2.py
+++ b/examples/inference/distributed/phi2.py
@ -0,0 +1,86 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from accelerate import PartialState
+from accelerate.utils import gather_object
+
+
+# Start up the distributed environment without needing the Accelerator.
+distributed_state = PartialState()
+
+# You can change the model to any LLM such as mistralai/Mistral-7B-v0.1 or meta-llama/Llama-2-7b-chat-hf
+model_name = "microsoft/phi-2"
+model = AutoModelForCausalLM.from_pretrained(
+    model_name, device_map=distributed_state.device, torch_dtype=torch.float16
+)
+
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+# Need to set the padding token to the eos token for generation
+tokenizer.pad_token = tokenizer.eos_token
+
+prompts = [
+    "I would like to",
+    "hello how are you",
+    "what is going on",
+    "roses are red and",
+    "welcome to the hotel",
+]
+
+# You can change the batch size depending on your GPU RAM
+batch_size = 2
+# We set it to 8 since it is better for some hardware. More information here https://github.com/huggingface/tokenizers/issues/991
+pad_to_multiple_of = 8
+
+# Split into batches
+# We will get the following results:
+# [ ["I would like to", "hello how are you"], [ "what is going on", "roses are red and"], [ "welcome to the hotel"] ]
+formatted_prompts = [prompts[i : i + batch_size] for i in range(0, len(prompts), batch_size)]
+
+# Apply padding on the left since we are doing generation
+padding_side_default = tokenizer.padding_side
+tokenizer.padding_side = "left"
+# Tokenize each batch
+tokenized_prompts = [
+    tokenizer(formatted_prompt, padding=True, pad_to_multiple_of=pad_to_multiple_of, return_tensors="pt")
+    for formatted_prompt in formatted_prompts
+]
+# Put back the original padding behavior
+tokenizer.padding_side = padding_side_default
+
+completions_per_process = []
+# We automatically split the batched data we passed to it across all the processes. We also set apply_padding=True
+# so that the GPUs will have the same number of prompts, and you can then gather the results.
+# For example, if we have 2 gpus, the distribution will be:
+# GPU 0: ["I would like to", "hello how are you"],  "what is going on", "roses are red and"]
+# GPU 1: ["welcome to the hotel"], ["welcome to the hotel"] -> this prompt is duplicated to ensure that all gpus have the same number of prompts
+with distributed_state.split_between_processes(tokenized_prompts, apply_padding=True) as batched_prompts:
+    for batch in batched_prompts:
+        # Move the batch to the device
+        batch = batch.to(distributed_state.device)
+        # We generate the text, decode it and add it to the list completions_per_process
+        outputs = model.generate(**batch, max_new_tokens=20)
+        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        completions_per_process.extend(generated_text)
+
+# We are gathering string, so we need to use gather_object.
+# If you need to gather tensors, you can use gather from accelerate.utils
+completions_gather = gather_object(completions_per_process)
+
+# Drop duplicates produced by apply_padding in split_between_processes
+completions = completions_gather[: len(prompts)]
+
+distributed_state.print(completions)
--- a/examples/inference/distributed/stable_diffusion.py
+++ b/examples/inference/distributed/stable_diffusion.py
@ -0,0 +1,30 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from diffusers import DiffusionPipeline
+
+from accelerate import PartialState  # Can also be Accelerator or AcceleratorState
+
+
+pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
+distributed_state = PartialState()
+pipe.to(distributed_state.device)
+
+# Assume two processes
+# On the first GPU, the prompts will be ["a dog", "a cat"],
+# and on the second GPU it will be ["a chicken", "a chicken"].
+# Make sure to drop the final sample, as it will be a duplicate of the previous one.
+with distributed_state.split_between_processes(["a dog", "a cat", "a chicken"], apply_padding=True) as prompt:
+    result = pipe(prompt).images
--- a/examples/inference/pippy/README.md
+++ b/examples/inference/pippy/README.md
--- a/examples/inference/pippy/bert.py
+++ b/examples/inference/pippy/bert.py
--- a/examples/inference/pippy/gpt2.py
+++ b/examples/inference/pippy/gpt2.py
--- a/examples/inference/pippy/llama.py
+++ b/examples/inference/pippy/llama.py
--- a/examples/inference/pippy/requirements.txt
+++ b/examples/inference/pippy/requirements.txt
--- a/examples/inference/pippy/t5.py
+++ b/examples/inference/pippy/t5.py
--- a/examples/requirements.txt
+++ b/examples/requirements.txt
@ -1,3 +1,5 @@
 accelerate # used to be installed in Amazon SageMaker environment
 evaluate
-datasets==2.3.2
+datasets==2.3.2
+schedulefree
+huggingface_hub>=0.20.0
--- a/manim_animations/dataloaders/stage_0.py
+++ b/manim_animations/dataloaders/stage_0.py
@ -0,0 +1,32 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from manim import *
+
+
+class Stage0(Scene):
+    def construct(self):
+        mascot = ImageMobject("mascot_bookie.png")
+        mascot.scale(.35)
+        mascot.move_to([-3.75,-1,0])
+        text = Paragraph(
+            "Distributed Training,\nHugging Face Accelerate,\nand PyTorch DataLoaders\n\nHow do they all interact?", 
+            font_size=36,
+            line_spacing=1,
+            alignment="center",
+            weight=BOLD,
+        )
+        text.move_to([1.75,.5,0])
+        self.add(mascot)
+        self.add(text)
--- a/manim_animations/dataloaders/stage_1.py
+++ b/manim_animations/dataloaders/stage_1.py
@ -0,0 +1,31 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from manim import *
+
+class Stage01(Scene):
+    def construct(self):
+        mascot = ImageMobject("mascot_bookie.png")
+        mascot.scale(.35)
+        mascot.move_to([-3.75,-1,0])
+        text = Paragraph(
+            "Distributed Training,\nHugging Face Accelerate,\nand PyTorch DataLoaders\n\nHow do they all interact?", 
+            font_size=36,
+            line_spacing=1,
+            alignment="center",
+            weight=BOLD,
+        )
+        text.move_to([1.75,.5,0])
+        self.add(mascot)
+        self.add(text)
--- a/manim_animations/dataloaders/stage_2.py
+++ b/manim_animations/dataloaders/stage_2.py
@ -0,0 +1,176 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from manim import *
+
+
+class Stage2(Scene):
+    def construct(self):
+        # The dataset items
+        fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)
+        columns = [
+            VGroup(*[Rectangle(height=0.25,width=0.25,color="green") for i in range(8)]).arrange(RIGHT,buff=0)
+            for j in range(4)
+        ]
+        dataset_recs = VGroup(*columns).arrange(UP, buff=0)
+        dataset_text = Text("Dataset", font_size=24)
+        dataset = Group(dataset_recs,dataset_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
+        dataset.move_to([-2,0,0])
+        self.add(dataset)
+        
+        code = Code(
+            code="dataloader = DataLoader(...)\nfor batch in dataloader():\n\t...",
+            tab_width=4,
+            background="window",
+            language="Python",
+            font="Monospace",
+            font_size=14,
+            corner_radius=.2,
+            insert_line_no=False,
+            line_spacing=.75,
+            style=Code.styles_list[1],
+        )
+        code.move_to([-3.5, 2.5, 0])
+        self.add(code)
+
+        # The dataloader itself
+        dataloader = Group(
+            Rectangle(color="red", height=2, width=2),
+            Text("DataLoader", font_size=24)
+        ).arrange(DOWN, buff=.5, aligned_edge=DOWN)
+
+        sampler = Group(
+            Rectangle(color="blue", height=1, width=1),
+            Text("Sampler", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN)
+        dataloader.move_to([1, 0, 0])
+        sampler.move_to([.75,.25,0])
+        self.add(dataloader)
+        self.add(sampler)
+
+        gpu_1 = Group(
+            Rectangle(color="white", height=1, width=1),
+            Text("GPU 1", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4, 2, 0])
+        gpu_2 = Group(
+            Rectangle(color="white", height=1, width=1),
+            Text("GPU 2", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4, .5, 0])
+        gpu_3 = Group(
+            Rectangle(color="white", height=1, width=1),
+            Text("GPU 3", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4, -1, 0])
+        gpu_4 = Group(
+            Rectangle(color="white", height=1, width=1),
+            Text("GPU 4", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4, -2.5, 0])
+        gpus = [gpu_1[0], gpu_2[0], gpu_3[0], gpu_4[0]]
+        self.add(gpu_1, gpu_2, gpu_3, gpu_4)
+
+        # Animate their existence
+        self.play(
+            Create(gpu_1[0], run_time=0.5),
+            Create(gpu_2[0], run_time=0.5),
+            Create(gpu_3[0], run_time=0.5),
+            Create(gpu_4[0], run_time=0.5),
+            Create(dataset_recs, run_time=1),
+            Create(sampler[0], run_time=1),
+            Create(dataloader[0], run_time=1)
+        )
+
+        step_1 = MarkupText(
+            f"Without any special care, \nthe same data is sent though each sampler, \nand the same samples are spit out on each GPU",
+            font_size=18
+        )
+        step_1.move_to([0, -2.5, 0])
+        self.play(
+            Write(step_1, run_time=4),
+        )
+
+        first_animations = []
+        second_animations = []
+
+
+        colors = ["BLUE_E", "DARK_BROWN", "GOLD_E", "GRAY_A"]
+        current_color = colors[0]
+        buff = 0
+        lr_buff = .25
+        old_target = None
+        new_datasets = []
+        for i,data in enumerate(dataset_recs[-1]):
+            if i % 2 == 0:
+                # current_color = colors[i//2]
+                current_color = "BLUE_E"
+            dataset_target = Rectangle(height=0.46/2,width=0.46/2).set_stroke(width=0.).set_fill(current_color, opacity=0.7)
+            dataset_target.move_to(data)
+            dataset_target.generate_target()
+            aligned_edge = ORIGIN
+            if i % 2 == 0:
+                old_target = dataset_target.target
+                buff -= .25
+                aligned_edge = LEFT
+                dataset_target.target.next_to(
+                    sampler, buff=buff, direction=UP,
+                    aligned_edge=LEFT
+                )
+            else:
+                dataset_target.target.next_to(
+                    old_target, direction=RIGHT, buff=0.01,
+                )
+            new_datasets.append(dataset_target)
+            first_animations.append(data.animate(run_time=0.5).set_stroke(current_color))
+            second_animations.append(MoveToTarget(dataset_target, run_time=1.5))
+        self.play(*first_animations)
+        self.play(*second_animations)
+        self.wait()
+
+        move_animation = []
+
+        for j,gpu in enumerate(gpus):
+            buff = 0
+            for i,data in enumerate(new_datasets):
+                if i % 2 == 0:
+                    current_color = colors[i//2]
+                if j != 3:
+                    data = data.copy()
+                data.generate_target()
+                aligned_edge = ORIGIN
+                if i % 2 == 0:
+                    old_target = data.target
+                    buff -= .25
+                    aligned_edge = LEFT
+                    data.target.next_to(
+                        gpu, buff=buff, direction=UP,
+                        aligned_edge=LEFT
+                    )
+                else:
+                    data.target.next_to(
+                        old_target, direction=RIGHT, buff=0.01,
+                    )
+                move_animation.append(MoveToTarget(data, run_time=1.5))
+
+
+        self.play(*move_animation)
+
+        self.remove(step_1)
+        step_2 = MarkupText(
+            f"This behavior is undesireable, because we want\neach GPU to see different data for efficient training.",
+            font_size=18
+        )
+        step_2.move_to([0, -2.5, 0])
+
+        self.play(
+            Write(step_2, run_time=2.5),
+        )
+        self.wait()
--- a/manim_animations/dataloaders/stage_3.py
+++ b/manim_animations/dataloaders/stage_3.py
@ -0,0 +1,34 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from manim import *
+
+class Stage3(Scene):
+    def construct(self):
+        step_1 = MarkupText(
+            f"To combat this, Accelerate employs one of two different\nSampler wrapper methods depending on the scenario:",
+            font_size=24
+        )
+        step_1.move_to([0, 1.5, 0])
+        self.add(step_1)
+        step_2 = MarkupText(
+            f"1. Sharding the dataset before drawing:\n\t● <span fgcolor='{RED}'>IterableDatasetShard</span>\n\t● <span fgcolor='{RED}'>BatchSamplerShard</span>",
+            font_size=24,
+        ).next_to(step_1, direction=DOWN, aligned_edge=LEFT)
+        self.add(step_2)
+        step_3 = MarkupText(
+            f"\n\n2. Splitting the batch after drawing:\n\t● <span fgcolor='{BLUE}'>DataLoaderDispatcher</span>",
+            font_size=24,
+        ).next_to(step_2, direction=DOWN, aligned_edge=LEFT)
+        self.add(step_3)
--- a/manim_animations/dataloaders/stage_4.py
+++ b/manim_animations/dataloaders/stage_4.py
@ -0,0 +1,52 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from manim import *
+
+class Stage4(Scene):
+    def construct(self):
+
+        step_1 = MarkupText(
+            f"To understand the next part fully, let's define two terms,\n<span fgcolor='{RED}'>`batch_size`</span> and <span fgcolor='{BLUE}'>`global_batch_size`</span>:",
+            font_size=18
+        )
+        step_1.move_to([0, 1.5, 0])
+        # <span fgcolor='{YELLOW}'>●</span>
+        step_2 = MarkupText(
+            f"\n\n● <span fgcolor='{RED}'>`batch_size`</span>: \n\tThis will be defined as the batch size seen on a given\n\t*individual* GPU",
+            font_size=18,
+        ).next_to(step_1, direction=DOWN, aligned_edge=LEFT)
+
+        step_3 = MarkupText(
+            f"\n\n● <span fgcolor='{BLUE}'>`global_batch_size`</span>:\n\tThis will be defined as the *total* number of\n\tdifferent items seen in the dataset, across all GPUs",
+            font_size=18,
+        ).next_to(step_2, direction=DOWN, aligned_edge=LEFT)
+
+        step_4 = MarkupText(
+            f"\n\nSo if we have a dataset of 64 items, 8 GPUs, \nand a `batch_size` of 8, each *step* will go through\nthe entire dataset one time as 8*8=64",
+            font_size=18,
+        ).next_to(step_3, direction=DOWN, aligned_edge=LEFT)
+        self.play(
+            Write(step_1, run_time=4),
+        )
+        self.play(
+            Write(step_2, run_time=4)
+        )
+        self.play(
+            Write(step_3, run_time=4)
+        )
+        self.play(
+            Write(step_4, run_time=6)
+        )
+        self.wait()
--- a/manim_animations/dataloaders/stage_5.py
+++ b/manim_animations/dataloaders/stage_5.py
@ -0,0 +1,203 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from manim import *
+
+class Stage5(Scene):
+    def construct(self):
+        # The dataset items
+        colors = ["BLUE_E", "DARK_BROWN", "GOLD_E", "GRAY_A"]
+        fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)
+        columns = [
+            VGroup(*[Rectangle(height=0.25,width=0.25,color=colors[j]) for i in range(8)]).arrange(RIGHT,buff=0)
+            for j in range(4)
+        ]
+        dataset_recs = VGroup(*columns).arrange(UP, buff=0)
+        dataset_text = Text("Dataset", font_size=24)
+        dataset = Group(dataset_recs,dataset_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
+        dataset.move_to([-2,0,0])
+        self.add(dataset)
+        code = Code(
+            code="# We enable this by default\naccelerator = Accelerator()\ndataloader = DataLoader(...)\ndataloader = accelerator.prepare(dataloader)\nfor batch in dataloader:\n\t...",
+            tab_width=4,
+            background="window",
+            language="Python",
+            font="Monospace",
+            font_size=14,
+            corner_radius=.2,
+            insert_line_no=False,
+            line_spacing=.75,
+            style=Code.styles_list[1],
+        )
+        code.move_to([-3.5, 2.5, 0])
+        self.add(code)
+
+        # The dataloader itself
+
+        sampler_1 = Group(
+            Rectangle(color="blue", height=1, width=1),
+            Text("Sampler GPU 1", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN)
+        sampler_2 = Group(
+            Rectangle(color="blue", height=1, width=1),
+            Text("Sampler GPU 2", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN)
+        sampler_3 = Group(
+            Rectangle(color="blue", height=1, width=1),
+            Text("Sampler GPU 3", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN)
+        sampler_4 = Group(
+            Rectangle(color="blue", height=1, width=1),
+            Text("Sampler GPU 4", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN)
+        sampler_1.move_to([2,2,0])
+        sampler_2.move_to([2,.5,0])
+        sampler_3.move_to([2,-1.,0])
+        sampler_4.move_to([2,-2.5,0])
+        self.add(sampler_1, sampler_2, sampler_3, sampler_4)
+        samplers = [sampler_1[0], sampler_2[0], sampler_3[0], sampler_4[0]]
+
+        gpu_1 = Group(
+            Rectangle(color="white", height=1, width=1),
+            Text("Output GPU 1", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, 2, 0])
+        gpu_2 = Group(
+            Rectangle(color="white", height=1, width=1),
+            Text("Output GPU 2", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, .5, 0])
+        gpu_3 = Group(
+            Rectangle(color="white", height=1, width=1),
+            Text("Output GPU 3", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, -1, 0])
+        gpu_4 = Group(
+            Rectangle(color="white", height=1, width=1),
+            Text("Output GPU 4", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, -2.5, 0])
+        gpus = [gpu_1[0], gpu_2[0], gpu_3[0], gpu_4[0]]
+        self.add(gpu_1, gpu_2, gpu_3, gpu_4)
+
+        # Animate their existence
+        self.play(
+            Create(gpu_1[0], run_time=1),
+            Create(gpu_2[0], run_time=1),
+            Create(gpu_3[0], run_time=1),
+            Create(gpu_4[0], run_time=1),
+            Create(dataset_recs, run_time=1),
+            Create(sampler_1[0], run_time=1),
+            Create(sampler_2[0], run_time=1),
+            Create(sampler_3[0], run_time=1),
+            Create(sampler_4[0], run_time=1),
+        )
+
+        first_animations = []
+        second_animations = []
+
+
+        colors = ["BLUE_E", "DARK_BROWN", "GOLD_E", "GRAY_A"]
+        current_color = colors[0]
+        buff = 0
+        lr_buff = .25
+        old_target = None
+        new_datasets = []
+        for i,row_data in enumerate(dataset_recs):
+            new_row = []
+            current_color = colors[i]
+            if i == 0:
+                idx = -3
+            elif i == 1:
+                idx = -2
+            elif i == 2:
+                idx = -1
+            elif i == 3:
+                idx = 0
+            for j,indiv_data in enumerate(row_data):
+                dataset_target = Rectangle(height=0.46/2,width=0.46/2).set_stroke(width=0.).set_fill(current_color, opacity=0.7)
+                dataset_target.move_to(indiv_data)
+                dataset_target.generate_target()
+                aligned_edge = ORIGIN
+                if j % 8 == 0:
+                    aligned_edge = LEFT
+                    dataset_target.target.next_to(
+                        samplers[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
+                    )
+                    dataset_target.target.set_x(dataset_target.target.get_x())
+                elif j % 4 == 0:
+                    old_target = dataset_target.target
+                    dataset_target.target.next_to(
+                        samplers[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
+                    )
+                    dataset_target.target.set_x(dataset_target.target.get_x())
+                    dataset_target.target.set_y(dataset_target.target.get_y()-.25)
+                else:
+                    dataset_target.target.next_to(
+                        old_target, direction=RIGHT, buff=0.02,
+                    )
+                old_target = dataset_target.target
+                new_row.append(dataset_target)
+                first_animations.append(indiv_data.animate(run_time=0.5).set_stroke(current_color))
+                second_animations.append(MoveToTarget(dataset_target, run_time=1.5))
+            
+            new_datasets.append(new_row)
+        step_1 = MarkupText(
+            f"Since we splice the dataset between each GPU,\nthe models weights can be averaged during `backward()`\nActing as though we did one giant epoch\nvery quickly.",
+            font_size=18
+        )
+        step_1.move_to([-2.5, -2, 0])
+
+        self.play(
+            Write(step_1, run_time=3),
+        )
+        self.play(
+            *first_animations,
+        )
+        self.play(*second_animations)
+        self.wait(duration=.5)
+
+        move_animation = []
+        import random
+        for i,row in enumerate(new_datasets):
+            # row = [row[k] for k in random.sample(range(8), 8)]
+            current_color = colors[i]
+            if i == 0:
+                idx = -3
+            elif i == 1:
+                idx = -2
+            elif i == 2:
+                idx = -1
+            elif i == 3:
+                idx = 0
+            for j,indiv_data in enumerate(row):
+                indiv_data.generate_target()
+                aligned_edge = ORIGIN
+                if j % 8 == 0:
+                    aligned_edge = LEFT
+                    indiv_data.target.next_to(
+                        gpus[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
+                    )
+                    indiv_data.target.set_x(indiv_data.target.get_x())
+                elif j % 4 == 0:
+                    indiv_data.target.next_to(
+                        gpus[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
+                    )
+                    indiv_data.target.set_x(indiv_data.target.get_x())
+                    indiv_data.target.set_y(indiv_data.target.get_y()-.25)
+                else:
+                    indiv_data.target.next_to(
+                        old_target, direction=RIGHT, buff=0.02,
+                    )
+                old_target = indiv_data.target
+                move_animation.append(MoveToTarget(indiv_data, run_time=1.5))
+
+        self.play(*move_animation)
+        self.wait()
--- a/manim_animations/dataloaders/stage_6.py
+++ b/manim_animations/dataloaders/stage_6.py
@ -0,0 +1,193 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from manim import *
+
+
+class Stage6(Scene):
+    def construct(self):
+        # The dataset items
+        colors = ["BLUE_E", "DARK_BROWN", "GOLD_E", "GRAY_A"]
+        fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)
+        columns = [
+            VGroup(*[Rectangle(height=0.25,width=0.25,color=colors[j]) for i in range(8)]).arrange(RIGHT,buff=0)
+            for j in range(4)
+        ]
+        dataset_recs = VGroup(*columns).arrange(UP, buff=0)
+        dataset_text = Text("Dataset", font_size=24)
+        dataset = Group(dataset_recs,dataset_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
+        dataset.move_to([-2,0,0])
+        self.add(dataset)
+        code = Code(
+            code="# We enable this by default\naccelerator = Accelerator()\ndataloader = DataLoader(..., shuffle=True)\ndataloader = accelerator.prepare(dataloader)\nfor batch in dataloader:\n\t...",
+            tab_width=4,
+            background="window",
+            language="Python",
+            font="Monospace",
+            font_size=14,
+            corner_radius=.2,
+            insert_line_no=False,
+            line_spacing=.75,
+            style=Code.styles_list[1],
+        )
+        code.move_to([-3.5, 2.5, 0])
+        self.add(code)
+
+        # The dataloader itself
+
+        sampler_1 = Group(
+            Rectangle(color="blue", height=1, width=1),
+            Text("Sampler GPU 1", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN)
+        sampler_2 = Group(
+            Rectangle(color="blue", height=1, width=1),
+            Text("Sampler GPU 2", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN)
+        sampler_3 = Group(
+            Rectangle(color="blue", height=1, width=1),
+            Text("Sampler GPU 3", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN)
+        sampler_4 = Group(
+            Rectangle(color="blue", height=1, width=1),
+            Text("Sampler GPU 4", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN)
+        sampler_1.move_to([2,2,0])
+        sampler_2.move_to([2,.5,0])
+        sampler_3.move_to([2,-1.,0])
+        sampler_4.move_to([2,-2.5,0])
+        self.add(sampler_1, sampler_2, sampler_3, sampler_4)
+        samplers = [sampler_1[0], sampler_2[0], sampler_3[0], sampler_4[0]]
+
+        gpu_1 = Group(
+            Rectangle(color="white", height=1, width=1),
+            Text("Output GPU 1", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, 2, 0])
+        gpu_2 = Group(
+            Rectangle(color="white", height=1, width=1),
+            Text("Output GPU 2", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, .5, 0])
+        gpu_3 = Group(
+            Rectangle(color="white", height=1, width=1),
+            Text("Output GPU 3", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, -1, 0])
+        gpu_4 = Group(
+            Rectangle(color="white", height=1, width=1),
+            Text("Output GPU 4", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, -2.5, 0])
+        gpus = [gpu_1[0], gpu_2[0], gpu_3[0], gpu_4[0]]
+        self.add(gpu_1, gpu_2, gpu_3, gpu_4)
+
+
+        first_animations = []
+        second_animations = []
+
+
+        colors = ["BLUE_E", "DARK_BROWN", "GOLD_E", "GRAY_A"]
+        current_color = colors[0]
+        buff = 0
+        lr_buff = .25
+        old_target = None
+        new_datasets = []
+        for i,row_data in enumerate(dataset_recs):
+            new_row = []
+            current_color = colors[i]
+            if i == 0:
+                idx = -3
+            elif i == 1:
+                idx = -2
+            elif i == 2:
+                idx = -1
+            elif i == 3:
+                idx = 0
+            for j,indiv_data in enumerate(row_data):
+                dataset_target = Rectangle(height=0.46/2,width=0.46/2).set_stroke(width=0.).set_fill(current_color, opacity=0.7)
+                dataset_target.move_to(indiv_data)
+                dataset_target.generate_target()
+                aligned_edge = ORIGIN
+                if j % 8 == 0:
+                    aligned_edge = LEFT
+                    old_target = dataset_target.target
+                    dataset_target.target.next_to(
+                        samplers[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
+                    )
+                    dataset_target.target.set_x(dataset_target.target.get_x())
+                elif j % 4 == 0:
+                    old_target = dataset_target.target
+                    dataset_target.target.next_to(
+                        samplers[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
+                    )
+                    dataset_target.target.set_x(dataset_target.target.get_x())
+                    dataset_target.target.set_y(dataset_target.target.get_y()-.25)
+                else:
+                    dataset_target.target.next_to(
+                        old_target, direction=RIGHT, buff=0.02,
+                    )
+                old_target = dataset_target.target
+                new_row.append(dataset_target)
+                first_animations.append(indiv_data.animate(run_time=0.5).set_stroke(current_color))
+                second_animations.append(MoveToTarget(dataset_target, run_time=1.5))
+            
+            new_datasets.append(new_row)
+        step_1 = MarkupText(
+            f"During shuffling, each mini-batch's\noutput order will be modified",
+            font_size=18
+        )
+        step_1.move_to([-1.5, -2, 0])
+
+        self.play(
+            Write(step_1, run_time=3),
+        )
+        self.play(
+            *first_animations,
+        )
+        self.play(*second_animations)
+        self.wait(duration=.5)
+
+        move_animation = []
+        import random
+        for i,row in enumerate(new_datasets):
+            row = [row[k] for k in random.sample(range(8), 8)]
+            current_color = colors[i]
+            if i == 0:
+                idx = -3
+            elif i == 1:
+                idx = -2
+            elif i == 2:
+                idx = -1
+            elif i == 3:
+                idx = 0
+            for j,indiv_data in enumerate(row):
+                indiv_data.generate_target()
+                aligned_edge = ORIGIN
+                if j % 8 == 0:
+                    aligned_edge = LEFT
+                    indiv_data.target.next_to(
+                        gpus[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
+                    )
+                    indiv_data.target.set_x(indiv_data.target.get_x())
+                elif j % 4 == 0:
+                    indiv_data.target.next_to(
+                        gpus[abs(idx)].get_corner(UP+LEFT), buff=.02, direction=RIGHT+DOWN,
+                    )
+                    indiv_data.target.set_x(indiv_data.target.get_x())
+                    indiv_data.target.set_y(indiv_data.target.get_y()-.25)
+                else:
+                    indiv_data.target.next_to(
+                        old_target, direction=RIGHT, buff=0.02,
+                    )
+                old_target = indiv_data.target
+                move_animation.append(MoveToTarget(indiv_data, run_time=1.5))
+
+        self.play(*move_animation)
+        self.wait()
--- a/manim_animations/dataloaders/stage_7.py
+++ b/manim_animations/dataloaders/stage_7.py
@ -0,0 +1,182 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from manim import *
+
+class Stage7(Scene):
+    def construct(self):
+        # The dataset items        
+        code = Code(
+            code="accelerator = Accelerator(dispatch_batches=True)\ndataloader = DataLoader(...)\ndataloader = accelerator.prepare(dataloader)\nfor batch in dataloader:\n\t...",
+            tab_width=4,
+            background="window",
+            language="Python",
+            font="Monospace",
+            font_size=14,
+            corner_radius=.2,
+            insert_line_no=False,
+            line_spacing=.75,
+            style=Code.styles_list[1],
+        )
+        code.move_to([-3.5, 2.5, 0])
+        self.add(code)
+        colors = ["BLUE_E", "DARK_BROWN", "GOLD_E", "GRAY_A"]
+        fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)
+        columns = [
+            VGroup(*[Rectangle(height=0.25,width=0.25,color=colors[j]) for i in range(8)]).arrange(RIGHT,buff=0)
+            for j in range(4)
+        ]
+        dataset_recs = VGroup(*columns).arrange(UP, buff=0)
+        dataset_text = Text("Dataset", font_size=24)
+        dataset = Group(dataset_recs,dataset_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
+        dataset.move_to([-2,0,0])
+        self.add(dataset)
+
+        # The dataloader itself
+
+        sampler_1 = Group(
+            Rectangle(color="blue", height=1.02, width=1.02),
+            Text("Sampler GPU 1", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN)
+        sampler_2 = Group(
+            Rectangle(color="blue", height=1.02, width=1.02),
+            Text("Sampler GPU 2", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN)
+        sampler_3 = Group(
+            Rectangle(color="blue", height=1.02, width=1.02),
+            Text("Sampler GPU 3", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN)
+        sampler_4 = Group(
+            Rectangle(color="blue", height=1.02, width=1.02),
+            Text("Sampler GPU 4", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN)
+        sampler_1.move_to([2,2,0])
+        sampler_2.move_to([2,.5,0])
+        sampler_3.move_to([2,-1.,0])
+        sampler_4.move_to([2,-2.5,0])
+        self.add(sampler_1, sampler_2, sampler_3, sampler_4)
+        samplers = [sampler_1[0], sampler_2[0], sampler_3[0], sampler_4[0]]
+
+        gpu_1 = Group(
+            Rectangle(color="white", height=1.02, width=.98),
+            Text("Output GPU 1", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, 2, 0])
+        gpu_2 = Group(
+            Rectangle(color="white", height=1.02, width=.98),
+            Text("Output GPU 2", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, .5, 0])
+        gpu_3 = Group(
+            Rectangle(color="white", height=1.02, width=.98),
+            Text("Output GPU 3", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, -1, 0])
+        gpu_4 = Group(
+            Rectangle(color="white", height=1.02, width=.98),
+            Text("Output GPU 4", font_size=12)
+        ).arrange(DOWN, buff=.25, aligned_edge=DOWN).move_to([4.5, -2.5, 0])
+        gpus = [gpu_1[0], gpu_2[0], gpu_3[0], gpu_4[0]]
+        self.add(gpu_1, gpu_2, gpu_3, gpu_4)
+
+        step_1 = MarkupText(
+            f"When using a `DataLoaderDispatcher`, all\nof the samples are collected from GPU 0's dataset,\nthen divided and sent to each GPU.\nAs a result, this will be slower.",
+            font_size=18
+        )
+        step_1.move_to([-2.5, -2, 0])
+
+        self.play(
+            Write(step_1, run_time=3.5),
+        )
+
+        first_animations = []
+        second_animations = []
+
+
+        colors = ["BLUE_E", "DARK_BROWN", "GOLD_E", "GRAY_A"]
+        current_color = colors[0]
+        ud_buff = 0.01
+        lr_buff = 0.01
+        old_target = None
+        new_datasets = []
+        for i,row_data in enumerate(dataset_recs):
+            new_row = []
+            current_color = colors[i]
+                
+            for j,indiv_data in enumerate(row_data):
+                dataset_target = Rectangle(height=0.46/4,width=0.46/2).set_stroke(width=0.).set_fill(current_color, opacity=0.7)
+                dataset_target.move_to(indiv_data)
+                dataset_target.generate_target()
+                aligned_edge = ORIGIN
+                if j % 8 == 0:
+                    aligned_edge = LEFT
+                    dataset_target.target.next_to(
+                        samplers[0].get_corner(DOWN+LEFT), buff=0.0125, direction=RIGHT+UP,
+                    )
+                    dataset_target.target.set_x(dataset_target.target.get_x())
+                    dataset_target.target.set_y(dataset_target.target.get_y() + (.25 * i))
+                elif j % 4 == 0:
+                    old_target = dataset_target.target
+                    dataset_target.target.next_to(
+                        samplers[0].get_corner(DOWN+LEFT), buff=0.0125, direction=RIGHT+UP,
+                    )
+                    dataset_target.target.set_x(dataset_target.target.get_x())
+                    dataset_target.target.set_y(dataset_target.target.get_y()+.125 + (.25 * i))
+                else:
+                    dataset_target.target.next_to(
+                        old_target, direction=RIGHT, buff=0.0125,
+                    )
+                old_target = dataset_target.target
+                new_row.append(dataset_target)
+                first_animations.append(indiv_data.animate(run_time=0.5).set_stroke(current_color))
+                second_animations.append(MoveToTarget(dataset_target, run_time=1.5))
+            
+            new_datasets.append(new_row)
+        self.play(
+            *first_animations,
+        )
+        self.play(*second_animations)
+        move_animation = []
+        for i,row in enumerate(new_datasets):
+            current_color = colors[i]
+            if i == 0:
+                idx = -3
+            elif i == 1:
+                idx = -2
+            elif i == 2:
+                idx = -1
+            elif i == 3:
+                idx = 0
+            for j,indiv_data in enumerate(row):
+                indiv_data.generate_target()
+                indiv_data.animate.stretch_to_fit_height(0.46/2)
+                aligned_edge = ORIGIN
+                if j % 8 == 0:
+                    aligned_edge = LEFT
+                    indiv_data.target.next_to(
+                        gpus[abs(idx)].get_corner(UP+LEFT), buff=.01, direction=RIGHT+DOWN,
+                    )
+                    indiv_data.target.set_x(indiv_data.target.get_x())
+                    indiv_data.target.set_y(indiv_data.target.get_y()-.25)
+                elif j % 4 == 0:
+                    indiv_data.target.next_to(
+                        gpus[abs(idx)].get_corner(UP+LEFT), buff=.01, direction=RIGHT+DOWN,
+                    )
+                    indiv_data.target.set_x(indiv_data.target.get_x())
+                else:
+                    indiv_data.target.next_to(
+                        old_target, direction=RIGHT, buff=0.01,
+                    )
+                old_target = indiv_data.target
+                move_animation.append(MoveToTarget(indiv_data, run_time=1.5))
+
+        self.play(*move_animation)
+        self.wait()
--- a/setup.py
+++ b/setup.py
@ -25,17 +25,18 @@ extras["docs"] = []
 extras["test_prod"] = ["pytest>=7.2.0,<=8.0.0", "pytest-xdist", "pytest-subtests", "parameterized"]
 extras["test_dev"] = [
    "datasets",
+    "diffusers",
    "evaluate",
    "torchpippy>=0.2.0",
    "transformers",
    "scipy",
    "scikit-learn",
-    "deepspeed",
    "tqdm",
    "bitsandbytes",
    "timm",
 ]
 extras["testing"] = extras["test_prod"] + extras["test_dev"]
+extras["deepspeed"] = ["deepspeed<=0.14.0"]
 extras["rich"] = ["rich"]

 extras["test_trackers"] = ["wandb", "comet-ml", "tensorboard", "dvclive"]
@ -47,7 +48,7 @@ extras["sagemaker"] = [

 setup(
    name="accelerate",
-    version="0.29.0.dev",
+    version="0.31.0.dev0",
    description="Accelerate",
    long_description=open("README.md", encoding="utf-8").read(),
    long_description_content_type="text/markdown",
--- a/src/accelerate/init.py
+++ b/src/accelerate/init.py
@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "0.29.0.dev0"
+__version__ = "0.30.1.dev0"

 from .accelerator import Accelerator
 from .big_modeling import (
--- a/src/accelerate/accelerator.py
+++ b/src/accelerate/accelerator.py
@ -79,6 +79,7 @@ from .utils import (
    is_deepspeed_available,
    is_fp8_available,
    is_ipex_available,
+    is_lomo_available,
    is_megatron_lm_available,
    is_mlu_available,
    is_msamp_available,
@ -215,7 +216,7 @@ class Accelerator:
        project_dir (`str`, `os.PathLike`, *optional*):
            A path to a directory for storing data such as logs of locally-compatible loggers and potentially saved
            checkpoints.
-        step_scheduler_with_optimizer (`bool`, *optional`, defaults to `True`):
+        step_scheduler_with_optimizer (`bool`, *optional*, defaults to `True`):
            Set `True` if the learning rate scheduler is stepped at the same time as the optimizer, `False` if only
            done under certain circumstances (at the end of each epoch, for instance).
        kwargs_handlers (list of [`~utils.KwargsHandler`], *optional*)
@ -340,6 +341,8 @@ class Accelerator:
        self.init_handler = None
        self.fp8_recipe_handler = None
        self.autocast_handler = None
+        self.has_lomo_optimizer = False
+
        if kwargs_handlers is not None:
            for handler in kwargs_handlers:
                assert isinstance(
@ -383,8 +386,15 @@ class Accelerator:
            **kwargs,
        )

-        if self.fp8_recipe_handler is None and self.state.mixed_precision == "fp8":
-            self.fp8_recipe_handler = FP8RecipeKwargs(backend="MSAMP" if is_msamp_available() else "TE")
+        self.delayed_fp8_autocast = False
+        if self.fp8_recipe_handler is not None:
+            # We already check if FP8 is available during `self.state`
+            if self.state.mixed_precision != "fp8":
+                raise ValueError("Passing in a `FP8RecipeKwargs` object requires setting `mixed_precision='fp8'`.")
+            self.delayed_fp8_autocast = self.fp8_recipe_handler.backend == "TE" and self.distributed_type in (
+                DistributedType.MULTI_GPU,
+                DistributedType.FSDP,
+            )

        trackers = filter_trackers(log_with, self.logging_dir)
        if len(trackers) < 1 and log_with is not None:
@ -450,7 +460,7 @@ class Accelerator:
            and self.distributed_type not in (DistributedType.DEEPSPEED, DistributedType.MEGATRON_LM)
        ):
            self.native_amp = True
-            if self.device.type not in ("xpu", "cuda", "mps", "npu", "xla", "mlu") or is_torch_xla_available(
+            if self.device.type not in ("xpu", "cuda", "npu", "xla", "mlu") or is_torch_xla_available(
                check_is_tpu=True
            ):
                raise ValueError(f"fp16 mixed precision requires a GPU (not {self.device.type!r}).")
@ -479,6 +489,10 @@ class Accelerator:
            if mixed_precision == "bf16" and not self.native_amp and not is_torch_xla_available():
                raise ValueError("bf16 mixed precision requires PyTorch >= 1.10 and a supported device.")

+        elif self.state.mixed_precision == "fp8":
+            # We always enable `native_amp` for FP8
+            self.native_amp = True
+
        # Start of internal step tracking
        self.step = 0

@ -550,6 +564,10 @@ class Accelerator:
    def use_seedable_sampler(self):
        return self.dataloader_config.use_seedable_sampler

+    @property
+    def non_blocking(self):
+        return self.dataloader_config.non_blocking
+
    @property
    def project_dir(self):
        return self.project_configuration.project_dir
@ -1345,18 +1363,22 @@ class Accelerator:
                model.forward = MethodType(convert_outputs_to_fp32(model.forward.__func__), model)
            else:
                model.forward = convert_outputs_to_fp32(new_forward)
-        elif self.mixed_precision == "fp8" and self.fp8_recipe_handler.backend == "TE":
+
+        # We prepare fp8 after, allowing for bf16 autocast to happen first
+        if getattr(self.fp8_recipe_handler, "backend", None) == "TE":
            if not has_transformer_engine_layers(model):
                with torch.no_grad():
                    convert_model(model)
                model._converted_to_transformer_engine = True
-            model._original_forward = model.forward

            kwargs = self.fp8_recipe_handler.to_kwargs() if self.fp8_recipe_handler is not None else {}
            if "fp8_format" in kwargs:
                kwargs["fp8_format"] = getattr(te_recipe.Format, kwargs["fp8_format"])
            fp8_recipe = te_recipe.DelayedScaling(**kwargs)
-            model.forward = fp8_autocast(enabled=True, fp8_recipe=fp8_recipe)(model.forward)
+            # If we are in DDP or FSDP, we delay `autocast` until after FSDP/DDP has been initialized
+            # to make use of the process group
+            if not self.delayed_fp8_autocast:
+                model.forward = fp8_autocast(enabled=True, fp8_recipe=fp8_recipe)(model.forward)

        if (getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False)) and getattr(
            model, "hf_device_map", False
@ -1368,16 +1390,19 @@ class Accelerator:
                    " In order to use 8-bit models that have been loaded across multiple GPUs the solution is to use Naive Pipeline Parallelism."
                    " Therefore you should not specify that you are under any distributed regime in your accelerate config."
                )
-            current_device = list(model_devices)[0]
-            current_device_index = current_device.index if isinstance(current_device, torch.device) else current_device
+            elif len(model_devices) == 1:
+                current_device = list(model_devices)[0]
+                current_device_index = (
+                    current_device.index if isinstance(current_device, torch.device) else current_device
+                )

-            if torch.device(current_device_index) != self.device:
-                # if on the first device (GPU 0) we don't care
-                if (self.device.index is not None) or (current_device_index != 0):
-                    raise ValueError(
-                        "You can't train a model that has been loaded in 8-bit precision on a different device than the one "
-                        "you're training on. Make sure you loaded the model on the correct device using for example `device_map={'':torch.cuda.current_device() or device_map={'':torch.xpu.current_device()}"
-                    )
+                if torch.device(current_device_index) != self.device:
+                    # if on the first device (GPU 0) we don't care
+                    if (self.device.index is not None) or (current_device_index != 0):
+                        raise ValueError(
+                            "You can't train a model that has been loaded in 8-bit precision on a different device than the one "
+                            "you're training on. Make sure you loaded the model on the correct device using for example `device_map={'':torch.cuda.current_device() or device_map={'':torch.xpu.current_device()}"
+                        )

            if "cpu" in model_devices or "disk" in model_devices:
                raise ValueError(
@ -1447,6 +1472,73 @@ class Accelerator:
                            ),
                            auto_wrap_policy=fsdp_plugin.auto_wrap_policy,
                        )
+
+                # In the event the model had been loaded in low precision, but
+                # mixed precision had also been activated, then we follow DeepSpeed's
+                # strategy to hold the parameters in full precision.
+                # - assume that trainer.args.bf16 and trainer.args.fp16 are already checked against
+                #   fsdp_plugin.mixed_precision_policy.
+                # - NOTE: we do not check the mixed_precision attribute on the FSDP root wrapper.
+                #   * this attribute will always set by init_utils.init_core_state so its always not None.
+                #   * mixed_precision.param_dtype only regards _fwd_bwd_param_dtype
+                #   * if model is loaded in 16bit, and even if mixed_precision.param_dtype is None,
+                #     we sill want to upcast the flat_param.
+                if self.mixed_precision != "no":  # if mixed precision is set
+                    upcasted_log = []
+                    for module in FSDP.fsdp_modules(model):
+                        # Referencing DeepSpeed Zero3
+                        # - in Init, params are converted to 16bit while partitioning.
+                        # - in accelerator.prepare, deepspeed.initalize is called to:
+                        #   * creates the DeepSpeeedEngine.
+                        #   * since zero_optimization() is True , calls engine._configure_zero_optimizer.
+                        #
+                        # Inside the DeepSpeed Zero3 optimizer configuration, which initalizes
+                        # DeepSpeedZeroOptimizer_Stage3, during which:
+                        #   * trainable_param_groups are obtained from the attached optimizer
+                        #     (already partitioned in 16bit).
+                        #   * then _setup_for_real_optimizer -> _create_fp32_partitions
+                        #     which performs the fp32 upcasting.
+
+                        # To mimick DeepSeepds's casting in FSDP, we look at the (single) FlatParameter held
+                        # within an FSDP wrapper. This FlatParameter will be seen by the optimizer.
+                        #  - even though there is a torch.device('meta') guard below, we
+                        #    expect _init_utils._init_param_handle_from_module to already
+                        #    sync the parameter.
+
+                        if not module._has_params:
+                            continue  # skip if FSDP module not managing parameters
+                        param = module._flat_param
+                        if (
+                            param.dtype != torch.float32
+                            and param.device != torch.device("meta")
+                            and param.requires_grad
+                        ):
+                            # keep log of names_params that was upcasted
+                            # NOTE: resorted to this because warnings.simplefilter("once") is somehow not working
+                            name_param_log = (module.module.__class__.__name__, ", ".join(module._flat_param._fqns))
+                            if name_param_log not in upcasted_log:
+                                upcasted_log.append(name_param_log)
+
+                            # this works because of FSDP's _runtime_utils.lazy_init.
+                            # Have to be careful not to call anything before this that
+                            # triggers lazy_init (e.g., _is_fsdp_root).
+                            param.data = param.data.to(torch.float32)  # upcasting
+                            module._handle._orig_param_dtype = torch.float32  # update
+
+                    # report the warnings
+                    # some messages can be quite repetitive, especially when reporting about layers that have identical architecture.
+                    if self.is_main_process:
+                        for name_log, param_log in upcasted_log:
+                            warnings.warn(
+                                f"Upcasted low precision parameters in {name_log} because mixed precision turned on in FSDP. "
+                                f"Affects: {param_log}."
+                            )
+
+                        if len(upcasted_log) > 0:
+                            warnings.warn(
+                                "FSDP upcast of low precision parameters may affect the precision of model checkpoints."
+                            )
+
                # if the previous and current models are same, delete the previous one
                if len(self._models) > 1 and (self._models[-2] is self._models[-1]):
                    del self._models[-2]
@ -1456,6 +1548,11 @@ class Accelerator:
                model = torch.nn.parallel.DistributedDataParallel(model, **kwargs)
            elif self.distributed_type == DistributedType.XLA and self.state.fork_launched:
                model = xmp.MpModelWrapper(model).to(self.device)
+        # Now we can apply the FP8 autocast
+        if self.delayed_fp8_autocast:
+            model.forward = fp8_autocast(enabled=True, fp8_recipe=fp8_recipe, fp8_group=model.process_group)(
+                model.forward
+            )
        # torch.compile should be called last and only if the model isn't already compiled.
        if self.state.dynamo_plugin.backend != DynamoBackend.NO and not is_compiled_module(model):
            if not is_torch_version(">=", "2.0"):
@ -1571,6 +1668,8 @@ class Accelerator:
                )

        if model is not None:
+            # if the model is an MOE, set the appropriate MOE layers as leaf Z3 modules
+            deepspeed_plugin.set_moe_leaf_modules(model)
            # deal with config keys that use `auto` value and rely on model's hidden_size
            hidden_size_based_keys = [
                "zero_optimization.reduce_bucket_size",
@ -1904,6 +2003,7 @@ class Accelerator:
            even_batches=self.even_batches,
            slice_fn_for_dispatch=slice_fn_for_dispatch,
            use_seedable_sampler=self.use_seedable_sampler,
+            non_blocking=self.non_blocking,
        )
        self._dataloaders.append(prepared_data_loader)
        return prepared_data_loader
@ -1930,6 +2030,14 @@ class Accelerator:
        >>> optimizer = accelerator.prepare_optimizer(optimizer, device_placement=True)
        ```
        """
+        if is_lomo_available():
+            # We need to import locally to avoid circular imports since lomo imports stuff from
+            # transformers & accelerate
+            from lomo_optim import AdaLomo, Lomo
+
+            # Support multiple optimizers: https://github.com/huggingface/accelerate/pull/2695#discussion_r1589164607
+            self.has_lomo_optimizer |= isinstance(optimizer, (Lomo, AdaLomo))
+
        # Ensure we can't double wrap an optimizer due to `find_batch_size`
        if getattr(optimizer, "_is_accelerate_prepared", False):
            if optimizer not in self._optimizers:
@ -2000,6 +2108,8 @@ class Accelerator:
        >>> accelerator.backward(loss)
        ```
        """
+        learning_rate = kwargs.get("learning_rate")
+
        if self.distributed_type != DistributedType.DEEPSPEED:
            # deepspeed handles loss scaling by gradient_accumulation_steps in its `backward`
            loss = loss / self.gradient_accumulation_steps
@ -2009,6 +2119,8 @@ class Accelerator:
            return
        elif self.scaler is not None:
            self.scaler.scale(loss).backward(**kwargs)
+        elif learning_rate is not None and self.has_lomo_optimizer:
+            self.lomo_backward(loss, learning_rate)
        else:
            loss.backward(**kwargs)

@ -2216,7 +2328,7 @@ class Accelerator:
        """
        return gather(tensor)

-    def gather_for_metrics(self, input_data):
+    def gather_for_metrics(self, input_data, use_gather_object=False):
        """
        Gathers `input_data` and potentially drops duplicates in the last batch if on a distributed system. Should be
        used for gathering the inputs and targets for metric calculation.
@ -2224,6 +2336,11 @@ class Accelerator:
        Args:
            input (`torch.Tensor`, `object`, a nested tuple/list/dictionary of `torch.Tensor`, or a nested tuple/list/dictionary of `object`):
                The tensors or objects for calculating metrics across all processes
+            use_gather_object(`bool`):
+                Whether to forcibly use gather_object instead of gather (which is already done if all objects passed do
+                not contain tensors). This flag can be useful for gathering tensors with different sizes that we don't
+                want to pad and concatenate along the first dimension. Using it with GPU tensors is not well supported
+                and inefficient as it incurs GPU -> CPU transfer since tensors would be pickled.

        Example:

@ -2248,7 +2365,9 @@ class Accelerator:
        except TypeError:
            all_tensors = False

-        if not all_tensors:
+        use_gather_object = use_gather_object or not all_tensors
+
+        if use_gather_object:
            data = gather_object(input_data)
        else:
            data = self.gather(input_data)
@ -2267,7 +2386,11 @@ class Accelerator:
                    def _adjust_samples(tensor):
                        return tensor[: self.gradient_state.remainder]

-                    return recursively_apply(_adjust_samples, data)
+                    if use_gather_object:
+                        # gather_object put the objects in a list
+                        return _adjust_samples(data)
+                    else:
+                        return recursively_apply(_adjust_samples, data)
                else:  # remainder is 0
                    # no remainder even though at end of dataloader, so nothing to do.
                    return data
@ -2780,7 +2903,7 @@ class Accelerator:
        for i, model in enumerate(self._models):
            if self.distributed_type == DistributedType.FSDP:
                logger.info("Saving FSDP model")
-                save_fsdp_model(self.state.fsdp_plugin, self, model, output_dir, i)
+                save_fsdp_model(self.state.fsdp_plugin, model, output_dir, i)
                logger.info(f"FSDP Model saved to output dir {output_dir}")
            elif self.distributed_type == DistributedType.DEEPSPEED:
                logger.info("Saving DeepSpeed Model and Optimizer")
@ -2799,7 +2922,7 @@ class Accelerator:
        if self.distributed_type == DistributedType.FSDP:
            for i, opt in enumerate(self._optimizers):
                logger.info("Saving FSDP Optimizer")
-                save_fsdp_optimizer(self.state.fsdp_plugin, self, opt, self._models[i], output_dir, i)
+                save_fsdp_optimizer(self.state.fsdp_plugin, opt, self._models[i], output_dir, i)
                logger.info(f"FSDP Optimizer saved to output dir {output_dir}")
        elif self.distributed_type not in [DistributedType.DEEPSPEED, DistributedType.MEGATRON_LM]:
            optimizers = self._optimizers
@ -2924,7 +3047,7 @@ class Accelerator:
        for i, model in enumerate(self._models):
            if self.distributed_type == DistributedType.FSDP:
                logger.info("Loading FSDP model")
-                load_fsdp_model(self.state.fsdp_plugin, self, model, input_dir, i)
+                load_fsdp_model(self.state.fsdp_plugin, model, input_dir, i)
                logger.info(f"FSDP Model loaded from input dir {input_dir}")
            elif self.distributed_type == DistributedType.DEEPSPEED:
                logger.info("Loading DeepSpeed Model and Optimizer")
@ -2943,7 +3066,7 @@ class Accelerator:
        if self.distributed_type == DistributedType.FSDP:
            for i, opt in enumerate(self._optimizers):
                logger.info("Loading FSDP Optimizer")
-                load_fsdp_optimizer(self.state.fsdp_plugin, self, opt, self._models[i], input_dir, i)
+                load_fsdp_optimizer(self.state.fsdp_plugin, opt, self._models[i], input_dir, i)
                logger.info(f"FSDP Optimizer loaded from input dir {input_dir}")
        elif self.distributed_type not in [DistributedType.DEEPSPEED, DistributedType.MEGATRON_LM]:
            optimizers = self._optimizers
@ -3002,7 +3125,7 @@ class Accelerator:
            for index, obj in enumerate(self._custom_objects):
                load_custom_state(obj, input_dir, index)

-    def free_memory(self):
+    def free_memory(self, *objects):
        """
        Will release all references to the internal objects stored and call the garbage collector. You should call this
        method between two trainings with different models/optimizers. Also will reset `Accelerator.step` to 0.
@ -3015,19 +3138,23 @@ class Accelerator:
        >>> accelerator = Accelerator()
        >>> model, optimizer, scheduler = ...
        >>> model, optimizer, scheduler = accelerator.prepare(model, optimizer, scheduler)
-        >>> accelerator.free_memory()
-        >>> del model, optimizer, scheduler
+        >>> model, optimizer, scheduler = accelerator.free_memory(model, optimizer, scheduler)
        ```
        """
+        # Deepspeed needs a bit more prep that should be done first
+        if hasattr(self, "deepspeed_engine_wrapped"):
+            if self.deepspeed_engine_wrapped is not None:
+                self.deepspeed_engine_wrapped.engine.destroy()
+            self.deepspeed_engine_wrapped = None
+        objects = release_memory(*objects)
        self._schedulers = []
        self._optimizers = []
        self._models = []
        self._dataloaders = []
-        self.deepspeed_engine_wrapped = None
        self.step = 0
-        release_memory()
+        return objects

-    def clear(self):
+    def clear(self, *objects):
        """
        Alias for [`Accelerate.free_memory`], releases all references to the internal objects stored and call the
        garbage collector. You should call this method between two trainings with different models/optimizers.
@ -3040,11 +3167,10 @@ class Accelerator:
        >>> accelerator = Accelerator()
        >>> model, optimizer, scheduler = ...
        >>> model, optimizer, scheduler = accelerator.prepare(model, optimizer, scheduler)
-        >>> accelerator.free_memory()
-        >>> del model, optimizer, scheduler
+        >>> model, optimizer, scheduler = accelerator.clear(model, optimizer, scheduler)
        ```
        """
-        self.free_memory()
+        return self.free_memory(*objects)

    def _get_named_parameters(self, *args):
        named_parameters = {}
@ -3257,3 +3383,27 @@ class Accelerator:
                return True

        return False
+
+    def lomo_backward(self, loss: torch.Tensor, learning_rate: float) -> None:
+        """
+        Runs backward pass on LOMO optimizers.
+        """
+        if is_lomo_available():
+            # We need to import locally to avoid circular imports since lomo imports stuff from
+            # transformers & accelerate
+            from lomo_optim import AdaLomo, Lomo
+
+        if learning_rate is None:
+            raise ValueError("A learning rate must be passed in order to call backward pass with LOMO optimizers.")
+
+        _backward_called = False
+
+        for optimizer in self._optimizers:
+            if isinstance(optimizer.optimizer, (Lomo, AdaLomo)):
+                optimizer.optimizer.fused_backward(loss, learning_rate)
+                _backward_called = True
+
+        if not _backward_called:
+            raise ValueError(
+                "Backward pass not properly called on LOMO optimizers. Are you sure you passed a LOMO optimizer in accelerator.prepare()?"
+            )
--- a/src/accelerate/big_modeling.py
+++ b/src/accelerate/big_modeling.py
@ -508,6 +508,7 @@ def load_checkpoint_and_dispatch(
    skip_keys: Optional[Union[str, List[str]]] = None,
    preload_module_classes: Optional[List[str]] = None,
    force_hooks: bool = False,
+    strict: bool = False,
 ):
    """
    Loads a (potentially sharded) checkpoint inside a model, potentially sending weights to a given device as they are
@ -554,6 +555,9 @@ def load_checkpoint_and_dispatch(
        force_hooks (`bool`, *optional*, defaults to `False`):
            Whether or not to force device hooks to be attached to the model even if all layers are dispatched to a
            single device.
+        strict (`bool`, *optional*, defaults to `False`):
+            Whether to strictly enforce that the keys in the checkpoint state_dict match the keys of the model's
+            state_dict.

    Example:

@ -608,6 +612,7 @@ def load_checkpoint_and_dispatch(
        dtype=dtype,
        offload_state_dict=offload_state_dict,
        offload_buffers=offload_buffers,
+        strict=strict,
    )
    if device_map is None:
        return model
--- a/src/accelerate/checkpointing.py
+++ b/src/accelerate/checkpointing.py
@ -120,8 +120,7 @@ def save_accelerator_state(
        from .data_loader import IterableDatasetShard, SeedableRandomSampler

        if isinstance(dataloader.dataset, IterableDatasetShard):
-            sampler = dataloader.sampler.sampler
-
+            sampler = dataloader.get_sampler()
            if isinstance(sampler, SeedableRandomSampler):
                save(sampler, output_sampler_file, save_on_each_node=save_on_each_node, safe_serialization=False)
        logger.info(f"Sampler state for dataloader {i} saved in {output_sampler_file}")
@ -227,10 +226,9 @@ def load_accelerator_state(
        from .data_loader import IterableDatasetShard, SeedableRandomSampler

        if isinstance(dataloader.dataset, IterableDatasetShard):
-            sampler = dataloader.sampler.sampler
-
+            sampler = dataloader.get_sampler()
            if isinstance(sampler, SeedableRandomSampler):
-                dataloader.sampler.sampler = torch.load(input_sampler_file)
+                sampler = dataloader.set_sampler(torch.load(input_sampler_file))
    logger.info("All dataloader sampler states loaded successfully")

    # GradScaler state
--- a/src/accelerate/commands/config/cluster.py
+++ b/src/accelerate/commands/config/cluster.py
@ -298,6 +298,18 @@ def get_cluster_input():
                        "When `zero3_init_flag` is set, it requires Transformers to be installed. "
                        "Please run `pip3 install transformers`."
                    )
+            use_moe = _ask_field(
+                "Do you want to enable Mixture-of-Experts training (MoE)? [yes/NO]: ",
+                _convert_yes_no_to_bool,
+                default=False,
+                error_message="Please enter yes or no.",
+            )
+            if use_moe:
+                deepspeed_config["deepspeed_moe_layer_cls_names"] = _ask_field(
+                    "Specify the comma-separated list of transformers MoE layer class names (case-sensitive), e.g : "
+                    " `MixtralSparseMoeBlock`, `Qwen2MoeSparseMoeBlock`, `JetMoEAttention,JetMoEBlock` ... : ",
+                    str,
+                )

            if num_machines > 1:
                launcher_query = "Which Type of launcher do you want to use?"
@ -567,7 +579,7 @@ def get_cluster_input():

    # CPU affinity is only supported on NVIDIA hardware for now
    enable_cpu_affinity = False
-    if distributed_type == (DistributedType.NO, DistributedType.MULTI_GPU) and not use_cpu and not use_mps:
+    if distributed_type in (DistributedType.NO, DistributedType.MULTI_GPU) and not use_cpu and not use_mps:
        enable_cpu_affinity = _ask_field(
            "Would you like to enable numa efficiency? (Currently only supported on NVIDIA hardware). [yes/NO]: ",
            _convert_yes_no_to_bool,
--- a/src/accelerate/commands/config/config_args.py
+++ b/src/accelerate/commands/config/config_args.py
@ -241,3 +241,4 @@ class SageMakerConfig(BaseConfig):
    sagemaker_metrics_file: str = None
    additional_args: dict = None
    dynamo_config: dict = None
+    enable_cpu_affinity: bool = False
--- a/src/accelerate/commands/config/default.py
+++ b/src/accelerate/commands/config/default.py
@ -95,6 +95,7 @@ def write_basic_config(mixed_precision="no", save_location: str = default_json_c
        config["num_processes"] = 1
        config["distributed_type"] = "NO"
    config["debug"] = False
+    config["enable_cpu_affinity"] = False
    config = ClusterConfig(**config)
    config.to_json_file(path)
    return path
--- a/src/accelerate/commands/env.py
+++ b/src/accelerate/commands/env.py
@ -79,6 +79,8 @@ def env_command(args):
    }
    if pt_cuda_available:
        info["GPU type"] = torch.cuda.get_device_name()
+    if pt_npu_available:
+        info["CANN version"] = torch.version.cann

    print("\nCopy-and-paste the text below in your GitHub issue\n")
    print("\n".join([f"- {prop}: {val}" for prop, val in info.items()]))
--- a/src/accelerate/commands/launch.py
+++ b/src/accelerate/commands/launch.py
@ -303,6 +303,15 @@ def launch_command_parser(subparsers=None):
        type=str,
        help="Tee std streams into a log file and also to console.",
    )
+    distributed_args.add_argument(
+        "--log_dir",
+        type=str,
+        default=None,
+        help=(
+            "Base directory to use for log files when using torchrun/torch.distributed.run as launcher. "
+            "Use with --tee to redirect std streams info log files."
+        ),
+    )
    distributed_args.add_argument(
        "--role",
        type=str,
@ -487,6 +496,13 @@ def launch_command_parser(subparsers=None):
        type=str,
        help="DeepSpeed multi-node launcher to use. If unspecified, will default to `pdsh`.",
    )
+    deepspeed_args.add_argument(
+        "--deepspeed_moe_layer_cls_names",
+        default=None,
+        type=str,
+        help="comma-separated list of transformer MoE layer class names (case-sensitive) to wrap ,e.g, `MixtralSparseMoeBlock`, `Qwen2MoeSparseMoeBlock`, `JetMoEAttention,JetMoEBlock` ..."
+        " (useful only when `use_deepspeed` flag is passed).",
+    )

    # fsdp arguments
    fsdp_args = parser.add_argument_group("FSDP Arguments", "Arguments related to Fully Shared Data Parallelism.")
@ -1027,8 +1043,8 @@ def _validate_launch_command(args):
        defaults is not None and defaults.compute_environment != ComputeEnvironment.AMAZON_SAGEMAKER
    )
    if is_aws_env_disabled and args.num_cpu_threads_per_process is None:
-        args.num_cpu_threads_per_process = 1
-        if args.use_cpu and args.num_processes >= 1:
+        args.num_cpu_threads_per_process = get_int_from_env(["OMP_NUM_THREADS"], 1)
+        if args.use_cpu and args.num_processes >= 1 and get_int_from_env(["OMP_NUM_THREADS"], 0) == 0:
            local_size = get_int_from_env(
                ["MPI_LOCALNRANKS", "OMPI_COMM_WORLD_LOCAL_SIZE", "MV2_COMM_WORLD_LOCAL_SIZE"], 1
            )
--- a/src/accelerate/data_loader.py
+++ b/src/accelerate/data_loader.py
@ -429,6 +429,7 @@ class DataLoaderShard(DataLoader, DataLoaderStateMixin):
        synchronized_generator=None,
        skip_batches=0,
        _drop_last: bool = False,
+        _non_blocking: bool = False,
        **kwargs,
    ):
        super().__init__(dataset, **kwargs)
@ -438,6 +439,7 @@ class DataLoaderShard(DataLoader, DataLoaderStateMixin):
        self.skip_batches = skip_batches
        self.gradient_state = GradientState()
        self._drop_last = _drop_last
+        self._non_blocking = _non_blocking
        self.iteration = 0

    def __iter__(self):
@ -458,7 +460,7 @@ class DataLoaderShard(DataLoader, DataLoaderStateMixin):
            try:
                # But we still move it to the device so it is done before `StopIteration` is reached
                if self.device is not None:
-                    current_batch = send_to_device(current_batch, self.device)
+                    current_batch = send_to_device(current_batch, self.device, non_blocking=self._non_blocking)
                next_batch = next(dataloader_iter)
                if batch_index >= self.skip_batches:
                    yield current_batch
@ -500,6 +502,18 @@ class DataLoaderShard(DataLoader, DataLoaderStateMixin):
        else:
            return len(self.dataset)

+    def get_sampler(self):
+        return get_sampler(self)
+
+    def set_sampler(self, sampler):
+        sampler_is_batch_sampler = isinstance(self.sampler, BatchSampler)
+        if sampler_is_batch_sampler:
+            self.sampler.sampler = sampler
+        else:
+            self.batch_sampler.sampler = sampler
+            if hasattr(self.batch_sampler, "batch_sampler"):
+                self.batch_sampler.batch_sampler.sampler = sampler
+

 if is_torch_xla_available():
    import torch_xla.distributed.parallel_loader as xpl
@ -571,7 +585,14 @@ class DataLoaderDispatcher(DataLoader, DataLoaderStateMixin):
    """

    def __init__(
-        self, dataset, split_batches: bool = False, skip_batches=0, _drop_last: bool = False, slice_fn=None, **kwargs
+        self,
+        dataset,
+        split_batches: bool = False,
+        skip_batches=0,
+        _drop_last: bool = False,
+        _non_blocking: bool = False,
+        slice_fn=None,
+        **kwargs,
    ):
        shuffle = False
        if is_torch_version(">=", "1.11.0"):
@ -588,6 +609,7 @@ class DataLoaderDispatcher(DataLoader, DataLoaderStateMixin):
        self.gradient_state = GradientState()
        self.state = AcceleratorState()
        self._drop_last = _drop_last
+        self._non_blocking = _non_blocking
        self.skip_batches = skip_batches

        self.slice_fn = slice_tensors if slice_fn is None else slice_fn
@ -660,7 +682,7 @@ class DataLoaderDispatcher(DataLoader, DataLoaderStateMixin):
            if self.state.process_index != 0:
                # Initialize tensors on other processes than process 0.
                batch = initialize_tensors(batch_info[0])
-            batch = send_to_device(batch, self.state.device)
+            batch = send_to_device(batch, self.state.device, non_blocking=self._non_blocking)
            # Broadcast the batch before splitting it.
            batch = broadcast(batch, from_process=0)

@ -741,6 +763,36 @@ class DataLoaderDispatcher(DataLoader, DataLoaderStateMixin):
    def total_dataset_length(self):
        return len(self.dataset)

+    def get_sampler(self):
+        return get_sampler(self)
+
+    def set_sampler(self, sampler):
+        sampler_is_batch_sampler = isinstance(self.sampler, BatchSampler)
+        if sampler_is_batch_sampler:
+            self.sampler.sampler = sampler
+        else:
+            self.batch_sampler.sampler = sampler
+            if hasattr(self.batch_sampler, "batch_sampler"):
+                self.batch_sampler.batch_sampler.sampler = sampler
+
+
+def get_sampler(dataloader):
+    """
+    Get the sampler associated to the dataloader
+
+    Args:
+        dataloader (`torch.utils.data.dataloader.DataLoader`):
+            The data loader to split across several devices.
+    Returns:
+        `torch.utils.data.Sampler`: The sampler associated to the dataloader
+    """
+    sampler_is_batch_sampler = isinstance(dataloader.sampler, BatchSampler)
+    if sampler_is_batch_sampler:
+        sampler = getattr(dataloader.sampler, "sampler", None)
+    else:
+        sampler = getattr(dataloader.batch_sampler, "sampler", None)
+    return sampler
+

 def prepare_data_loader(
    dataloader: DataLoader,
@ -754,6 +806,7 @@ def prepare_data_loader(
    even_batches: bool = True,
    slice_fn_for_dispatch: Optional[Callable] = None,
    use_seedable_sampler: bool = False,
+    non_blocking: bool = False,
 ) -> DataLoader:
    """
    Wraps a PyTorch `DataLoader` to generate batches for one of the processes only.
@ -812,6 +865,10 @@ def prepare_data_loader(
            reproducability. Comes at a cost of potentially different performances due to different shuffling
            algorithms but ensures results will be the *exact* same. Should be paired with `set_seed()` at every
            `self.set_epoch`
+        non_blocking (`bool`, *optional*, defaults to `False`):
+            If set to `True`, dataloader will utilize non-blocking host-to-device transfers. If the dataloader has
+            `pin_memory` set to `True`, this will help to increase overlap between data transfer and computations.
+

    Returns:
        `torch.utils.data.dataloader.DataLoader`: A new data loader that will yield the portion of the batches
@ -863,13 +920,10 @@ def prepare_data_loader(
    new_dataset = dataloader.dataset
    # Iterable dataset doesn't like batch_sampler, but data_loader creates a default one for it
    new_batch_sampler = dataloader.batch_sampler if not isinstance(new_dataset, IterableDataset) else None
-    sampler_is_batch_sampler = False
-    synchronized_generator = None
    sampler_is_batch_sampler = isinstance(dataloader.sampler, BatchSampler)
-    if sampler_is_batch_sampler:
-        sampler = getattr(dataloader.sampler, "sampler", None)
-    else:
-        sampler = getattr(dataloader.batch_sampler, "sampler", None)
+    synchronized_generator = None
+
+    sampler = get_sampler(dataloader)
    if isinstance(sampler, RandomSampler) and use_seedable_sampler:
        # When iterating through the dataloader during distributed processes
        # we want to ensure that on each process we are iterating through the same
@ -901,6 +955,10 @@ def prepare_data_loader(
                split_batches=split_batches,
            )
        else:
+            if not use_seedable_sampler and hasattr(sampler, "generator"):
+                if sampler.generator is None:
+                    sampler.generator = torch.Generator()
+                synchronized_generator = sampler.generator
            batch_sampler = dataloader.sampler if sampler_is_batch_sampler else dataloader.batch_sampler
            new_batch_sampler = BatchSamplerShard(
                batch_sampler,
@ -941,6 +999,7 @@ def prepare_data_loader(
            split_batches=split_batches,
            batch_sampler=new_batch_sampler,
            _drop_last=dataloader.drop_last,
+            _non_blocking=non_blocking,
            slice_fn=slice_fn_for_dispatch,
            **kwargs,
        )
@ -952,6 +1011,7 @@ def prepare_data_loader(
            batch_size=dataloader.batch_size,
            rng_types=rng_types,
            _drop_last=dataloader.drop_last,
+            _non_blocking=non_blocking,
            synchronized_generator=synchronized_generator,
            **kwargs,
        )
@ -963,16 +1023,12 @@ def prepare_data_loader(
            rng_types=rng_types,
            synchronized_generator=synchronized_generator,
            _drop_last=dataloader.drop_last,
+            _non_blocking=non_blocking,
            **kwargs,
        )

    if isinstance(sampler, SeedableRandomSampler) and use_seedable_sampler:
-        if sampler_is_batch_sampler:
-            dataloader.sampler.sampler = sampler
-        else:
-            dataloader.batch_sampler.sampler = sampler
-            if hasattr(dataloader.batch_sampler, "batch_sampler"):
-                dataloader.batch_sampler.batch_sampler.sampler = sampler
+        dataloader.set_sampler(sampler)
    if state.distributed_type == DistributedType.XLA:
        return MpDeviceLoaderWrapper(dataloader, device)
    return dataloader
--- a/src/accelerate/logging.py
+++ b/src/accelerate/logging.py
@ -54,6 +54,8 @@ class MultiProcessAdapter(logging.LoggerAdapter):
            )
        main_process_only = kwargs.pop("main_process_only", True)
        in_order = kwargs.pop("in_order", False)
+        # set `stacklevel` to exclude ourself in `Logger.findCaller()` while respecting user's choice
+        kwargs.setdefault("stacklevel", 2)

        if self.isEnabledFor(level):
            if self._should_log(main_process_only):
--- a/src/accelerate/optimizer.py
+++ b/src/accelerate/optimizer.py
@ -18,7 +18,7 @@ import warnings
 import torch

 from .state import AcceleratorState, GradientState
-from .utils import DistributedType, honor_type, is_torch_xla_available
+from .utils import DistributedType, honor_type, is_lomo_available, is_torch_xla_available


 if is_torch_xla_available():
@ -121,7 +121,22 @@ class AcceleratedOptimizer(torch.optim.Optimizer):
                    raise ValueError("`set_to_none` for Optimizer.zero_grad` is not supported by this optimizer.")
                self.optimizer.zero_grad()

+    def train(self):
+        """
+        Sets the optimizer to "train" mode. Useful for optimizers like `schedule_free`
+        """
+        return self.optimizer.train()
+
+    def eval(self):
+        """
+        Sets the optimizer to "eval" mode. Useful for optimizers like `schedule_free`
+        """
+        return self.optimizer.eval()
+
    def step(self, closure=None):
+        if is_lomo_available():
+            from lomo_optim import AdaLomo, Lomo
+
        if (
            not self.gradient_state.is_xla_gradients_synced
            and self.accelerator_state.distributed_type == DistributedType.XLA
@ -129,6 +144,12 @@ class AcceleratedOptimizer(torch.optim.Optimizer):
            gradients = xm._fetch_gradients(self.optimizer)
            xm.all_reduce("sum", gradients, scale=1.0 / xm.xrt_world_size())
            self.gradient_state.is_xla_gradients_synced = True
+
+        if is_lomo_available():
+            #  `step` should be a no-op for LOMO optimizers.
+            if isinstance(self.optimizer, (Lomo, AdaLomo)):
+                return
+
        if self.gradient_state.sync_gradients:
            if self.scaler is not None:
                self.optimizer.step = self._optimizer_patched_step_method
--- a/src/accelerate/state.py
+++ b/src/accelerate/state.py
@ -179,22 +179,14 @@ class PartialState:
                )

            # Sets up self.backend + imports
-            backend, distributed_type = self._prepare_backend(cpu, use_sagemaker_dp, kwargs.pop("backend", None))
+            original_backend = kwargs.pop("backend", None)
+            backend, distributed_type = self._prepare_backend(cpu, use_sagemaker_dp, original_backend)
+            if original_backend is not None and backend != original_backend:
+                raise ValueError("Your assigned backend {original_backend} is not avaliable, please use {backend}")
            self.backend = backend
            self.distributed_type = distributed_type
            use_deepspeed = False
-            if not cpu:
-                # Deal with XLA
-                if is_torch_xla_available():
-                    self.device = xm.xla_device()
-                    xm.set_replication(self.device, xm.get_xla_supported_devices())
-                    self.num_processes = xm.xrt_world_size()
-                    self.process_index = xm.get_ordinal()
-                    if is_torch_xla_available(check_is_tpu=True):
-                        self.local_process_index = xm.get_local_ordinal()
-                    else:
-                        self.local_process_index = int(os.environ.get("LOCAL_RANK", -1))
-                    self.distributed_type = DistributedType.XLA
+            if not cpu and self.backend != "xla":
                if int(os.environ.get("LOCAL_RANK", -1)) != -1:
                    # Deal with spawning deepspeed
                    if os.environ.get("ACCELERATE_USE_DEEPSPEED", "false") == "true":
@ -204,7 +196,7 @@ class PartialState:
                            )
                        from deepspeed import comm as dist

-                        if is_xpu_available and is_ccl_available():
+                        if is_xpu_available() and is_ccl_available():
                            os.environ["CCL_PROCESS_LAUNCHER"] = "none"
                            os.environ["CCL_LOCAL_SIZE"] = os.environ.get("LOCAL_WORLD_SIZE", "1")
                            os.environ["CCL_LOCAL_RANK"] = os.environ.get("LOCAL_RANK", "0")
@ -246,7 +238,7 @@ class PartialState:

                if (
                    self.distributed_type == DistributedType.MULTI_CPU
-                    and get_int_from_env(["OMP_NUM_THREADS", "OMP_NUM_THREADS"], 0) > 0
+                    and get_int_from_env(["OMP_NUM_THREADS"], 0) == 0
                ):
                    import psutil

@ -270,6 +262,16 @@ class PartialState:
                self.num_processes = 1
                self.process_index = 0
                self.local_process_index = 0
+            elif self.backend == "xla":
+                # XLA needs device setting first for `set_replication`
+                self.set_device()
+                xm.set_replication(self.device, xm.get_xla_supported_devices())
+                self.num_processes = xm.xrt_world_size()
+                self.process_index = xm.get_ordinal()
+                if is_torch_xla_available(check_is_tpu=True):
+                    self.local_process_index = xm.get_local_ordinal()
+                else:
+                    self.local_process_index = int(os.environ.get("LOCAL_RANK", -1))
            else:
                self.num_processes = torch.distributed.get_world_size()
                self.process_index = torch.distributed.get_rank()
@ -284,16 +286,17 @@ class PartialState:
            # Set CPU affinity if enabled
            if parse_flag_from_env("ACCELERATE_CPU_AFFINITY", False):
                set_numa_affinity(self.local_process_index)
-        self.fork_launched = parse_flag_from_env("FORK_LAUNCHED", 0)

-        # Check for old RTX 4000's that can't use P2P or IB and are on old drivers
-        if self.device.type == "cuda" and not check_cuda_p2p_ib_support():
-            if "NCCL_P2P_DISABLE" not in os.environ or "NCCL_IB_DISABLE" not in os.environ:
-                raise NotImplementedError(
-                    "Using RTX 4000 series doesn't support faster communication broadband via P2P or IB. "
-                    'Please set `NCCL_P2P_DISABLE="1"` and `NCCL_IB_DISABLE="1" or use `accelerate launch` which '
-                    "will do this automatically."
-                )
+            # Check for old RTX 4000's that can't use P2P or IB and are on old drivers
+            if self.device.type == "cuda" and not check_cuda_p2p_ib_support():
+                if "NCCL_P2P_DISABLE" not in os.environ or "NCCL_IB_DISABLE" not in os.environ:
+                    raise NotImplementedError(
+                        "Using RTX 4000 series doesn't support faster communication broadband via P2P or IB. "
+                        'Please set `NCCL_P2P_DISABLE="1"` and `NCCL_IB_DISABLE="1" or use `accelerate launch` which '
+                        "will do this automatically."
+                    )
+        # Important: This should be the *only* code outside of `self.initialized!`
+        self.fork_launched = parse_flag_from_env("FORK_LAUNCHED", 0)

    def __repr__(self) -> str:
        return (
@ -715,19 +718,22 @@ class PartialState:

            backend = "smddp"
            distributed_type = DistributedType.MULTI_GPU
-        elif int(os.environ.get("LOCAL_RANK", -1)) != -1:
-            if not cpu:
-                if is_mlu_available():
-                    backend = "cncl"
-                    distributed_type = DistributedType.MULTI_MLU
-                elif torch.cuda.is_available():
-                    if backend is None:
-                        backend = "nccl"
-                    distributed_type = DistributedType.MULTI_GPU
-                elif is_npu_available():
-                    backend = "hccl"
-                    distributed_type = DistributedType.MULTI_NPU
-        if backend is None and (
+        elif is_torch_xla_available():
+            backend = "xla"
+            distributed_type = DistributedType.XLA
+        elif int(os.environ.get("LOCAL_RANK", -1)) != -1 and not cpu:
+            if is_mlu_available():
+                backend = "cncl"
+                distributed_type = DistributedType.MULTI_MLU
+            elif torch.cuda.is_available():
+                if backend is None:
+                    backend = "nccl"
+                distributed_type = DistributedType.MULTI_GPU
+            elif is_npu_available():
+                backend = "hccl"
+                distributed_type = DistributedType.MULTI_NPU
+
+        if distributed_type is None and (
            int(os.environ.get("LOCAL_RANK", -1)) != -1
            or get_int_from_env(["PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE", "WORLD_SIZE"], 1) > 1
        ):
@ -735,8 +741,11 @@ class PartialState:
                distributed_type = DistributedType.MULTI_XPU
            else:
                distributed_type = DistributedType.MULTI_CPU
-            if is_ccl_available() and (
-                get_int_from_env(["CCL_WORKER_COUNT"], 0) > 0 or distributed_type == DistributedType.MULTI_XPU
+
+            if (
+                backend in (None, "ccl")
+                and is_ccl_available()
+                and (get_int_from_env(["CCL_WORKER_COUNT"], 0) > 0 or distributed_type == DistributedType.MULTI_XPU)
            ):
                if get_ccl_version() >= "1.12":
                    import oneccl_bindings_for_pytorch  # noqa: F401
@ -744,12 +753,13 @@ class PartialState:
                    import torch_ccl  # noqa: F401

                backend = "ccl"
-            elif torch.distributed.is_mpi_available():
+            elif backend in (None, "mpi") and torch.distributed.is_mpi_available():
                backend = "mpi"
            else:
                backend = "gloo"
        if distributed_type is None:
            distributed_type = DistributedType.NO
+
        return backend, distributed_type

    def set_device(self):
@ -758,17 +768,20 @@ class PartialState:
        """
        if self.device is not None:
            return
-        if self.num_processes == 1:
+        if self.distributed_type == DistributedType.NO:
            self.device = torch.device("cpu") if self._cpu else self.default_device
            return
        device = str(self.distributed_type).split(".")[-1].replace("MULTI_", "").lower()
-        if device not in ("cpu", "gpu", "mlu", "npu", "xpu"):
+        if device not in ("cpu", "gpu", "mlu", "npu", "xpu", "xla"):
            raise ValueError(
                f"Can't set device for {self.distributed_type} ({device}), verify we should be calling `_set_device()` for it!"
            )
-        if device == "gpu":
-            device = "cuda"
-        self.device = torch.device(device, self.local_process_index)
+        if device == "xla":
+            self.device = xm.xla_device()
+        else:
+            if device == "gpu":
+                device = "cuda"
+            self.device = torch.device(device, self.local_process_index)
        if self.device is not None:
            if device == "xpu":
                torch.xpu.set_device(self.device)
@ -893,7 +906,6 @@ class AcceleratorState:
                        fsdp_plugin.set_mixed_precision(self._mixed_precision)
                    self.fsdp_plugin = fsdp_plugin
                if os.environ.get("ACCELERATE_USE_MEGATRON_LM", "false") == "true" and self.distributed_type not in [
-                    DistributedType.MULTI_NPU,
                    DistributedType.MULTI_XPU,
                ]:
                    self.distributed_type = DistributedType.MEGATRON_LM
--- a/src/accelerate/test_utils/init.py
+++ b/src/accelerate/test_utils/init.py
@ -38,6 +38,7 @@ from .testing import (
    require_single_gpu,
    require_single_xpu,
    require_torch_min_version,
+    require_torchvision,
    require_tpu,
    require_xpu,
    skip,
--- a/src/accelerate/test_utils/scripts/test_distributed_data_loop.py
+++ b/src/accelerate/test_utils/scripts/test_distributed_data_loop.py
@ -20,12 +20,48 @@ from typing import List
 from unittest.mock import Mock

 import torch
-from torch.utils.data import DataLoader, IterableDataset, TensorDataset
+from torch.utils.data import (
+    BatchSampler,
+    DataLoader,
+    Dataset,
+    IterableDataset,
+    RandomSampler,
+    TensorDataset,
+    default_collate,
+)

 from accelerate.accelerator import Accelerator, DataLoaderConfiguration
 from accelerate.utils.dataclasses import DistributedType


+NUM_ELEMENTS = 22
+NUM_WORKERS = 4
+BATCH_SIZE = 4
+
+
+class DummyDataset(Dataset):
+    def __len__(self):
+        return NUM_ELEMENTS
+
+    def __getitem__(self, index):
+        squeeze = False
+
+        if isinstance(index, int):
+            index = [index]
+            squeeze = True
+        elif isinstance(index, slice):
+            index = list(range(*index.indices(self.size)))
+        else:
+            index = list(index)
+
+        batch = [{"index": i, "label": i % 2, "random_augmentation": torch.rand(1).item()} for i in index]
+
+        if squeeze:
+            batch = batch[0]
+
+        return batch
+
+
 class DummyIterableDataset(IterableDataset):
    def __init__(self, data):
        self.data = data
@ -206,8 +242,27 @@ def test_join_raises_warning_for_iterable_when_overriding_even_batches():
        assert "only supported for map-style datasets" in str(w[-1].message)


+def test_data_loader(data_loader, accelerator):
+    # Prepare the DataLoader
+    data_loader = accelerator.prepare(data_loader)
+
+    all_examples = []
+    for i, batch in enumerate(data_loader):
+        index, _ = accelerator.gather_for_metrics((batch["index"], batch["label"]))
+        all_examples.extend(index.detach().cpu().numpy().tolist())
+
+    # Sort the examples
+    sorted_all_examples = sorted(all_examples)
+
+    # Check if all elements are present in the sorted list of iterated samples
+    assert (
+        len(set(sorted_all_examples)) == NUM_ELEMENTS
+    ), "Not all the dataset elements have been iterated in an epoch due to duplication of samples across processes."
+
+
 def main():
    accelerator = create_accelerator()
+    torch.manual_seed(accelerator.process_index)

    accelerator.print("Test that even_batches variable ensures uniform batches across processes")
    test_default_ensures_even_batch_sizes()
@ -233,6 +288,25 @@ def main():
    test_join_raises_warning_for_non_ddp_distributed(accelerator)
    accelerator.state.distributed_type = original_state

+    dataset = DummyDataset()
+    # Conventional Dataloader with shuffle=False
+    loader = DataLoader(dataset, shuffle=False, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)
+    test_data_loader(loader, accelerator)
+
+    # Conventional Dataloader with shuffle=True
+    loader = DataLoader(dataset, shuffle=True, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)
+    test_data_loader(loader, accelerator)
+
+    # Dataloader with batch_sampler
+    sampler = BatchSampler(RandomSampler(dataset), batch_size=BATCH_SIZE, drop_last=False)
+    loader = DataLoader(dataset, batch_sampler=sampler, num_workers=NUM_WORKERS)
+    test_data_loader(loader, accelerator)
+
+    # Dataloader with sampler as an instance of `BatchSampler`
+    sampler = BatchSampler(RandomSampler(dataset), batch_size=BATCH_SIZE, drop_last=False)
+    loader = DataLoader(dataset, sampler=sampler, batch_size=None, collate_fn=default_collate, num_workers=NUM_WORKERS)
+    test_data_loader(loader, accelerator)
+

 if __name__ == "__main__":
    main()
--- a/src/accelerate/test_utils/scripts/test_script.py
+++ b/src/accelerate/test_utils/scripts/test_script.py
@ -22,7 +22,6 @@ from copy import deepcopy
 from pathlib import Path

 import numpy as np
-import pytest
 import torch
 from torch.utils.data import DataLoader, Dataset

@ -39,6 +38,7 @@ from accelerate.utils import (
    is_ipex_available,
    is_mlu_available,
    is_npu_available,
+    is_pytest_available,
    is_xpu_available,
    set_seed,
    synchronize_rng_states,
@ -711,6 +711,8 @@ def test_trigger():


 def test_reinstantiated_state():
+    import pytest
+
    AcceleratorState._reset_state()
    simple_model = torch.nn.Linear(1, 1)
    # First define an accelerator
@ -792,9 +794,10 @@ def main():
        print("\n**Breakpoint trigger test**")
    test_trigger()

-    if state.local_process_index == 0:
-        print("\n**Test reinstantiated state**")
-    test_reinstantiated_state()
+    if is_pytest_available():
+        if state.local_process_index == 0:
+            print("\n**Test reinstantiated state**")
+        test_reinstantiated_state()


 if __name__ == "__main__":
--- a/src/accelerate/test_utils/testing.py
+++ b/src/accelerate/test_utils/testing.py
@ -45,10 +45,12 @@ from ..utils import (
    is_npu_available,
    is_pandas_available,
    is_pippy_available,
+    is_schedulefree_available,
    is_tensorboard_available,
    is_timm_available,
    is_torch_version,
    is_torch_xla_available,
+    is_torchvision_available,
    is_transformers_available,
    is_wandb_available,
    is_xpu_available,
@ -213,6 +215,20 @@ def require_timm(test_case):
    return unittest.skipUnless(is_timm_available(), "test requires the timm library")(test_case)


+def require_torchvision(test_case):
+    """
+    Decorator marking a test that requires torchvision. These tests are skipped when they are not.
+    """
+    return unittest.skipUnless(is_torchvision_available(), "test requires the torchvision library")(test_case)
+
+
+def require_schedulefree(test_case):
+    """
+    Decorator marking a test that requires schedulefree. These tests are skipped when they are not.
+    """
+    return unittest.skipUnless(is_schedulefree_available(), "test requires the schedulefree library")(test_case)
+
+
 def require_bnb(test_case):
    """
    Decorator marking a test that requires bitsandbytes. These tests are skipped when they are not.
--- a/src/accelerate/utils/init.py
+++ b/src/accelerate/utils/init.py
@ -81,6 +81,7 @@ from .imports import (
    is_dvclive_available,
    is_fp8_available,
    is_ipex_available,
+    is_lomo_available,
    is_megatron_lm_available,
    is_mlflow_available,
    is_mlu_available,
@ -91,11 +92,14 @@ from .imports import (
    is_peft_available,
    is_pippy_available,
    is_pynvml_available,
+    is_pytest_available,
    is_rich_available,
    is_sagemaker_available,
+    is_schedulefree_available,
    is_tensorboard_available,
    is_timm_available,
    is_torch_xla_available,
+    is_torchvision_available,
    is_transformer_engine_available,
    is_transformers_available,
    is_wandb_available,
--- a/src/accelerate/utils/dataclasses.py
+++ b/src/accelerate/utils/dataclasses.py
@ -154,6 +154,8 @@ class InitProcessGroupKwargs(KwargsHandler):
    [method](https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group) for more
    information on each argument.

+    Note: If `timeout` is set to `None`, the default will be based upon how `backend` is set.
+
    ```python
    from datetime import timedelta
    from accelerate import Accelerator
@ -166,7 +168,12 @@ class InitProcessGroupKwargs(KwargsHandler):

    backend: Optional[str] = "nccl"
    init_method: Optional[str] = None
-    timeout: timedelta = timedelta(seconds=1800)
+    timeout: Optional[timedelta] = None
+
+    def __post_init__(self):
+        if self.timeout is None:
+            seconds = 1800 if self.backend != "nccl" else 600
+            self.timeout = timedelta(seconds=seconds)


 # Literals
@ -524,6 +531,14 @@ class DataLoaderConfiguration:
            "multiple different seeds to compare. Should also be ran with [`~utils.set_seed`] for the best results."
        },
    )
+    non_blocking: bool = field(
+        default=False,
+        metadata={
+            "help": "If set to `True`, the dataloader prepared by the Accelerator will utilize non-blocking host-to-device"
+            " transfers, allowing for better overlap between dataloader communication and computation.  Recommended that the"
+            " prepared dataloader has `pin_memory` set to `True` to work properly."
+        },
+    )


@dataclass
@ -682,15 +697,15 @@ class DeepSpeedPlugin:
        default=None,
        metadata={"help": "Possible options are 0,1,2,3; Default will be taken from environment variable"},
    )
-    is_train_batch_min: str = field(
+    is_train_batch_min: bool = field(
        default=True,
        metadata={"help": "If both train & eval dataloaders are specified, this will decide the train_batch_size"},
    )
-    offload_optimizer_device: bool = field(
+    offload_optimizer_device: str = field(
        default=None,
        metadata={"help": "Possible options are none|cpu|nvme. Only applicable with ZeRO Stages 2 and 3."},
    )
-    offload_param_device: bool = field(
+    offload_param_device: str = field(
        default=None,
        metadata={"help": "Possible options are none|cpu|nvme. Only applicable with ZeRO Stage 3."},
    )
@ -713,6 +728,13 @@ class DeepSpeedPlugin:
        default=None,
        metadata={"help": "Flag to indicate whether to save 16-bit model. Only applicable with ZeRO Stage-3."},
    )
+    transformer_moe_cls_names: str = field(
+        default=None,
+        metadata={
+            "help": "comma-separated list of transformers MoE layer class names (case-sensitive), e.g : "
+            " `MixtralSparseMoeBlock`, `Qwen2MoeSparseMoeBlock`, `JetMoEAttention,JetMoEBlock` ..."
+        },
+    )

    def __post_init__(self):
        from .deepspeed import HfDeepSpeedConfig
@ -722,9 +744,8 @@ class DeepSpeedPlugin:
            self.gradient_accumulation_steps = int(gas) if gas.isdigit() else gas

        if self.gradient_clipping is None:
-            gradient_clipping = os.environ.get("ACCELERATE_GRADIENT_CLIPPING", "none")
-            if gradient_clipping != "none":
-                self.gradient_clipping = float(gradient_clipping)
+            gradient_clipping = os.environ.get("ACCELERATE_GRADIENT_CLIPPING", "auto")
+            self.gradient_clipping = gradient_clipping if gradient_clipping == "auto" else float(gradient_clipping)

        if self.zero_stage is None:
            self.zero_stage = int(os.environ.get("ACCELERATE_DEEPSPEED_ZERO_STAGE", 2))
@ -968,6 +989,26 @@ class DeepSpeedPlugin:
                "It will only ask for the necessary config variables when using `deepspeed_config_file`."
            )

+    def set_moe_leaf_modules(self, model):
+        if self.transformer_moe_cls_names is None:
+            self.transformer_moe_cls_names = os.environ.get("ACCELERATE_DEEPSPEED_MOE_LAYER_CLS_NAMES", None)
+        if self.transformer_moe_cls_names is not None:
+            if compare_versions("deepspeed", "<", "0.14.0"):
+                raise ImportError("DeepSpeed version must be >= 0.14.0 to use MOE support. Please update DeepSpeed.")
+            from deepspeed.utils import set_z3_leaf_modules
+
+            class_names = self.transformer_moe_cls_names.split(",")
+            transformer_moe_cls = []
+            for layer_class in class_names:
+                transformer_cls = get_module_class_from_name(model, layer_class)
+                if transformer_cls is None:
+                    raise Exception(
+                        f"Could not find a transformer layer class called '{layer_class}' to wrap in the model."
+                    )
+                else:
+                    transformer_moe_cls.append(transformer_cls)
+            set_z3_leaf_modules(model, transformer_moe_cls)  # z3_leaf
+

@dataclass
 class FullyShardedDataParallelPlugin:
@ -1109,6 +1150,13 @@ class FullyShardedDataParallelPlugin:
        self.forward_prefetch = str_to_bool(os.environ.get(prefix + "FORWARD_PREFETCH", "False")) == 1
        self.activation_checkpointing = str_to_bool(os.environ.get(prefix + "ACTIVATION_CHECKPOINTING", "False")) == 1

+        if str_to_bool(os.environ.get("FSDP_CPU_RAM_EFFICIENT_LOADING", "False")) == 1 and not self.sync_module_states:
+            warnings.warn(
+                "sync_module_states cannot be False since efficient cpu ram loading enabled. "
+                "Setting sync_module_states to True."
+            )
+            self.sync_module_states = True
+
        if self.sync_module_states:
            if is_npu_available():
                device = torch.npu.current_device()
@ -1122,26 +1170,6 @@ class FullyShardedDataParallelPlugin:
                )
            self.param_init_fn = lambda x: x.to_empty(device=device, recurse=False)

-    @staticmethod
-    def get_module_class_from_name(module, name):
-        """
-        Gets a class from a module by its name.
-
-        Args:
-            module (`torch.nn.Module`): The module to get the class from.
-            name (`str`): The name of the class.
-        """
-        modules_children = list(module.children())
-        if module.__class__.__name__ == name:
-            return module.__class__
-        elif len(modules_children) == 0:
-            return
-        else:
-            for child_module in modules_children:
-                module_class = FullyShardedDataParallelPlugin.get_module_class_from_name(child_module, name)
-                if module_class is not None:
-                    return module_class
-
    def set_auto_wrap_policy(self, model):
        from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy, transformer_auto_wrap_policy

@ -1156,7 +1184,7 @@ class FullyShardedDataParallelPlugin:
                ).split(",")
                transformer_cls_to_wrap = set()
                for layer_class in transformer_cls_names_to_wrap:
-                    transformer_cls = FullyShardedDataParallelPlugin.get_module_class_from_name(model, layer_class)
+                    transformer_cls = get_module_class_from_name(model, layer_class)
                    if transformer_cls is None:
                        raise Exception("Could not find the transformer layer class to wrap in the model.")
                    else:
@ -1199,6 +1227,8 @@ class FullyShardedDataParallelPlugin:
        from torch.distributed.fsdp.fully_sharded_data_parallel import (
            FullOptimStateDictConfig,
            FullStateDictConfig,
+            ShardedOptimStateDictConfig,
+            ShardedStateDictConfig,
            StateDictType,
        )

@ -1209,6 +1239,11 @@ class FullyShardedDataParallelPlugin:
                self.state_dict_config = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
            if self.optim_state_dict_config is None:
                self.optim_state_dict_config = FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=True)
+        elif self.state_dict_type == StateDictType.SHARDED_STATE_DICT:
+            if self.state_dict_config is None:
+                self.state_dict_config = ShardedStateDictConfig(offload_to_cpu=True)
+            if self.optim_state_dict_config is None:
+                self.optim_state_dict_config = ShardedOptimStateDictConfig(offload_to_cpu=True)


@dataclass
@ -1715,3 +1750,23 @@ class BnbQuantizationConfig:

        if not isinstance(self.torch_dtype, torch.dtype):
            raise ValueError("torch_dtype must be a torch.dtype")
+
+
+def get_module_class_from_name(module, name):
+    """
+    Gets a class from a module by its name.
+
+    Args:
+        module (`torch.nn.Module`): The module to get the class from.
+        name (`str`): The name of the class.
+    """
+    modules_children = list(module.children())
+    if module.__class__.__name__ == name:
+        return module.__class__
+    elif len(modules_children) == 0:
+        return
+    else:
+        for child_module in modules_children:
+            module_class = get_module_class_from_name(child_module, name)
+            if module_class is not None:
+                return module_class
--- a/src/accelerate/utils/fsdp_utils.py
+++ b/src/accelerate/utils/fsdp_utils.py
@ -16,6 +16,7 @@ import os
 import torch

 from ..logging import get_logger
+from ..state import PartialState
 from .constants import FSDP_MODEL_NAME, FSDP_PYTORCH_VERSION, OPTIMIZER_NAME
 from .imports import is_torch_distributed_available
 from .modeling import is_peft_model
@ -51,13 +52,14 @@ def _set_model_state_dict(model, state_dict, adapter_only=False):
        return model.load_state_dict(state_dict)


-def save_fsdp_model(fsdp_plugin, accelerator, model, output_dir, model_index=0, adapter_only=False):
+def save_fsdp_model(fsdp_plugin, model, output_dir, model_index=0, adapter_only=False):
+    state = PartialState()
    os.makedirs(output_dir, exist_ok=True)

    if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
        # FSDP raises error when single GPU is used with `offload_to_cpu=True` for FULL_STATE_DICT
        # so, only enable it when num_processes>1
-        is_multi_process = accelerator.num_processes > 1
+        is_multi_process = state.num_processes > 1
        fsdp_plugin.state_dict_config.offload_to_cpu = is_multi_process
        fsdp_plugin.state_dict_config.rank0_only = is_multi_process

@ -68,15 +70,15 @@ def save_fsdp_model(fsdp_plugin, accelerator, model, output_dir, model_index=0,
        if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
            weights_name = f"{FSDP_MODEL_NAME}.bin" if model_index == 0 else f"{FSDP_MODEL_NAME}_{model_index}.bin"
            output_model_file = os.path.join(output_dir, weights_name)
-            if accelerator.process_index == 0:
+            if state.process_index == 0:
                logger.info(f"Saving model to {output_model_file}")
                torch.save(state_dict, output_model_file)
                logger.info(f"Model saved to {output_model_file}")
        elif fsdp_plugin.state_dict_type == StateDictType.LOCAL_STATE_DICT:
            weights_name = (
-                f"{FSDP_MODEL_NAME}_rank{accelerator.process_index}.bin"
+                f"{FSDP_MODEL_NAME}_rank{state.process_index}.bin"
                if model_index == 0
-                else f"{FSDP_MODEL_NAME}_{model_index}_rank{accelerator.process_index}.bin"
+                else f"{FSDP_MODEL_NAME}_{model_index}_rank{state.process_index}.bin"
            )
            output_model_file = os.path.join(output_dir, weights_name)
            logger.info(f"Saving model to {output_model_file}")
@ -96,19 +98,20 @@ def save_fsdp_model(fsdp_plugin, accelerator, model, output_dir, model_index=0,
            logger.info(f"Model saved to {ckpt_dir}")


-def load_fsdp_model(fsdp_plugin, accelerator, model, input_dir, model_index=0, adapter_only=False):
-    accelerator.wait_for_everyone()
+def load_fsdp_model(fsdp_plugin, model, input_dir, model_index=0, adapter_only=False):
+    state = PartialState()
+    state.wait_for_everyone()
    if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
        # FSDP raises error when single GPU is used with `offload_to_cpu=True` for FULL_STATE_DICT
        # so, only enable it when num_processes>1
-        is_multi_process = accelerator.num_processes > 1
+        is_multi_process = state.num_processes > 1
        fsdp_plugin.state_dict_config.offload_to_cpu = is_multi_process
        fsdp_plugin.state_dict_config.rank0_only = is_multi_process
    with FSDP.state_dict_type(
        model, fsdp_plugin.state_dict_type, fsdp_plugin.state_dict_config, fsdp_plugin.optim_state_dict_config
    ):
        if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
-            if type(model) != FSDP and accelerator.process_index != 0:
+            if type(model) != FSDP and state.process_index != 0:
                if not fsdp_plugin.sync_module_states:
                    raise ValueError(
                        "Set the `sync_module_states` flag to `True` so that model states are synced across processes when "
@ -122,9 +125,9 @@ def load_fsdp_model(fsdp_plugin, accelerator, model, input_dir, model_index=0, a
            logger.info(f"Model loaded from {input_model_file}")
        elif fsdp_plugin.state_dict_type == StateDictType.LOCAL_STATE_DICT:
            weights_name = (
-                f"{FSDP_MODEL_NAME}_rank{accelerator.process_index}.bin"
+                f"{FSDP_MODEL_NAME}_rank{state.process_index}.bin"
                if model_index == 0
-                else f"{FSDP_MODEL_NAME}_{model_index}_rank{accelerator.process_index}.bin"
+                else f"{FSDP_MODEL_NAME}_{model_index}_rank{state.process_index}.bin"
            )
            input_model_file = os.path.join(input_dir, weights_name)
            logger.info(f"Loading model from {input_model_file}")
@ -149,14 +152,15 @@ def load_fsdp_model(fsdp_plugin, accelerator, model, input_dir, model_index=0, a
    return load_result


-def save_fsdp_optimizer(fsdp_plugin, accelerator, optimizer, model, output_dir, optimizer_index=0):
+def save_fsdp_optimizer(fsdp_plugin, optimizer, model, output_dir, optimizer_index=0):
+    state = PartialState()
    os.makedirs(output_dir, exist_ok=True)
    with FSDP.state_dict_type(
        model, fsdp_plugin.state_dict_type, fsdp_plugin.state_dict_config, fsdp_plugin.optim_state_dict_config
    ):
        optim_state = FSDP.optim_state_dict(model, optimizer)
        if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
-            if accelerator.process_index == 0:
+            if state.process_index == 0:
                optim_state_name = (
                    f"{OPTIMIZER_NAME}.bin" if optimizer_index == 0 else f"{OPTIMIZER_NAME}_{optimizer_index}.bin"
                )
@ -176,14 +180,15 @@ def save_fsdp_optimizer(fsdp_plugin, accelerator, optimizer, model, output_dir,
            logger.info(f"Optimizer state saved in {ckpt_dir}")


-def load_fsdp_optimizer(fsdp_plugin, accelerator, optimizer, model, input_dir, optimizer_index=0, adapter_only=False):
-    accelerator.wait_for_everyone()
+def load_fsdp_optimizer(fsdp_plugin, optimizer, model, input_dir, optimizer_index=0, adapter_only=False):
+    state = PartialState()
+    state.wait_for_everyone()
    with FSDP.state_dict_type(
        model, fsdp_plugin.state_dict_type, fsdp_plugin.state_dict_config, fsdp_plugin.optim_state_dict_config
    ):
        if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
            optim_state = None
-            if accelerator.process_index == 0 or not fsdp_plugin.optim_state_dict_config.rank0_only:
+            if state.process_index == 0 or not fsdp_plugin.optim_state_dict_config.rank0_only:
                optimizer_name = (
                    f"{OPTIMIZER_NAME}.bin" if optimizer_index == 0 else f"{OPTIMIZER_NAME}_{optimizer_index}.bin"
                )
--- a/src/accelerate/utils/imports.py
+++ b/src/accelerate/utils/imports.py
@ -85,14 +85,26 @@ def is_pynvml_available():
    return _is_package_available("pynvml")


+def is_pytest_available():
+    return _is_package_available("pytest")
+
+
 def is_msamp_available():
    return _is_package_available("msamp", "ms-amp")


+def is_schedulefree_available():
+    return _is_package_available("schedulefree")
+
+
 def is_transformer_engine_available():
    return _is_package_available("transformer_engine")


+def is_lomo_available():
+    return _is_package_available("lomo_optim")
+
+
 def is_fp8_available():
    return is_msamp_available() or is_transformer_engine_available()

@ -175,6 +187,8 @@ def is_bf16_available(ignore_tpu=False):
        return not ignore_tpu
    if is_cuda_available():
        return torch.cuda.is_bf16_supported()
+    if is_mps_available():
+        return False
    return True


@ -198,6 +212,10 @@ def is_bnb_available():
    return _is_package_available("bitsandbytes")


+def is_torchvision_available():
+    return _is_package_available("torchvision")
+
+
 def is_megatron_lm_available():
    if str_to_bool(os.environ.get("ACCELERATE_USE_MEGATRON_LM", "False")) == 1:
        package_exists = importlib.util.find_spec("megatron") is not None
--- a/src/accelerate/utils/launch.py
+++ b/src/accelerate/utils/launch.py
@ -393,6 +393,8 @@ def prepare_deepspeed_cmd_env(args: argparse.Namespace) -> Tuple[List[str], Dict
        current_env["ACCELERATE_DEEPSPEED_CONFIG_FILE"] = str(args.deepspeed_config_file)
    if args.enable_cpu_affinity:
        current_env["ACCELERATE_CPU_AFFINITY"] = "1"
+    if args.deepspeed_moe_layer_cls_names is not None:
+        current_env["ACCELERATE_DEEPSPEED_MOE_LAYER_CLS_NAMES"] = str(args.deepspeed_moe_layer_cls_names)
    return cmd, current_env


--- a/src/accelerate/utils/modeling.py
+++ b/src/accelerate/utils/modeling.py
@ -381,12 +381,13 @@ def set_module_tensor_to_device(
            device_quantization = device
            device = "cpu"
        # `torch.Tensor.to(<int num>)` is not supported by `torch_npu` (see this [issue](https://github.com/Ascend/pytorch/issues/16)).
-        if is_npu_available() and isinstance(device, int):
-            device = f"npu:{device}"
-        elif is_mlu_available() and isinstance(device, int):
-            device = f"mlu:{device}"
-        if is_xpu_available() and isinstance(device, int):
-            device = f"xpu:{device}"
+        if isinstance(device, int):
+            if is_npu_available():
+                device = f"npu:{device}"
+            elif is_mlu_available():
+                device = f"mlu:{device}"
+            elif is_xpu_available():
+                device = f"xpu:{device}"
        if value is None:
            new_value = old_value.to(device)
            if dtype is not None and device in ["meta", torch.device("meta")]:
@ -447,14 +448,15 @@ def set_module_tensor_to_device(
                if not getattr(module.weight, "quant_state", None) and device_index is not None:
                    module.weight = module.weight.cuda(device_index)
    # clean pre and post foward hook
-    if is_npu_available():
-        torch.npu.empty_cache()
-    elif is_mlu_available():
-        torch.mlu.empty_cache()
-    elif is_xpu_available():
-        torch.xpu.empty_cache()
-    else:
-        torch.cuda.empty_cache()
+    if device != "cpu":
+        if is_npu_available():
+            torch.npu.empty_cache()
+        elif is_mlu_available():
+            torch.mlu.empty_cache()
+        elif is_xpu_available():
+            torch.xpu.empty_cache()
+        else:
+            torch.cuda.empty_cache()

    # When handling tied weights, we update tied_params_map to keep track of the tied weights that have already been allocated on the device in
    # order to avoid duplicating memory, see above.
@ -801,27 +803,40 @@ def get_max_memory(max_memory: Optional[Dict[Union[int, str], Union[int, str]]]
    import psutil

    if max_memory is None:
-        if not (torch.cuda.is_available() or is_npu_available() or is_mlu_available() or is_xpu_available()):
-            max_memory = {}
-
-        else:
-            # Make sure CUDA is initialized on each GPU to have the right memory info.
-            if is_npu_available():
-                for i in range(torch.npu.device_count()):
+        max_memory = {}
+        # Make sure CUDA is initialized on each GPU to have the right memory info.
+        if is_npu_available():
+            for i in range(torch.npu.device_count()):
+                try:
                    _ = torch.tensor(0, device=torch.device("npu", i))
-                max_memory = {i: torch.npu.mem_get_info(i)[0] for i in range(torch.npu.device_count())}
-            elif is_mlu_available():
-                for i in range(torch.mlu.device_count()):
+                    max_memory[i] = torch.npu.mem_get_info(i)[0]
+                except Exception:
+                    logger.info(f"Device {i} seems unavailable, Proceeding to check subsequent devices.")
+                    continue
+        elif is_mlu_available():
+            for i in range(torch.mlu.device_count()):
+                try:
                    _ = torch.tensor(0, device=torch.device("mlu", i))
-                max_memory = {i: torch.mlu.mem_get_info(i)[0] for i in range(torch.mlu.device_count())}
-            elif is_xpu_available():
-                for i in range(torch.xpu.device_count()):
+                    max_memory[i] = torch.mlu.mem_get_info(i)[0]
+                except Exception:
+                    logger.info(f"Device {i} seems unavailable, Proceeding to check subsequent devices.")
+                    continue
+        elif is_xpu_available():
+            for i in range(torch.xpu.device_count()):
+                try:
                    _ = torch.tensor(0, device=torch.device("xpu", i))
-                max_memory = {i: torch.xpu.max_memory_allocated(i) for i in range(torch.xpu.device_count())}
-            else:
-                for i in range(torch.cuda.device_count()):
+                    max_memory[i] = torch.xpu.max_memory_allocated(i)
+                except Exception:
+                    logger.info(f"Device {i} seems unavailable, Proceeding to check subsequent devices.")
+                    continue
+        else:
+            for i in range(torch.cuda.device_count()):
+                try:
                    _ = torch.tensor([0], device=i)
-                max_memory = {i: torch.cuda.mem_get_info(i)[0] for i in range(torch.cuda.device_count())}
+                    max_memory[i] = torch.cuda.mem_get_info(i)[0]
+                except Exception:
+                    logger.info(f"Device {i} seems unavailable, Proceeding to check subsequent devices.")
+                    continue
        # allocate everything in the mps device as the RAM is shared
        if is_mps_available():
            max_memory["mps"] = psutil.virtual_memory().available
@ -914,6 +929,17 @@ def load_offloaded_weights(model, index, offload_folder):
        set_module_tensor_to_device(model, param_name, "cpu", value=weight, fp16_statistics=fp16_statistics)


+def get_module_leaves(module_sizes):
+    module_children = {}
+    for module in module_sizes:
+        if module == "" or "." not in module:
+            continue
+        parent = module.rsplit(".", 1)[0]
+        module_children[parent] = module_children.get(parent, 0) + 1
+    leaves = [module for module in module_sizes if module_children.get(module, 0) == 0 and module != ""]
+    return leaves
+
+
 def get_balanced_memory(
    model: nn.Module,
    max_memory: Optional[Dict[Union[int, str], Union[int, str]]] = None,
@ -1023,10 +1049,10 @@ def get_balanced_memory(
        buffer = 0

    # Compute mean of final modules. In the first dict of module sizes, leaves are the parameters
-    leaves = [n for n in module_sizes if len([p for p in module_sizes if n == "" or p.startswith(n + ".")]) == 0]
+    leaves = get_module_leaves(module_sizes)
    module_sizes = {n: v for n, v in module_sizes.items() if n not in leaves}
    # Once removed, leaves are the final modules.
-    leaves = [n for n in module_sizes if len([p for p in module_sizes if n == "" or p.startswith(n + ".")]) == 0]
+    leaves = get_module_leaves(module_sizes)
    mean_leaves = int(sum([module_sizes[n] for n in leaves]) / max(len(leaves), 1))
    buffer = int(1.25 * max(buffer, mean_leaves))
    per_gpu += buffer
@ -1783,7 +1809,7 @@ def get_mixed_precision_context_manager(native_amp: bool = False, autocast_kwarg
        )
        if state.mixed_precision == "fp16":
            return torch.autocast(device_type=device_type, dtype=torch.float16, **autocast_kwargs)
-        elif state.mixed_precision == "bf16" and state.distributed_type in [
+        elif state.mixed_precision in ["bf16", "fp8"] and state.distributed_type in [
            DistributedType.NO,
            DistributedType.MULTI_CPU,
            DistributedType.MULTI_GPU,
--- a/src/accelerate/utils/operations.py
+++ b/src/accelerate/utils/operations.py
@ -164,10 +164,7 @@ def send_to_device(tensor, device, non_blocking=False, skip_keys=None):
            if is_npu_available():
                if isinstance(device, int):
                    device = f"npu:{device}"
-            else:
-                raise error
-        except Exception as error:
-            if is_xpu_available():
+            elif is_xpu_available():
                if isinstance(device, int):
                    device = f"xpu:{device}"
            else:
--- a/src/accelerate/utils/random.py
+++ b/src/accelerate/utils/random.py
@ -109,6 +109,8 @@ def synchronize_rng_state(rng_type: Optional[RNGType] = None, generator: Optiona
        torch.cuda.set_rng_state(rng_state)
    elif rng_type == RNGType.NPU:
        torch.npu.set_rng_state(rng_state)
+    elif rng_type == RNGType.MLU:
+        torch.mlu.set_rng_state(rng_state)
    elif rng_type == RNGType.XPU:
        torch.xpu.set_rng_state(rng_state)
    elif rng_type == RNGType.XLA:
--- a/src/accelerate/utils/tqdm.py
+++ b/src/accelerate/utils/tqdm.py
@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import warnings
+
 from .imports import is_tqdm_available


@ -21,7 +23,7 @@ if is_tqdm_available():
 from ..state import PartialState


-def tqdm(main_process_only: bool = True, *args, **kwargs):
+def tqdm(*args, main_process_only: bool = True, **kwargs):
    """
    Wrapper around `tqdm.tqdm` that optionally displays only on the main process.

@ -31,7 +33,15 @@ def tqdm(main_process_only: bool = True, *args, **kwargs):
    """
    if not is_tqdm_available():
        raise ImportError("Accelerate's `tqdm` module requires `tqdm` to be installed. Please run `pip install tqdm`.")
-    disable = False
-    if main_process_only:
+    if len(args) > 0 and isinstance(args[0], bool):
+        warnings.warn(
+            f"Passing `{args[0]}` as the first argument to Accelerate's `tqdm` wrapper is deprecated "
+            "and will be removed in v0.33.0. Please use the `main_process_only` keyword argument instead.",
+            FutureWarning,
+        )
+        main_process_only = args[0]
+        args = args[1:]
+    disable = kwargs.pop("disable", False)
+    if main_process_only and not disable:
        disable = PartialState().local_process_index != 0
    return _tqdm(*args, **kwargs, disable=disable)
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@ -51,12 +51,14 @@ from accelerate.utils.deepspeed import (
    DummyScheduler,
 )
 from accelerate.utils.other import patch_environment
+from accelerate.utils.versions import compare_versions


 set_seed(42)

 GPT2_TINY = "sshleifer/tiny-gpt2"
 MOBILEVIT = "apple/mobilevit-xx-small"
+QWEN_MOE = "peft-internal-testing/tiny-random-qwen-1.5-MoE"

 ZERO2 = "zero2"
 ZERO3 = "zero3"
@ -811,6 +813,30 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
        )
        assert deepspeed_plugin.zero_stage == int(stage.replace("zero", ""))

+    def test_prepare_deepspeed_prepare_moe(self):
+        if compare_versions("transformers", "<", "4.40") and compare_versions("deepspeed", "<", "0.14"):
+            return
+        deepspeed_plugin = DeepSpeedPlugin(
+            zero3_init_flag=True,
+            gradient_accumulation_steps=1,
+            gradient_clipping=1.0,
+            zero_stage=3,
+            offload_optimizer_device="none",
+            offload_param_device="none",
+            zero3_save_16bit_model=True,
+            transformer_moe_cls_names="Qwen2MoeSparseMoeBlock",
+        )
+        with mockenv_context(**self.dist_env):
+            accelerator = Accelerator(mixed_precision="fp16", deepspeed_plugin=deepspeed_plugin)
+            accelerator.state.deepspeed_plugin.deepspeed_config["train_micro_batch_size_per_gpu"] = 1
+            model = AutoModelForCausalLM.from_pretrained(QWEN_MOE)
+            model = accelerator.prepare(model)
+            from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock
+
+            for module in model.modules():
+                if isinstance(module, Qwen2MoeSparseMoeBlock):
+                    assert hasattr(module, "_z3_leaf") and module._z3_leaf
+
    def test_basic_run(self):
        test_file_path = path_in_accelerate_package("test_utils", "scripts", "external_deps", "test_performance.py")
        with tempfile.TemporaryDirectory() as dirpath:
--- a/tests/test_accelerator.py
+++ b/tests/test_accelerator.py
@ -17,6 +17,7 @@ import pickle
 import tempfile
 from unittest.mock import patch

+import psutil
 import pytest
 import torch
 from parameterized import parameterized
@ -196,14 +197,25 @@ class AcceleratorTester(AccelerateTestCase):

    def test_free_memory_dereferences_prepared_components(self):
        accelerator = Accelerator()
-        model, optimizer, scheduler, train_dl, valid_dl = create_components()
-        accelerator.prepare(model, optimizer, scheduler, train_dl, valid_dl)
+        # Free up refs with empty_cache() and gc.collect()
        accelerator.free_memory()
+        model, optimizer, scheduler, train_dl, valid_dl = create_components()
+        free_cpu_ram_before = psutil.virtual_memory().available // 1024 // 1024
+        model, optimizer, scheduler, train_dl, valid_dl = accelerator.prepare(
+            model, optimizer, scheduler, train_dl, valid_dl
+        )
+        model, optimizer, scheduler, train_dl, valid_dl = accelerator.free_memory(
+            model, optimizer, scheduler, train_dl, valid_dl
+        )
+
+        free_cpu_ram_after = psutil.virtual_memory().available // 1024 // 1024

        assert len(accelerator._models) == 0
        assert len(accelerator._optimizers) == 0
        assert len(accelerator._schedulers) == 0
        assert len(accelerator._dataloaders) == 0
+        # The less-than comes *specifically* from CUDA CPU things/won't be present on CPU builds
+        assert free_cpu_ram_after <= free_cpu_ram_before

    @require_non_torch_xla
    def test_env_var_device(self):
--- a/tests/test_big_modeling.py
+++ b/tests/test_big_modeling.py
@ -35,14 +35,19 @@ from accelerate.hooks import remove_hook_from_submodules
 from accelerate.test_utils import (
    require_bnb,
    require_cuda,
-    require_mps,
+    require_multi_device,
    require_multi_gpu,
+    require_non_cpu,
    require_non_torch_xla,
    slow,
+    torch_device,
 )
 from accelerate.utils import is_torch_version, offload_state_dict


+torch_device = f"{torch_device}:0" if torch_device != "cpu" else "cpu"
+
+
 class ModelForTest(nn.Module):
    def __init__(self):
        super().__init__()
@ -175,17 +180,9 @@ class BigModelingTester(unittest.TestCase):
        with init_empty_weights():
            _ = nn.Sequential(*[nn.Linear(10000, 10000) for _ in range(1000)])

-    @require_cuda
-    def test_init_on_device_cuda(self):
-        device = torch.device("cuda:0")
-        with init_on_device(device):
-            model = nn.Linear(10, 10)
-        assert model.weight.device == device
-        assert model.weight.device == device
-
-    @require_mps
-    def test_init_on_device_mps(self):
-        device = torch.device("mps:0")
+    @require_non_cpu
+    def test_init_on_device(self):
+        device = torch.device(torch_device)
        with init_on_device(device):
            model = nn.Linear(10, 10)
        assert model.weight.device == device
@ -196,7 +193,7 @@ class BigModelingTester(unittest.TestCase):
        x = torch.randn(2, 3)
        expected = model(x)

-        device = torch.device(0 if torch.cuda.is_available() else "cpu")
+        device = torch.device(torch_device)

        cpu_offload(model, execution_device=device)
        output = model(x)
@ -214,7 +211,7 @@ class BigModelingTester(unittest.TestCase):
        x = torch.randn(2, 3)
        expected = model(x)

-        device = torch.device(0 if torch.cuda.is_available() else "cpu")
+        device = torch.device(torch_device)

        cpu_offload(model, execution_device=device, preload_module_classes=["ModuleWithUnusedSubModules"])
        output = model(x)
@ -233,10 +230,10 @@ class BigModelingTester(unittest.TestCase):
        assert torch.allclose(expected, output.cpu(), 1e-4, 1e-5), f"Expected: {expected}, Actual: {output.cpu()}"

    @slow
-    @require_cuda
+    @require_non_cpu
    def test_cpu_offload_gpt2(self):
        tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        inputs = tokenizer("Hello world! My name is", return_tensors="pt").to(0)
+        inputs = tokenizer("Hello world! My name is", return_tensors="pt").to(torch_device)

        gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")
        cpu_offload(gpt2, execution_device=0)
@ -251,7 +248,7 @@ class BigModelingTester(unittest.TestCase):
        x = torch.randn(2, 3)
        expected = model(x)

-        device = torch.device(0 if torch.cuda.is_available() else "cpu")
+        device = torch.device(torch_device)

        with TemporaryDirectory() as tmp_dir:
            disk_offload(model, tmp_dir, execution_device=device)
@ -271,7 +268,7 @@ class BigModelingTester(unittest.TestCase):
        x = torch.randn(2, 3)
        expected = model(x)

-        device = torch.device(0 if torch.cuda.is_available() else "cpu")
+        device = torch.device(torch_device)

        with TemporaryDirectory() as tmp_dir:
            disk_offload(
@ -295,10 +292,10 @@ class BigModelingTester(unittest.TestCase):
            assert torch.allclose(expected, output.cpu(), 1e-4, 1e-5), f"Expected: {expected}, Actual: {output.cpu()}"

    @slow
-    @require_cuda
+    @require_non_cpu
    def test_disk_offload_gpt2(self):
        tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        inputs = tokenizer("Hello world! My name is", return_tensors="pt").to(0)
+        inputs = tokenizer("Hello world! My name is", return_tensors="pt").to(torch_device)

        gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")
        with TemporaryDirectory() as tmp_dir:
@ -309,7 +306,7 @@ class BigModelingTester(unittest.TestCase):
                == "Hello world! My name is Kiyoshi, and I'm a student at the University of Tokyo"
            )

-    @require_cuda
+    @require_non_cpu
    def test_dispatch_model(self):
        model = ModelForTest()
        device_map = {"linear1": "disk", "batchnorm": "cpu", "linear2": 0}
@ -322,7 +319,7 @@ class BigModelingTester(unittest.TestCase):
            output = model(x)
            assert torch.allclose(expected, output.cpu(), atol=1e-5)

-    @require_cuda
+    @require_non_cpu
    def test_dispatch_model_with_non_persistent_buffers(self):
        model = ModelForTestNonPersistentBuffers()
        device_map = {"linear1": 0, "batchnorm": "cpu", "linear2": "disk"}
@ -334,20 +331,7 @@ class BigModelingTester(unittest.TestCase):
            output = model(x)
            assert torch.allclose(expected, output.cpu(), atol=1e-5)

-    @require_mps
-    def test_dispatch_model_mps(self):
-        model = ModelForTest()
-        device_map = {"linear1": "mps", "batchnorm": "disk", "linear2": "disk"}
-
-        x = torch.randn(2, 3)
-        expected = model(x)
-
-        with TemporaryDirectory() as tmp_dir:
-            dispatch_model(model, device_map, offload_dir=tmp_dir)
-            output = model(x)
-            assert torch.allclose(expected, output.cpu(), atol=1e-5)
-
-    @require_cuda
+    @require_non_cpu
    def test_dispatch_model_tied_weights(self):
        model = ModelForTestTiedWeights()
        model.linear1.weight = model.linear2.weight
@ -597,8 +581,8 @@ class BigModelingTester(unittest.TestCase):

            assert (free_memory_bytes_after_infer - free_memory_bytes_after_dispatch) * 1e-6 < 130

-    @require_multi_gpu
-    def test_dispatch_model_multi_gpu(self):
+    @require_multi_device
+    def test_dispatch_model_multi_devices(self):
        model = BiggerModelForTest()
        device_map = {"linear1": "cpu", "linear2": "disk", "batchnorm": "cpu", "linear3": 0, "linear4": 1}

@ -610,7 +594,7 @@ class BigModelingTester(unittest.TestCase):
            output = model(x)
            assert torch.allclose(expected, output.cpu(), atol=1e-5)

-    @require_cuda
+    @require_non_cpu
    def test_dispatch_model_copy(self):
        original_model = ModelForTestCopy(id=1)
        device_map = {"linear1": 0, "batchnorm": "cpu", "linear2": 0}
@ -629,7 +613,7 @@ class BigModelingTester(unittest.TestCase):
        assert copied_model.linear1.forward is not original_model.linear1.forward
        assert torch.allclose(expected, output.cpu(), atol=1e-5)

-    @require_cuda
+    @require_non_cpu
    def test_dispatch_model_move_offloaded_model(self):
        model = ModelForTest()
        device_map = {"linear1": "disk", "batchnorm": "cpu", "linear2": 0}
@ -653,10 +637,10 @@ class BigModelingTester(unittest.TestCase):
                model(x)

    @slow
-    @require_multi_gpu
-    def test_dispatch_model_gpt2_on_two_gpus(self):
+    @require_multi_device
+    def test_dispatch_model_gpt2_on_two_devices(self):
        tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        inputs = tokenizer("Hello world! My name is", return_tensors="pt").to(0)
+        inputs = tokenizer("Hello world! My name is", return_tensors="pt").to(torch_device)

        gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")
        # Dispatch on GPUs 0 and 1
@ -703,7 +687,7 @@ class BigModelingTester(unittest.TestCase):
                == "Hello world! My name is Kiyoshi, and I'm a student at the University of Tokyo"
            )

-    @require_cuda
+    @require_non_cpu
    def test_dispatch_model_with_unused_submodules(self):
        model = ModelWithUnusedSubModulesForTest()
        device_map = {"linear1": "cpu", "linear2": "disk", "batchnorm": "cpu", "linear3": 0, "linear4": 0}
@ -718,23 +702,8 @@ class BigModelingTester(unittest.TestCase):
            output = model(x)
            assert torch.allclose(expected, output.cpu(), atol=1e-5)

-    @require_mps
-    def test_dispatch_model_with_unused_submodules_mps(self):
-        model = ModelWithUnusedSubModulesForTest()
-        device_map = {"linear1": "mps", "linear2": "mps", "batchnorm": "mps", "linear3": "mps", "linear4": "disk"}
-
-        x = torch.randn(2, 3)
-        expected = model(x)
-
-        with TemporaryDirectory() as tmp_dir:
-            dispatch_model(
-                model, device_map, offload_dir=tmp_dir, preload_module_classes=["ModuleWithUnusedSubModules"]
-            )
-            output = model(x)
-            assert torch.allclose(expected, output.cpu(), atol=1e-5)
-
-    @require_multi_gpu
-    def test_dispatch_model_with_unused_submodules_multi_gpu(self):
+    @require_multi_device
+    def test_dispatch_model_with_unused_submodules_multi_device(self):
        model = ModelWithUnusedSubModulesForTest()
        device_map = {"linear1": "cpu", "linear2": "disk", "batchnorm": "cpu", "linear3": 0, "linear4": 1}

@ -748,7 +717,7 @@ class BigModelingTester(unittest.TestCase):
            output = model(x)
            assert torch.allclose(expected, output.cpu(), atol=1e-5)

-    @require_cuda
+    @require_non_cpu
    def test_dispatch_model_force_hooks(self):
        model = ModelForTest()
        device_map = {"": 0}
@ -760,7 +729,7 @@ class BigModelingTester(unittest.TestCase):
        output = model(x)
        assert torch.allclose(expected, output.cpu(), atol=1e-5)

-    @require_cuda
+    @require_non_cpu
    def test_load_checkpoint_and_dispatch(self):
        model = ModelForTest()
        device_map = {"linear1": "cpu", "batchnorm": "cpu", "linear2": 0}
@ -782,32 +751,8 @@ class BigModelingTester(unittest.TestCase):
        output = new_model(x)
        assert torch.allclose(expected, output.cpu(), atol=1e-5)

-    @require_mps
-    def test_load_checkpoint_and_dispatch_mps(self):
-        model = ModelForTest()
-        device_map = {"linear1": "mps", "batchnorm": "mps", "linear2": "disk"}
-
-        x = torch.randn(2, 3)
-        expected = model(x)
-
-        with TemporaryDirectory() as tmp_dir:
-            checkpoint = os.path.join(tmp_dir, "pt_model.bin")
-            torch.save(model.state_dict(), checkpoint)
-
-            new_model = ModelForTest()
-            new_model = load_checkpoint_and_dispatch(
-                new_model, checkpoint, device_map=device_map, offload_folder=tmp_dir
-            )
-
-            # CPU-offloaded weights are on the meta device while waiting for the forward pass.
-            assert new_model.linear1.weight.device == torch.device("mps:0")
-            assert new_model.linear2.weight.device == torch.device("meta")
-
-            output = new_model(x)
-            assert torch.allclose(expected, output.cpu(), atol=1e-5)
-
-    @require_multi_gpu
-    def test_load_checkpoint_and_dispatch_multi_gpu(self):
+    @require_multi_device
+    def test_load_checkpoint_and_dispatch_multi_device(self):
        model = BiggerModelForTest()
        device_map = {"linear1": "cpu", "linear2": "cpu", "batchnorm": 0, "linear3": 0, "linear4": 1}

@ -830,7 +775,7 @@ class BigModelingTester(unittest.TestCase):
        output = new_model(x)
        assert torch.allclose(expected, output.cpu(), atol=1e-5)

-    @require_cuda
+    @require_non_cpu
    def test_load_checkpoint_and_dispatch_with_unused_submodules(self):
        model = ModelWithUnusedSubModulesForTest()
        device_map = {"linear1": "cpu", "linear2": "cpu", "batchnorm": 0, "linear3": 0, "linear4": 0}
@ -856,38 +801,8 @@ class BigModelingTester(unittest.TestCase):
        output = new_model(x)
        assert torch.allclose(expected, output.cpu(), atol=1e-5)

-    @require_mps
-    def test_load_checkpoint_and_dispatch_with_unused_submodules_mps(self):
-        model = ModelWithUnusedSubModulesForTest()
-        device_map = {"linear1": "mps", "linear2": "mps", "batchnorm": "mps", "linear3": "disk", "linear4": "disk"}
-
-        x = torch.randn(2, 3)
-        expected = model(x)
-
-        with TemporaryDirectory() as tmp_dir:
-            checkpoint = os.path.join(tmp_dir, "pt_model.bin")
-            torch.save(model.state_dict(), checkpoint)
-
-            new_model = ModelWithUnusedSubModulesForTest()
-            new_model = load_checkpoint_and_dispatch(
-                new_model,
-                checkpoint,
-                device_map=device_map,
-                preload_module_classes=["ModuleWithUnusedSubModules"],
-                offload_folder=tmp_dir,
-            )
-
-            # CPU-offloaded weights are on the meta device while waiting for the forward pass.
-            assert new_model.linear1.linear.weight.device == torch.device("mps:0")
-            assert new_model.linear2.linear.weight.device == torch.device("mps:0")
-            assert new_model.linear3.linear.weight.device == torch.device("meta")
-            assert new_model.linear4.linear.weight.device == torch.device("meta")
-
-            output = new_model(x)
-            assert torch.allclose(expected, output.cpu(), atol=1e-5)
-
-    @require_multi_gpu
-    def test_load_checkpoint_and_dispatch_multi_gpu_with_unused_submodules(self):
+    @require_multi_device
+    def test_load_checkpoint_and_dispatch_multi_device_with_unused_submodules(self):
        model = ModelWithUnusedSubModulesForTest()
        device_map = {"linear1": "cpu", "linear2": "cpu", "batchnorm": 0, "linear3": 0, "linear4": 1}

@ -912,7 +827,7 @@ class BigModelingTester(unittest.TestCase):
        output = new_model(x)
        assert torch.allclose(expected, output.cpu(), atol=1e-5)

-    @require_cuda
+    @require_non_cpu
    def test_cpu_offload_with_hook(self):
        model1 = torch.nn.Linear(4, 5)
        model1, hook1 = cpu_offload_with_hook(model1)
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@ -20,7 +20,7 @@ from unittest.mock import patch
 import torch
 from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError

-from accelerate.commands.config.config_args import BaseConfig, ClusterConfig, SageMakerConfig
+from accelerate.commands.config.config_args import BaseConfig, ClusterConfig, SageMakerConfig, load_config_from_file
 from accelerate.commands.estimate import estimate_command, estimate_command_parser, gather_data
 from accelerate.commands.launch import _validate_launch_command, launch_command_parser
 from accelerate.test_utils import execute_subprocess_async
@ -73,8 +73,9 @@ class AccelerateLauncherTester(unittest.TestCase):
        execute_subprocess_async(cmd, env=os.environ.copy())

    def test_config_compatibility(self):
+        invalid_configs = ["invalid", "mpi", "sagemaker"]
        for config in sorted(self.test_config_path.glob("**/*.yaml")):
-            if "invalid" in str(config) or "mpi" in str(config):
+            if any(invalid_config in str(config) for invalid_config in invalid_configs):
                continue
            with self.subTest(config_file=config):
                cmd = get_launch_command(config_file=config) + [self.test_file_path]
@ -196,6 +197,8 @@ class ClusterConfigTester(unittest.TestCase):
    Test case for verifying the config dataclasses work
    """

+    test_config_path = Path("tests/test_configs")
+
    def test_base_config(self):
        # Tests that all the dataclasses can be initialized
        config = BaseConfig(
@ -257,6 +260,8 @@ class ClusterConfigTester(unittest.TestCase):
        assert config.ec2_instance_type == "MY_TYPE"
        assert config.iam_role_name == "MY_ROLE"

+        config = load_config_from_file(str(self.test_config_path / "0_30_0_sagemaker.yaml"))
+

 class TpuConfigTester(unittest.TestCase):
    """
--- a/tests/test_configs/0_30_0_sagemaker.yaml
+++ b/tests/test_configs/0_30_0_sagemaker.yaml
@ -0,0 +1,8 @@
+compute_environment: AMAZON_SAGEMAKER
+debug: false
+distributed_type: NO
+mixed_precision: fp16
+debug: false
+use_cpu: false
+ec2_instance_type: MY_TYPE
+iam_role_name: MY_ROLE
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@ -30,6 +30,7 @@ from accelerate.test_utils.testing import (
    require_huggingface_suite,
    require_multi_gpu,
    require_pippy,
+    require_schedulefree,
    require_trackers,
    run_command,
    slow,
@ -47,6 +48,7 @@ EXCLUDE_EXAMPLES = [
    "local_sgd.py",
    "multi_process_metrics.py",
    "memory.py",
+    "schedule_free.py",
    "automatic_gradient_accumulation.py",
    "fsdp_with_peak_mem_tracking.py",
    "deepspeed_with_config_support.py",
@ -216,6 +218,11 @@ class FeatureExamplesTests(TempDirTestCase):
        testargs = ["examples/by_feature/multi_process_metrics.py"]
        run_command(self.launch_args + testargs)

+    @require_schedulefree
+    def test_schedulefree(self):
+        testargs = ["examples/by_feature/schedule_free.py"]
+        run_command(self.launch_args + testargs)
+
    @require_trackers
    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline", "DVCLIVE_TEST": "true"})
    def test_tracking(self):
@ -240,20 +247,30 @@ class FeatureExamplesTests(TempDirTestCase):
        testargs = ["examples/by_feature/early_stopping.py"]
        run_command(self.launch_args + testargs)

+    @require_multi_gpu
+    def test_distributed_inference_examples_stable_diffusion(self):
+        testargs = ["examples/inference/distributed/stable_diffusion.py"]
+        run_command(self.launch_args + testargs)
+
+    @require_multi_gpu
+    def test_distributed_inference_examples_phi2(self):
+        testargs = ["examples/inference/distributed/phi2.py"]
+        run_command(self.launch_args + testargs)
+
    @require_pippy
    @require_multi_gpu
    def test_pippy_examples_bert(self):
-        testargs = ["examples/inference/bert.py"]
+        testargs = ["examples/inference/pippy/bert.py"]
        run_command(self.launch_args + testargs)

    @require_pippy
    @require_multi_gpu
    def test_pippy_examples_gpt2(self):
-        testargs = ["examples/inference/gpt2.py"]
+        testargs = ["examples/inference/pippy/gpt2.py"]
        run_command(self.launch_args + testargs)

    @require_pippy
    @require_multi_gpu
    def test_pippy_examples_t5(self):
-        testargs = ["examples/inference/t5.py"]
+        testargs = ["examples/inference/pippy/t5.py"]
        run_command(self.launch_args + testargs)
--- a/tests/test_hooks.py
+++ b/tests/test_hooks.py
@ -28,7 +28,10 @@ from accelerate.hooks import (
    remove_hook_from_module,
    remove_hook_from_submodules,
 )
-from accelerate.test_utils import require_multi_gpu
+from accelerate.test_utils import require_multi_device, torch_device
+
+
+torch_device = f"{torch_device}:0" if torch_device != "cpu" else "cpu"


 class ModelForTest(nn.Module):
@ -150,7 +153,7 @@ class HooksModelTester(unittest.TestCase):
        output1 = test_model(x)
        assert not output1.requires_grad

-    @require_multi_gpu
+    @require_multi_device
    def test_align_devices_as_model_parallelism(self):
        model = ModelForTest()
        # Everything is on CPU
@ -175,7 +178,7 @@ class HooksModelTester(unittest.TestCase):

        # We can add a general hook to put back output on same device as input.
        add_hook_to_module(model, AlignDevicesHook(io_same_device=True))
-        x = torch.randn(2, 3).to(0)
+        x = torch.randn(2, 3).to(torch_device)
        output = model(x)
        assert output.device == torch.device(0)

@ -188,7 +191,7 @@ class HooksModelTester(unittest.TestCase):
        assert model.linear2.weight.device == torch.device("cpu")

        # This will move each submodule on different devices
-        hook_kwargs = {"execution_device": 0 if torch.cuda.is_available() else "cpu", "offload": True}
+        hook_kwargs = {"execution_device": torch_device, "offload": True}

        add_hook_to_module(model.linear1, AlignDevicesHook(**hook_kwargs))
        add_hook_to_module(model.batchnorm, AlignDevicesHook(**hook_kwargs))
@ -216,7 +219,7 @@ class HooksModelTester(unittest.TestCase):

        # Now test with buffers included in the offload
        hook_kwargs = {
-            "execution_device": 0 if torch.cuda.is_available() else "cpu",
+            "execution_device": torch_device,
            "offload": True,
            "offload_buffers": True,
        }
@ -252,7 +255,7 @@ class HooksModelTester(unittest.TestCase):
        assert model.linear2.weight.device == torch.device("cpu")

        # This will move each submodule on different devices
-        execution_device = 0 if torch.cuda.is_available() else "cpu"
+        execution_device = torch_device
        attach_align_device_hook(model, execution_device=execution_device, offload=True)

        # Parameters have been offloaded, so on the meta device
@ -301,7 +304,7 @@ class HooksModelTester(unittest.TestCase):
        assert model.linear2.weight.device == torch.device("cpu")

        # This will move each submodule on different devices
-        execution_device = 0 if torch.cuda.is_available() else "cpu"
+        execution_device = torch_device
        attach_align_device_hook(
            model, execution_device=execution_device, offload=True, weights_map=model.state_dict()
        )
--- a/tests/test_logging.py
+++ b/tests/test_logging.py
@ -0,0 +1,91 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import logging
+import os
+
+import pytest
+
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+
+
+def current_lineno() -> int:
+    # A simple helper that returns the lineno of its call-site.
+    caller_frame = inspect.currentframe().f_back
+    caller_info = inspect.getframeinfo(caller_frame)
+    return caller_info.lineno
+
+
+class CustomLogger(logging.LoggerAdapter):
+    # Mocks a user-defined custom logger wrapper that sets `stacklevel=3`.
+    def log(self, level, msg, *args, **kwargs):
+        # E.g. the user wants to modify `stacklevel`, `accelerate.logging`
+        # should respect the user's `stacklevel`. For the specific value
+        # of `3`, calling `CustomLogger.log()`, etc., should log that callsite,
+        # rather than the callsite of the following `self.logger.log()`.
+        kwargs["stacklevel"] = 3
+        self.logger.log(level, msg, *args, **kwargs)
+
+
+@pytest.fixture(scope="module")
+def accelerator():
+    return Accelerator()
+
+
+@pytest.mark.usefixtures("accelerator")
+def test_log_stack(caplog):
+    logger = get_logger(__name__)
+    logging.basicConfig(
+        format="%(filename)s:%(name)s:%(lineno)s:%(funcName)s - %(message)s",
+        datefmt="%m/%d %H:%M:%S",
+    )
+
+    message = "Test"
+    lineno = current_lineno() + 1  # the next line is the actual callsite
+    logger.warning(message)
+
+    assert len(caplog.records) == 1
+    rec = caplog.records[0]
+    assert rec.levelname == logging.getLevelName(logging.WARNING)
+    assert rec.filename == os.path.basename(__file__)
+    assert rec.name == __name__
+    assert rec.lineno == lineno
+    assert rec.funcName == test_log_stack.__name__
+    assert rec.message == message
+
+
+@pytest.mark.usefixtures("accelerator")
+def test_custom_stacklevel(caplog):
+    wrapped_logger = get_logger(__name__)
+    logging.basicConfig(
+        format="%(filename)s:%(name)s:%(lineno)s:%(funcName)s - %(message)s",
+        datefmt="%m/%d %H:%M:%S",
+    )
+    logger = CustomLogger(wrapped_logger, {})
+
+    message = "Test"
+    lineno = current_lineno() + 1  # the next line is the actual callsite
+    logger.warning(message)
+
+    # `CustomLogger.log` set custom `stacklevel=3`, so `logger.warning` should
+    # log its callsite (rather than those of the `warpped_logger`).
+    assert len(caplog.records) == 1
+    rec = caplog.records[0]
+    assert rec.levelname == logging.getLevelName(logging.WARNING)
+    assert rec.filename == os.path.basename(__file__)
+    assert rec.name == __name__
+    assert rec.lineno == lineno
+    assert rec.funcName == test_custom_stacklevel.__name__
+    assert rec.message == message
--- a/tests/test_multigpu.py
+++ b/tests/test_multigpu.py
@ -31,6 +31,7 @@ from accelerate.test_utils import (
    require_multi_gpu,
    require_non_torch_xla,
    require_pippy,
+    require_torchvision,
 )
 from accelerate.utils import patch_environment

@ -76,6 +77,7 @@ class MultiDeviceTester(unittest.TestCase):

    @require_multi_gpu
    @require_pippy
+    @require_torchvision
    @require_huggingface_suite
    def test_pippy(self):
        """
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@ -29,9 +29,11 @@ from accelerate.state import PartialState
 from accelerate.test_utils.testing import (
    require_cuda,
    require_huggingface_suite,
+    require_non_cpu,
    require_non_torch_xla,
    require_torch_min_version,
    require_tpu,
+    torch_device,
 )
 from accelerate.test_utils.training import RegressionModel
 from accelerate.utils import (
@ -51,6 +53,7 @@ from accelerate.utils import (
    recursively_apply,
    save,
    send_to_device,
+    tqdm,
 )
 from accelerate.utils.operations import is_namedtuple

@ -70,7 +73,7 @@ class UtilsTester(unittest.TestCase):

    def test_send_to_device(self):
        tensor = torch.randn(5, 2)
-        device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        device = torch.device(f"{torch_device}:0")

        result1 = send_to_device(tensor, device)
        assert torch.equal(result1.cpu(), tensor)
@ -178,11 +181,11 @@ class UtilsTester(unittest.TestCase):
        model = extract_model_from_parallel(model, keep_fp32_wrapper=False)
        _ = pickle.dumps(model)

-    @require_cuda
+    @require_non_cpu
    def test_can_undo_fp16_conversion(self):
        model = RegressionModel()
        model._original_forward = model.forward
-        model.forward = torch.cuda.amp.autocast(dtype=torch.float16)(model.forward)
+        model.forward = torch.autocast(device_type=torch_device, dtype=torch.float16)(model.forward)
        model.forward = convert_outputs_to_fp32(model.forward)
        model = extract_model_from_parallel(model, keep_fp32_wrapper=False)
        _ = pickle.dumps(model)
@ -401,3 +404,9 @@ class UtilsTester(unittest.TestCase):
        with self.assertLogs("accelerate.utils.environment", level="WARNING"):
            valid_env_items = convert_dict_to_env_variables(env)
        assert valid_env_items == ["ACCELERATE_DEBUG_MODE=1\n", "OTHER_ENV=2\n"]
+
+    def test_tqdm_deprecation(self):
+        with pytest.warns(FutureWarning) as cm:
+            tqdm(True, range(3), disable=True)
+        assert "Passing `True` as the first argument to" in cm.pop().message.args[0]
+        tqdm(range(3), main_process_only=True, disable=True)
Author	SHA1	Message	Date
[[ -z $EMAIL ]] && read -e -p "Enter your email (for git configuration): " EMAIL	72e214f561	Update accelerator	2024-05-14 12:00:35 -04:00
[[ -z $EMAIL ]] && read -e -p "Enter your email (for git configuration): " EMAIL	ab14a5e6a1	Use partial state for fsdp saving/loading	2024-05-14 11:59:07 -04:00
Marc Sun	27a607ea90	Fix small edge case in get_module_leaves (#2774 ) * fix edge case * fix	2024-05-14 11:52:51 +02:00
Tom Mery	aa21174de9	fix minor typo (#2767 )	2024-05-13 08:24:01 -04:00
Xiaoyu Zhang	6cf1cc0a39	optimize get_module_leaves speed (#2756 ) * optimize get_module_leaves * fix format * Update modeling.py	2024-05-13 08:23:38 -04:00
Zach Mueller	bb465a9cf0	Sets default to PyTorch defaults based on backend (#2758 ) * Amd * Add timeout defaults to match pytorch * forward contrib credits from discussions * oop --------- Co-authored-by: Julian Buchel <jubueche@users.noreply.github.com>	2024-05-13 05:41:15 -04:00
Zach Mueller	67308ca6ef	Enable sharded cpu resume (#2762 )	2024-05-10 11:39:37 -04:00
Zach Mueller	63772f6ac2	Revert "Simplify CLI args validation and ensure CLI args take precedence over config file." (#2763 ) This reverts commit 724824abbe0aed8606661bbce5e057c0d2447794.	2024-05-10 11:22:56 -04:00
jiqing-feng	8798cf06ab	fix cpu omp num threads set (#2755 ) * fix cpu omp num threads set * fix OMP_NUM_THREADS * consider no-cpu usage * fix style	2024-05-10 11:16:06 -04:00
Zach Mueller	47bb2dd53e	Fix sagemaker config (#2753 ) * Fix sagemaker * Default to False * Include fixes * Nit * Ignore launching	2024-05-10 09:09:36 -04:00
Iain Stenson	724824abbe	Simplify CLI args validation and ensure CLI args take precedence over config file. (#2757 ) * Remove unnecessary args.debug statement * Add expected test failure for config sub-sections * Remove redundancy in config file args parsing * Make config file --cpu logic more explicit	2024-05-09 09:30:13 -04:00
YH	afc2c99e6a	Fix duplicate environment variable check in multi-cpu condition (#2752 ) * Del duplicted key * Apply format	2024-05-07 14:27:29 -04:00
Marc Sun	0fb95a2d3b	Fix max_memory assignment (#2751 )	2024-05-07 11:53:25 +02:00
Younes Belkada	7ac153f404	LOMO / FIX: Support multiple optimizers (#2745 )	2024-05-06 08:28:14 -04:00
Luo Wenyang	0f1b91bb74	Fix stacklevel in `logging` to log the actual user call site (instead of the call site inside the logger wrapper) of log functions (#2730 ) * fix stacklevel in logging to log info about the actual user callsite * Add two tests for stacklevel in logging --------- Co-authored-by: luowyang <luowyang@github.com>	2024-05-06 08:21:19 -04:00
Huazhong Ji	d1eb44c856	Fixed the problem of incorrect conditional judgment statement when configuring enable_cpu_affinity (#2748 )	2024-05-06 08:20:22 -04:00
若只如初见	11a363287a	Update modeling.py by adding try-catch section to skip the unavailable devices (#2681 ) * Update modeling.py to ignore the unavailable devices * Update src/accelerate/utils/modeling.py Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> Update src/accelerate/utils/modeling.py Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> Update src/accelerate/utils/modeling.py Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> Update src/accelerate/utils/modeling.py Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> --------- Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>	2024-05-06 12:44:35 +02:00
LFu	5cfe409443	Add feature to allow redirecting std streams into log files when using torchrun as the launcher. (#2740 ) * Add --log-dir/--log_dir to `distributed_args` to allow redirecting std streams into log files when using torchrun as the launcher. Used with --tee this will acheive similar effect as running with `torchrun --tee X --log-dir=logs`. * Deleted the unecessary "--log-dir" argument following suggestion from @muellerzr, since it will be automatically generated from "--log_dir".	2024-05-04 15:03:05 -04:00
Zach Mueller	5b3a7f3892	Update setup.py + test falures found during release	2024-05-03 10:40:25 -04:00
Zach Mueller	060361fca3	Fix tests on main (#2739 ) * Start * Fixings	2024-05-03 10:18:20 -04:00
Younes Belkada	6ac27e2383	FEAT: Add LOMO optimizer (#2695 ) * add v1 lomo * final fixes * fix * Update src/accelerate/accelerator.py Co-authored-by: Zach Mueller <muellerzr@gmail.com> * add comment * more comments * fix --------- Co-authored-by: Zach Mueller <muellerzr@gmail.com>	2024-05-03 10:55:44 +02:00
YH	ba5f49219f	Fix offload device type (#2717 )	2024-05-02 17:07:24 +05:30
Yu Chin Fabian Lim	2c767338f2	Fix Documentation in FSDP and DeepSpeed Concept Guide (#2725 ) * address part of stats comments * automatically set sync_module_states if low_cpu_mem is set * Apply suggestions from @stas00 Co-authored-by: Stas Bekman <stas00@users.noreply.github.com> * add links from fsdp and deepspeed docs. fix deepspeed imports * replace raise in accelerate.launch --------- Co-authored-by: Stas Bekman <stas00@users.noreply.github.com>	2024-05-01 09:25:18 -04:00
Marc Sun	234a85506d	Docs: Fix build main documentation (#2729 )	2024-05-01 08:18:52 -04:00
Marc Sun	232ebd159a	Fix sampler (#2728 )	2024-05-01 12:20:26 +02:00
Marc Sun	4d3d4bc88f	fix sampler serialization (#2723 ) * fix sampler serialization * add getter and setter for sampler * more maintenable	2024-04-30 11:19:05 +02:00
Zach Mueller	2b1e7bd462	Fixup `free_memory` to deal with garbage collection (#2716 ) * Fixup cleanup * Return * Fixup test * Fix test * DeepSpeed * More careful guard * bring back as none * passing * bring forward	2024-04-30 03:28:57 -04:00
Zach Mueller	c7e5e41b8c	Segment out a deepspeed docker image (#2707 ) * Segment out a deepspeed docker image * Update readme * Keep pinned ds	2024-04-29 11:25:22 -04:00
Yu Chin Fabian Lim	9557598c45	Add Upcasting for FSDP in Mixed Precision. Add Concept Guide for FSPD and DeepSpeed. (#2674 ) * draft fsdp vs ds * reframe to migration doc * updated functionality section * cast to float32 * improvements to float32 casting * some cleanup * addressed @pacman100's comments * Apply some of @muellerz suggestions Co-authored-by: Zach Mueller <muellerzr@gmail.com> * change to subsections * changed the manner upcasting warnings are surfaced * update document to discuss fsdp and ds plugins. minor fixes. * @muellerzr's new suggestions Co-authored-by: Zach Mueller <muellerzr@gmail.com> * explain all-or-nothing * add @pacman100's comments Co-authored-by: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com> * minor fix --------- Co-authored-by: Yu Chin Fabian Lim <flim@sg.ibm.com> Co-authored-by: Zach Mueller <muellerzr@gmail.com> Co-authored-by: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>	2024-04-29 11:19:03 -04:00
Marc Sun	156331aecd	allow gather_for_metrics to be more flexible (#2710 ) * allow gather_for_metrics to be more flexible * style * udapte doc * fix * style * typo * typo * Update src/accelerate/accelerator.py Co-authored-by: Zach Mueller <muellerzr@gmail.com> * remove distributed * clean --------- Co-authored-by: Zach Mueller <muellerzr@gmail.com>	2024-04-29 12:14:22 +02:00
Marc Sun	cd7df4117d	fix bnb multi gpu training (#2714 ) * fix bnb multi gpu training * style * elif instead * fix * style * fix	2024-04-26 15:52:15 +02:00
Zach Mueller	6af157ea93	Add diffusers to req (#2711 )	2024-04-25 08:31:54 -04:00
Marc Sun	83317b3081	add distributed examples (#2672 ) * add distributed examples * typo * uncomment * require multigpu * add stable diffusion example * style * add copyright * style * remove tqdm * Apply suggestions from code review Co-authored-by: Zach Mueller <muellerzr@gmail.com> * add comments * remove print * More comments --------- Co-authored-by: Zach Mueller <muellerzr@gmail.com>	2024-04-25 11:13:56 +02:00
drhead	e831bcb3b1	Change dataloader send_to_device calls to non-blocking (#2685 ) * Change dataloader send_to_device calls to non-blocking * add non_blocking to dataloader dataclass * add dataloader non blocking option from dataclass * add handling for non blocking to accelerator * add notes on non-blocking transfers to quicktour * link to dataloaderconfiguration in docs * linting * "requires" -> "recommended" on non-blocking setting Co-authored-by: Zach Mueller <muellerzr@gmail.com> --------- Co-authored-by: drhead <a@a.a> Co-authored-by: Zach Mueller <muellerzr@gmail.com>	2024-04-24 15:45:57 -04:00
Sourab Mangrulkar	092c3af0c4	Add version checks for the import of DeepSpeed moe utils (#2705 ) * fix import for moe utils * Apply suggestions from code review Co-authored-by: Zach Mueller <muellerzr@gmail.com> --------- Co-authored-by: Zach Mueller <muellerzr@gmail.com>	2024-04-25 00:38:56 +05:30
Huazhong Ji	3e944c5583	add cann version info to command accelerate env (#2689 )	2024-04-24 09:17:09 -04:00
Zach Mueller	f67737363c	Do a pip freeze during workflows (#2704 ) * Do a pip freeze * No need to do source activate on non-conda workflow	2024-04-24 08:46:13 -04:00
Marc Sun	f7daaaa305	fix support (#2699 )	2024-04-23 15:32:43 +02:00
Zach Mueller	3dc131cd8d	Add source code for DataLoader Animation (#2696 ) * dl animation * oops * Export	2024-04-23 04:28:28 -04:00
Sourab Mangrulkar	ef0f62c12a	Simplify test logic (#2697 ) * simplify test logic 😅 * 😅	2024-04-23 02:49:55 +05:30
Sourab Mangrulkar	baafaf4a6e	Fix the rng states of sampler's generator to be synchronized for correct sharding of dataset across GPUs (#2694 ) * Fix the rng states of sampler's generator to be synchronized for correct sharding of dataset across GPUs * add tests	2024-04-22 13:50:04 -04:00
Zach Mueller	abc86c0e35	Enable BF16 autocast to everything during FP8 + some tweaks to enable FSDP (#2655 ) * Basic autocasting stuff * Delay fp8 autocast until after DDP wrapping * More fixes * Bookmark: without dtype change * Bookmark: with dtype changes * Different alternative, better results * Didn't matter what order, same result * Revert + maintain * Fin * Refactor based on feedback * native_amp bool * Final nits	2024-04-18 10:14:35 -04:00
Zach Mueller	4450cb3132	Deprecate tqdm args + slight logic tweaks (#2673 ) * Deprecate + slight logic fix * Maybe fix test?	2024-04-17 06:26:55 -04:00
jiqing-feng	fd0dcd1c45	fix backend check (#2670 ) * fix backend check * reformat backend check * Update src/accelerate/state.py Co-authored-by: Zach Mueller <muellerzr@gmail.com> * Update src/accelerate/state.py Co-authored-by: Zach Mueller <muellerzr@gmail.com> * raise value error if backend mismatch * Update src/accelerate/state.py Co-authored-by: Zach Mueller <muellerzr@gmail.com> --------- Co-authored-by: Zach Mueller <muellerzr@gmail.com>	2024-04-16 21:22:27 -04:00
Zach Mueller	f478201c28	Pin DS...again.. (#2679 )	2024-04-16 12:07:59 -04:00
Sourab Mangrulkar	c7046845e7	Fix deepspeed moe test with version check (#2677 )	2024-04-16 10:22:41 -04:00
Sourab Mangrulkar	701e24c539	Handle MoE models with DeepSpeed (#2662 ) * Handle MoE models with DeepSpeed * Update launch.py * Update test_deepspeed.py * Update test_deepspeed.py * Update src/accelerate/utils/dataclasses.py Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com> * address comments * Update deepspeed.md --------- Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>	2024-04-16 16:11:49 +05:30
Richard Brown	37da848e6c	`tqdm`: `args` should come ahead of `main_process_only` (#2654 ) Update tqdm.py * add unit test * add test to test_utils * ruff changes	2024-04-15 12:30:28 -04:00
Zach Mueller	c470a1336a	Revert "fix backend check (#2652 )" (#2669 ) This reverts commit 2fc48c7eeea67e747a39be2dec822b07a27bae71.	2024-04-15 04:30:33 -04:00
zhangshengdong29	581a390e2f	Megatron plugin can support NPU (#2667 )	2024-04-15 03:02:13 -04:00
jiqing-feng	2fc48c7eee	fix backend check (#2652 ) * fix backend check * fix ccl check	2024-04-15 02:59:29 -04:00
Yuanhang Yang	1024231133	Add MLU rng state setter (#2664 )	2024-04-15 02:59:10 -04:00
yuanwu2017	5ca095a34f	Fix test_from_pretrained_low_cpu_mem_usage_measured failure (#2644 ) This test is to test the change in the memory size occupied by model loading when low_cpu_mem_usage is used. Therefore, the default device used is cpu. However, when judging whether other devices are available, new packages will be introduced, causing memory changes and interfering with the test results. Signed-off-by: yuanwu <yuan.wu@intel.com>	2024-04-12 18:23:28 +02:00
Lucain	b77c65398c	Don't use deprecated `Repository` anymore (#2658 ) * Don't use deprecated Repository anymore * oops * Update requirements.txt	2024-04-12 09:05:54 -04:00
YH	a91691463b	Fix deepspeed plugin attr type (#2646 )	2024-04-12 15:29:16 +05:30
regisss	5056d327f8	Allow "auto" for gradient clipping in YAML (#2649 ) * Allow "auto" for gradient clipping in YAML * Update src/accelerate/utils/dataclasses.py Co-authored-by: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com> * Make style --------- Co-authored-by: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>	2024-04-12 13:44:39 +05:30
Brent Yi	c0a37015e3	Typo fix in tracking.md (#2650 )	2024-04-10 17:16:11 -04:00
Huazhong Ji	e9b9c7d022	device agnostic testing for hooks&utils&big_modeling (#2602 ) * device agnostic testing for hooks&utils&big_modeling * fix failed test cased on cpu * make style	2024-04-10 13:56:50 -04:00
Marc Sun	6c09584f73	add strict arg to load_checkpoint_and_dispatch (#2641 )	2024-04-10 11:20:07 +02:00
Fanli Lin	b8c8583953	add third-party device prefix to `execution_device` (#2612 ) * add xpu device_map * fix	2024-04-09 13:47:41 +02:00
Zach Mueller	df485ae1e3	Parenthesis on xpu_available (#2639 )	2024-04-09 06:33:38 -04:00
Zach Mueller	6386f70103	Fix up state with xla + performance regression (#2634 ) * Fix up state with xla * use backend * Change last time * Cmoment * Slight tweak to use dtype	2024-04-09 06:06:28 -04:00
Zach Mueller	6d92198ef4	Schedule free optimizer support (#2631 ) * Schedule free optimizer supporT * Fin * Doc * Add in eval * Add to exclude * Fix module issue	2024-04-08 11:28:27 -04:00
Zach Mueller	16488be9a4	Update version	2024-04-05 13:11:05 -04:00
Zach Mueller	685bd3a439	CLean	2024-04-05 13:05:05 -04:00
Zach Mueller	2e69948c1a	Patchfix	2024-04-05 13:04:44 -04:00