Release: v0.12.0

2025-11-14 22:24:32 +08:00 · 2022-08-04 09:00:35 -04:00
71 changed files with 348 additions and 2315 deletions
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@ -1,5 +1,6 @@
 name: "\U0001F41B Bug Report"
 description: Submit a bug report to help us improve Accelerate
+labels: [ "bug" ]
 body:
  - type: textarea
    id: system-info
--- a/.github/workflows/build-docker-images-release.yml
+++ b/.github/workflows/build-docker-images-release.yml
@ -1,64 +0,0 @@
-name: Build Docker images (releases)
-
-on:
-  workflow_dispatch:
-  release:
-    types: [published]
-
-concurrency:
-  group: docker-image-builds
-  cancel-in-progress: false
-
-jobs:
-  get-version:
-    runs-on: ubuntu-latest
-    outputs:
-      version: ${{ steps.step1.outputs.version }}
-    steps:
-      - uses: actions/checkout@v3
-      - id: step1
-        run: echo "::set-output name=version::$(python setup.py --version)"
-
-  version-cpu:
-    name: "Latest Accelerate CPU [version]"
-    runs-on: ubuntu-latest
-    needs: get-version
-    steps:
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
-      - name: Check out code
-        uses: actions/checkout@v2
-      - name: Login to DockerHub
-        uses: docker/login-action@v1
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
-      - name: Build and Push CPU
-        uses: docker/build-push-action@v2
-        with:
-          context: ./docker/accelerate-cpu
-          push: true
-          tags: huggingface/accelerate-cpu:${{needs.get-version.outputs.version}}
-
-  version-cuda:
-    name: "Latest Accelerate GPU [version]"
-    runs-on: ubuntu-latest
-    needs: get-version
-    steps:
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
-      - name: Check out code
-        uses: actions/checkout@v2
-      - name: Login to DockerHub
-        uses: docker/login-action@v1
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
-      - name: Build and Push GPU
-        uses: docker/build-push-action@v2
-        with:
-          context: ./docker/accelerate-gpu
-          push: true
-          tags: huggingface/accelerate-gpu:${{needs.get-version.outputs.version}}
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
--- a/.github/workflows/build_and_run_tests.yml
+++ b/.github/workflows/build_and_run_tests.yml
@ -10,7 +10,7 @@ env:
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

 jobs:
-  check-for-source:
+  check-for-setup:
    runs-on: ubuntu-latest
    name: Check if setup was changed
    outputs:
@ -28,18 +28,18 @@ jobs:
        id: was_changed
        run: |
          for file in ${{ steps.changed-files.outputs.all_changed_files }}; do
-            if [ `basename "${file}"` == "setup.py" ]; then
+            if [ `basename "${file}"` = "setup.py" ]; then
              echo ::set-output name=changed::"1"
            fi
          done
          
  build-docker-containers:
-    needs: check-for-source
-    if: (github.event_name == 'push') && (needs.check-for-source.outputs.changed == '1')
-    uses: ./.github/workflows/build_docker_images.yml
+    needs: check-for-setup
+    if: (github.event_name == 'push') && (needs.check-for-setup.outputs.changed == '1')
+    uses: ./.github/workflows/build-docker-images.yml
    secrets: inherit

-  run-merge-tests:
+  run-tests:
    needs: build-docker-containers
    if: always()
-    uses: ./.github/workflows/run_merge_tests.yml
+    uses: ./.github/workflows/on-merge.yml
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@ -7,7 +7,6 @@ on:

 env:
  RUN_SLOW: "yes"
-  IS_GITHUB_CI: "1"

 jobs:
  run_all_tests_single_gpu:
@ -28,7 +27,6 @@ jobs:
          git config --global --add safe.directory '*'
          git fetch && git checkout ${{ github.sha }} 
          pip install -e . --no-deps
-          pip install pytest-reportlog

      - name: Run test on GPUs
        run: |
@ -39,11 +37,6 @@ jobs:
          source activate accelerate
          pip uninstall comet_ml -y
          make test_examples
-          
-      - name: Generate Report
-        if: always()
-        run: |
-          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY

  run_all_tests_multi_gpu:
    runs-on: [self-hosted, docker-gpu, multi-gpu]
@ -63,26 +56,14 @@ jobs:
          git config --global --add safe.directory '*'
          git fetch && git checkout ${{ github.sha }}
          pip install -e . --no-deps
-          pip install pytest-reportlog

-      - name: Run core and big modeling tests on GPUs
+      - name: Run test on GPUs
        run: |
          source activate accelerate
-          make test_big_modeling
-          make test_core
-
-      - name: Run Integration tests on GPUs
-        run: |
-          source activate accelerate
-          make test_integrations
+          make test

      - name: Run examples on GPUs
        run: |
          source activate accelerate
          pip uninstall comet_ml -y
-          make test_examples
-
-      - name: Generate Report
-        if: always()
-        run: |
-          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
+          make test_examples
--- a/.github/workflows/run_merge_tests.yml
+++ b/.github/workflows/run_merge_tests.yml
@ -6,7 +6,6 @@ on:

 env:
  TESTING_MOCKED_DATALOADERS: "1"
-  IS_GITHUB_CI: "1"

 jobs:
  run_all_tests_single_gpu:
@ -27,7 +26,6 @@ jobs:
          git config --global --add safe.directory '*'
          git fetch && git checkout ${{ github.sha }}
          pip install -e .[testing,test_trackers]
-          pip install pytest-reportlog

      - name: Run test on GPUs
        run: |
@ -39,11 +37,6 @@ jobs:
          pip uninstall comet_ml -y
          make test_examples

-      - name: Generate Report
-        if: always()
-        run: |
-          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
-
  run_all_tests_multi_gpu:
    runs-on: [self-hosted, docker-gpu, multi-gpu]
    container:
@ -60,7 +53,6 @@ jobs:
          git config --global --add safe.directory '*'
          git fetch && git checkout ${{ github.sha }}
          pip install -e .[testing,test_trackers]
-          pip install pytest-reportlog

      - name: Run test on GPUs
        run: |
@ -71,9 +63,4 @@ jobs:
        run: |
          source activate accelerate
          pip uninstall comet_ml -y
-          make test_examples
-
-      - name: Generate Report
-        if: always()
-        run: |
-          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
+          make test_examples
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -1,30 +1,16 @@
 name: Run Tests

-on:
-  pull_request:
-    paths:
-      - "src/**"
-      - "tests/**"
-      - ".github/**"
-      - "examples/**"
-      - "setup.py"
-    types: [opened, synchronize, reopened]
+on: [pull_request]

 env:
  HF_HOME: ~/hf_cache
  TESTING_MOCKED_DATALOADERS: "1"
-  IS_GITHUB_CI: "1"

 jobs:
  run-tests:
    runs-on: ubuntu-latest
    strategy:
-      fail-fast: false
      matrix:
-        pytorch-version: [
-          latest,
-          minimum
-        ]
        test-kind: [
          test_prod,
          test_core,
@ -57,14 +43,7 @@ jobs:
        if [[ ${{ matrix.test-kind }} = test_prod ]]; then pip install -e .[test_prod]; fi
        if [[ ${{ matrix.test-kind }} != test_prod ]]; then pip install -e .[testing,test_trackers]; fi
        if [[ ${{ matrix.test-kind }} = test_rest ]]; then pip uninstall comet_ml -y; fi
-        if [[ ${{ matrix.pytorch-version }} = minimum ]]; then pip install torch==1.6.0; fi
-        pip install pytest-reportlog
    
    - name: Run Tests
      run: |
-        make ${{ matrix.test-kind }}
-
-    - name: Generate Report
-      if: always()
-      run: |
-        python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
+        make ${{ matrix.test-kind }}
--- a/23
+++ b/23
@ -25,40 +25,37 @@ style:
 	
 # Run tests for the library
 test:
-	python -m pytest -s -v ./tests/ --ignore=./tests/test_examples.py $(if $(IS_GITHUB_CI),--report-log 'all.log',)
+	python -m pytest -s -v ./tests/ --ignore=./tests/test_examples.py

 test_big_modeling:
-	python -m pytest -s -v ./tests/test_big_modeling.py $(if $(IS_GITHUB_CI),--report-log 'big_modeling.log',)
+	python -m pytest -s -v ./tests/test_big_modeling.py

 test_core:
 	python -m pytest -s -v ./tests/ --ignore=./tests/test_examples.py --ignore=./tests/deepspeed --ignore=./tests/test_big_modeling.py \
-	--ignore=./tests/fsdp $(if $(IS_GITHUB_CI),--report-log 'core.log',)
+	--ignore=./tests/fsdp

 test_deepspeed:
-	python -m pytest -s -v ./tests/deepspeed $(if $(IS_GITHUB_CI),--report-log 'deepspeed.log',)
+	python -m pytest -s -v ./tests/deepspeed

 test_fsdp:
-	python -m pytest -s -v ./tests/fsdp $(if $(IS_GITHUB_CI),--report-log 'fsdp.log',)
+	python -m pytest -s -v ./tests/fsdp

 test_examples:
-	python -m pytest -s -v ./tests/test_examples.py $(if $(IS_GITHUB_CI),--report-log 'examples.log',)
+	python -m pytest -s -v ./tests/test_examples.py

 # Broken down example tests for the CI runners
-test_integrations:
-	python -m pytest -s -v ./tests/deepspeed ./tests/fsdp $(if $(IS_GITHUB_CI),--report-log 'integrations.log',)
-
 test_example_differences:
-	python -m pytest -s -v ./tests/test_examples.py::ExampleDifferenceTests $(if $(IS_GITHUB_CI),--report-log 'example_diff.log',)
+	python -m pytest -s -v ./tests/test_examples.py::ExampleDifferenceTests

 test_checkpoint_epoch:
-	python -m pytest -s -v ./tests/test_examples.py::FeatureExamplesTests -k "by_epoch" $(if $(IS_GITHUB_CI),--report-log 'checkpoint_epoch.log',)
+	python -m pytest -s -v ./tests/test_examples.py::FeatureExamplesTests -k "by_epoch"

 test_checkpoint_step:
-	python -m pytest -s -v ./tests/test_examples.py::FeatureExamplesTests -k "by_step" $(if $(IS_GITHUB_CI),--report-log 'checkpoint_step.log',)
+	python -m pytest -s -v ./tests/test_examples.py::FeatureExamplesTests -k "by_step"

 # Same as test but used to install only the base dependencies
 test_prod:
 	$(MAKE) test_core

 test_rest:
-	python -m pytest -s -v ./tests/test_examples.py::FeatureExamplesTests -k "not by_step and not by_epoch" $(if $(IS_GITHUB_CI),--report-log 'rest.log',)
+	python -m pytest -s -v ./tests/test_examples.py::FeatureExamplesTests -k "not by_step and not by_epoch"
--- a/README.md
+++ b/README.md
@ -196,7 +196,7 @@ from accelerate import notebook_launcher
 notebook_launcher(training_function)
 ```

-An example can be found in [this notebook](https://github.com/huggingface/notebooks/blob/main/examples/accelerate_examples/simple_nlp_example.ipynb). [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/accelerate_examples/simple_nlp_example.ipynb)
+An example can be found in [this notebook](https://github.com/huggingface/notebooks/blob/master/examples/accelerate/simple_nlp_example.ipynb). [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/accelerate/simple_nlp_example.ipynb)

 ## Why should I use 🤗 Accelerate?

@ -243,16 +243,3 @@ pip install accelerate
 - FP16 with native AMP (apex on the roadmap)
 - DeepSpeed support (Experimental)
 - PyTorch Fully Sharded Data Parallel (FSDP) support (Experimental)
-
-## Citing 🤗 Accelerate
-
-If you use 🤗 Accelerate in your publication, please cite it by using the following BibTeX entry.
-
-```bibtex
-@Misc{accelerate,
-  title =        {Accelerate: Training and inference at scale made simple, efficient and adaptable.},
-  author =       {Sylvain Gugger, Lysandre Debut, Thomas Wolf, Philipp Schmid, Zachary Mueller, Sourab Mangrulkar},
-  howpublished = {\url{https://github.com/huggingface/accelerate}},
-  year =         {2022}
-}
-```
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@ -32,15 +32,11 @@
  - local: usage_guides/memory
    title: How to avoid CUDA Out-of-Memory
  - local: usage_guides/sagemaker
-    title: Using 🤗 Accelerate on SageMaker
+    title: Using Accelerate on SageMaker
  - local: usage_guides/mps
    title: How to use Apple Silicon M1 GPUs
-  - local: usage_guides/training_zoo
-    title: 🤗 Accelerate Example Zoo
  title: How-To Guides
 - sections:
-  - local: concept_guides/performance
-    title: Comparing performance across distributed setups
  - local: concept_guides/gradient_synchronization
    title: Gradient synchronization
  - local: concept_guides/deferring_execution
--- a/docs/source/concept_guides/deferring_execution.mdx
+++ b/docs/source/concept_guides/deferring_execution.mdx
@ -1,15 +1,3 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
 # Deferring Executions

 When you run your usual script, instructions are executed in order. Using 🤗 Accelerate to deploy your script on several
--- a/docs/source/concept_guides/gradient_synchronization.mdx
+++ b/docs/source/concept_guides/gradient_synchronization.mdx
@ -114,6 +114,4 @@ for batch in dataloader:
        outputs = model(inputs)
        loss = loss_function(outputs, targets)
        accelerator.backward(loss)
-```
-
-As a result, you should either use *`accelerator.accumulate` or `accelerator.no_sync`* when it comes to API choice. 
+```
--- a/docs/source/concept_guides/performance.mdx
+++ b/docs/source/concept_guides/performance.mdx
@ -1,91 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Comparing performance between different device setups
-
-Evaluating and comparing the performance from different setups can be quite tricky if you don't know what to look for.
-For example, you cannot run the same script with the same batch size across TPU, multi-GPU, and single-GPU with Accelerate 
-and expect your results to line up. 
-
-But why?
-
-There's three reasons for this that this tutorial will cover: 
-
-1. **Setting the right seeds**
-2. **Observed Batch Sizes**
-3. **Learning Rates**
-
-## Setting the Seed 
-
-While this issue has not come up as much, make sure to use [`utils.set_seed`] to fully set the seed in all distributed cases so training will be reproducable:
-
-```python
-from accelerate import set_seed
-
-set_seed(42)
-```
-
-Why is this important? Under the hood this will set **5** different seed settings:
-
-```python
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
-    # ^^ safe to call this function even if cuda is not available
-    if is_tpu_available():
-        xm.set_rng_state(seed)
-```
-
-The random state, numpy's state, torch, torch's cuda state, and if TPUs are available torch_xla's cuda state.
-
-## Observed Batch Sizes 
-
-When training with Accelerate, the batch size passed to the dataloader is the **batch size per GPU**. What this entails is 
-a batch size of 64 on two GPUs is truly a batch size of 128. As a result, when testing on a single GPU this needs to be accounted for,
-as well as similarly for TPUs. 
-
-The below table can be used as a quick reference to try out different batch sizes:
-
-<Tip>
-
-In this example there are two GPUs for "Multi-GPU" and a TPU pod with 8 workers
-
-</Tip>
-
-| Single GPU Batch Size | Multi-GPU Equivalent Batch Size | TPU Equivalent Batch Size |
-|-----------------------|---------------------------------|---------------------------|
-| 256                   | 128                             | 32                        |
-| 128                   | 64                              | 16                        |
-| 64                    | 32                              | 8                         |
-| 32                    | 16                              | 4                         |
-
-## Learning Rates 
-
-As noted in multiple sources[[1](https://aws.amazon.com/blogs/machine-learning/scalable-multi-node-deep-learning-training-using-gpus-in-the-aws-cloud/)][[2](https://docs.nvidia.com/clara/tlt-mi_archive/clara-train-sdk-v2.0/nvmidl/appendix/training_with_multiple_gpus.html)], the learning rate should be scaled *linearly* based on the number of devices present. The below 
-snippet shows doing so with Accelerate:
-
-<Tip>
-
-Since users can have their own learning rate schedulers defined, we leave this up to the user to decide if they wish to scale their 
-learning rate or not.
- 
-</Tip>
-
-```python
-learning_rate = 1e-3
-accelerator = Accelerator()
-learning_rate *= accelerator.num_processes
-
-optimizer = AdamW(params=model.parameters(), lr=learning_rate)
-```
-
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@ -55,7 +55,7 @@ accelerate launch {my_script.py}
      ><div class="w-full text-center bg-gradient-to-br from-blue-400 to-blue-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Tutorials</div>
      <p class="text-gray-700">Learn the basics and become familiar with using 🤗 Accelerate. Start here if you are using 🤗 Accelerate for the first time!</p>
    </a>
-    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="/docs/accelerate/usage_guides/gradient_accumulation"
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="/docs/accelerate/utility_guides/gradient_accumulation"
      ><div class="w-full text-center bg-gradient-to-br from-indigo-400 to-indigo-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">How-to guides</div>
      <p class="text-gray-700">Practical guides to help you achieve a specific goal. Take a look at these guides to learn how to use 🤗 Accelerate to solve real-world problems.</p>
    </a>
@ -68,4 +68,4 @@ accelerate launch {my_script.py}
      <p class="text-gray-700">Technical descriptions of how 🤗 Accelerate classes and methods work.</p>
    </a>
  </div>
-</div>
+</div>
--- a/docs/source/package_reference/cli.mdx
+++ b/docs/source/package_reference/cli.mdx
@ -94,7 +94,6 @@ The following arguments are useful for customization of worker machines
 * `--machine_rank MACHINE_RANK` (`int`) -- The rank of the machine on which this script is launched.
 * `--num_machines NUM_MACHINES` (`int`) -- The total number of machines used in this training.
 * `--num_processes NUM_PROCESSES` (`int`) -- The total number of processes to be launched in parallel.
-* `--gpu_ids` (`str`) -- What GPUs (by id) should be used for training on this machine as a comma-seperated list
 * `--main_process_ip MAIN_PROCESS_IP` (`str`) -- The IP address of the machine of rank 0.
 * `--main_process_port MAIN_PROCESS_PORT` (`int`) -- The port to use to communicate with the machine of rank 0.
 * `--num_cpu_threads_per_process NUM_CPU_THREADS_PER_PROCESS` (`int`) -- The number of CPU threads per process. Can be tuned for optimal performance.
--- a/docs/source/usage_guides/big_modeling.mdx
+++ b/docs/source/usage_guides/big_modeling.mdx
@ -35,13 +35,7 @@ While this works very well for regularly sized models, this workflow has some cl

 </Tip>

-## How the Process Works: A Quick Overview
-
-<Youtube id="MWCSGj9jEAo" />
-
-## How the Process Works: Working with Code
-
-### Instantiating an empty model
+## Instantiating an empty model

 The first tool 🤗 Accelerate introduces to help with big models is a context manager [`init_empty_weights`] that helps you initialize a model without using any RAM, so that step 1 can be done on models of any size. Here is how it works:

@ -67,7 +61,7 @@ initializes an empty model with a bit more than 100B parameters. Behind the scen

 </Tip>

-### Sharded checkpoints
+## Sharded checkpoints

 It's possible your model is so big that even a single copy won't fit in RAM. That doesn't mean it can't be loaded: if you have one or several GPUs, this is more memory available to store your model. In this case, it's better if your checkpoint is split in several smaller files that we call checkpoint shards.

@ -92,7 +86,7 @@ with index.json being the following file:

 and `first_state_dict.bin` containing the weights for `"linear1.weight"` and `"linear1.bias"`, `second_state_dict.bin` the ones for `"linear2.weight"` and `"linear2.bias"`

-### Loading weights
+## Loading weights

 The second tool 🤗 Accelerate introduces is a function [`load_checkpoint_and_dispatch`], that will allow you to load a checkpoint inside your empty model. This supports full checkpoints (a single file containing the whole state dict) as well as sharded checkpoints. It will also automatically dispatch those weights across the devices you have available (GPUs, CPU RAM), so if you are loading a sharded checkpoint, the maximum RAM usage will be the size of the biggest shard.

@ -182,7 +176,7 @@ You can also design your `device_map` yourself, if you prefer to explicitly deci
 model = load_checkpoint_and_dispatch(model, "sharded-gpt-j-6B", device_map=my_device_map)
 ```

-### Run the model
+## Run the model

 Now that we have done this, our model lies across several devices, and maybe the hard drive. But it can still be used as a regular PyTorch model:

@ -209,7 +203,7 @@ This way, you model can run for inference even if it doesn't fit on one of the G

 </Tip>

-### Designing a device map
+## Designing a device map

 You can let 🤗 Accelerate handle the device map computation by setting `device_map` to one of the supported options (`"auto"`, `"balanced"`, `"balanced_low_0"`, `"sequential"`) or create one yourself, if you want more control over where each layer should go.

--- a/docs/source/usage_guides/gradient_accumulation.mdx
+++ b/docs/source/usage_guides/gradient_accumulation.mdx
@ -31,6 +31,7 @@ model.to(device)
 gradient_accumulation_steps = 2

 for index, batch in enumerate(training_dataloader):
+    optimizer.zero_grad()
    inputs, targets = batch
    inputs = inputs.to(device)
    targets = targets.to(device)
@ -41,7 +42,6 @@ for index, batch in enumerate(training_dataloader):
    if (index + 1) % gradient_accumulation_steps == 0:
        optimizer.step()
        scheduler.step()
-        optimizer.zero_grad()
 ```

 ## Converting it to 🤗 Accelerate
@ -57,6 +57,7 @@ First the code shown earlier will be converted to utilize 🤗 Accelerate withou
 + )

  for index, batch in enumerate(training_dataloader):
+      optimizer.zero_grad()
      inputs, targets = batch
 -     inputs = inputs.to(device)
 -     targets = targets.to(device)
@ -67,7 +68,6 @@ First the code shown earlier will be converted to utilize 🤗 Accelerate withou
      if (index+1) % gradient_accumulation_steps == 0:
          optimizer.step()
          scheduler.step()
-          optimizer.zero_grad()
 ```

 <Tip warning={true}>
@ -94,6 +94,7 @@ You just wrap it around the entire training part of our code:
 - for index, batch in enumerate(training_dataloader):
 + for batch in training_dataloader:
 +     with accelerator.accumulate(model):
+          optimizer.zero_grad()
          inputs, targets = batch
          outputs = model(inputs)
 ```
@ -106,7 +107,6 @@ You can remove all the special checks for the step number and the loss adjustmen
 - if (index+1) % gradient_accumulation_steps == 0:
  optimizer.step()
  scheduler.step()
-  optimizer.zero_grad()
 ```

 As you can see the [`Accelerator`] is able to keep track of the batch number you are on and it will automatically know whether to step through the prepared optimizer and how to adjust the loss. 
@ -118,13 +118,11 @@ Below is the finished implementation for performing gradient accumulation with
 ```python
 for batch in training_dataloader:
    with accelerator.accumulate(model):
+        optimizer.zero_grad()
        inputs, targets = batch
        outputs = model(inputs)
        loss = loss_function(outputs, targets)
        accelerator.backward(loss)
        optimizer.step()
        scheduler.step()
-        optimizer.zero_grad()
 ```
-
-To learn more about what magic this wraps around, read the [Gradient Synchronization concept guide](/concept_guides/gradient_synchronization)
--- a/docs/source/usage_guides/memory.mdx
+++ b/docs/source/usage_guides/memory.mdx
@ -25,20 +25,16 @@ training script. To use it, restructure your training function to include an inn
 and build your dataloaders inside it. At a minimum, this could look like 4 new lines of code. 
 > Note: The inner function *must* take in the batch size as the first parameter, but we do not pass one to it when called. The wrapper handles this for us

-It should also be noted that anything which will consume CUDA memory and passed to the `accelerator` **must** be declared inside the inner function,
-such as models and optimizers.
-
 ```diff
 def training_function(args):
    accelerator = Accelerator()
+    model = get_model()
+    model.to(accelerator.device)
+    optimizer = get_optimizer()

 +   @find_executable_batch_size(starting_batch_size=args.batch_size)
 +   def inner_training_loop(batch_size):
-+       nonlocal accelerator # Ensure they can be used in our context
-+       accelerator.free_memory() # Free all lingering references
-        model = get_model()
-        model.to(accelerator.device)
-        optimizer = get_optimizer()
+       nonlocal model, optimizer # Ensure they can be used in our context
        train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
        lr_scheduler = get_scheduler(
            optimizer, 
@ -52,4 +48,4 @@ def training_function(args):
 +   inner_training_loop()
 ```

-To find out more, check the documentation [here](../package_reference/utilities#accelerate.find_executable_batch_size).
+To find out more, check the documentation [here](package_reference/utilities#accelerate.find_executable_batch_size)
--- a/docs/source/usage_guides/mps.mdx
+++ b/docs/source/usage_guides/mps.mdx
@ -19,7 +19,7 @@ This will map computational graphs and primitives on the MPS Graph framework and
 For more information please refer official documents [Introducing Accelerated PyTorch Training on Mac](https://pytorch.org/blog/introducing-accelerated-pytorch-training-on-mac/)
 and [MPS BACKEND](https://pytorch.org/docs/stable/notes/mps.html).

-### Benefits of Training and Inference using Apple Silicon Chips
+### Benefits of Training and Inference using Apple M1 Chips

 1. Enables users to train larger networks or batch sizes locally
 2. Reduces data retrieval latency and provides the GPU with direct access to the full memory store due to unified memory architecture. 
@ -72,9 +72,8 @@ accelerate launch /examples/cv_example.py --data_dir images

 ## A few caveats to be aware of

-1. We strongly recommend to install PyTorch >= 1.13 (nightly version at the time of writing) on your MacOS machine. 
-It has major fixes related to model correctness and performance improvements for transformer based models.
-Please refer to https://github.com/pytorch/pytorch/issues/82707 for more details.
+1. For `nlp_example.py` the metrics are too bad when compared to CPU-only training. 
+This means certain operations in BERT model are going wrong using mps device and this needs to be fixed by PyTorch.
 2. Distributed setups `gloo` and `nccl` are not working with `mps` device. 
 This means that currently only single GPU of `mps` device type can be used.

--- a/docs/source/usage_guides/sagemaker.mdx
+++ b/docs/source/usage_guides/sagemaker.mdx
@ -129,26 +129,7 @@ You can find your model data at: s3://your-bucket/accelerate-sagemaker-1-2021-04

 ### Distributed Training: Data Parallelism

-Set up the accelerate config by running `accelerate config` and answer the SageMaker questions and set it up.
-To use SageMaker DDP, select it when asked 
-`What is the distributed mode? ([0] No distributed training, [1] data parallelism):`.
-Example config below:
-```yaml
-base_job_name: accelerate-sagemaker-1
-compute_environment: AMAZON_SAGEMAKER
-distributed_type: DATA_PARALLEL
-ec2_instance_type: ml.p3.16xlarge
-iam_role_name: xxxxx
-image_uri: null
-mixed_precision: fp16
-num_machines: 1
-profile: xxxxx
-py_version: py38
-pytorch_version: 1.10.2
-region: us-east-1
-transformers_version: 4.17.0
-use_cpu: false
-```
+*currently in development, will be supported soon.*

 ### Distributed Training: Model Parallelism

--- a/docs/source/usage_guides/training_zoo.mdx
+++ b/docs/source/usage_guides/training_zoo.mdx
@ -1,107 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Example Zoo
-
-Below contains a non-exhuastive list of tutorials and scripts showcasing Accelerate
-
-## Official Accelerate Examples:
-
-### Basic Examples
-
-These examples showcase the base features of Accelerate and are a great starting point
-
- [Barebones NLP example](https://github.com/huggingface/accelerate/blob/main/examples/nlp_example.py)
- [Barebones computer vision example](https://github.com/huggingface/accelerate/blob/main/examples/cv_example.py)
-
-### Feature Specific Examples
-
-These examples showcase specific features that the Accelerate framework offers
-
- [Automatic memory-aware gradient accumulation](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/automatic_gradient_accumulation.py)
- [Checkpointing states](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/checkpointing.py)
- [Cross validation](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/cross_validation.py)
- [DeepSpeed](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/deepspeed_with_config_support.py)
- [Fully Sharded Data Parallelism](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/fsdp_with_peak_mem_tracking.py)
- [Gradient accumulation](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/gradient_accumulation.py)
- [Memory-aware batch size finder](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/memory.py)
- [Metric Computation](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/multi_process_metrics.py)
- [Using Trackers](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/tracking.py)
-
-### Full Examples 
-
-These examples showcase every feature in Accelerate at once that was shown in "Feature Specific Examples"
-
- [Complete NLP example](https://github.com/huggingface/accelerate/blob/main/examples/complete_nlp_example.py)
- [Complete computer vision example](https://github.com/huggingface/accelerate/blob/main/examples/complete_cv_example.py)
- [Causal language model fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm_no_trainer.py)
- [Masked language model fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm_no_trainer.py)
- [Speech pretraining example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py)
- [Translation fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/translation/run_translation_no_trainer.py)
- [Text classification fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue_no_trainer.py)
- [Semantic segmentation fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py)
- [Question answering fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/question-answering/run_qa_no_trainer.py)
- [Beam search question answering fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py)
- [Multiple choice question answering fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/multiple-choice/run_swag_no_trainer.py)
- [Named entity recognition fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/token-classification/run_ner_no_trainer.py)
- [Image classification fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/image-classification/run_image_classification_no_trainer.py)
- [Summarization fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/summarization/run_summarization_no_trainer.py)
-
-## Integration Examples 
-
-These are tutorials from libraries that integrate with 🤗 Accelerate: 
-
-### Catalyst
-
- [Distributed training tutorial with Catalyst](https://catalyst-team.github.io/catalyst/tutorials/ddp.html)
-
-### DALLE2-pytorch 
-
- [Fine-tuning DALLE2](https://github.com/lucidrains/DALLE2-pytorch#usage)
-
-### 🤗 diffusers
-
- [Performing textual inversion with diffusers](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion)
- [Training DreamBooth with diffusers](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth)
-
-### fastai 
-
- [Distributed training from Jupyter Notebooks with fastai](https://docs.fast.ai/tutorial.distributed.html)
- [Basic distributed training examples with fastai](https://docs.fast.ai/examples/distributed_app_examples.html)
-
-### GradsFlow
-
- [Auto Image Classification with GradsFlow](https://docs.gradsflow.com/en/latest/examples/nbs/01-ImageClassification/)
-
-### imagen-pytorch 
-
- [Fine-tuning Imagen](https://github.com/lucidrains/imagen-pytorch#usage)
-
-### Kornia
-
- [Fine-tuning vision models with Kornia's Trainer](https://kornia.readthedocs.io/en/latest/get-started/training.html)
-
-### PyTorch Accelerated 
-
- [Quickstart distributed training tutorial with PyTorch Accelerated](https://pytorch-accelerated.readthedocs.io/en/latest/quickstart.html)
-
-### PyTorch3D
-
- [Perform Deep Learning with 3D data](https://pytorch3d.org/tutorials/)
-
-### Tez 
-
- [Leaf disease detection with Tez and Accelerate](https://www.kaggle.com/code/abhishek/tez-faster-and-easier-training-for-leaf-detection/notebook)
-
-### trlx 
-
- [How to implement a sentiment learning task with trlx](https://github.com/CarperAI/trlx#example-how-to-add-a-task)
--- a/examples/README.md
+++ b/examples/README.md
@ -136,7 +136,7 @@ To run it in each of these various modes, use the following commands:
        ```
 - single GPU:
    ```bash
-    python ./cv_example.py  # from a server with a GPU
+    python ./nlp_example.py  # from a server with a GPU
    ```
 - with fp16 (mixed-precision)
    * from any server by passing `fp16=True` to the `Accelerator`.
@ -184,13 +184,6 @@ To run it in each of these various modes, use the following commands:
    * In PyTorch:
        Add an `xmp.spawn` line in your script as you usually do.

-### Simple vision example (GANs)
-
- [huggan project](https://github.com/huggingface/community-events/tree/main/huggan)
-
-### Using AWS SageMaker integration
- [Examples showcasing AWS SageMaker integration of 🤗 Accelerate.](https://github.com/pacman100/accelerate-aws-sagemaker)
-    
 ## Finer Examples

 While the first two scripts are extremely barebones when it comes to what you can do with accelerate, more advanced features are documented in two other locations.
--- a/examples/by_feature/automatic_gradient_accumulation.py
+++ b/examples/by_feature/automatic_gradient_accumulation.py
@ -1,232 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-
-import torch
-from torch.optim import AdamW
-from torch.utils.data import DataLoader
-
-# New Code #
-import evaluate
-from accelerate import Accelerator, DistributedType
-from accelerate.utils import find_executable_batch_size
-from datasets import load_dataset
-from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
-
-
-########################################################################
-# This is a fully working simple example to use Accelerate,
-# specifically showcasing how to combine both the gradient accumulation
-# and automatic batch size finder utilities of Accelerate to perfrom
-# automatic gradient accumulation
-#
-# This example trains a Bert base model on GLUE MRPC
-# in any of the following settings (with the same script):
-#   - single CPU or single GPU
-#   - multi GPUS (using PyTorch distributed mode)
-#   - (multi) TPUs
-#   - fp16 (mixed-precision) or fp32 (normal precision)
-#
-# New additions from the base script can be found quickly by
-# looking for the # New Code # tags
-#
-# To run it in each of these various modes, follow the instructions
-# in the readme for examples:
-# https://github.com/huggingface/accelerate/tree/main/examples
-#
-########################################################################
-
-EVAL_BATCH_SIZE = 32
-
-
-def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
-    """
-    Creates a set of `DataLoader`s for the `glue` dataset,
-    using "bert-base-cased" as the tokenizer.
-
-    Args:
-        accelerator (`Accelerator`):
-            An `Accelerator` object
-        batch_size (`int`, *optional*):
-            The batch size for the train and validation DataLoaders.
-    """
-    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-    datasets = load_dataset("glue", "mrpc")
-
-    def tokenize_function(examples):
-        # max_length=None => use the model max length (it's actually the default)
-        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
-        return outputs
-
-    # Apply the method we just defined to all the examples in all the splits of the dataset
-    # starting with the main process first:
-    with accelerator.main_process_first():
-        tokenized_datasets = datasets.map(
-            tokenize_function,
-            batched=True,
-            remove_columns=["idx", "sentence1", "sentence2"],
-        )
-
-    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
-    # transformers library
-    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
-
-    def collate_fn(examples):
-        # On TPU it's best to pad everything to the same length or training will be very slow.
-        if accelerator.distributed_type == DistributedType.TPU:
-            return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
-        return tokenizer.pad(examples, padding="longest", return_tensors="pt")
-
-    # Instantiate dataloaders.
-    train_dataloader = DataLoader(
-        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
-    )
-    eval_dataloader = DataLoader(
-        tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
-    )
-
-    return train_dataloader, eval_dataloader
-
-
-# For testing only
-if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
-    from accelerate.test_utils.training import mocked_dataloaders
-
-    get_dataloaders = mocked_dataloaders  # noqa: F811
-
-
-def training_function(config, args):
-    # For testing only
-    if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
-        config["num_epochs"] = 2
-    # Initialize accelerator
-    accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision)
-    # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
-    lr = config["lr"]
-    num_epochs = int(config["num_epochs"])
-    seed = int(config["seed"])
-    observed_batch_size = int(config["batch_size"])
-
-    metric = evaluate.load("glue", "mrpc")
-
-    # New Code #
-    # We use the `find_executable_batch_size` decorator, passing in the desired observed batch size
-    # to train on. If a CUDA OOM error occurs, it will retry this loop cutting the batch size in
-    # half each time. From this, we can calculate the number of gradient accumulation steps needed
-    # and modify the Accelerator object as a result
-    @find_executable_batch_size(starting_batch_size=int(observed_batch_size))
-    def inner_training_loop(batch_size):
-        # Since we need to modify the outside accelerator object, we need to bring it
-        # to the local scope
-        nonlocal accelerator
-
-        # We can calculate the number of gradient accumulation steps based on the current
-        # batch size vs the starting batch size
-        num_gradient_accumulation_steps = observed_batch_size // batch_size
-
-        # And then set it in the Accelerator directly:
-        accelerator.gradient_accumulation_steps = num_gradient_accumulation_steps
-
-        # Next we need to free all of the stored model references in the Accelerator each time
-        accelerator.free_memory()
-
-        # And set the seed so our results are reproducable each reset
-        set_seed(seed)
-
-        # Instantiate the model (we build the model here so that the seed also control new weights initialization)
-        model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)
-
-        # We could avoid this line since the accelerator is set with `device_placement=True` (default value).
-        # Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
-        # creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
-        model = model.to(accelerator.device)
-
-        # Instantiate optimizer
-        optimizer = AdamW(params=model.parameters(), lr=lr)
-        train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
-
-        # Instantiate scheduler
-        lr_scheduler = get_linear_schedule_with_warmup(
-            optimizer=optimizer,
-            num_warmup_steps=100,
-            num_training_steps=(len(train_dataloader) * num_epochs),
-        )
-
-        # Prepare everything
-        # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
-        # prepare method.
-        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
-            model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
-        )
-
-        # Now we train the model
-        for epoch in range(num_epochs):
-            model.train()
-            for step, batch in enumerate(train_dataloader):
-                # And perform gradient accumulation
-                with accelerator.accumulate(model):
-                    # We could avoid this line since we set the accelerator with `device_placement=True`.
-                    batch.to(accelerator.device)
-                    outputs = model(**batch)
-                    loss = outputs.loss
-                    accelerator.backward(loss)
-                    optimizer.step()
-                    lr_scheduler.step()
-                    optimizer.zero_grad()
-
-            model.eval()
-            for step, batch in enumerate(eval_dataloader):
-                # We could avoid this line since we set the accelerator with `device_placement=True`.
-                batch.to(accelerator.device)
-                with torch.no_grad():
-                    outputs = model(**batch)
-                predictions = outputs.logits.argmax(dim=-1)
-                predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))
-                metric.add_batch(
-                    predictions=predictions,
-                    references=references,
-                )
-
-            eval_metric = metric.compute()
-            # Use accelerator.print to print only on the main process.
-            accelerator.print(f"epoch {epoch}:", eval_metric)
-
-    # New Code #
-    # And call it at the end with no arguments
-    # Note: You could also refactor this outside of your training loop function
-    inner_training_loop()
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Simple example of training script.")
-    parser.add_argument(
-        "--mixed_precision",
-        type=str,
-        default="no",
-        choices=["no", "fp16", "bf16"],
-        help="Whether to use mixed precision. Choose"
-        "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
-        "and an Nvidia Ampere GPU.",
-    )
-    parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
-    args = parser.parse_args()
-    # New Code #
-    # We modify the starting batch size to be an observed batch size of 256, to guarentee an initial CUDA OOM
-    config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 256}
-    training_function(config, args)
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/by_feature/cross_validation.py
+++ b/examples/by_feature/cross_validation.py
@ -58,7 +58,7 @@ MAX_GPU_BATCH_SIZE = 16
 EVAL_BATCH_SIZE = 32

 # New Code #
-# We need a different `get_dataloaders` function that will build dataloaders by index
+# We need a different `get_dataloaders` function that will build dataloaders by indexs


 def get_fold_dataloaders(
@ -71,9 +71,9 @@ def get_fold_dataloaders(
        accelerator (`Accelerator`):
            The main `Accelerator` object
        train_idxs (list of `int`):
-            The split indices for the training dataset
+            The split indicies for the training dataset
        valid_idxs (list of `int`):
-            The split indices for the validation dataset
+            The split indicies for the validation dataset
        batch_size (`int`):
            The size of the minibatch. Default is 16
    """
--- a/examples/by_feature/deepspeed_with_config_support.py
+++ b/examples/by_feature/deepspeed_with_config_support.py
@ -525,7 +525,7 @@ def main():
        },
    ]
    # New Code #
-    # Creates Dummy Optimizer if `optimizer` was specified in the config file else creates Adam Optimizer
+    # Creates Dummy Optimizer if `optimizer` was spcified in the config file else creates Adam Optimizer
    optimizer_cls = (
        torch.optim.AdamW
        if accelerator.state.deepspeed_plugin is None
@ -554,7 +554,7 @@ def main():
        args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)

    # New Code #
-    # Creates Dummy Scheduler if `scheduler` was specified in the config file else creates `args.lr_scheduler_type` Scheduler
+    # Creates Dummy Scheduler if `scheduler` was spcified in the config file else creates `args.lr_scheduler_type` Scheduler
    if (
        accelerator.state.deepspeed_plugin is None
        or "scheduler" not in accelerator.state.deepspeed_plugin.deepspeed_config
@ -588,12 +588,14 @@ def main():
        checkpointing_steps = None

    # We need to initialize the trackers we use, and also store our configuration.
-    # The trackers initializes automatically on the main process.
+    # We initialize the trackers only on main process because `accelerator.log`
+    # only logs on main process and we don't want empty logs/runs on other processes.
    if args.with_tracking:
-        experiment_config = vars(args)
-        # TensorBoard cannot log Enums, need the raw value
-        experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value
-        accelerator.init_trackers("clm_no_trainer", experiment_config)
+        if accelerator.is_main_process:
+            experiment_config = vars(args)
+            # TensorBoard cannot log Enums, need the raw value
+            experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value
+            accelerator.init_trackers("clm_no_trainer", experiment_config)

    # Train!
    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
--- a/examples/by_feature/fsdp_with_peak_mem_tracking.py
+++ b/examples/by_feature/fsdp_with_peak_mem_tracking.py
@ -113,7 +113,7 @@ def training_function(config, args):
    batch_size = int(config["batch_size"])

    # We need to initialize the trackers we use, and also store our configuration
-    if args.with_tracking:
+    if args.with_tracking and accelerator.is_main_process:
        experiment_config = vars(args)
        accelerator.init_trackers("fsdp_glue_no_trainer", experiment_config)

--- a/examples/by_feature/memory.py
+++ b/examples/by_feature/memory.py
@ -29,7 +29,7 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_
 ########################################################################
 # This is a fully working simple example to use Accelerate,
 # specifically showcasing how to ensure out-of-memory errors never
-# interrupt training, and builds off the `nlp_example.py` script.
+# iterrupt training, and builds off the `nlp_example.py` script.
 #
 # This example trains a Bert base model on GLUE MRPC
 # in any of the following settings (with the same script):
@ -122,6 +122,24 @@ def training_function(config, args):

    metric = evaluate.load("glue", "mrpc")

+    # If the batch size is too big we use gradient accumulation
+    gradient_accumulation_steps = 1
+    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
+        gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
+        batch_size = MAX_GPU_BATCH_SIZE
+
+    set_seed(seed)
+    # Instantiate the model (we build the model here so that the seed also control new weights initialization)
+    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)
+
+    # We could avoid this line since the accelerator is set with `device_placement=True` (default value).
+    # Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
+    # creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
+    model = model.to(accelerator.device)
+
+    # Instantiate optimizer
+    optimizer = AdamW(params=model.parameters(), lr=lr)
+
    # New Code #
    # We now can define an inner training loop function. It should take a batch size as the only parameter,
    # and build the dataloaders in there.
@ -129,31 +147,16 @@ def training_function(config, args):
    @find_executable_batch_size(starting_batch_size=batch_size)
    def inner_training_loop(batch_size):
        # And now just move everything below under this function
-        # We need to bring in the Accelerator object from earlier
-        nonlocal accelerator
-        # And reset all of its attributes that could hold onto any memory:
-        accelerator.free_memory()
-
-        # Then we can declare the model, optimizer, and everything else:
-        set_seed(seed)
-
-        # Instantiate the model (we build the model here so that the seed also control new weights initialization)
-        model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)
-
-        # We could avoid this line since the accelerator is set with `device_placement=True` (default value).
-        # Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
-        # creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
-        model = model.to(accelerator.device)
-
-        # Instantiate optimizer
-        optimizer = AdamW(params=model.parameters(), lr=lr)
+        # Ensure that anything declared outside this function is set as `nonlocal`
+        # so it is in scope
+        nonlocal model, optimizer
        train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)

        # Instantiate scheduler
        lr_scheduler = get_linear_schedule_with_warmup(
            optimizer=optimizer,
            num_warmup_steps=100,
-            num_training_steps=(len(train_dataloader) * num_epochs),
+            num_training_steps=(len(train_dataloader) * num_epochs) // gradient_accumulation_steps,
        )

        # Prepare everything
@ -171,10 +174,12 @@ def training_function(config, args):
                batch.to(accelerator.device)
                outputs = model(**batch)
                loss = outputs.loss
+                loss = loss / gradient_accumulation_steps
                accelerator.backward(loss)
-                optimizer.step()
-                lr_scheduler.step()
-                optimizer.zero_grad()
+                if step % gradient_accumulation_steps == 0:
+                    optimizer.step()
+                    lr_scheduler.step()
+                    optimizer.zero_grad()

            model.eval()
            for step, batch in enumerate(eval_dataloader):
--- a/examples/by_feature/tracking.py
+++ b/examples/by_feature/tracking.py
@ -166,8 +166,8 @@ def training_function(config, args):
    )

    # New Code #
-    # We need to initialize the trackers we use. Overall configurations can also be stored
-    if args.with_tracking:
+    # We need to initalize the trackers we use. Overall configurations can also be stored
+    if args.with_tracking and accelerator.is_main_process:
        run = os.path.split(__file__)[-1].split(".")[0]
        accelerator.init_trackers(run, config)

--- a/examples/complete_cv_example.py
+++ b/examples/complete_cv_example.py
@ -103,8 +103,10 @@ def training_function(config, args):
        checkpointing_steps = None

    # We need to initialize the trackers we use, and also store our configuration
-    if args.with_tracking:
+    if args.with_tracking and accelerator.is_main_process:
        run = os.path.split(__file__)[-1].split(".")[0]
+        if args.logging_dir:
+            run = os.path.join(args.logging_dir, run)
        accelerator.init_trackers(run, config)

    # Grab all the image filenames
--- a/examples/complete_nlp_example.py
+++ b/examples/complete_nlp_example.py
@ -75,8 +75,10 @@ def training_function(config, args):
    batch_size = int(config["batch_size"])

    # We need to initialize the trackers we use, and also store our configuration
-    if args.with_tracking:
+    if args.with_tracking and accelerator.is_main_process:
        run = os.path.split(__file__)[-1].split(".")[0]
+        if args.logging_dir:
+            run = os.path.join(args.logging_dir, run)
        accelerator.init_trackers(run, config)

    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
--- a/manim_animations/big_model_inference/stage_1.py
+++ b/manim_animations/big_model_inference/stage_1.py
@ -1,108 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from manim import *
-
-
-class Stage1(Scene):
-    def construct(self):
-        mem = Rectangle(height=0.5,width=0.5)
-        fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)
-
-        cpu_left_col_base = [mem.copy() for i in range(6)]
-        cpu_right_col_base = [mem.copy() for i in range(6)]
-        cpu_left_col = VGroup(*cpu_left_col_base).arrange(UP, buff=0)
-        cpu_right_col = VGroup(*cpu_right_col_base).arrange(UP, buff=0)
-        cpu_rects = VGroup(cpu_left_col,cpu_right_col).arrange(RIGHT, buff=0)
-        cpu_text = Text("CPU", font_size=24)
-        cpu = Group(cpu_rects,cpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
-        cpu.move_to([-2.5,-.5,0])
-        self.add(cpu)
-
-        gpu_base = [mem.copy() for i in range(1)]
-        gpu_rect = VGroup(*gpu_base).arrange(UP,buff=0)
-        gpu_text = Text("GPU", font_size=24)
-        gpu = Group(gpu_rect,gpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
-        gpu.align_to(cpu, DOWN)
-        gpu.set_x(gpu.get_x() - 1)
-        
-        self.add(gpu)
-
-        model_base = [mem.copy() for i in range(6)]
-        model_rect = VGroup(*model_base).arrange(RIGHT,buff=0)
-
-        model_text = Text("Model", font_size=24)
-        model = Group(model_rect,model_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
-        model.move_to([3, -1., 0])
-        
-        self.play(
-            Create(cpu_left_col, run_time=1),
-            Create(cpu_right_col, run_time=1),
-            Create(gpu_rect, run_time=1),
-        )
-
-        step_1 = MarkupText(
-            f"First, an empty model skeleton is loaded\ninto <span fgcolor='{YELLOW}'>memory</span> without using much RAM.", 
-            font_size=24
-        )
-
-        key = Square(side_length=2.2)
-        key.move_to([-5, 2, 0])
-
-        key_text = MarkupText(
-            f"<b>Key:</b>\n\n<span fgcolor='{YELLOW}'>●</span> Empty Model",
-            font_size=18,
-        )
-
-        key_text.move_to([-5, 2.4, 0])
-
-
-        step_1.move_to([2, 2, 0])
-        self.play(
-            Write(step_1, run_time=2.5),
-            Write(key_text),
-            Write(key)
-        )
-
-        self.add(model)
-        
-
-        cpu_targs = []
-        first_animations = []
-        second_animations = []
-        for i,rect in enumerate(model_base):
-
-            cpu_target = Rectangle(height=0.46,width=0.46).set_stroke(width=0.).set_fill(YELLOW, opacity=0.7)
-            cpu_target.move_to(rect)
-            cpu_target.generate_target()
-            cpu_target.target.height = 0.46/4
-            cpu_target.target.width = 0.46/3
-            
-            if i == 0:
-                cpu_target.target.next_to(cpu_left_col_base[0].get_corner(DOWN+LEFT), buff=0.02, direction=UP)
-                cpu_target.target.set_x(cpu_target.target.get_x()+0.1)
-            elif i == 3:
-                cpu_target.target.next_to(cpu_targs[0].target, direction=UP, buff=0.)
-            else:
-                cpu_target.target.next_to(cpu_targs[i-1].target, direction=RIGHT, buff=0.)
-            cpu_targs.append(cpu_target)
-
-            first_animations.append(rect.animate(run_time=0.5).set_stroke(YELLOW))
-            second_animations.append(MoveToTarget(cpu_target, run_time=1.5))
-
-        self.play(*first_animations)
-        self.play(*second_animations)
-                 
-
-        self.wait()
--- a/manim_animations/big_model_inference/stage_2.py
+++ b/manim_animations/big_model_inference/stage_2.py
@ -1,126 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from manim import *
-
-class Stage2(Scene):
-    def construct(self):
-        mem = Rectangle(height=0.5,width=0.5)
-        fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)
-
-        cpu_left_col_base = [mem.copy() for i in range(6)]
-        cpu_right_col_base = [mem.copy() for i in range(6)]
-        cpu_left_col = VGroup(*cpu_left_col_base).arrange(UP, buff=0)
-        cpu_right_col = VGroup(*cpu_right_col_base).arrange(UP, buff=0)
-        cpu_rects = VGroup(cpu_left_col,cpu_right_col).arrange(RIGHT, buff=0)
-        cpu_text = Text("CPU", font_size=24)
-        cpu = Group(cpu_rects,cpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
-        cpu.move_to([-2.5,-.5,0])
-        self.add(cpu)
-
-        gpu_base = [mem.copy() for i in range(4)]
-        gpu_rect = VGroup(*gpu_base).arrange(UP,buff=0)
-        gpu_text = Text("GPU", font_size=24)
-        gpu = Group(gpu_rect,gpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
-        gpu.move_to([-1,-1,0])
-        self.add(gpu)
-
-        model_base = [mem.copy() for i in range(6)]
-        model_rect = VGroup(*model_base).arrange(RIGHT,buff=0)
-
-        model_text = Text("Model", font_size=24)
-        model = Group(model_rect,model_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
-        model.move_to([3, -1., 0])
-        self.add(model)
-        
-        cpu_targs = []
-        for i,rect in enumerate(model_base):
-            rect.set_stroke(YELLOW)
-            # target = fill.copy().set_fill(YELLOW, opacity=0.7)
-            # target.move_to(rect)
-            # self.add(target)
-
-            cpu_target = Rectangle(height=0.46/4,width=0.46/3).set_stroke(width=0.).set_fill(YELLOW, opacity=0.7)
-            
-            if i == 0:
-                cpu_target.next_to(cpu_left_col_base[0].get_corner(DOWN+LEFT), buff=0.02, direction=UP)
-                cpu_target.set_x(cpu_target.get_x()+0.1)
-            elif i == 3:
-                cpu_target.next_to(cpu_targs[0], direction=UP, buff=0.)
-            else:
-                cpu_target.next_to(cpu_targs[i-1], direction=RIGHT, buff=0.)
-            self.add(cpu_target)
-            cpu_targs.append(cpu_target)
-
-              
-
-        checkpoint_base = [mem.copy() for i in range(6)]
-        checkpoint_rect = VGroup(*checkpoint_base).arrange(RIGHT,buff=0)
-
-        checkpoint_text = Text("Loaded Checkpoint", font_size=24)
-        checkpoint = Group(checkpoint_rect,checkpoint_text).arrange(DOWN, aligned_edge=DOWN, buff=0.4)
-        checkpoint.move_to([3, .5, 0])
-            
-        key = Square(side_length=2.2)
-        key.move_to([-5, 2, 0])
-
-        key_text = MarkupText(
-            f"<b>Key:</b>\n\n<span fgcolor='{YELLOW}'>●</span> Empty Model",
-            font_size=18,
-        )
-
-        key_text.move_to([-5, 2.4, 0])
-
-        self.add(key_text, key)
-
-        blue_text = MarkupText(
-            f"<span fgcolor='{BLUE}'>●</span> Checkpoint",
-            font_size=18,
-        )
-
-        blue_text.next_to(key_text, DOWN*2.4, aligned_edge=key_text.get_left())
-
-        step_2 = MarkupText(
-            f'Next, a <i><span fgcolor="{BLUE}">second</span></i> model is loaded into memory,\nwith the weights of a <span fgcolor="{BLUE}">single shard</span>.', 
-            font_size=24
-        )
-        step_2.move_to([2, 2, 0])
-        self.play(
-            Write(step_2),
-            Write(blue_text)
-        )
-
-        self.play(
-            Write(checkpoint_text, run_time=1),
-            Create(checkpoint_rect, run_time=1)
-        )
-
-        first_animations = []
-        second_animations = []
-        for i,rect in enumerate(checkpoint_base):
-            target = fill.copy().set_fill(BLUE, opacity=0.7)
-            target.move_to(rect)
-            first_animations.append(GrowFromCenter(target, run_time=1))
-
-            cpu_target = target.copy()
-            cpu_target.generate_target()
-            if i < 5:
-                cpu_target.target.move_to(cpu_left_col_base[i+1])
-            else:
-                cpu_target.target.move_to(cpu_right_col_base[i-5])
-            second_animations.append(MoveToTarget(cpu_target, run_time=1.5))
-            
-        self.play(*first_animations)
-        self.play(*second_animations)
-        self.wait()
--- a/manim_animations/big_model_inference/stage_3.py
+++ b/manim_animations/big_model_inference/stage_3.py
@ -1,158 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from manim import *
-
-class Stage3(Scene):
-    def construct(self):
-        mem = Rectangle(height=0.5,width=0.5)
-        meta_mem = Rectangle(height=0.25,width=0.25)
-        fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)
-
-        cpu_left_col_base = [mem.copy() for i in range(6)]
-        cpu_right_col_base = [mem.copy() for i in range(6)]
-        cpu_left_col = VGroup(*cpu_left_col_base).arrange(UP, buff=0)
-        cpu_right_col = VGroup(*cpu_right_col_base).arrange(UP, buff=0)
-        cpu_rects = VGroup(cpu_left_col,cpu_right_col).arrange(RIGHT, buff=0)
-        cpu_text = Text("CPU", font_size=24)
-        cpu = Group(cpu_rects,cpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
-        cpu.move_to([-2.5,-.5,0])
-        self.add(cpu)
-
-        gpu_base = [mem.copy() for i in range(4)]
-        gpu_rect = VGroup(*gpu_base).arrange(UP,buff=0)
-        gpu_text = Text("GPU", font_size=24)
-        gpu = Group(gpu_rect,gpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
-        gpu.move_to([-1,-1,0])
-        self.add(gpu)
-
-        model_base = [mem.copy() for i in range(6)]
-        model_rect = VGroup(*model_base).arrange(RIGHT,buff=0)
-
-        model_text = Text("Model", font_size=24)
-        model = Group(model_rect,model_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
-        model.move_to([3, -1., 0])
-        self.add(model)
-
-        model_arr = []
-        model_cpu_arr = []
-        model_meta_arr = []
-        
-        for i,rect in enumerate(model_base):
-            rect.set_stroke(YELLOW)
-
-            cpu_target = Rectangle(height=0.46/4,width=0.46/3).set_stroke(width=0.).set_fill(YELLOW, opacity=0.7)
-            
-            if i == 0:
-                cpu_target.next_to(cpu_left_col_base[0].get_corner(DOWN+LEFT), buff=0.02, direction=UP)
-                cpu_target.set_x(cpu_target.get_x()+0.1)
-            elif i == 3:
-                cpu_target.next_to(model_cpu_arr[0], direction=UP, buff=0.)
-            else:
-                cpu_target.next_to(model_cpu_arr[i-1], direction=RIGHT, buff=0.)
-            self.add(cpu_target)
-            model_cpu_arr.append(cpu_target)
-
-        self.add(*model_arr, *model_cpu_arr, *model_meta_arr)
-
-        checkpoint_base = [mem.copy() for i in range(6)]
-        checkpoint_rect = VGroup(*checkpoint_base).arrange(RIGHT,buff=0)
-
-        checkpoint_text = Text("Loaded Checkpoint", font_size=24)
-        checkpoint = Group(checkpoint_rect,checkpoint_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
-        checkpoint.move_to([3, .5, 0])
-            
-        self.add(checkpoint)
-
-        ckpt_arr = []
-        ckpt_cpu_arr = []
-
-        for i,rect in enumerate(checkpoint_base):
-            target = fill.copy().set_fill(BLUE, opacity=0.7)
-            target.move_to(rect)
-            ckpt_arr.append(target)
-
-            cpu_target = target.copy()
-            if i < 5:
-                cpu_target.move_to(cpu_left_col_base[i+1])
-            else:
-                cpu_target.move_to(cpu_right_col_base[i-5])
-            ckpt_cpu_arr.append(cpu_target)
-        self.add(*ckpt_arr, *ckpt_cpu_arr)
-
-        key = Square(side_length=2.2)
-        key.move_to([-5, 2, 0])
-
-        key_text = MarkupText(
-            f"<b>Key:</b>\n\n<span fgcolor='{YELLOW}'>●</span> Empty Model",
-            font_size=18,
-        )
-
-        key_text.move_to([-5, 2.4, 0])
-
-        self.add(key_text, key)
-
-        blue_text = MarkupText(
-            f"<span fgcolor='{BLUE}'>●</span> Checkpoint",
-            font_size=18,
-        )
-
-        blue_text.next_to(key_text, DOWN*2.4, aligned_edge=key_text.get_left())
-        self.add(blue_text)
-
-        step_3 = MarkupText(
-            f'Based on the passed in configuration, weights are stored in\na variety of np.memmaps on disk or to a particular device.', 
-            font_size=24
-        )
-        step_3.move_to([2, 2, 0])
-
-        disk_left_col_base = [meta_mem.copy() for i in range(6)]
-        disk_right_col_base = [meta_mem.copy() for i in range(6)]
-        disk_left_col = VGroup(*disk_left_col_base).arrange(UP, buff=0)
-        disk_right_col = VGroup(*disk_right_col_base).arrange(UP, buff=0)
-        disk_rects = VGroup(disk_left_col,disk_right_col).arrange(RIGHT, buff=0)
-        disk_text = Text("Disk", font_size=24)
-        disk = Group(disk_rects,disk_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
-        disk.move_to([-4.,-1.25,0])
-        self.play(
-            Write(step_3, run_time=3),
-            Write(disk_text, run_time=1),
-            Create(disk_rects, run_time=1)
-        )
-
-        animations = []
-        for i,rect in enumerate(ckpt_cpu_arr):
-            target = rect.copy()
-            target.generate_target()
-            target.target.move_to(disk_left_col_base[i]).scale(0.5)
-            animations.append(MoveToTarget(target, run_time=1.5))
-        self.play(*animations)
-
-        self.play(FadeOut(step_3))
-
-        step_4 = MarkupText(
-            f'Then, the checkpoint is removed from memory\nthrough garbage collection.', 
-            font_size=24
-        )
-        step_4.move_to([2, 2, 0])
-
-        self.play(
-            Write(step_4, run_time=3)
-        )
-
-        self.play(
-            FadeOut(checkpoint_rect, checkpoint_text, *ckpt_arr, *ckpt_cpu_arr),
-        )
-
-        self.wait()      
--- a/manim_animations/big_model_inference/stage_4.py
+++ b/manim_animations/big_model_inference/stage_4.py
@ -1,156 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from manim import *
-
-class Stage4(Scene):
-    def construct(self):
-        mem = Rectangle(height=0.5,width=0.5)
-        fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)
-        meta_mem = Rectangle(height=0.25,width=0.25)
-
-        cpu_left_col_base = [mem.copy() for i in range(6)]
-        cpu_right_col_base = [mem.copy() for i in range(6)]
-        cpu_left_col = VGroup(*cpu_left_col_base).arrange(UP, buff=0)
-        cpu_right_col = VGroup(*cpu_right_col_base).arrange(UP, buff=0)
-        cpu_rects = VGroup(cpu_left_col,cpu_right_col).arrange(RIGHT, buff=0)
-        cpu_text = Text("CPU", font_size=24)
-        cpu = Group(cpu_rects,cpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
-        cpu.move_to([-2.5,-.5,0])
-        self.add(cpu)
-
-        gpu_base = [mem.copy() for i in range(4)]
-        gpu_rect = VGroup(*gpu_base).arrange(UP,buff=0)
-        gpu_text = Text("GPU", font_size=24)
-        gpu = Group(gpu_rect,gpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
-        gpu.move_to([-1,-1,0])
-        self.add(gpu)
-
-        model_base = [mem.copy() for i in range(6)]
-        model_rect = VGroup(*model_base).arrange(RIGHT,buff=0)
-
-        model_text = Text("Model", font_size=24)
-        model = Group(model_rect,model_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
-        model.move_to([3, -1., 0])
-        self.add(model)
-
-        model_cpu_arr = []
-        model_meta_arr = []
-        
-        for i,rect in enumerate(model_base):
-            rect.set_stroke(YELLOW)
-
-            cpu_target = Rectangle(height=0.46/4,width=0.46/3).set_stroke(width=0.).set_fill(YELLOW, opacity=0.7)
-            
-            if i == 0:
-                cpu_target.next_to(cpu_left_col_base[0].get_corner(DOWN+LEFT), buff=0.02, direction=UP)
-                cpu_target.set_x(cpu_target.get_x()+0.1)
-            elif i == 3:
-                cpu_target.next_to(model_cpu_arr[0], direction=UP, buff=0.)
-            else:
-                cpu_target.next_to(model_cpu_arr[i-1], direction=RIGHT, buff=0.)
-            self.add(cpu_target)
-            model_cpu_arr.append(cpu_target)
-
-        self.add(*model_cpu_arr, *model_meta_arr)
-
-        disk_left_col_base = [meta_mem.copy() for i in range(6)]
-        disk_right_col_base = [meta_mem.copy() for i in range(6)]
-        disk_left_col = VGroup(*disk_left_col_base).arrange(UP, buff=0)
-        disk_right_col = VGroup(*disk_right_col_base).arrange(UP, buff=0)
-        disk_rects = VGroup(disk_left_col,disk_right_col).arrange(RIGHT, buff=0)
-        disk_text = Text("Disk", font_size=24)
-        disk = Group(disk_rects,disk_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
-        disk.move_to([-4.,-1.25,0])
-        self.add(disk_text, disk_rects)
-
-        cpu_disk_arr = []
-
-        for i in range(6):
-            target = fill.copy().set_fill(BLUE, opacity=0.8)
-            target.move_to(disk_left_col_base[i]).scale(0.5)
-            cpu_disk_arr.append(target)
-
-        self.add(*cpu_disk_arr)
-
-        key = Square(side_length=2.2)
-        key.move_to([-5, 2, 0])
-
-        key_text = MarkupText(
-            f"<b>Key:</b>\n\n<span fgcolor='{YELLOW}'>●</span> Empty Model",
-            font_size=18,
-        )
-
-        key_text.move_to([-5, 2.4, 0])
-
-        self.add(key_text, key)
-
-        blue_text = MarkupText(
-            f"<span fgcolor='{BLUE}'>●</span> Checkpoint",
-            font_size=18,
-        )
-
-        blue_text.next_to(key_text, DOWN*2.4, aligned_edge=key_text.get_left())
-        self.add(blue_text)
-
-        step_5 = MarkupText(
-            f'The offloaded weights are all sent to the CPU.', 
-            font_size=24
-        )
-        step_5.move_to([2, 2, 0])
-
-        self.play(Write(step_5, run_time=3))
-
-        for i in range(6):
-            rect = cpu_disk_arr[i]
-            cp2 = rect.copy().set_fill(BLUE, opacity=0.8).scale(2.0)
-            cp2.generate_target()
-            cp2.target.move_to(model_base[i])
-
-            if i == 0:
-                rect.set_fill(BLUE, opacity=0.8)
-                rect.generate_target()
-                rect.target.move_to(cpu_left_col_base[0]).scale(2.0)
-                
-                self.remove(*model_meta_arr, 
-                    *model_cpu_arr,
-                )
-
-            else:
-                rect.generate_target()
-                rect.target.move_to(cpu_left_col_base[i]).scale(2.0)
-            self.play(
-                MoveToTarget(rect),
-                MoveToTarget(cp2),
-                model_base[i].animate.set_stroke(WHITE)
-            )
-        self.play(FadeOut(step_5))
-
-        step_5 = MarkupText(
-            f'Finally, hooks are added to each weight in the model\nto transfer the weights from CPU to GPU\n\t\tand back when needed.', 
-            font_size=24
-        )
-        step_5.move_to([2, 2, 0])
-
-        self.play(Write(step_5, run_time=3))
-
-        arrows = []
-        animations = []
-        for i in range(6):
-            a = Arrow(start=UP, end=DOWN, color=RED, buff=.5)
-            a.next_to(model_base[i].get_left(), UP, buff=0.2)
-            arrows.append(a)
-            animations.append(Write(a))
-        self.play(*animations)
-        self.wait()  
--- a/manim_animations/big_model_inference/stage_5.py
+++ b/manim_animations/big_model_inference/stage_5.py
@ -1,221 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from manim import *
-
-class Stage5(Scene):
-    def construct(self):
-        mem = Rectangle(height=0.5,width=0.5)
-        fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)
-
-        meta_mem = Rectangle(height=0.25,width=0.25)
-
-        cpu_left_col_base = [mem.copy() for i in range(6)]
-        cpu_right_col_base = [mem.copy() for i in range(6)]
-        cpu_left_col = VGroup(*cpu_left_col_base).arrange(UP, buff=0)
-        cpu_right_col = VGroup(*cpu_right_col_base).arrange(UP, buff=0)
-        cpu_rects = VGroup(cpu_left_col,cpu_right_col).arrange(RIGHT, buff=0)
-        cpu_text = Text("CPU", font_size=24)
-        cpu = Group(cpu_rects,cpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
-        cpu.move_to([-2.5,-.5,0])
-        self.add(cpu)
-
-        gpu_base = [mem.copy() for i in range(4)]
-        gpu_rect = VGroup(*gpu_base).arrange(UP,buff=0)
-        gpu_text = Text("GPU", font_size=24)
-        gpu = Group(gpu_rect,gpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
-        gpu.move_to([-1,-1,0])
-        self.add(gpu)
-
-        model_base = [mem.copy() for i in range(6)]
-        model_rect = VGroup(*model_base).arrange(RIGHT,buff=0)
-
-        model_text = Text("Model", font_size=24)
-        model = Group(model_rect,model_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
-        model.move_to([3, -1., 0])
-        self.add(model)
-
-        model_arr = []
-        model_cpu_arr = []
-        
-        for i,rect in enumerate(model_base):
-            target = fill.copy().set_fill(BLUE, opacity=0.8)
-            target.move_to(rect)
-            model_arr.append(target)
-
-            cpu_target = Rectangle(height=0.46,width=0.46).set_stroke(width=0.).set_fill(BLUE, opacity=0.8)
-            cpu_target.move_to(cpu_left_col_base[i])
-            model_cpu_arr.append(cpu_target)
-
-        self.add(*model_arr, *model_cpu_arr)
-
-        disk_left_col_base = [meta_mem.copy() for i in range(6)]
-        disk_right_col_base = [meta_mem.copy() for i in range(6)]
-        disk_left_col = VGroup(*disk_left_col_base).arrange(UP, buff=0)
-        disk_right_col = VGroup(*disk_right_col_base).arrange(UP, buff=0)
-        disk_rects = VGroup(disk_left_col,disk_right_col).arrange(RIGHT, buff=0)
-        disk_text = Text("Disk", font_size=24)
-        disk = Group(disk_rects,disk_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
-        disk.move_to([-4,-1.25,0])
-        self.add(disk_text, disk_rects)
-
-        key = Square(side_length=2.2)
-        key.move_to([-5, 2, 0])
-
-        key_text = MarkupText(
-            f"<b>Key:</b>\n\n<span fgcolor='{YELLOW}'>●</span> Empty Model",
-            font_size=18,
-        )
-
-        key_text.move_to([-5, 2.4, 0])
-
-        self.add(key_text, key)
-
-        blue_text = MarkupText(
-            f"<span fgcolor='{BLUE}'>●</span> Checkpoint",
-            font_size=18,
-        )
-
-        blue_text.next_to(key_text, DOWN*2.4, aligned_edge=key_text.get_left())
-        self.add(blue_text)
-
-        step_6 = MarkupText(
-            f'Now watch as an input is passed through the model\nand how the memory is utilized and handled.', 
-            font_size=24
-        )
-        step_6.move_to([2, 2, 0])
-
-        self.play(Write(step_6))
-
-        input = Square(0.3)
-        input.set_fill(RED, opacity=1.)
-        input.set_stroke(width=0.)
-        input.next_to(model_base[0], LEFT, buff=.5)
-
-        self.play(Write(input))
-
-        input.generate_target()
-        input.target.next_to(model_arr[0], direction=LEFT, buff=0.02)
-        self.play(MoveToTarget(input))
-
-        self.play(FadeOut(step_6))
-
-
-        a = Arrow(start=UP, end=DOWN, color=RED, buff=.5)
-        a.next_to(model_arr[0].get_left(), UP, buff=0.2)
-
-        model_cpu_arr[0].generate_target()
-        model_cpu_arr[0].target.move_to(gpu_rect[0])
-
-        step_7 = MarkupText(
-            f'As the input reaches a layer, the hook triggers\nand weights are moved from the CPU\nto the GPU and back.', 
-            font_size=24
-        )
-        step_7.move_to([2, 2, 0])
-
-        self.play(Write(step_7, run_time=3))
-
-        circ_kwargs = {"run_time":1, "fade_in":True, "fade_out":True, "buff":0.02}
-
-        self.play(
-            Write(a), 
-            Circumscribe(model_arr[0], color=ORANGE, **circ_kwargs),
-            Circumscribe(model_cpu_arr[0], color=ORANGE, **circ_kwargs),
-            Circumscribe(gpu_rect[0], color=ORANGE, **circ_kwargs),
-        )
-        self.play(
-            MoveToTarget(model_cpu_arr[0])
-        )
-
-        a_c = a.copy()
-        for i in range(6):
-            a_c.next_to(model_arr[i].get_right()+0.02, UP, buff=0.2)
-
-            input.generate_target()
-            input.target.move_to(model_arr[i].get_right()+0.02)
-
-            grp = AnimationGroup(
-                FadeOut(a, run_time=.5), 
-                MoveToTarget(input, run_time=.5), 
-                FadeIn(a_c, run_time=.5),
-                lag_ratio=0.2
-            )
-
-            self.play(grp)
-
-
-            model_cpu_arr[i].generate_target()
-            model_cpu_arr[i].target.move_to(cpu_left_col_base[i])
-
-
-            if i < 5:
-                model_cpu_arr[i+1].generate_target()
-                model_cpu_arr[i+1].target.move_to(gpu_rect[0])
-                if i >= 1:
-                    circ_kwargs["run_time"] = .7
-
-                self.play(
-                    Circumscribe(model_arr[i], **circ_kwargs),
-                    Circumscribe(cpu_left_col_base[i], **circ_kwargs),
-                    Circumscribe(cpu_left_col_base[i+1], color=ORANGE, **circ_kwargs),                    
-                    Circumscribe(gpu_rect[0], color=ORANGE, **circ_kwargs),
-                    Circumscribe(model_arr[i+1], color=ORANGE, **circ_kwargs),
-                )
-                if i < 1:
-                    self.play(
-                        MoveToTarget(model_cpu_arr[i]), 
-                        MoveToTarget(model_cpu_arr[i+1]),
-                    )
-                else:
-                    self.play(
-                        MoveToTarget(model_cpu_arr[i], run_time=.7), 
-                        MoveToTarget(model_cpu_arr[i+1], run_time=.7),
-                    )
-            else:
-                model_cpu_arr[i].generate_target()
-                model_cpu_arr[i].target.move_to(cpu_left_col_base[-1])
-                input.generate_target()
-                input.target.next_to(model_arr[-1].get_right(), RIGHT+0.02, buff=0.2)
-
-                self.play(
-                    Circumscribe(model_arr[-1], color=ORANGE, **circ_kwargs),
-                    Circumscribe(cpu_left_col_base[-1], color=ORANGE, **circ_kwargs),
-                    Circumscribe(gpu_rect[0], color=ORANGE, **circ_kwargs),
-                )
-
-                self.play(
-                    MoveToTarget(model_cpu_arr[i])
-                )
-
-            a = a_c
-            a_c = a_c.copy()
-
-        input.generate_target()
-        input.target.next_to(model_base[-1], RIGHT+0.02, buff=.5)
-        self.play(
-            FadeOut(step_7),
-            FadeOut(a, run_time=.5), 
-        )
-
-        step_8 = MarkupText(
-            f'Inference on a model too large for GPU memory\nis successfully completed.', font_size=24
-        )
-        step_8.move_to([2, 2, 0])
-
-        self.play(
-            Write(step_8, run_time=3),
-            MoveToTarget(input)
-        )
-
-        self.wait()
--- a/setup.py
+++ b/setup.py
@ -21,10 +21,9 @@ extras["docs"] = []
 extras["test_prod"] = ["pytest", "pytest-xdist", "pytest-subtests", "parameterized"]
 extras["test_dev"] = ["datasets", "evaluate", "transformers", "scipy", "sklearn", "deepspeed<0.7.0", "tqdm"]
 extras["testing"] = extras["test_prod"] + extras["test_dev"]
-extras["rich"] = ["rich"]

 extras["test_trackers"] = ["wandb", "comet-ml", "tensorboard"]
-extras["dev"] = extras["quality"] + extras["testing"] + extras["rich"]
+extras["dev"] = extras["quality"] + extras["testing"]

 extras["sagemaker"] = [
    "sagemaker",  # boto3 is a required package in sagemaker
@ -32,7 +31,7 @@ extras["sagemaker"] = [

 setup(
    name="accelerate",
-    version="0.13.1",
+    version="0.12.0",
    description="Accelerate",
    long_description=open("README.md", "r", encoding="utf-8").read(),
    long_description_content_type="text/markdown",
--- a/src/accelerate/init.py
+++ b/src/accelerate/init.py
@ -2,7 +2,7 @@
 # There's no way to ignore "F401 '...' imported but unused" warnings in this
 # module, but to preserve other warnings. So, don't check this module at all.

-__version__ = "0.13.1"
+__version__ = "0.12.0"

 from .accelerator import Accelerator
 from .big_modeling import cpu_offload, disk_offload, dispatch_model, init_empty_weights, load_checkpoint_and_dispatch
@ -16,11 +16,6 @@ from .utils import (
    InitProcessGroupKwargs,
    find_executable_batch_size,
    infer_auto_device_map,
-    is_rich_available,
    load_checkpoint_in_model,
    synchronize_rng_states,
 )
-
-
-if is_rich_available():
-    from .utils import rich
--- a/src/accelerate/accelerator.py
+++ b/src/accelerate/accelerator.py
@ -125,8 +125,8 @@ class Accelerator:
            - `"tensorboard"`
            - `"wandb"`
            - `"comet_ml"`
-            If `"all"` is selected, will pick up all available trackers in the environment and initialize them. Can
-            also accept implementations of `GeneralTracker` for custom trackers, and can be combined with `"all"`.
+            If `"all`" is selected, will pick up all available trackers in the environment and intialize them. Can also
+            accept implementations of `GeneralTracker` for custom trackers, and can be combined with `"all"`.
        logging_dir (`str`, `os.PathLike`, *optional*):
            A path to a directory for storing logs of locally-compatible loggers.
        dispatch_batches (`bool`, *optional*):
@ -140,7 +140,7 @@ class Accelerator:
            A list of `KwargHandler` to customize how the objects related to distributed training or mixed precision
            are created. See [kwargs](kwargs) for more information.

-    **Available attributes:**
+    **Attributes:**

        - **device** (`torch.device`) -- The device to use.
        - **distributed_type** ([`~utils.DistributedType`]) -- The distributed training configuration.
@ -279,7 +279,9 @@ class Accelerator:
        self.native_amp = False
        err = "{mode} mixed precision requires {requirement}"
        if self.state.mixed_precision == "fp16":
-            self.native_amp = True
+            self.native_amp = is_torch_version(">=", "1.6")
+            if not self.native_amp:
+                raise ValueError(err.format(mode="fp16", requirement="PyTorch >= 1.6"))
            if not torch.cuda.is_available() and not parse_flag_from_env("USE_MPS_DEVICE"):
                raise ValueError(err.format(mode="fp16", requirement="a GPU"))
            kwargs = self.scaler_handler.to_kwargs() if self.scaler_handler is not None else {}
@ -312,7 +314,7 @@ class Accelerator:
        # RNG Types
        self.rng_types = rng_types
        if self.rng_types is None:
-            self.rng_types = ["generator"]
+            self.rng_types = ["torch"] if is_torch_version("<=", "1.5.1") else ["generator"]

    @property
    def use_distributed(self):
@ -461,28 +463,6 @@ class Accelerator:
        Args:
            model (`torch.nn.Module`):
                PyTorch Module that was prepared with `Accelerator.prepare`
-
-        Example:
-
-        ```python
-        >>> from accelerate import Accelerator
-
-        >>> accelerator = Accelerator()
-        >>> dataloader, model, optimizer = accelerator.prepare(dataloader, model, optimizer)
-        >>> input_a = next(iter(dataloader))
-        >>> input_b = next(iter(dataloader))
-
-        >>> with accelerator.no_sync():
-        ...     outputs = model(input_a)
-        ...     loss = loss_func(outputs)
-        ...     accelerator.backward(loss)
-        ...     # No synchronization across processes, only accumulate gradients
-        >>> outputs = model(input_b)
-        >>> accelerator.backward(loss)
-        >>> # Synchronization across all processes
-        >>> optimizer.step()
-        >>> optimizer.zero_grad()
-        ```
        """
        context = contextlib.nullcontext
        if self.use_distributed:
@ -512,24 +492,6 @@ class Accelerator:
        Args:
            model (`torch.nn.Module`):
                PyTorch Module that was prepared with `Accelerator.prepare`
-
-        Example:
-
-        ```python
-        >>> from accelerate import Accelerator
-
-        >>> accelerator = Accelerator(gradient_accumulation_steps=2)
-        >>> dataloader, model, optimizer, scheduler = accelerator.prepare(dataloader, model, optimizer, scheduler)
-
-        >>> with accelerator.accumulate():
-        ...     for input, output in dataloader:
-        ...         outputs = model(input)
-        ...         loss = loss_func(outputs)
-        ...         loss.backward()
-        ...         optimizer.step()
-        ...         scheduler.step()
-        ...         optimizer.zero_grad()
-        ```
        """
        self._do_sync()
        if self.sync_gradients:
@ -547,15 +509,15 @@ class Accelerator:
        if self.is_local_main_process:
            print(*args, **kwargs)

-    def _prepare_one(self, obj, first_pass=False, device_placement=None):
+    def _prepare_one(self, obj, first_pass=False):
        # First pass of preparation: DataLoader, model, optimizer
        if first_pass:
            if isinstance(obj, torch.utils.data.DataLoader):
-                return self.prepare_data_loader(obj, device_placement=device_placement)
+                return self.prepare_data_loader(obj)
            elif isinstance(obj, torch.nn.Module):
-                return self.prepare_model(obj, device_placement=device_placement)
+                return self.prepare_model(obj)
            elif isinstance(obj, torch.optim.Optimizer):
-                optimizer = self.prepare_optimizer(obj, device_placement=device_placement)
+                optimizer = self.prepare_optimizer(obj)
                return optimizer
        # Second pass of preparation: LR scheduler (which need the full list of optimizers)
        elif isinstance(obj, torch.optim.lr_scheduler._LRScheduler):
@ -602,39 +564,17 @@ class Accelerator:
        self._optimizers = optimizers
        return tuple(result)

-    def prepare(self, *args, device_placement=None):
+    def prepare(self, *args):
        """
        Prepare all objects passed in `args` for distributed training and mixed precision, then return them in the same
        order.

-        Args:
-            *args (list of objects):
-                Any of the following type of objects:
+        Accepts the following type of objects:

-                - `torch.utils.data.DataLoader`: PyTorch Dataloader
-                - `torch.nn.Module`: PyTorch Module
-                - `torch.optim.Optimizer`: PyTorch Optimizer
-                - `torch.optim.lr_scheduler._LRScheduler`: PyTorch LR Scheduler
-
-            device_placement (`List[bool]`, *optional*):
-                Used to customize whether automatic device placement should be performed for each object passed. Needs
-                to be a list of the same length as `args`.
-
-        <Tip>
-
-          You don't need to prepare a model if you only use it for inference without any kind of mixed precision
-
-        </Tip>
+            - `torch.utils.data.DataLoader`: PyTorch Dataloader
+            - `torch.nn.Module`: PyTorch Module
+            - `torch.optim.Optimizer`: PyTorch Optimizer
        """
-        if device_placement is None:
-            device_placement = [None for _ in args]
-        elif self.distributed_type == DistributedType.DEEPSPEED:
-            raise ValueError("You can't customize device placements with DeepSpeed.")
-        elif len(device_placement) != len(args):
-            raise ValueError(
-                f"`device_placement` should be a list with {len(args)} elements (the number of objects passed)."
-            )
-
        if self.distributed_type == DistributedType.FSDP:
            model_count = 0
            optimizer_present = False
@ -665,7 +605,7 @@ class Accelerator:
                    "The model and the optimizer parameters are not on the same device, which probably means you "
                    "created an optimizer around your model **before** putting on the device. Make sure the line "
                    "model.to(device) is before the optimizer creation in your script or remove it entirely and use "
-                    "the flag default value for `device_placement` in your `Accelerator` to let it handle that "
+                    "the flag default value for `devicement_placement` in your `Accelerator` to let it handle that "
                    "part for you."
                )

@ -678,10 +618,8 @@ class Accelerator:
        if self.distributed_type == DistributedType.DEEPSPEED:
            result = self._prepare_deepspeed(*args)
        else:
-            result = tuple(
-                self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
-            )
-            result = tuple(self._prepare_one(obj, device_placement=d) for obj, d in zip(result, device_placement))
+            result = tuple(self._prepare_one(obj, first_pass=True) for obj in args)
+            result = tuple(self._prepare_one(obj) for obj in result)

        if tpu_should_fix_optimizer:
            # 2. grabbing new model parameters
@ -698,22 +636,9 @@ class Accelerator:

        return result if len(result) > 1 else result[0]

-    def prepare_model(self, model: torch.nn.Module, device_placement=None):
-        """
-        Prepares a PyTorch model for training in any distributed setup. It is recommended to use
-        [`Accelerator.prepare`] instead.
-
-        Args:
-            model (`torch.nn.Module`):
-                A PyTorch model to prepare. You don't need to prepare a model if it is used only for inference without
-                any kind of mixed precision
-            device_placement (`bool`, *optional*):
-                Whether or not to place the model on the proper device. Will default to `self.device_placement`.
-        """
-        if device_placement is None:
-            device_placement = self.device_placement and self.distributed_type != DistributedType.FSDP
+    def prepare_model(self, model):
        self._models.append(model)
-        if device_placement:
+        if self.device_placement and self.distributed_type != DistributedType.FSDP:
            model = model.to(self.device)
        if self.distributed_type == DistributedType.MULTI_GPU:
            kwargs = self.ddp_handler.to_kwargs() if self.ddp_handler is not None else {}
@ -760,36 +685,26 @@ class Accelerator:

        deepspeed_plugin = self.state.deepspeed_plugin

-        if deepspeed_plugin.deepspeed_config["train_micro_batch_size_per_gpu"] == "auto":
-            result = [
-                self._prepare_one(obj, first_pass=True) if isinstance(obj, torch.utils.data.DataLoader) else obj
-                for obj in args
-            ]
+        result = [
+            self._prepare_one(obj, first_pass=True) if isinstance(obj, torch.utils.data.DataLoader) else obj
+            for obj in args
+        ]

-            batch_sizes = [obj.batch_size for obj in args if hasattr(obj, "batch_size")]
-            if self.split_batches:
-                batch_sizes = [batch_size // self.num_processes for batch_size in batch_sizes]
-            if len(batch_sizes) == 0:
-                raise ValueError(
-                    "You must specify a training or evaluation dataloader in `accelerate.prepare()` when using DeepSpeed."
-                )
-
-            batch_size_per_device = min(batch_sizes) if deepspeed_plugin.is_train_batch_min else max(batch_sizes)
-            if len(batch_sizes) > 1:
-                logger.info(
-                    "Since you passed both train and evaluation dataloader, `is_train_batch_min` (here "
-                    f"{deepspeed_plugin.is_train_batch_min} will decide the `train_batch_size` ({batch_size_per_device})."
-                )
-        else:
-            batch_size_per_device = deepspeed_plugin.deepspeed_config["train_micro_batch_size_per_gpu"]
-            result = [obj for obj in args]
-
-        if self.gradient_accumulation_steps != deepspeed_plugin.deepspeed_config["gradient_accumulation_steps"]:
-            logger.info(
-                f"Updating DeepSpeed's gradient accumulation steps to {self.gradient_accumulation_steps} from "
-                f"{deepspeed_plugin.deepspeed_config['gradient_accumulation_steps']}."
+        batch_sizes = [obj.batch_size for obj in args if hasattr(obj, "batch_size")]
+        if self.split_batches:
+            batch_sizes = [batch_size // self.num_processes for batch_size in batch_sizes]
+        if len(batch_sizes) == 0:
+            raise ValueError(
+                "You must specify a training or evaluation dataloader in `accelerate.prepare()` when using DeepSpeed."
            )
-            deepspeed_plugin.deepspeed_config["gradient_accumulation_steps"] = self.gradient_accumulation_steps
+
+        batch_size_per_device = min(batch_sizes) if deepspeed_plugin.is_train_batch_min else max(batch_sizes)
+        if len(batch_sizes) > 1:
+            logger.info(
+                "Since you passed both train and evaluation dataloader, `is_train_batch_min` (here "
+                f"{deepspeed_plugin.is_train_batch_min} will decide the `train_batch_size` ({batch_size_per_device})."
+            )
+
        config_kwargs = {
            "train_micro_batch_size_per_gpu": batch_size_per_device,
            "train_batch_size": batch_size_per_device
@ -923,57 +838,24 @@ class Accelerator:
                )
        return tuple(result)

-    def prepare_data_loader(self, data_loader: torch.utils.data.DataLoader, device_placement=None):
-        """
-        Prepares a PyTorch DataLoader for training in any distributed setup. It is recommended to use
-        [`Accelerator.prepare`] instead.
-
-        Args:
-            data_loader (`torch.utils.data.DataLoader`):
-                A vanilla PyTorch DataLoader to prepare
-            device_placement (`bool`, *optional*):
-                Whether or not to place the batches on the proper device in the prepared dataloader. Will default to
-                `self.device_placement`.
-        """
-        if device_placement is None:
-            device_placement = self.device_placement if self.distributed_type != DistributedType.TPU else False
+    def prepare_data_loader(self, data_loader):
        return prepare_data_loader(
            data_loader,
            self.device,
            num_processes=self.num_processes,
            process_index=self.process_index,
            split_batches=self.split_batches,
-            put_on_device=device_placement,
+            put_on_device=self.device_placement if self.distributed_type != DistributedType.TPU else False,
            rng_types=self.rng_types.copy(),
            dispatch_batches=self.dispatch_batches,
        )

-    def prepare_optimizer(self, optimizer: torch.optim.Optimizer, device_placement=None):
-        """
-        Prepares a PyTorch Optimizer for training in any distributed setup. It is recommended to use
-        [`Accelerator.prepare`] instead.
-
-        Args:
-            optimizer (`torch.optim.Optimizer`):
-                A vanilla PyTorch optimizer to prepare
-            device_placement (`bool`, *optional*):
-                Whether or not to place the optimizer on the proper device. Will default to `self.device_placement`.
-        """
-        if device_placement is None:
-            device_placement = self.device_placement
-        optimizer = AcceleratedOptimizer(optimizer, device_placement=device_placement, scaler=self.scaler)
+    def prepare_optimizer(self, optimizer):
+        optimizer = AcceleratedOptimizer(optimizer, device_placement=self.device_placement, scaler=self.scaler)
        self._optimizers.append(optimizer)
        return optimizer

-    def prepare_scheduler(self, scheduler: torch.optim.lr_scheduler._LRScheduler):
-        """
-        Prepares a PyTorch Scheduler for training in any distributed setup. It is recommended to use
-        [`Accelerator.prepare`] instead.
-
-        Args:
-            scheduler (`torch.optim.lr_scheduler._LRScheduler`):
-                A vanilla PyTorch scheduler to prepare
-        """
+    def prepare_scheduler(self, scheduler):
        # We try to find the optimizer associated with `scheduler`, the default is the full list.
        optimizer = self._optimizers
        for opt in self._optimizers:
@ -991,14 +873,9 @@ class Accelerator:

    def backward(self, loss, **kwargs):
        """
-        Scales the gradients in accordance to `Accelerator.gradient_accumulation_steps` and calls the correct
-        `backward()` based on the configuration.
-
-        Should be used in lieu of `loss.backward()`.
+        Use `accelerator.backward(loss)` in lieu of `loss.backward()`.
        """
-        if self.distributed_type != DistributedType.DEEPSPEED:
-            # deepspeed handles loss scaling by gradient_accumulation_steps in its `backward`
-            loss = loss / self.gradient_accumulation_steps
+        loss /= self.gradient_accumulation_steps
        if self.distributed_type == DistributedType.DEEPSPEED:
            self.deepspeed_engine_wrapped.backward(loss, **kwargs)
        elif self.scaler is not None:
@ -1029,24 +906,6 @@ class Accelerator:
    def clip_grad_norm_(self, parameters, max_norm, norm_type=2):
        """
        Should be used in place of `torch.nn.utils.clip_grad_norm_`.
-
-        Example:
-
-        ```python
-        >>> from accelerate import Accelerator
-
-        >>> accelerator = Accelerator(gradient_accumulation_steps=2)
-        >>> dataloader, model, optimizer, scheduler = accelerator.prepare(dataloader, model, optimizer, scheduler)
-
-        >>> for (input, target) in dataloader:
-        ...     optimizer.zero_grad()
-        ...     output = model(input)
-        ...     loss = loss_func(output, target)
-        ...     accelerator.backward(loss)
-        ...     if accelerator.sync_gradients:
-        ...         accelerator.clip_grad_norm_(model.parameters(), max_grad_norm)
-        ...     optimizer.step()
-        ```
        """
        if self.distributed_type == DistributedType.FSDP:
            self.unscale_gradients()
@ -1064,24 +923,6 @@ class Accelerator:
    def clip_grad_value_(self, parameters, clip_value):
        """
        Should be used in place of `torch.nn.utils.clip_grad_value_`.
-
-        Example:
-
-        ```python
-        >>> from accelerate import Accelerator
-
-        >>> accelerator = Accelerator(gradient_accumulation_steps=2)
-        >>> dataloader, model, optimizer, scheduler = accelerator.prepare(dataloader, model, optimizer, scheduler)
-
-        >>> for (input, target) in dataloader:
-        ...     optimizer.zero_grad()
-        ...     output = model(input)
-        ...     loss = loss_func(output, target)
-        ...     accelerator.backward(loss)
-        ...     if accelerator.sync_gradients:
-        ...         accelerator.clip_grad_value_(model.parameters(), clip_value)
-        ...     optimizer.step()
-        ```
        """
        if self.distributed_type in [DistributedType.DEEPSPEED, DistributedType.FSDP]:
            raise Exception("DeepSpeed and FSDP  do not support `clip_grad_value_`. Use `clip_grad_norm_` instead.")
@ -1191,7 +1032,6 @@ class Accelerator:
        """
        wait_for_everyone()

-    @on_main_process
    def init_trackers(self, project_name: str, config: Optional[dict] = None, init_kwargs: Optional[dict] = {}):
        """
        Initializes a run for all trackers stored in `self.log_with`, potentially with starting configurations
@ -1203,7 +1043,7 @@ class Accelerator:
                Optional starting configuration to be logged.
            init_kwargs (`dict`, *optional*):
                A nested dictionary of kwargs to be passed to a specific tracker's `__init__` function. Should be
-                formatted like so:
+                formatted like this:
                ```python
                {"wandb": {"tags": ["tag_a", "tag_b"]}}
                ```
@ -1252,7 +1092,7 @@ class Accelerator:
                The run step. If included, the log will be affiliated with this step.
            log_kwargs (`dict`, *optional*):
                A nested dictionary of kwargs to be passed to a specific tracker's `log` function. Should be formatted
-                like so:
+                like this:
                ```python
                {"wandb": {"tags": ["tag_a", "tag_b"]}}
                ```
@ -1263,8 +1103,7 @@ class Accelerator:
    @on_main_process
    def end_training(self):
        """
-        Runs any special end training behaviors, such as stopping trackers on the main process only. Should always be
-        called at the end of your script if using experiment tracking.
+        Runs any special end training behaviors, such as stopping trackers on the main process only.
        """
        for tracker in self.trackers:
            tracker.finish()
@ -1284,13 +1123,6 @@ class Accelerator:
        """
        Saves the current states of the model, optimizer, scaler, RNG generators, and registered objects.

-        <Tip>
-
-        Should only be used when wanting to save a checkpoint during training and restoring the state in the same
-        environment.
-
-        </Tip>
-
        Args:
            output_dir (`str` or `os.PathLike`):
                The name of the folder to save all relevant weights and states.
@ -1346,12 +1178,6 @@ class Accelerator:
        """
        Loads the current states of the model, optimizer, scaler, RNG generators, and registered objects.

-        <Tip>
-
-        Should only be used in conjunction with [`Accelerator.save_state`].
-
-        </Tip>
-
        Args:
            input_dir (`str` or `os.PathLike`):
                The name of the folder all relevant weights and states were saved in.
@ -1454,15 +1280,6 @@ class Accelerator:
        return (model_device, optimizer_device)

    def get_state_dict(self, model, unwrap=True):
-        """
-        Returns the state dictionary of a model sent through [`Accelerator.prepare`] in full precision
-
-        Args:
-            model (`torch.nn.Module`):
-                A PyTorch model sent through [`Accelerator.prepare`]
-            unwrap (`bool`, *optional*, defaults to True):
-                Whether to return the original underlying state_dict of `model` or to return the wrapped state_dict
-        """
        is_zero_3 = False
        if self.distributed_type == DistributedType.DEEPSPEED:
            is_zero_3 = self.deepspeed_config["zero_optimization"]["stage"] == 3
--- a/src/accelerate/big_modeling.py
+++ b/src/accelerate/big_modeling.py
@ -29,7 +29,6 @@ from .utils import (
    load_checkpoint_in_model,
    offload_state_dict,
 )
-from .utils.versions import is_torch_version


@contextmanager
@ -44,7 +43,7 @@ def init_empty_weights(include_buffers: bool = False):

    Example:

-    ```python
+    ```pyton
    import torch.nn as nn
    from accelerate import init_empty_weights

@ -60,8 +59,6 @@ def init_empty_weights(include_buffers: bool = False):

    </Tip>
    """
-    if not is_torch_version(">=", "1.9.0"):
-        raise NotImplementedError("Initializing empty weights to a meta device requires torch >= 1.9.0")
    old_register_parameter = nn.Module.register_parameter
    if include_buffers:
        old_register_buffer = nn.Module.register_buffer
@ -78,35 +75,15 @@ def init_empty_weights(include_buffers: bool = False):
        if buffer is not None:
            module._buffers[name] = module._buffers[name].to(torch.device("meta"))

-    # Patch tensor creation
-    if include_buffers:
-        tensor_constructors_to_patch = {
-            torch_function_name: getattr(torch, torch_function_name)
-            for torch_function_name in ["empty", "zeros", "ones", "full"]
-        }
-    else:
-        tensor_constructors_to_patch = {}
-
-    def patch_tensor_constructor(fn):
-        def wrapper(*args, **kwargs):
-            kwargs["device"] = torch.device("meta")
-            return fn(*args, **kwargs)
-
-        return wrapper
-
    try:
        nn.Module.register_parameter = register_empty_parameter
        if include_buffers:
            nn.Module.register_buffer = register_empty_buffer
-        for torch_function_name in tensor_constructors_to_patch.keys():
-            setattr(torch, torch_function_name, patch_tensor_constructor(getattr(torch, torch_function_name)))
        yield
    finally:
        nn.Module.register_parameter = old_register_parameter
        if include_buffers:
            nn.Module.register_buffer = old_register_buffer
-        for torch_function_name, old_torch_function in tensor_constructors_to_patch.items():
-            setattr(torch, torch_function_name, old_torch_function)


 def cpu_offload(
@ -137,8 +114,6 @@ def cpu_offload(
            called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
            `dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
    """
-    if not is_torch_version(">=", "1.9.0"):
-        raise NotImplementedError("CPU offloading requires torch >= 1.9.0")
    if execution_device is None:
        execution_device = next(iter(model.parameters())).device
    if state_dict is None:
@ -182,8 +157,6 @@ def disk_offload(
            called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
            `dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
    """
-    if not is_torch_version(">=", "1.9.0"):
-        raise NotImplementedError("Disk offloading requires torch >= 1.9.0")
    if not os.path.isdir(offload_dir) or not os.path.isfile(os.path.join(offload_dir, "index.json")):
        offload_state_dict(offload_dir, model.state_dict())
    if execution_device is None:
@ -235,8 +208,6 @@ def dispatch_model(
            called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
            `dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
    """
-    if not is_torch_version(">=", "1.9.0"):
-        raise NotImplementedError("Model dispatching requires torch >= 1.9.0")
    # Error early if the device map is incomplete.
    check_device_map(model, device_map)

@ -324,7 +295,7 @@ def load_checkpoint_and_dispatch(
        dtype (`str` or `torch.dtype`, *optional*):
            If provided, the weights will be converted to that type when loaded.
        offload_state_dict (`bool`, *optional*):
-            If `True`, will temporarily offload the CPU state dict on the hard drive to avoid getting out of CPU RAM if
+            If `True`, will temporarily offload the CPU state dict on the hard drive to avoig getting out of CPU RAM if
            the weight of the CPU state dict + the biggest shard does not fit. Will default to `True` if the device map
            picked contains `"disk"` values.
        preload_module_classes (`List[str]`, *optional*):
@ -333,8 +304,6 @@ def load_checkpoint_and_dispatch(
            called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
            `dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
    """
-    if not is_torch_version(">=", "1.9.0"):
-        raise NotImplementedError("Loading and dispatching requires torch >= 1.9.0")
    if isinstance(device_map, str) and device_map not in ["auto", "balanced", "balanced_low_0", "sequential"]:
        raise ValueError(
            "If passing a string for `device_map`, please choose 'auto', 'balanced', 'balanced_low_0' or "
--- a/src/accelerate/commands/config/cluster.py
+++ b/src/accelerate/commands/config/cluster.py
@ -35,12 +35,8 @@ def get_cluster_input():

    machine_rank = 0
    num_machines = 1
-    num_processes = 1
-    gpu_ids = None
    main_process_ip = None
    main_process_port = None
-    rdzv_backend = "static"
-    same_network = True
    if distributed_type in [DistributedType.MULTI_GPU, DistributedType.MULTI_CPU]:
        num_machines = _ask_field(
            "How many different machines will you use (use more than 1 for multi-node training)? [1]: ",
@ -60,16 +56,6 @@ def get_cluster_input():
                "What is the port you will use to communicate with the main process? ",
                lambda x: int(x),
            )
-            same_network = _ask_field(
-                "Are all the machines on the same local network? Answer `no` if nodes are on the cloud and/or on different network hosts [YES/no]: ",
-                _convert_yes_no_to_bool,
-                default=True,
-                error_message="Please enter yes or no.",
-            )
-            if not same_network:
-                rdzv_backend = _ask_field(
-                    "What rendezvous backend will you use? ('static', 'c10d', ...): ", default="static"
-                )

    if distributed_type == DistributedType.NO:
        use_cpu = _ask_field(
@ -305,12 +291,6 @@ def get_cluster_input():
    else:
        num_processes = 1

-    if distributed_type in [DistributedType.MULTI_GPU, DistributedType.NO] and not use_cpu:
-        gpu_ids = _ask_field(
-            "What GPU(s) (by id) should be used for training on this machine as a comma-seperated list? [all]:",
-            default="all",
-        )
-
    if distributed_type != DistributedType.TPU:
        if distributed_type == DistributedType.DEEPSPEED and use_deepspeed_config:
            mixed_precision = "no"
@ -333,7 +313,6 @@ def get_cluster_input():
        compute_environment=ComputeEnvironment.LOCAL_MACHINE,
        distributed_type=distributed_type,
        num_processes=num_processes,
-        gpu_ids=gpu_ids,
        mixed_precision=mixed_precision,
        downcast_bf16=downcast_bf16,
        machine_rank=machine_rank,
@ -344,6 +323,4 @@ def get_cluster_input():
        deepspeed_config=deepspeed_config,
        fsdp_config=fsdp_config,
        use_cpu=use_cpu,
-        rdzv_backend=rdzv_backend,
-        same_network=same_network,
    )
--- a/src/accelerate/commands/config/config_args.py
+++ b/src/accelerate/commands/config/config_args.py
@ -135,11 +135,8 @@ class ClusterConfig(BaseConfig):
    num_processes: int
    machine_rank: int = 0
    num_machines: int = 1
-    gpu_ids: Optional[str] = None
    main_process_ip: Optional[str] = None
    main_process_port: Optional[int] = None
-    rdzv_backend: Optional[str] = "static"
-    same_network: Optional[bool] = False
    main_training_function: str = "main"

    # args for deepspeed_plugin
--- a/src/accelerate/commands/config/sagemaker.py
+++ b/src/accelerate/commands/config/sagemaker.py
@ -157,7 +157,7 @@ def get_sagemaker_input():
        )

    distributed_type = _ask_field(
-        "What is the distributed mode? ([0] No distributed training, [1] data parallelism): ",
+        "Which type of machine are you using? ([0] No distributed training, [1] data parallelism): ",
        _convert_sagemaker_distributed_mode,
        error_message="Please enter 0 or 1",
    )
--- a/src/accelerate/commands/launch.py
+++ b/src/accelerate/commands/launch.py
@ -36,28 +36,15 @@ from accelerate.utils import (
    DistributedType,
    PrecisionType,
    PrepareForLaunch,
-    _filter_args,
+    get_launch_prefix,
    is_deepspeed_available,
-    is_rich_available,
    is_sagemaker_available,
-    is_torch_version,
    patch_environment,
 )
 from accelerate.utils.constants import DEEPSPEED_MULTINODE_LAUNCHERS
 from accelerate.utils.dataclasses import SageMakerDistributedType


-if is_rich_available():
-    from rich import get_console
-    from rich.logging import RichHandler
-
-    FORMAT = "%(message)s"
-    logging.basicConfig(format=FORMAT, datefmt="[%X]", handlers=[RichHandler()])
-
-
-if is_torch_version(">=", "1.9.0"):
-    import torch.distributed.run as distrib_run
-
 logger = logging.getLogger(__name__)


@ -259,11 +246,6 @@ def launch_command_parser(subparsers=None):
    parser.add_argument(
        "--num_machines", type=int, default=None, help="The total number of machines used in this training."
    )
-    parser.add_argument(
-        "--gpu_ids",
-        default=None,
-        help="What GPUs (by id) should be used for training on this machine as a comma-seperated list",
-    )
    parser.add_argument(
        "--machine_rank", type=int, default=None, help="The rank of the machine on which this script is launched."
    )
@ -274,25 +256,6 @@ def launch_command_parser(subparsers=None):
        default=None,
        help="The port to use to communicate with the machine of rank 0.",
    )
-    # Rendezvous related arguments
-    parser.add_argument(
-        "--rdzv_conf",
-        type=str,
-        default="",
-        help="Additional rendezvous configuration (<key1>=<value1>,<key2>=<value2>,...).",
-    )
-    parser.add_argument(
-        "--max_restarts",
-        type=int,
-        default=0,
-        help="Maximum number of worker group restarts before failing.",
-    )
-    parser.add_argument(
-        "--monitor_interval",
-        type=float,
-        default=5,
-        help="Interval, in seconds, to monitor the state of workers.",
-    )
    parser.add_argument(
        "--main_training_function",
        type=str,
@ -331,12 +294,7 @@ def launch_command_parser(subparsers=None):
        "--aws_secret_access_key",
        type=str,
        default=None,
-        help="The AWS_SECRET_ACCESS_KEY used to launch the Amazon SageMaker training job.",
-    )
-    parser.add_argument(
-        "--debug",
-        action="store_true",
-        help="Whether to print out the torch.distributed stack trace when something fails.",
+        help="The AWS_SECRET_ACCESS_KEY used to launch the Amazon SageMaker training job",
    )
    parser.add_argument(
        "training_script",
@ -369,10 +327,6 @@ def simple_launcher(args):
    current_env = os.environ.copy()
    current_env["USE_CPU"] = str(args.cpu or args.use_cpu)
    current_env["USE_MPS_DEVICE"] = str(args.use_mps_device)
-    if args.use_mps_device:
-        current_env["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
-    elif args.gpu_ids != "all":
-        current_env["CUDA_VISIBLE_DEVICES"] = args.gpu_ids
    if args.num_machines > 1:
        current_env["MASTER_ADDR"] = args.main_process_ip
        current_env["MASTER_PORT"] = str(args.main_process_port)
@ -401,40 +355,43 @@ def simple_launcher(args):


 def multi_gpu_launcher(args):
-    num_processes = getattr(args, "num_processes")
-    num_machines = getattr(args, "num_machines")
-    main_process_ip = getattr(args, "main_process_ip")
-    main_process_port = getattr(args, "main_process_port")
-    if num_machines > 1:
-        setattr(args, "nproc_per_node", str(num_processes // num_machines))
-        setattr(args, "nnodes", str(num_machines))
-        setattr(args, "node_rank", int(args.machine_rank))
-        if getattr(args, "same_network"):
-            setattr(args, "master_addr", str(main_process_ip))
-            setattr(args, "master_port", str(main_process_port))
-        else:
-            setattr(args, "rdzv_endpoint", f"{main_process_ip}:{main_process_port}")
+    cmd = get_launch_prefix()
+    if args.num_machines > 1:
+        cmd.extend(
+            [
+                "--nproc_per_node",
+                str(args.num_processes // args.num_machines),
+                "--nnodes",
+                str(args.num_machines),
+                "--node_rank",
+                str(args.machine_rank),
+                "--master_addr",
+                args.main_process_ip,
+                "--master_port",
+                str(args.main_process_port),
+            ]
+        )
    else:
-        setattr(args, "nproc_per_node", str(num_processes))
-        if main_process_port is not None:
-            setattr(args, "master_port", str(main_process_port))
+        cmd.extend(["--nproc_per_node", str(args.num_processes)])
+        if args.main_process_port is not None:
+            cmd.extend(["--master_port", str(args.main_process_port)])

    if args.module and args.no_python:
        raise ValueError("--module and --no_python cannot be used together")
    elif args.module:
-        setattr(args, "module", True)
+        cmd.append("--module")
    elif args.no_python:
-        setattr(args, "no_python", True)
+        cmd.append("--no_python")
+    cmd.append(args.training_script)
+    cmd.extend(args.training_script_args)

    current_env = os.environ.copy()
-    gpu_ids = getattr(args, "gpu_ids")
-    if gpu_ids != "all":
-        current_env["CUDA_VISIBLE_DEVICES"] = gpu_ids
-    mixed_precision = args.mixed_precision.lower()
    try:
-        mixed_precision = PrecisionType(mixed_precision)
+        mixed_precision = PrecisionType(args.mixed_precision.lower())
    except ValueError:
-        raise ValueError(f"Unknown mixed_precision mode: {mixed_precision}. Choose between {PrecisionType.list()}.")
+        raise ValueError(
+            f"Unknown mixed_precision mode: {args.mixed_precision.lower()}. Choose between {PrecisionType.list()}."
+        )

    if args.fp16:
        warnings.warn('--fp16 flag is deprecated. Use "--mixed_precision fp16" instead.', DeprecationWarning)
@ -487,81 +444,66 @@ def multi_gpu_launcher(args):
        if args.fsdp_state_dict_type is not None:
            current_env["FSDP_STATE_DICT_TYPE"] = str(args.fsdp_state_dict_type)
    current_env["OMP_NUM_THREADS"] = str(args.num_cpu_threads_per_process)
-    if is_torch_version("<", "1.9.0"):
-        raise NotImplementedError("Multi-node training requires pytorch>=1.9.0")
-
-    debug = getattr(args, "debug", False)
-    args = _filter_args(args)
-    with patch_environment(**current_env):
-        try:
-            distrib_run.run(args)
-        except:
-            if debug:
-                console = get_console()
-                console.print("\n[bold red]Using --debug, `torch.distributed` Stack Trace:[/bold red]")
-                console.print_exception(suppress=[__file__], show_locals=False)
+    process = subprocess.Popen(cmd, env=current_env)
+    process.wait()
+    if process.returncode != 0:
+        raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)


 def deepspeed_launcher(args):
    if not is_deepspeed_available():
        raise ImportError("DeepSpeed is not installed => run `pip3 install deepspeed` or build it from source.")
-    num_processes = getattr(args, "num_processes")
-    num_machines = getattr(args, "num_machines")
-    main_process_ip = getattr(args, "main_process_ip")
-    main_process_port = getattr(args, "main_process_port")
-    if num_machines > 1 and args.deepspeed_multinode_launcher != DEEPSPEED_MULTINODE_LAUNCHERS[1]:
-        cmd = ["deepspeed", "--no_local_rank"]
-        cmd.extend(["--hostfile", str(args.deepspeed_hostfile), "--launcher", str(args.deepspeed_multinode_launcher)])
-        if args.deepspeed_exclusion_filter is not None:
+    cmd = ["deepspeed", "--no_local_rank"]
+    if args.num_machines > 1:
+        if args.deepspeed_multinode_launcher == DEEPSPEED_MULTINODE_LAUNCHERS[1]:
+            cmd = get_launch_prefix()
            cmd.extend(
                [
-                    "--exclude",
-                    str(args.deepspeed_exclusion_filter),
-                ]
-            )
-        elif args.deepspeed_inclusion_filter is not None:
-            cmd.extend(
-                [
-                    "--include",
-                    str(args.deepspeed_inclusion_filter),
+                    "--nproc_per_node",
+                    str(args.num_processes // args.num_machines),
+                    "--nnodes",
+                    str(args.num_machines),
+                    "--node_rank",
+                    str(args.machine_rank),
+                    "--master_addr",
+                    args.main_process_ip,
+                    "--master_port",
+                    str(args.main_process_port),
                ]
            )
        else:
-            cmd.extend(["--num_gpus", str(args.num_processes // args.num_machines)])
-
-        if args.module and args.no_python:
-            raise ValueError("--module and --no_python cannot be used together")
-        elif args.module:
-            cmd.append("--module")
-        elif args.no_python:
-            cmd.append("--no_python")
-        cmd.append(args.training_script)
-        cmd.extend(args.training_script_args)
-    elif num_machines > 1 and args.deepspeed_multinode_launcher == DEEPSPEED_MULTINODE_LAUNCHERS[1]:
-        setattr(args, "nproc_per_node", str(num_processes // num_machines))
-        setattr(args, "nnodes", str(num_machines))
-        setattr(args, "node_rank", int(args.machine_rank))
-        if getattr(args, "same_network"):
-            setattr(args, "master_addr", str(main_process_ip))
-            setattr(args, "master_port", str(main_process_port))
-        else:
-            setattr(args, "rdzv_endpoint", f"{main_process_ip}:{main_process_port}")
+            cmd.extend(
+                ["--hostfile", str(args.deepspeed_hostfile), "--launcher", str(args.deepspeed_multinode_launcher)]
+            )
+            if args.deepspeed_exclusion_filter is not None:
+                cmd.extend(
+                    [
+                        "--exclude",
+                        str(args.deepspeed_exclusion_filter),
+                    ]
+                )
+            elif args.deepspeed_inclusion_filter is not None:
+                cmd.extend(
+                    [
+                        "--include",
+                        str(args.deepspeed_inclusion_filter),
+                    ]
+                )
+            else:
+                cmd.extend(["--num_gpus", str(args.num_processes // args.num_machines)])
    else:
-        setattr(args, "nproc_per_node", str(num_processes))
-        if main_process_port is not None:
-            setattr(args, "master_port", str(main_process_port))
+        cmd.extend(["--num_gpus", str(args.num_processes)])

    if args.module and args.no_python:
        raise ValueError("--module and --no_python cannot be used together")
    elif args.module:
-        setattr(args, "module", True)
+        cmd.append("--module")
    elif args.no_python:
-        setattr(args, "no_python", True)
+        cmd.append("--no_python")
+    cmd.append(args.training_script)
+    cmd.extend(args.training_script_args)

    current_env = os.environ.copy()
-    gpu_ids = getattr(args, "gpu_ids")
-    if gpu_ids != "all":
-        current_env["CUDA_VISIBLE_DEVICES"] = gpu_ids
    try:
        mixed_precision = PrecisionType(args.mixed_precision.lower())
    except ValueError:
@ -583,8 +525,7 @@ def deepspeed_launcher(args):
    current_env["DEEPSPEED_OFFLOAD_PARAM_DEVICE"] = str(args.offload_param_device).lower()
    current_env["DEEPSPEED_ZERO3_INIT"] = str(args.zero3_init_flag).lower()
    current_env["DEEPSPEED_ZERO3_SAVE_16BIT_MODEL"] = str(args.zero3_save_16bit_model).lower()
-    if args.deepspeed_config_file is not None:
-        current_env["DEEPSPEED_CONFIG_FILE"] = str(args.deepspeed_config_file)
+    current_env["DEEPSPEED_CONFIG_FILE"] = str(args.deepspeed_config_file).lower()

    if args.num_machines > 1 and args.deepspeed_multinode_launcher != DEEPSPEED_MULTINODE_LAUNCHERS[1]:
        with open(".deepspeed_env", "a") as f:
@ -593,24 +534,10 @@ def deepspeed_launcher(args):
                    continue
                f.write(f"{key}={value}\n")

-        process = subprocess.Popen(cmd, env=current_env)
-        process.wait()
-        if process.returncode != 0:
-            raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
-    else:
-        if is_torch_version("<", "1.9.0"):
-            raise NotImplementedError("Multi-node training requires pytorch>=1.9.0")
-
-        debug = getattr(args, "debug", False)
-        args = _filter_args(args)
-        with patch_environment(**current_env):
-            try:
-                distrib_run.run(args)
-            except:
-                if debug:
-                    console = get_console()
-                    console.print("\n[bold red]Using --debug, `torch.distributed` Stack Trace:[/bold red]")
-                    console.print_exception(suppress=[__file__], show_locals=False)
+    process = subprocess.Popen(cmd, env=current_env)
+    process.wait()
+    if process.returncode != 0:
+        raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)


 def tpu_launcher(args):
@ -830,14 +757,6 @@ def launch_command(args):
            args.tpu = defaults.distributed_type == DistributedType.TPU
            args.use_fsdp = defaults.distributed_type == DistributedType.FSDP
            args.use_mps_device = defaults.distributed_type == DistributedType.MPS
-        if not args.use_mps_device:
-            if args.gpu_ids is None:
-                if defaults.gpu_ids is not None:
-                    args.gpu_ids = defaults.gpu_ids
-                else:
-                    args.gpu_ids = "all"
-            if len(args.gpu_ids.split(",")) < 2 and args.multi_gpu and (args.gpu_ids != "all"):
-                args.multi_gpu = False
        if defaults.compute_environment == ComputeEnvironment.LOCAL_MACHINE:
            # Update args with the defaults
            for name, attr in defaults.__dict__.items():
@ -865,8 +784,8 @@ def launch_command(args):
                args.mixed_precision = defaults.mixed_precision
    else:
        if args.num_processes is None:
-            args.num_processes = torch.cuda.device_count() if args.multi_gpu else 1
-            warned.append("\t`--num_processes` was set to a value of `{args.num_processes}`")
+            warned.append("\t`--num_processes` was set to a value of `1`")
+            args.num_processes = 1
        if args.num_machines is None:
            warned.append("\t`--num_machines` was set to a value of `1`")
            args.num_machines = 1
@ -875,6 +794,14 @@ def launch_command(args):
            args.mixed_precision = "no"
        if not hasattr(args, "use_cpu"):
            args.use_cpu = args.cpu
+    if args.multi_gpu and args.num_processes == 1:
+        args.num_processes = torch.cuda.device_count()
+        if not any("--num_processes" in warn for warn in warned):
+            warned.append(f"\t`--num_processes` was set to `{args.num_processes}`")
+        else:
+            for i, warn in enumerate(warned):
+                if "--num_processes" in warn:
+                    warned[i] = warn.replace("`1`", f"`{args.num_processes}`")

    if args.num_cpu_threads_per_process is None:
        local_size = get_int_from_env(
--- a/src/accelerate/data_loader.py
+++ b/src/accelerate/data_loader.py
@ -75,11 +75,11 @@ _PYTORCH_DATALOADER_KWARGS = {
    "timeout": 0,
    "worker_init_fn": None,
    "multiprocessing_context": None,
-    "generator": None,
 }

 # kwargs added after by version
 _PYTORCH_DATALOADER_ADDITIONAL_KWARGS = {
+    "1.6.0": {"generator": None},
    "1.7.0": {"prefetch_factor": 2, "persistent_workers": False},
 }

@ -347,6 +347,7 @@ class DataLoaderShard(DataLoader):
        try:
            current_batch = next(dataloader_iter)
        except StopIteration:
+            self.gradient_state._iterate_samples_seen(find_batch_size(current_batch))
            yield
        while True:
            try:
@ -363,11 +364,10 @@ class DataLoaderShard(DataLoader):

    @property
    def total_batch_size(self):
-        batch_sampler = self.sampler if isinstance(self.sampler, BatchSampler) else self.batch_sampler
        return (
-            batch_sampler.batch_size
-            if batch_sampler.split_batches
-            else (batch_sampler.batch_size * batch_sampler.num_processes)
+            self.batch_sampler.batch_size
+            if self.batch_sampler.split_batches
+            else (self.batch_sampler.batch_size * self.batch_sampler.num_processes)
        )

    @property
@ -412,7 +412,7 @@ class DataLoaderDispatcher(DataLoader):
        self.split_batches = split_batches
        if is_torch_version("<", "1.8.0"):
            raise ImportError(
-                f"Using `DataLoaderDispatcher` requires PyTorch 1.8.0 minimum. You have {torch.__version__}."
+                "Using `DataLoaderDispatcher` requires PyTorch 1.8.0 minimum. You have {torch.__version__}."
            )
        if shuffle:
            torch.utils.data.graph_settings.apply_shuffle_settings(dataset, shuffle=shuffle)
@ -462,7 +462,11 @@ class DataLoaderDispatcher(DataLoader):
                else:
                    batch_info = [None, True]
                broadcast_object_list(batch_info)
-        return batch, batch_info
+                if batch_info[1]:
+                    return batch, batch_info, True
+            else:
+                return batch, batch_info, True
+        return batch, batch_info, False

    def __iter__(self):
        self.gradient_state._set_end_of_dataloader(False)
@ -473,10 +477,11 @@ class DataLoaderDispatcher(DataLoader):
        stop_iteration = False
        self._stop_iteration = False
        first_batch = None
-        next_batch, next_batch_info = self._fetch_batches(main_iterator)
+        next_batch, next_batch_info, next_skip = self._fetch_batches(main_iterator)
        while not stop_iteration:
-            batch, batch_info = next_batch, next_batch_info
-
+            batch, batch_info, skip = next_batch, next_batch_info, next_skip
+            if skip:
+                continue
            if self.state.process_index != 0:
                # Initialize tensors on other processes than process 0.
                batch = initialize_tensors(batch_info[0])
@ -495,7 +500,7 @@ class DataLoaderDispatcher(DataLoader):
            if not stop_iteration:
                # We may still be at the end of the dataloader without knowing it yet: if there is nothing left in
                # the dataloader since the number of batches is a round multiple of the number of processes.
-                next_batch, next_batch_info = self._fetch_batches(main_iterator)
+                next_batch, next_batch_info, next_skip = self._fetch_batches(main_iterator)
                # next_batch_info[0] is None when there are no more batches, otherwise we still need to process them.
                if self._stop_iteration and next_batch_info[0] is None:
                    stop_iteration = True
@ -622,7 +627,6 @@ def prepare_data_loader(
    new_dataset = dataloader.dataset
    # Iterable dataset doesn't like batch_sampler, but data_loader creates a default one for it
    new_batch_sampler = dataloader.batch_sampler if not isinstance(new_dataset, IterableDataset) else None
-    sampler_is_batch_sampler = False
    generator = getattr(dataloader, "generator", None)
    # No change if no multiprocess
    if num_processes != 1 and not dispatch_batches:
@ -639,20 +643,15 @@ def prepare_data_loader(
            )
        else:
            # New batch sampler for the current process.
-            sampler_is_batch_sampler = isinstance(dataloader.sampler, BatchSampler)
-            if sampler_is_batch_sampler:
-                sampler = dataloader.sampler.sampler
-            else:
-                sampler = dataloader.batch_sampler.sampler
-            if hasattr(sampler, "generator"):
-                if sampler.generator is None:
-                    sampler.generator = torch.Generator()
-                generator = sampler.generator
-                generator.manual_seed(int(torch.empty((), dtype=torch.int64).random_().item()))
-
-            batch_sampler = dataloader.sampler if sampler_is_batch_sampler else dataloader.batch_sampler
+            if hasattr(dataloader.sampler, "generator"):
+                if dataloader.sampler.generator is None:
+                    dataloader.sampler.generator = torch.Generator()
+                    generator = dataloader.sampler.generator
+                    generator.manual_seed(int(torch.empty((), dtype=torch.int64).random_().item()))
+            elif getattr(dataloader.batch_sampler, "generator", None) is not None:
+                generator = dataloader.batch_sampler.generator
            new_batch_sampler = BatchSamplerShard(
-                batch_sampler,
+                dataloader.batch_sampler,
                num_processes=num_processes,
                process_index=process_index,
                split_batches=split_batches,
@ -690,16 +689,6 @@ def prepare_data_loader(
            _drop_last=dataloader.drop_last,
            **kwargs,
        )
-    elif sampler_is_batch_sampler:
-        dataloader = DataLoaderShard(
-            new_dataset,
-            device=device if put_on_device and state.distributed_type != DistributedType.TPU else None,
-            sampler=new_batch_sampler,
-            batch_size=getattr(dataloader, "batch_size", _PYTORCH_DATALOADER_KWARGS["batch_size"]),
-            rng_types=rng_types,
-            generator=generator,
-            **kwargs,
-        )
    else:
        dataloader = DataLoaderShard(
            new_dataset,
--- a/src/accelerate/hooks.py
+++ b/src/accelerate/hooks.py
@ -71,7 +71,7 @@ class ModelHook:

    def detach_hook(self, module):
        """
-        To be executed when the hook is detached from a module.
+        To be executed when the hook is deached from a module.

        Args:
            module (`torch.nn.Module`): The module detached from this hook.
@ -182,7 +182,7 @@ class AlignDevicesHook(ModelHook):
    Args:
        execution_device (`torch.device`, *optional*):
            The device on which inputs and model weights should be placed before the forward pass.
-        offload (`bool`, *optional*, defaults to `False`):
+        offload (`bool`, *optional*, defauts to `False`):
            Whether or not the weights should be offloaded after the forward pass.
        io_same_device (`bool`, *optional*, defaults to `False`):
            Whether or not the output should be placed on the same device as the input was.
@ -319,7 +319,7 @@ def attach_align_device_hook(
            The module where we want to attach the hooks.
        execution_device (`torch.device`, *optional*):
            The device on which inputs and model weights should be placed before the forward pass.
-        offload (`bool`, *optional*, defaults to `False`):
+        offload (`bool`, *optional*, defauts to `False`):
            Whether or not the weights should be offloaded after the forward pass.
        weights_map (`Mapping[str, torch.Tensor]`, *optional*):
            When the model weights are offloaded, a (potentially lazy) map from param names to the tensor values.
@ -402,7 +402,7 @@ def attach_align_device_hook_on_blocks(
        execution_device (`torch.device` or `Dict[str, torch.device]`, *optional*):
            The device on which inputs and model weights should be placed before the forward pass. It can be one device
            for the whole module, or a dictionary mapping module name to device.
-        offload (`bool`, *optional*, defaults to `False`):
+        offload (`bool`, *optional*, defauts to `False`):
            Whether or not the weights should be offloaded after the forward pass. It can be one boolean for the whole
            module, or a dictionary mapping module name to boolean.
        weights_map (`Mapping[str, torch.Tensor]`, *optional*):
--- a/src/accelerate/launchers.py
+++ b/src/accelerate/launchers.py
@ -20,7 +20,7 @@ import warnings
 import torch

 from .state import AcceleratorState
-from .utils import PrecisionType, PrepareForLaunch, patch_environment
+from .utils import PrecisionType, PrepareForLaunch, is_torch_version, patch_environment


 def notebook_launcher(function, args=(), num_processes=None, use_fp16=False, mixed_precision="no", use_port="29500"):
@ -90,6 +90,12 @@ def notebook_launcher(function, args=(), num_processes=None, use_fp16=False, mix

        if num_processes > 1:
            # Multi-GPU launch
+            if is_torch_version("<", "1.5.0"):
+                raise ImportError(
+                    "Using `notebook_launcher` for distributed training on GPUs require torch >= 1.5.0, got "
+                    f"{torch.__version__}."
+                )
+
            from torch.multiprocessing import start_processes

            if len(AcceleratorState._shared_state) > 0:
@ -121,17 +127,12 @@ def notebook_launcher(function, args=(), num_processes=None, use_fp16=False, mix
                start_processes(launcher, args=args, nprocs=num_processes, start_method="fork")

        else:
-            # No need for a distributed launch otherwise as it's either CPU, GPU or MPS.
-            use_mps_device = "false"
-            if torch.backends.mps.is_available():
-                print("Launching training on MPS.")
-                use_mps_device = "true"
-            elif torch.cuda.is_available():
+            # No need for a distributed launch otherwise as it's either CPU or one GPU.
+            if torch.cuda.is_available():
                print("Launching training on one GPU.")
            else:
                print("Launching training on CPU.")
-            with patch_environment(use_mps_device=use_mps_device):
-                function(*args)
+            function(*args)


 def debug_launcher(function, args=(), num_processes=2):
@ -153,6 +154,12 @@ def debug_launcher(function, args=(), num_processes=2):
        num_processes (`int`, *optional*, defaults to 2):
            The number of processes to use for training.
    """
+    if is_torch_version("<", "1.5.0"):
+        raise ImportError(
+            "Using `debug_launcher` for distributed training on GPUs require torch >= 1.5.0, got "
+            f"{torch.__version__}."
+        )
+
    from torch.multiprocessing import start_processes

    with tempfile.NamedTemporaryFile() as tmp_file:
--- a/src/accelerate/logging.py
+++ b/src/accelerate/logging.py
@ -49,20 +49,15 @@ def get_logger(name: str):

    If a log should be called on all processes, pass `main_process_only=False`

+    E.g.
+    ```python
+    logger.info("My log", main_process_only=False)
+    logger.debug("My log", main_process_only=False)
+    ```
+
    Args:
        name (`str`):
            The name for the logger, such as `__file__`
-
-    Example:
-
-    ```python
-    >>> from accelerate.logging import get_logger
-
-    >>> logger = get_logger(__name__)
-
-    >>> logger.info("My log", main_process_only=False)
-    >>> logger.debug("My log", main_process_only=True)
-    ```
    """
    logger = logging.getLogger(name)
    return MultiProcessAdapter(logger, {})
--- a/src/accelerate/scheduler.py
+++ b/src/accelerate/scheduler.py
@ -28,7 +28,7 @@ class AcceleratedScheduler:
    to avoid making a scheduler step too fast when gradients went overflow and there was no training step (in mixed
    precision training)

-    When performing gradient accumulation scheduler lengths should not be changed accordingly, Accelerate will always
+    When performing gradient accumulation scheduler lengths should not be changed accordingly, accelerate will always
    step the scheduler to account for it.

    Args:
@ -69,9 +69,7 @@ class AcceleratedScheduler:
            num_processes = AcceleratorState().num_processes
            for _ in range(num_processes):
                # Special case when using OneCycle and `drop_last` was not used
-                if hasattr(self.scheduler, "total_steps") and self.scheduler._step_count <= self.scheduler.total_steps:
-                    self.scheduler.step(*args, **kwargs)
-                else:
+                if getattr(self.scheduler, "total_steps", 0) <= self.scheduler.last_epoch:
                    self.scheduler.step(*args, **kwargs)

    # Passthroughs
--- a/src/accelerate/state.py
+++ b/src/accelerate/state.py
@ -13,7 +13,6 @@
 # limitations under the License.

 import os
-import warnings
 from distutils.util import strtobool

 import torch
@ -50,14 +49,14 @@ class AcceleratorState:
    """
    Singleton class that has information about the current training environment.

-    **Available attributes:**
+    **Attributes:**

        - **device** (`torch.device`) -- The device to use.
        - **distributed_type** ([`~accelerate.state.DistributedType`]) -- The type of distributed environment currently
          in use.
        - **local_process_index** (`int`) -- The index of the current process on the current server.
-        - **mixed_precision** (`str`) -- Whether or not the current script will use mixed precision, and if so the type
-          of mixed precision being performed.
+        - **mixed_precision** (`str`) -- Whether or not the current script will use mixed precision. If you are using
+          mixed precision, define if you want to use FP16 or BF16 (bfloat16) as the floating point.
        - **num_processes** (`int`) -- The number of processes currently launched in parallel.
        - **process_index** (`int`) -- The index of the current process.
    """
@ -222,14 +221,6 @@ class AcceleratorState:
                                "and/or you do not have an MPS-enabled device on this machine."
                            )
                    else:
-                        from .utils import is_torch_version
-
-                        if not is_torch_version(">", "1.12.0"):
-                            warnings.warn(
-                                "We strongly recommend to install PyTorch >= 1.13 (nightly version at the time of writing) on your MacOS machine. "
-                                "It has major fixes related to model correctness and performance improvements for transformer based models. "
-                                "Please refer to https://github.com/pytorch/pytorch/issues/82707 for more details."
-                            )
                        self.device = torch.device("mps")
                elif cpu or not torch.cuda.is_available():
                    self.device = torch.device("cpu")
@ -251,7 +242,7 @@ class AcceleratorState:
        if self.distributed_type == DistributedType.DEEPSPEED:
            repr += f"ds_config: {self.deepspeed_plugin.deepspeed_config}\n"
        else:
-            repr += f"Mixed precision type: {mixed_precision}\n"
+            f"Mixed precision type: {mixed_precision}\n"
        return repr

    # For backward compatibility
@ -278,11 +269,10 @@ class GradientState:
    """
    Singleton class that has information related to gradient synchronization for gradient accumulation

-    **Available attributes:**
+    **Attributes:**

        - **end_of_dataloader** (`bool`) -- Whether we have reached the end the current dataloader
        - **remainder** (`int`) -- The number of extra samples that were added from padding the dataloader
-        - **sync_gradients** (`bool`) -- Whether the gradients should be synced across all devices
    """

    _shared_state = {}
@ -311,5 +301,5 @@ class GradientState:
        self.end_of_dataloader = end_of_dataloader

    def _set_remainder(self, remainder):
-        "Private function that sets the number of remaining samples at the end of the dataloader. Users should not have to call this."
+        "Private function that sets the number of remaining samples at the end of the dataloader"
        self.remainder = remainder
--- a/src/accelerate/test_utils/init.py
+++ b/src/accelerate/test_utils/init.py
@ -10,7 +10,6 @@ from .testing import (
    require_huggingface_suite,
    require_multi_gpu,
    require_single_gpu,
-    require_torch_min_version,
    require_tpu,
    skip,
    slow,
--- a/src/accelerate/test_utils/scripts/test_script.py
+++ b/src/accelerate/test_utils/scripts/test_script.py
@ -46,9 +46,10 @@ def rng_sync_check():
    if state.distributed_type == DistributedType.MULTI_GPU:
        synchronize_rng_states(["cuda"])
        assert are_the_same_tensors(torch.cuda.get_rng_state()), "RNG states improperly synchronized on GPU."
-    generator = torch.Generator()
-    synchronize_rng_states(["generator"], generator=generator)
-    assert are_the_same_tensors(generator.get_state()), "RNG states improperly synchronized in generator."
+    if is_torch_version(">=", "1.6.0"):
+        generator = torch.Generator()
+        synchronize_rng_states(["generator"], generator=generator)
+        assert are_the_same_tensors(generator.get_state()), "RNG states improperly synchronized in generator."

    if state.local_process_index == 0:
        print("All rng are properly synched.")
@ -338,7 +339,7 @@ def main():
    if state.local_process_index == 0:
        print("\n**DataLoader integration test**")
    dl_preparation_check()
-    if state.distributed_type != DistributedType.TPU and is_torch_version(">=", "1.8.0"):
+    if state.distributed_type != DistributedType.TPU:
        central_dl_preparation_check()

    # Trainings are not exactly the same in DeepSpeed and CPU mode
--- a/src/accelerate/test_utils/testing.py
+++ b/src/accelerate/test_utils/testing.py
@ -20,7 +20,6 @@ import sys
 import tempfile
 import unittest
 from distutils.util import strtobool
-from functools import partial
 from pathlib import Path
 from typing import List, Union
 from unittest import mock
@ -133,16 +132,6 @@ def require_fsdp(test_case):
    return unittest.skipUnless(is_torch_version(">=", "1.12.0"), "test requires torch version >= 1.12.0")(test_case)


-def require_torch_min_version(test_case=None, version=None):
-    """
-    Decorator marking that a test requires a particular torch version to be tested. These tests are skipped when an
-    installed torch version is less than the required one.
-    """
-    if test_case is None:
-        return partial(require_torch_min_version, version=version)
-    return unittest.skipUnless(is_torch_version(">=", version), f"test requires torch version >= {version}")(test_case)
-
-
 def require_tensorboard(test_case):
    """
    Decorator marking a test that requires tensorboard installed. These tests are skipped when tensorboard isn't
--- a/src/accelerate/tracking.py
+++ b/src/accelerate/tracking.py
@ -16,14 +16,11 @@
 # Provide a project dir name, then each type of logger gets stored in project/{`logging_dir`}

 import os
-import time
 from abc import ABCMeta, abstractmethod, abstractproperty
 from typing import List, Optional, Union

-import yaml
-
 from .logging import get_logger
-from .utils import LoggerType, is_aim_available, is_comet_ml_available, is_tensorboard_available, is_wandb_available
+from .utils import LoggerType, is_comet_ml_available, is_tensorboard_available, is_wandb_available


 _available_trackers = []
@ -43,11 +40,6 @@ if is_comet_ml_available():

    _available_trackers.append(LoggerType.COMETML)

-if is_aim_available():
-    from aim import Run
-
-    _available_trackers.append(LoggerType.AIM)
-

 logger = get_logger(__name__)

@ -139,8 +131,8 @@ class TensorBoardTracker(GeneralTracker):
        self.run_name = run_name
        self.logging_dir = os.path.join(logging_dir, run_name)
        self.writer = tensorboard.SummaryWriter(self.logging_dir, **kwargs)
-        logger.debug(f"Initialized TensorBoard project {self.run_name} logging to {self.logging_dir}")
-        logger.debug(
+        logger.info(f"Initialized TensorBoard project {self.run_name} logging to {self.logging_dir}")
+        logger.info(
            "Make sure to log any initial configurations with `self.store_init_configuration` before training!"
        )

@ -150,8 +142,7 @@ class TensorBoardTracker(GeneralTracker):

    def store_init_configuration(self, values: dict):
        """
-        Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment. Stores the
-        hyperparameters in a yaml file for future use.
+        Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment.

        Args:
            values (Dictionary `str` to `bool`, `str`, `float` or `int`):
@ -160,16 +151,7 @@ class TensorBoardTracker(GeneralTracker):
        """
        self.writer.add_hparams(values, metric_dict={})
        self.writer.flush()
-        project_run_name = time.time()
-        dir_name = os.path.join(self.logging_dir, str(project_run_name))
-        os.makedirs(dir_name, exist_ok=True)
-        with open(os.path.join(dir_name, "hparams.yml"), "w") as outfile:
-            try:
-                yaml.dump(values, outfile)
-            except yaml.representer.RepresenterError:
-                logger.error("Serialization to store hyperparameters failed")
-                raise
-        logger.debug("Stored initial configuration hyperparameters to TensorBoard and hparams yaml file")
+        logger.info("Stored initial configuration hyperparameters to TensorBoard")

    def log(self, values: dict, step: Optional[int] = None, **kwargs):
        """
@ -193,14 +175,14 @@ class TensorBoardTracker(GeneralTracker):
            elif isinstance(v, dict):
                self.writer.add_scalars(k, v, global_step=step, **kwargs)
        self.writer.flush()
-        logger.debug("Successfully logged to TensorBoard")
+        logger.info("Successfully logged to TensorBoard")

    def finish(self):
        """
        Closes `TensorBoard` writer
        """
        self.writer.close()
-        logger.debug("TensorBoard writer closed")
+        logger.info("TensorBoard writer closed")


 class WandBTracker(GeneralTracker):
@ -220,8 +202,8 @@ class WandBTracker(GeneralTracker):
    def __init__(self, run_name: str, **kwargs):
        self.run_name = run_name
        self.run = wandb.init(project=self.run_name, **kwargs)
-        logger.debug(f"Initialized WandB project {self.run_name}")
-        logger.debug(
+        logger.info(f"Initialized WandB project {self.run_name}")
+        logger.info(
            "Make sure to log any initial configurations with `self.store_init_configuration` before training!"
        )

@ -239,7 +221,7 @@ class WandBTracker(GeneralTracker):
                `str`, `float`, `int`, or `None`.
        """
        wandb.config.update(values)
-        logger.debug("Stored initial configuration hyperparameters to WandB")
+        logger.info("Stored initial configuration hyperparameters to WandB")

    def log(self, values: dict, step: Optional[int] = None, **kwargs):
        """
@ -255,14 +237,14 @@ class WandBTracker(GeneralTracker):
                Additional key word arguments passed along to the `wandb.log` method.
        """
        self.run.log(values, step=step, **kwargs)
-        logger.debug("Successfully logged to WandB")
+        logger.info("Successfully logged to WandB")

    def finish(self):
        """
        Closes `wandb` writer
        """
        self.run.finish()
-        logger.debug("WandB run closed")
+        logger.info("WandB run closed")


 class CometMLTracker(GeneralTracker):
@ -284,8 +266,8 @@ class CometMLTracker(GeneralTracker):
    def __init__(self, run_name: str, **kwargs):
        self.run_name = run_name
        self.writer = Experiment(project_name=run_name, **kwargs)
-        logger.debug(f"Initialized CometML project {self.run_name}")
-        logger.debug(
+        logger.info(f"Initialized CometML project {self.run_name}")
+        logger.info(
            "Make sure to log any initial configurations with `self.store_init_configuration` before training!"
        )

@ -303,7 +285,7 @@ class CometMLTracker(GeneralTracker):
                `str`, `float`, `int`, or `None`.
        """
        self.writer.log_parameters(values)
-        logger.debug("Stored initial configuration hyperparameters to CometML")
+        logger.info("Stored initial configuration hyperparameters to CometML")

    def log(self, values: dict, step: Optional[int] = None, **kwargs):
        """
@ -328,82 +310,17 @@ class CometMLTracker(GeneralTracker):
                self.writer.log_other(k, v, **kwargs)
            elif isinstance(v, dict):
                self.writer.log_metrics(v, step=step, **kwargs)
-        logger.debug("Successfully logged to CometML")
+        logger.info("Successfully logged to CometML")

    def finish(self):
        """
        Closes `comet-ml` writer
        """
        self.writer.end()
-        logger.debug("CometML run closed")
+        logger.info("CometML run closed")


-class AimTracker(GeneralTracker):
-    """
-    A `Tracker` class that supports `aim`. Should be initialized at the start of your script.
-
-    Args:
-        run_name (`str`):
-            The name of the experiment run.
-        kwargs:
-            Additional key word arguments passed along to the `Run.__init__` method.
-    """
-
-    name = "aim"
-    requires_logging_directory = True
-
-    def __init__(self, run_name: str, logging_dir: Optional[Union[str, os.PathLike]] = ".", **kwargs):
-        self.run_name = run_name
-        self.writer = Run(repo=logging_dir, **kwargs)
-        self.writer.name = self.run_name
-        logger.debug(f"Initialized Aim project {self.run_name}")
-        logger.debug(
-            "Make sure to log any initial configurations with `self.store_init_configuration` before training!"
-        )
-
-    @property
-    def tracker(self):
-        return self.writer
-
-    def store_init_configuration(self, values: dict):
-        """
-        Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment.
-
-        Args:
-            values (`dict`):
-                Values to be stored as initial hyperparameters as key-value pairs.
-        """
-        self.writer["hparams"] = values
-
-    def log(self, values: dict, step: Optional[int], **kwargs):
-        """
-        Logs `values` to the current run.
-
-        Args:
-            values (`dict`):
-                Values to be logged as key-value pairs.
-            step (`int`, *optional*):
-                The run step. If included, the log will be affiliated with this step.
-            kwargs:
-                Additional key word arguments passed along to the `Run.track` method.
-        """
-        # Note: replace this with the dictionary support when merged
-        for key, value in values.items():
-            self.writer.track(value, name=key, step=step, **kwargs)
-
-    def finish(self):
-        """
-        Closes `aim` writer
-        """
-        self.writer.close()
-
-
-LOGGER_TYPE_TO_CLASS = {
-    "aim": AimTracker,
-    "comet_ml": CometMLTracker,
-    "tensorboard": TensorBoardTracker,
-    "wandb": WandBTracker,
-}
+LOGGER_TYPE_TO_CLASS = {"tensorboard": TensorBoardTracker, "wandb": WandBTracker, "comet_ml": CometMLTracker}


 def filter_trackers(
@ -424,8 +341,8 @@ def filter_trackers(
            - `"tensorboard"`
            - `"wandb"`
            - `"comet_ml"`
-            If `"all"` is selected, will pick up all available trackers in the environment and initialize them. Can
-            also accept implementations of `GeneralTracker` for custom trackers, and can be combined with `"all"`.
+            If `"all`" is selected, will pick up all available trackers in the environment and intialize them. Can also
+            accept implementations of `GeneralTracker` for custom trackers, and can be combined with `"all"`.
        logging_dir (`str`, `os.PathLike`, *optional*):
            A path to a directory for storing logs of locally-compatible loggers.
    """
@ -454,6 +371,6 @@ def filter_trackers(
                                    )
                            loggers.append(log_type)
                        else:
-                            logger.debug(f"Tried adding logger {log_type}, but package is unavailable in the system.")
+                            logger.info(f"Tried adding logger {log_type}, but package is unavailable in the system.")

    return loggers
--- a/src/accelerate/utils/init.py
+++ b/src/accelerate/utils/init.py
@ -2,7 +2,7 @@
 # There's no way to ignore "F401 '...' imported but unused" warnings in this
 # module, but to preserve other warnings. So, don't check this module at all

-from .constants import MODEL_NAME, OPTIMIZER_NAME, RNG_STATE_NAME, SCALER_NAME, SCHEDULER_NAME, TORCH_LAUNCH_PARAMS
+from .constants import MODEL_NAME, OPTIMIZER_NAME, RNG_STATE_NAME, SCALER_NAME, SCHEDULER_NAME
 from .dataclasses import (
    ComputeEnvironment,
    DeepSpeedPlugin,
@ -20,7 +20,6 @@ from .dataclasses import (
 )
 from .imports import (
    get_ccl_version,
-    is_aim_available,
    is_apex_available,
    is_bf16_available,
    is_boto3_available,
@ -28,7 +27,6 @@ from .imports import (
    is_comet_ml_available,
    is_datasets_available,
    is_deepspeed_available,
-    is_rich_available,
    is_sagemaker_available,
    is_tensorboard_available,
    is_tpu_available,
@ -93,7 +91,7 @@ if is_deepspeed_available():
        HfDeepSpeedConfig,
    )

-from .launch import PrepareForLaunch, _filter_args, get_launch_prefix
+from .launch import PrepareForLaunch, get_launch_prefix
 from .memory import find_executable_batch_size
 from .other import (
    extract_model_from_parallel,
--- a/src/accelerate/utils/constants.py
+++ b/src/accelerate/utils/constants.py
@ -31,30 +31,3 @@ FSDP_STATE_DICT_TYPE = ["FULL_STATE_DICT", "LOCAL_STATE_DICT", "SHARDED_STATE_DI
 DEEPSPEED_MULTINODE_LAUNCHERS = ["pdsh", "standard", "openmpi", "mvapich"]

 STR_OPERATION_TO_FUNC = {">": op.gt, ">=": op.ge, "==": op.eq, "!=": op.ne, "<=": op.le, "<": op.lt}
-
-# These are the args for `torch.distributed.launch` for pytorch < 1.9
-TORCH_LAUNCH_PARAMS = [
-    "nnodes",
-    "nproc_per_node",
-    "rdzv_backend",
-    "rdzv_endpoint",
-    "rdzv_id",
-    "rdzv_conf",
-    "standalone",
-    "max_restarts",
-    "monitor_interval",
-    "start_method",
-    "role",
-    "module",
-    "m",
-    "no_python",
-    "run_path",
-    "log_dir",
-    "r",
-    "redirects",
-    "t",
-    "tee",
-    "node_rank",
-    "master_addr",
-    "master_port",
-]
--- a/src/accelerate/utils/dataclasses.py
+++ b/src/accelerate/utils/dataclasses.py
@ -60,8 +60,6 @@ class DistributedDataParallelKwargs(KwargsHandler):

    `gradient_as_bucket_view` is only available in PyTorch 1.7.0 and later versions.

-    `static_graph` is only available in PyTorch 1.11.0 and later versions.
-
    </Tip>"""

    dim: int = 0
@ -70,7 +68,6 @@ class DistributedDataParallelKwargs(KwargsHandler):
    find_unused_parameters: bool = False
    check_reduction: bool = False
    gradient_as_bucket_view: bool = False
-    static_graph: bool = False


@dataclass
@ -196,7 +193,6 @@ class LoggerType(BaseEnum):
    """

    ALL = "all"
-    AIM = "aim"
    TENSORBOARD = "tensorboard"
    WANDB = "wandb"
    COMETML = "comet_ml"
--- a/src/accelerate/utils/imports.py
+++ b/src/accelerate/utils/imports.py
@ -14,7 +14,6 @@

 import importlib
 import sys
-from functools import lru_cache

 import torch

@ -51,7 +50,6 @@ def is_apex_available():
    return importlib.util.find_spec("apex") is not None


-@lru_cache()
 def is_tpu_available(check_device=True):
    "Checks if `torch_xla` is installed and potentially if a TPU is in the environment"
    if _tpu_available and check_device:
@ -95,10 +93,6 @@ def is_datasets_available():
    return importlib.util.find_spec("datasets") is not None


-def is_aim_available():
-    return importlib.util.find_spec("aim") is not None
-
-
 def is_tensorboard_available():
    return importlib.util.find_spec("tensorboard") is not None or importlib.util.find_spec("tensorboardX") is not None

@ -115,10 +109,6 @@ def is_boto3_available():
    return importlib.util.find_spec("boto3") is not None


-def is_rich_available():
-    return importlib.util.find_spec("rich") is not None
-
-
 def is_sagemaker_available():
    return importlib.util.find_spec("sagemaker") is not None

--- a/src/accelerate/utils/launch.py
+++ b/src/accelerate/utils/launch.py
@ -21,10 +21,6 @@ from ..utils import is_torch_version
 from .dataclasses import DistributedType


-if is_torch_version(">=", "1.9.0"):
-    import torch.distributed.run as distrib_run
-
-
 def get_launch_prefix():
    """
    Grabs the correct launcher for starting a distributed command, such as either `torchrun`, `python -m
@ -39,19 +35,6 @@ def get_launch_prefix():
    return cmd


-def _filter_args(args):
-    """
-    Filters out all `accelerate` specific args
-    """
-    distrib_args = distrib_run.get_args_parser()
-    new_args, _ = distrib_args.parse_known_args()
-
-    for key, value in vars(args).items():
-        if key in vars(new_args).keys():
-            setattr(new_args, key, value)
-    return new_args
-
-
 class PrepareForLaunch:
    """
    Prepare a function that will launched in a distributed setup.
--- a/src/accelerate/utils/modeling.py
+++ b/src/accelerate/utils/modeling.py
@ -617,7 +617,7 @@ def load_checkpoint_in_model(
        dtype (`str` or `torch.dtype`, *optional*):
            If provided, the weights will be converted to that type when loaded.
        offload_state_dict (`bool`, *optional*, defaults to `False`):
-            If `True`, will temporarily offload the CPU state dict on the hard drive to avoid getting out of CPU RAM if
+            If `True`, will temporarily offload the CPU state dict on the hard drive to avoig getting out of CPU RAM if
            the weight of the CPU state dict + the biggest shard does not fit.
    """
    if offload_folder is None and device_map is not None and "disk" in device_map.values():
--- a/src/accelerate/utils/random.py
+++ b/src/accelerate/utils/random.py
@ -64,7 +64,7 @@ def synchronize_rng_state(rng_type: Optional[RNGType] = None, generator: Optiona
    state = AcceleratorState()
    if state.distributed_type == DistributedType.TPU:
        rng_state = xm.mesh_reduce("random_seed", rng_state, lambda x: x[0])
-    elif state.distributed_type in [DistributedType.DEEPSPEED, DistributedType.MULTI_GPU, DistributedType.FSDP]:
+    elif state.distributed_type in [DistributedType.DEEPSPEED, DistributedType.MULTI_GPU]:
        rng_state = rng_state.to(state.device)
        torch.distributed.broadcast(rng_state, 0)
        rng_state = rng_state.cpu()
--- a/src/accelerate/utils/rich.py
+++ b/src/accelerate/utils/rich.py
@ -1,24 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .imports import is_rich_available
-
-
-if is_rich_available():
-    from rich.traceback import install
-
-    install(show_locals=False)
-
-else:
-    raise ModuleNotFoundError("To use the rich extension, install rich with `pip install rich`")
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@ -35,6 +35,7 @@ from accelerate.test_utils.testing import (
    require_cuda,
    require_deepspeed,
    require_multi_gpu,
+    skip,
    slow,
 )
 from accelerate.test_utils.training import RegressionDataset
@ -696,6 +697,7 @@ class DeepSpeedIntegrationTest(TempDirTestCase):
            with patch_environment(omp_num_threads=1):
                execute_subprocess_async(cmd_stage, env=os.environ.copy())

+    @skip
    def test_checkpointing(self):
        self.test_file_path = os.path.join(self.test_scripts_folder, "test_checkpointing.py")
        cmd = [
--- a/tests/test_big_modeling.py
+++ b/tests/test_big_modeling.py
@ -27,7 +27,7 @@ from accelerate.big_modeling import (
    load_checkpoint_and_dispatch,
 )
 from accelerate.hooks import remove_hook_from_submodules
-from accelerate.test_utils import require_cuda, require_multi_gpu, require_torch_min_version, slow
+from accelerate.test_utils import require_cuda, require_multi_gpu, slow
 from accelerate.utils import offload_state_dict
 from transformers import AutoModelForCausalLM, AutoTokenizer

@ -79,7 +79,6 @@ class ModelWithUnusedSubModulesForTest(nn.Module):
        return self.linear4(self.linear3(self.batchnorm(self.linear2(self.linear1(x)))))


-@require_torch_min_version(version="1.9.0")
 class BigModelingTester(unittest.TestCase):
    def test_init_empty_weights(self):
        # base use
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@ -36,7 +36,6 @@ EXCLUDE_EXAMPLES = [
    "gradient_accumulation.py",
    "multi_process_metrics.py",
    "memory.py",
-    "automatic_gradient_accumulation.py",
    "fsdp_with_peak_mem_tracking.py",
    "deepspeed_with_config_support.py",
 ]
--- a/tests/test_hooks.py
+++ b/tests/test_hooks.py
@ -27,7 +27,7 @@ from accelerate.hooks import (
    remove_hook_from_module,
    remove_hook_from_submodules,
 )
-from accelerate.test_utils import require_multi_gpu, require_torch_min_version
+from accelerate.test_utils import require_multi_gpu


 class ModelForTest(nn.Module):
@ -51,7 +51,6 @@ class PostForwardHook(ModelHook):
        return output + 1


-@require_torch_min_version(version="1.9.0")
 class HooksModelTester(unittest.TestCase):
    def test_add_and_remove_hooks(self):
        test_model = ModelForTest()
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@ -26,13 +26,11 @@ from accelerate.test_utils import (
    require_huggingface_suite,
    require_multi_gpu,
    require_single_gpu,
-    require_torch_min_version,
 )
 from accelerate.utils import get_launch_prefix, patch_environment


@require_huggingface_suite
-@require_torch_min_version(version="1.8.0")
 class MetricTester(unittest.TestCase):
    def setUp(self):
        mod_file = inspect.getfile(accelerate.test_utils)
--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@ -21,7 +21,6 @@ import torch
 import torch.nn as nn

 from accelerate.test_utils import require_cuda, require_multi_gpu
-from accelerate.test_utils.testing import require_torch_min_version
 from accelerate.utils.modeling import (
    check_device_map,
    clean_device_map,
@ -46,7 +45,6 @@ class ModelForTest(nn.Module):
        return self.linear2(self.batchnorm(self.linear1(x)))


-@require_torch_min_version(version="1.9.0")
 class ModelingUtilsTester(unittest.TestCase):
    def check_set_module_tensor_for_device(self, model, device1, device2):
        self.assertEqual(model.linear1.weight.device, torch.device(device1))
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@ -21,30 +21,12 @@ from accelerate import Accelerator, debug_launcher
 from accelerate.test_utils import require_cpu


-def one_cycle_test(num_processes=2, step_scheduler_with_optimizer=True, split_batches=False):
-    accelerator = Accelerator(step_scheduler_with_optimizer=step_scheduler_with_optimizer, split_batches=split_batches)
-    model = torch.nn.Linear(2, 4)
-    optimizer = torch.optim.AdamW(model.parameters(), lr=1.0)
-    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.01, steps_per_epoch=2, epochs=1)
-    model, optimizer, scheduler = accelerator.prepare(model, optimizer, scheduler)
-
-    # Optimizer has stepped
-    scheduler.step()
-    if step_scheduler_with_optimizer or (num_processes == 1):
-        assert (
-            scheduler.scheduler.last_epoch == num_processes
-        ), f"Last Epoch ({scheduler.scheduler.last_epoch}) != Num Processes ({num_processes})"
-    else:
-        assert (
-            scheduler.scheduler.last_epoch != num_processes
-        ), f"Last Epoch ({scheduler.scheduler.last_epoch}) == Num Processes ({num_processes})"
-
-
-def lambda_test(num_processes=2, step_scheduler_with_optimizer=True, split_batches=False):
+def scheduler_test(num_processes=2, step_scheduler_with_optimizer=True, split_batches=False):
    accelerator = Accelerator(step_scheduler_with_optimizer=step_scheduler_with_optimizer, split_batches=split_batches)
    model = torch.nn.Linear(2, 4)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1.0)
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda n: 1 - n / 10)
+
    model, optimizer, scheduler = accelerator.prepare(model, optimizer, scheduler)

    # Optimizer has stepped
@ -67,30 +49,16 @@ def lambda_test(num_processes=2, step_scheduler_with_optimizer=True, split_batch

@require_cpu
 class SchedulerTester(unittest.TestCase):
-    def test_lambda_scheduler_steps_with_optimizer_single_process(self):
-        debug_launcher(partial(lambda_test, num_processes=1), num_processes=1)
-        debug_launcher(partial(lambda_test, num_processes=1, split_batches=True), num_processes=1)
+    def test_scheduler_steps_with_optimizer_single_process(self):
+        debug_launcher(partial(scheduler_test, num_processes=1), num_processes=1)
+        debug_launcher(partial(scheduler_test, num_processes=1, split_batches=True), num_processes=1)

-    def test_one_cycle_scheduler_steps_with_optimizer_single_process(self):
-        debug_launcher(partial(one_cycle_test, num_processes=1), num_processes=1)
-        debug_launcher(partial(one_cycle_test, num_processes=1, split_batches=True), num_processes=1)
+    def test_scheduler_not_step_with_optimizer_single_process(self):
+        debug_launcher(partial(scheduler_test, num_processes=1, step_scheduler_with_optimizer=False), num_processes=1)

-    def test_lambda_scheduler_not_step_with_optimizer_single_process(self):
-        debug_launcher(partial(lambda_test, num_processes=1, step_scheduler_with_optimizer=False), num_processes=1)
+    def test_scheduler_steps_with_optimizer_multiprocess(self):
+        debug_launcher(scheduler_test)
+        debug_launcher(partial(scheduler_test, num_processes=1, split_batches=True), num_processes=1)

-    def test_one_cycle_scheduler_not_step_with_optimizer_single_process(self):
-        debug_launcher(partial(one_cycle_test, num_processes=1, step_scheduler_with_optimizer=False), num_processes=1)
-
-    def test_lambda_scheduler_steps_with_optimizer_multiprocess(self):
-        debug_launcher(lambda_test)
-        debug_launcher(partial(lambda_test, num_processes=1, split_batches=True), num_processes=1)
-
-    def test_one_cycle_scheduler_steps_with_optimizer_multiprocess(self):
-        debug_launcher(one_cycle_test)
-        debug_launcher(partial(one_cycle_test, num_processes=1, split_batches=True), num_processes=1)
-
-    def test_lambda_scheduler_not_step_with_optimizer_multiprocess(self):
-        debug_launcher(partial(lambda_test, step_scheduler_with_optimizer=False))
-
-    def test_one_cycle_scheduler_not_step_with_optimizer_multiprocess(self):
-        debug_launcher(partial(one_cycle_test, step_scheduler_with_optimizer=False))
+    def test_scheduler_not_step_with_optimizer_multiprocess(self):
+        debug_launcher(partial(scheduler_test, step_scheduler_with_optimizer=False))
--- a/tests/test_tracking.py
+++ b/tests/test_tracking.py
@ -85,12 +85,12 @@ class WandBTrackingTest(TempDirTestCase, MockingTestCase):
        self.add_mocks(mock.patch.dict(os.environ, {"WANDB_DIR": self.tmpdir}))

    @staticmethod
-    def get_value_from_log(key: str, log: str, key_occurrence: int = 0):
+    def get_value_from_log(key: str, log: str, key_occurance: int = 0):
        """
        Parses wandb log for `key` and returns the value.
-        If parsing through multiple calls to .log, pass in a `key_occurrence`
+        If parsing through multiple calls to .log, pass in a `key_occurance`
        """
-        res = re.findall(rf"(?<={key} )[^\s]+", log)[key_occurrence]
+        res = re.findall(rf"(?<={key} )[^\s]+", log)[key_occurance]
        if '"' in res:
            return re.findall(r'"([^"]*)"', res)[0]
        else:
--- a/utils/log_reports.py
+++ b/utils/log_reports.py
@ -1,34 +0,0 @@
-import json
-from pathlib import Path 
-import subprocess
-
-failed = []
-passed = []
-
-group_info = []
-
-total_num_failed = 0
-for log in Path().glob("*.log"):
-    section_num_failed = 0
-    with open(log, "r") as f:
-        for line in f:
-            line = json.loads(line)
-            if line.get("nodeid", "") != "":
-                test = line["nodeid"]
-                if line.get("duration", None) is not None:
-                    duration = f'{line["duration"]:.4f}'
-                    if line.get("outcome", "") == "failed":
-                        section_num_failed += 1
-                        failed.append([test, duration])
-                    else:
-                        passed.append([test, duration])
-    group_info.append([str(log), section_num_failed])
-
-if len(failed) > 0:
-    result = "## Failed Tests:\n"
-    failed_table = '| Test Location | Test Class | Test Name |\n|---|---|---|\n| '
-    for test in failed:
-        failed_table += ' | '.join(test[0].split("::"))
-    failed_table += " |"
-    result += failed_table
-    print(result)
--- a/utils/style_doc.py
+++ b/utils/style_doc.py
@ -28,7 +28,7 @@ BLACK_AVOID_PATTERNS = {}
 # Regexes
 # Re pattern that catches list introduction (with potential indent)
 _re_list = re.compile(r"^(\s*-\s+|\s*\*\s+|\s*\d+\.\s+)")
-# Re pattern that catches code block introduction (with potential indent)
+# Re pattern that catches code block introduction (with potentinal indent)
 _re_code = re.compile(r"^(\s*)```(.*)$")
 # Re pattern that catches rst args blocks of the form `Parameters:`.
 _re_args = re.compile("^\s*(Args?|Arguments?|Params?|Parameters?):\s*$")
@ -62,7 +62,7 @@ def parse_code_example(code_lines):

    Args:
        code_lines (`List[str]`): The code lines to parse.
-        max_len (`int`): The maximum length per line.
+        max_len (`int`): The maximum lengh per line.

    Returns:
        (List[`str`], List[`str`]): The list of code samples and the list of outputs.
@ -109,7 +109,7 @@ def format_code_example(code: str, max_len: int, in_docstring: bool = False):

    Args:
        code (`str`): The code example to format.
-        max_len (`int`): The maximum length per line.
+        max_len (`int`): The maximum lengh per line.
        in_docstring (`bool`, *optional*, defaults to `False`): Whether or not the code example is inside a docstring.

    Returns: