Adds tag v0.27.2 for pypi

Fix seedable sampler logic and expound docs (#2434 )
* Fix and add more docs * Add tests + ensure working * Fixup all tests!
2025-11-17 16:04:35 +08:00 · 2024-02-13 10:14:49 -05:00 · 2024-02-13 10:13:40 -05:00 · 2024-02-13 10:06:34 -05:00 · 2024-02-09 10:46:47 -05:00
123 changed files with 2040 additions and 2593 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,13 +0,0 @@
-repos:
-  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.2.1
-    hooks:
-      - id: ruff
-        args:
-          - --fix
-      - id: ruff-format
-  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
-    hooks:
-      - id: check-merge-conflict
-      - id: check-yaml
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -172,14 +172,6 @@ Follow these steps to start contributing:
   $ make quality
   ```

-   You can also set up [`pre-commit`](https://pre-commit.com/) to run these checks
-   automatically as Git commit hooks.
-
-   ```bash
-   $ pip install pre-commit
-   $ pre-commit install
-   ```
-
   Once you're happy with your changes, add changed files using `git add` and
   make a commit with `git commit` to record your changes locally:

--- a/4
+++ b/4
@ -12,13 +12,13 @@ extra_quality_checks:

 # this target runs checks on all files
 quality:
-	ruff check $(check_dirs)
+	ruff $(check_dirs)
 	ruff format --check $(check_dirs)
 	doc-builder style src/accelerate docs/source --max_len 119 --check_only

 # Format source code automatically and check is there are any problems left that need manual fixing
 style:
-	ruff check $(check_dirs) --fix
+	ruff $(check_dirs) --fix
 	ruff format $(check_dirs)
 	doc-builder style src/accelerate docs/source --max_len 119
 	
--- a/benchmarks/measures_util.py
+++ b/benchmarks/measures_util.py
@ -1,16 +1,3 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 import gc
 import threading
 import time
--- a/docker/accelerate-gpu/Dockerfile
+++ b/docker/accelerate-gpu/Dockerfile
@ -4,7 +4,7 @@
 # Use base conda image to reduce time
 FROM continuumio/miniconda3:latest AS compile-image
 # Specify py version
-ENV PYTHON_VERSION=3.9
+ENV PYTHON_VERSION=3.8
 # Install apt libs
 RUN apt-get update && \
    apt-get install -y curl git wget && \
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@ -10,11 +10,7 @@
  - local: basic_tutorials/overview
    title: Overview
  - local: basic_tutorials/migration
-    title: Add Accelerate to your code
-  - local: basic_tutorials/execution
-    title: Execution process
-  - local: basic_tutorials/tpu
-    title: TPU training
+    title: Migrating to 🤗 Accelerate
  - local: basic_tutorials/launch
    title: Launching distributed code
  - local: basic_tutorials/notebook
@ -23,50 +19,41 @@
    title: Troubleshooting guide
  title: Tutorials
 - sections:
-  - isExpanded: true
-    sections:
-    - local: usage_guides/explore
-      title: Start Here!
-    - local: usage_guides/model_size_estimator
-      title: Model memory estimator
-    - local: usage_guides/quantization
-      title: Model quantization
-    - local: usage_guides/tracking
-      title: Experiment trackers
-    - local: usage_guides/checkpoint
-      title: Save and load training states
-    - local: usage_guides/training_zoo
-      title: Example Zoo
-    title: Accelerate
-  - isExpanded: true
-    sections:
-    - local: usage_guides/gradient_accumulation
-      title: Gradient accumulation
-    - local: usage_guides/local_sgd
-      title: Local SGD
-    - local: usage_guides/low_precision_training
-      title: Low precision (FP8) training
-    - local: usage_guides/deepspeed
-      title: DeepSpeed
-    - local: usage_guides/fsdp
-      title: Fully Sharded Data Parallelism
-    - local: usage_guides/megatron_lm
-      title: Megatron-LM
-    - local: usage_guides/sagemaker
-      title: Amazon SageMaker
-    - local: usage_guides/mps
-      title: Apple M1 GPUs
-    - local: usage_guides/ipex
-      title: IPEX training with CPU
-    title: Training
-  - isExpanded: true
-    sections:
-    - local: usage_guides/big_modeling
-      title: Big Model Inference
-    - local: usage_guides/distributed_inference
-      title: Distributed inference
-    title: Inference
-  title: How to guides
+  - local: usage_guides/explore
+    title: Start Here!
+  - local: usage_guides/training_zoo
+    title: Example Zoo
+  - local: usage_guides/big_modeling
+    title: How to perform inference on large models with small resources
+  - local: usage_guides/model_size_estimator
+    title: Knowing how big of a model you can fit into memory
+  - local: usage_guides/quantization
+    title: How to quantize model 
+  - local: usage_guides/distributed_inference
+    title: How to perform distributed inference with normal resources
+  - local: usage_guides/gradient_accumulation
+    title: Performing gradient accumulation
+  - local: usage_guides/local_sgd
+    title: Accelerating training with local SGD
+  - local: usage_guides/checkpoint
+    title: Saving and loading training states
+  - local: usage_guides/tracking
+    title: Using experiment trackers
+  - local: usage_guides/mps
+    title: How to use Apple Silicon M1 GPUs
+  - local: usage_guides/low_precision_training
+    title: How to train in low precision (FP8)
+  - local: usage_guides/deepspeed
+    title: How to use DeepSpeed
+  - local: usage_guides/fsdp
+    title: How to use Fully Sharded Data Parallelism
+  - local: usage_guides/megatron_lm
+    title: How to use Megatron-LM
+  - local: usage_guides/sagemaker
+    title: How to use 🤗 Accelerate with SageMaker
+  - local: usage_guides/ipex
+    title: How to use 🤗 Accelerate with Intel® Extension for PyTorch for cpu
+  title: How-To Guides
 - sections:
  - local: concept_guides/internal_mechanism
    title: 🤗 Accelerate's internal mechanism
@ -85,7 +72,7 @@
  title: Concepts and fundamentals
 - sections: 
  - local: package_reference/accelerator
-    title: Accelerator
+    title: Main Accelerator class
  - local: package_reference/state
    title: Stateful configuration classes
  - local: package_reference/cli
--- a/docs/source/basic_tutorials/execution.md
+++ b/docs/source/basic_tutorials/execution.md
@ -1,128 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-->
-
-# Execution process
-
-When working with distributed training systems, it is important to manage how and when processes are executed across GPUs. Some processes are completed faster than others, and some processes shouldn't begin if others haven't finished yet. Accelerate provides tools for orchestrating when processes are executed to ensure everything remains synchronized across all devices.
-
-This tutorial will teach you how to execute a process on only one machine and how to delay execution until all processes have reached a certain point.
-
-## Execute on one process
-
-Certain code only needs to be run once on a given machine, such as printing a log statement or only displaying one progress bar on the local main process.
-
-<hfoptions id="local-execution">
-<hfoption id="statements">
-
-You should use `accelerator.is_local_main_process` to indicate code that should only be executed once.
-
-```py
-from tqdm.auto import tqdm
-
-progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
-```
-
-You could also wrap a statement with `accelerator.is_local_main_process`.
-
-> [!TIP]
-> For standalone `print` statements that aren't wrapped in `accelerator.is_local_main_process`, replace `print` with Accelerate's [`~Accelerator.print`] method to only print once per process.
-
-```py
-if accelerator.is_local_main_process:
-    print("Accelerate is the best")
-```
-
-</hfoption>
-<hfoption id="function">
-
-For a function that should only be executed once, use [`~Accelerator.on_local_main_process`].
-
-```py
-@accelerator.on_local_main_process
-def do_my_thing():
-    "Something done once per server"
-    do_thing_once_per_server()
-```
-
-</hfoption>
-</hfoptions>
-
-You could also direct Accelerate to execute code once across *all processes* regardless of the number of machines. This is useful if you're uploading a final model to the Hub.
-
-<hfoptions id="main-execution">
-<hfoption id="statement">
-
-You should use `accelerator.is_main_process` to indicate code that should only be executed once across all processes.
-
-```py
-if accelerator.is_main_process:
-    repo.push_to_hub()
-```
-
-</hfoption>
-<hfoption id="function">
-
-For a function that should only be executed once across all processes, use [`~Accelerator.on_main_process`].
-
-```py
-@accelerator.on_main_process
-def do_my_thing():
-    "Something done once per server"
-    do_thing_once()
-```
-
-</hfoption>
-</hfoptions>
-
-## Execute on a specific process
-
-Accelerate can also help you execute functions that should only be executed on a specific process or a local process index.
-
-<hfoptions id="specific-execution">
-<hfoption id="specific process">
-
-Use the [`~Accelerator.on_process`] method and specify the process index to execute a function on.
-
-```py
-@accelerator.on_process(process_index=0)
-def do_my_thing():
-    "Something done on process index 0"
-    do_thing_on_index_zero()
-```
-
-</hfoption>
-<hfoption id="local process">
-
-Use the [`~Accelerator.on_local_process`] method and specify the local process index to execute a function on.
-
-```py
-@accelerator.on_local_process(local_process_idx=0)
-def do_my_thing():
-    "Something done on process index 0 on each server"
-    do_thing_on_index_zero_on_each_server()
-```
-
-</hfoption>
-</hfoptions>
-
-## Defer execution
-
-When you run your script on several GPUs at the same time, some code may be executed faster than others. You might need to wait for all processes to reach a certain point before executing the next set of instructions. For instance, you shouldn’t save a model before making sure every process is done with training.
-
-To do this, add [`~Accelerator.wait_for_everyone`] in your code. This blocks all processes that have finished first from continuing until all remaining processes have reached the same point (this has no effect if you're running on a single GPU or CPU).
-
-```py
-accelerator.wait_for_everyone()
-```
--- a/docs/source/basic_tutorials/migration.md
+++ b/docs/source/basic_tutorials/migration.md
@ -13,11 +13,21 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.
 -->

-# Add Accelerate to your code
+# Migrating your code to 🤗 Accelerate

-Each distributed training framework has their own way of doing things which can require writing a lot of custom code to adapt it to your PyTorch training code and training environment. Accelerate offers a friendly way to interface with these distributed training frameworks without having to learn the specific details of each one. Accelerate takes care of those details for you, so you can focus on the training code and scale it to any distributed training environment.
+This tutorial will detail how to easily convert existing PyTorch code to use 🤗 Accelerate!
+You'll see that by just changing a few lines of code, 🤗 Accelerate can perform its magic and get you on 
+your way toward running your code on distributed systems with ease!

-In this tutorial, you'll learn how to adapt your existing PyTorch code with Accelerate and get you on your way toward training on distributed systems with ease! You'll start with a basic PyTorch training loop (it assumes all the training objects like `model` and `optimizer` have been setup already) and progressively integrate Accelerate into it.
+## The base training loop
+
+To begin, write out a very basic PyTorch training loop. 
+
+<Tip>
+
+    We are under the presumption that `training_dataloader`, `model`, `optimizer`, `scheduler`, and `loss_function` have been defined beforehand.
+
+</Tip>

 ```python
 device = "cuda"
@ -35,44 +45,50 @@ for batch in training_dataloader:
    scheduler.step()
 ```

-## Accelerator
-
-The [`Accelerator`] is the main class for adapting your code to work with Accelerate. It knows about the distributed setup you're using such as the number of different processes and your hardware type. This class also provides access to many of the necessary methods for enabling your PyTorch code to work in any distributed training environment and for managing and executing processes across devices.
-
-That's why you should always start by importing and creating an [`Accelerator`] instance in your script.
+## Add in 🤗 Accelerate

+To start using 🤗 Accelerate, first import and create an [`Accelerator`] instance:
 ```python
 from accelerate import Accelerator

 accelerator = Accelerator()
 ```
+[`Accelerator`] is the main force behind utilizing all the possible options for distributed training!

-The [`Accelerator`] also knows which device to move your PyTorch objects to, so it is recommended to let Accelerate handle this for you.
+### Setting the right device
+
+The [`Accelerator`] class knows the right device to move any PyTorch object to at any time, so you should
+change the definition of `device` to come from [`Accelerator`]:

 ```diff
- device = "cuda"
+- device = 'cuda'
 + device = accelerator.device
  model.to(device)
 ```

-## Prepare PyTorch objects
+### Preparing your objects

-Next, you need to prepare your PyTorch objects (model, optimizer, scheduler, etc.) for distributed training. The [`~Accelerator.prepare`] method takes care of placing your model in the appropriate container (like single GPU or multi-GPU) for your training setup, adapting the optimizer and scheduler to use Accelerate's [`~optimizer.AcceleratedOptimizer`] and [`~scheduler.AcceleratedScheduler`], and creating a new dataloader that can be sharded across processes.
+Next, you need to pass all of the important objects related to training into [`~Accelerator.prepare`]. 🤗 Accelerate will
+make sure everything is setup in the current environment for you to start training:

-> [!TIP]
-> Accelerate only prepares objects that inherit from their respective PyTorch classes such as `torch.optim.Optimizer`.
-
-The PyTorch objects are returned in the same order they're sent.
-
-```py
+```
 model, optimizer, training_dataloader, scheduler = accelerator.prepare(
    model, optimizer, training_dataloader, scheduler
 )
 ```
+These objects are returned in the same order they were sent in. By default when using `device_placement=True`, all of the objects that can be sent to the right device will be.
+If you need to work with data that isn't passed to [~Accelerator.prepare] but should be on the active device, you should pass in the `device` you made earlier. 

-## Training loop
+<Tip warning={true}>

-Finally, remove the `to(device)` calls to the inputs and targets in the training loop because Accelerate's DataLoader classes automatically places them on the right device. You should also replace the usual `backward()` pass with Accelerate's [`~Accelerator.backward`] method which scales the gradients for you and uses the appropriate `backward()` method depending on your distributed setup (for example, DeepSpeed or Megatron).
+    Accelerate will only prepare objects that inherit from their respective PyTorch classes (such as `torch.optim.Optimizer`).
+
+</Tip>
+
+### Modifying the training loop
+
+Finally, three lines of code need to be changed in the training loop. 🤗 Accelerate's DataLoader classes will automatically handle the device placement by default,
+and [`~Accelerator.backward`] should be used for performing the backward pass:

 ```diff
 -   inputs = inputs.to(device)
@ -83,13 +99,17 @@ Finally, remove the `to(device)` calls to the inputs and targets in the training
 +   accelerator.backward(loss)
 ```

-Put everything together and your new Accelerate training loop should now look like this!
+With that, your training loop is now ready to use 🤗 Accelerate!
+
+## The finished code
+
+Below is the final version of the converted code: 

 ```python
 from accelerate import Accelerator
+
 accelerator = Accelerator()

-device = accelerator.device
 model, optimizer, training_dataloader, scheduler = accelerator.prepare(
    model, optimizer, training_dataloader, scheduler
 )
@ -104,118 +124,6 @@ for batch in training_dataloader:
    scheduler.step()
 ```

-## Training features
+## More Resources

-Accelerate offers additional features - like gradient accumulation, gradient clipping, mixed precision training and more - you can add to your script to improve your training run. Let's explore these three features.
-
-### Gradient accumulation
-
-Gradient accumulation enables you to train on larger batch sizes by accumulating the gradients over multiple batches before updating the weights. This can be useful for getting around memory limitations. To enable this feature in Accelerate, specify the `gradient_accumulation_steps` parameter in the [`Accelerator`] class and add the [`~Accelerator.accumulate`] context manager to your script.
-
-```diff
-+ accelerator = Accelerator(gradient_accumulation_steps=2)
-  model, optimizer, training_dataloader = accelerator.prepare(model, optimizer, training_dataloader)
-
-  for input, label in training_dataloader:
-+     with accelerator.accumulate(model):
-          predictions = model(input)
-          loss = loss_function(predictions, label)
-          accelerator.backward(loss)
-          optimizer.step()
-          scheduler.step()
-          optimizer.zero_grad()
-```
-
-### Gradient clipping
-
-Gradient clipping is a technique to prevent "exploding gradients", and Accelerate offers:
-
-* [`~Accelerator.clip_grad_value_`] to clip gradients to a minimum and maximum value
-* [`~Accelerator.clip_grad_norm_`] for normalizing gradients to a certain value
-
-### Mixed precision
-
-Mixed precision accelerates training by using a lower precision data type like fp16 (half-precision) to calculate the gradients. For the best performance with Accelerate, the loss should be computed inside your model (like in Transformers models) because computations outside of the model are computed in full precision.
-
-Set the mixed precision type to use in the [`Accelerator`], and then use the [`~Accelerator.autocast`] context manager to automatically cast the values to the specified data type.
-
-> [!WARNING]
-> Accelerate enables automatic mixed precision, so [`~Accelerator.autocast`] is only needed if there are other mixed precision operations besides those performed on loss by [`~Accelerator.backward`] which already handles the scaling.
-
-```diff
-+ accelerator = Accelerator(mixed_precision="fp16")
-+ with accelerator.autocast():
-      loss = complex_loss_function(outputs, target):
-```
-
-## Save and load
-
-Accelerate can also save and load a *model* once training is complete or you can also save the model and optimizer *state* which could be useful for resuming training.
-
-### Model
-
-Once all processes are complete, unwrap the model with the [`~Accelerator.unwrap_model`] method before saving it because the [`~Accelerator.prepare`] method wrapped your model into the proper interface for distributed training. If you don't unwrap the model, saving the model state dictionary also saves any potential extra layers from the larger model and you won't be able to load the weights back into your base model.
-
-You should use the [`~Accelerator.save_model`] method to unwrap and save the model state dictionary. This method can also save a model into sharded checkpoints or into the [safetensors](https://hf.co/docs/safetensors/index) format.
-
-<hfoptions id="save">
-<hfoption id="single checkpoint">
-
-```py
-accelerator.wait_for_everyone()
-accelerator.save_model(model, save_directory)
-```
-
-<Tip>
-
-For models from the [Transformers](https://hf.co/docs/transformers/index) library, save the model with the [`~transformers.PreTrainedModel.save_pretrained`] method so that it can be reloaded with the [`~transformers.PreTrainedModel.from_pretrained`] method.
-
-```py
-from transformers import AutoModel
-
-unwrapped_model = accelerator.unwrap_model(model)
-unwrapped_model.save_pretrained(
-    "path/to/my_model_directory",
-    is_main_process=accelerator.is_main_process,
-    save_function=accelerator.save,
-)
-
-model = AutoModel.from_pretrained("path/to/my_model_directory")
-```
-
-</Tip>
-
-To load your weights, use the [`~Accelerator.unwrap_model`] method to unwrap the model first before loading the weights. All model parameters are references to tensors, so this loads your weights inside `model`.
-
-```py
-unwrapped_model = accelerator.unwrap_model(model)
-path_to_checkpoint = os.path.join(save_directory,"pytorch_model.bin")
-unwrapped_model.load_state_dict(torch.load(path_to_checkpoint))
-```
-
-</hfoption>
-<hfoption id="sharded checkpoint">
-
-Set `safe_serialization=True` to save the model in the safetensor format.
-
-```py
-accelerator.wait_for_everyone()
-accelerator.save_model(model, save_directory, max_shard_size="1GB", safe_serialization=True)
-```
-
-To load a sharded checkpoint or a safetensor formatted checkpoint, use the [`~accelerate.load_checkpoint_in_model`] method. This method allows you to load a checkpoint onto a specific device.
-
-```py
-load_checkpoint_in_model(unwrapped_model, save_directory, device_map={"":device})
-```
-
-</hfoption>
-</hfoptions>
-
-### State
-
-During training, you may want to save the current state of the model, optimizer, random generators, and potentially learning rate schedulers so they can be restored in the *same script*. You should add the [`~Accelerator.save_state`] and [`~Accelerator.load_state`] methods to your script to save and load states.
-
-To further customize where and how states are saved through [`~Accelerator.save_state`], use the [`~utils.ProjectConfiguration`] class. For example, if `automatic_checkpoint_naming` is enabled, each saved checkpoint is stored at `Accelerator.project_dir/checkpoints/checkpoint_{checkpoint_number}`.
-
-Any other stateful items to be stored should be registered with the [`~Accelerator.register_for_checkpointing`] method so they can be saved and loaded. Every object passed to this method to be stored must have a `load_state_dict` and `state_dict` function.
+To check out more ways on how to migrate to 🤗 Accelerate, check out our [interactive migration tutorial](https://huggingface.co/docs/accelerate/usage_guides/explore) which showcases other items that need to be watched for when using Accelerate and how to do so quickly.
--- a/docs/source/basic_tutorials/tpu.md
+++ b/docs/source/basic_tutorials/tpu.md
@ -1,38 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-->
-
-# TPU training
-
-A [TPU (Tensor Processing Unit)](https://cloud.google.com/tpu/docs/intro-to-tpu) is a type of hardware specifically designed for training models efficiently. Accelerate supports TPU training, but there are a few things you should be aware of, namely graph compilation. This tutorial briefly discusses compilation, and for more details, take a look at the [Training on TPUs with Accelerate](../concept_guides/training_tpu) guide.
-
-## Compilation
-
-A TPU creates a graph of all the operations in the training step such as the forward pass, backward pass and optimizer step. This is why the first training step always takes a while because building and compiling this graph takes time. But once compilation is complete, it is cached and all subsequent steps are much faster.
-
-The key is to avoid compiling your code again or else training is super slow. This means all your operations must be exactly the same:
-
-* all tensors in your batches must have the same length (for example, no dynamic padding for NLP tasks)
-* your code must be static (for example, no layers with for loops that have different lengths depending on the input such as a LSTM)
-
-## Weight tying
-
-A common language model design is to tie the weights of the embedding and softmax layers. However, moving the model to a TPU (either yourself or passing it to the [`~Accelerator.prepare`] method) breaks the weight tying and you'll need to retie the weights.
-
-To add special behavior (like weight tying) in your script for TPUs, set [`~Accelerator.distributed_type`] to `DistributedType.TPU` first. Then you can use the [`~transformers.PreTrainedModel.tie_weights`] method to tie the weights.
-
-```py
-if accelerator.distributed_type == DistributedType.TPU:
-    model.tie_weights()
-```
--- a/docs/source/concept_guides/low_precision_training.md
+++ b/docs/source/concept_guides/low_precision_training.md
@ -34,7 +34,7 @@ MS-AMP O3 | FP8 | FP8 | FP8 | FP16 | FP8 | FP8+FP16

 ## `TransformersEngine`

-`TransformersEngine` is the first solution to trying to train in 8-bit floating point. It works by using drop-in replacement layers for certain ones in a model that utilizes their FP8-engine to reduce the number of bits (such as 32 to 8) without degrading the final accuracy of the model. 
+`TransformersEngine` is the first solution to trying to train in 8-bit floating point. It works by using drop-in replacement layers for certain ones in a model that utilize their FP8-engine to reduce the number of bits (such as 32 to 8) without degrading the final accuracy of the model. 

 Specifically, 🤗 Accelerate will find and replace the following layers with `TransformersEngine` versions:

@ -71,4 +71,4 @@ MS-AMP takes a different approach to `TransformersEngine` by providing three dif

 ## Combining the two

-More experiments need to be performed but it's been noted that combining both MS-AMP and TransformersEngine can lead to the highest throughput by relying on NVIDIA's optimized FP8 operators and utilizing how MS-AMP reduces the memory overhead.
+More experiments need to be performed but it's been noted that combining both MS-AMP and TransformersEngine can lead to the highest throughput by relying on NVIDIA's optimized FP8 operators and utilizing how MS-AMP reduces the memory overhead.
--- a/docs/source/concept_guides/performance.md
+++ b/docs/source/concept_guides/performance.md
@ -45,7 +45,7 @@ Why is this important? Under the hood this will set **5** different seed setting
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # ^^ safe to call this function even if cuda is not available
-    if is_torch_xla_available():
+    if is_tpu_available():
        xm.set_rng_state(seed)
 ```

--- a/docs/source/package_reference/accelerator.md
+++ b/docs/source/package_reference/accelerator.md
@ -15,8 +15,197 @@ rendered properly in your Markdown viewer.

 # Accelerator

-The [`Accelerator`] is the main class for enabling distributed training on any type of training setup. Read the [Add Accelerator to your code](../basic_tutorials/migration) tutorial to learn more about how to add the [`Accelerator`] to your script.
+The [`Accelerator`] is the main class provided by 🤗 Accelerate. 
+It serves at the main entry point for the API. 

-## Accelerator[[api]]
+## Quick adaptation of your code
+
+To quickly adapt your script to work on any kind of setup with 🤗 Accelerate just:
+
+1. Initialize an [`Accelerator`] object (that we will call `accelerator` throughout this page) as early as possible in your script.
+2. Pass your dataloader(s), model(s), optimizer(s), and scheduler(s) to the [`~Accelerator.prepare`] method.
+3. Remove all the `.cuda()` or `.to(device)` from your code and let the `accelerator` handle the device placement for you. 
+
+<Tip>
+
+    Step three is optional, but considered a best practice.
+
+</Tip>
+
+4. Replace `loss.backward()` in your code with `accelerator.backward(loss)`
+5. Gather your predictions and labels before storing them or using them for metric computation using [`~Accelerator.gather`]
+
+<Tip warning={true}>
+
+    Step five is mandatory when using distributed evaluation
+    
+</Tip>
+
+In most cases this is all that is needed. The next section lists a few more advanced use cases and nice features
+you should search for and replace by the corresponding methods of your `accelerator`:
+
+## Advanced recommendations
+
+### Printing
+
+`print` statements should be replaced by [`~Accelerator.print`] to be printed once per process:
+
+```diff
+- print("My thing I want to print!")
+ accelerator.print("My thing I want to print!")
+```
+
+### Executing processes
+
+#### Once on a single server
+
+For statements that should be executed once per server, use [`~Accelerator.is_local_main_process`]:
+
+```python
+if accelerator.is_local_main_process:
+    do_thing_once_per_server()
+```
+
+A function can be wrapped using the [`~Accelerator.on_local_main_process`] function to achieve the same 
+behavior on a function's execution:
+
+```python
+@accelerator.on_local_main_process
+def do_my_thing():
+    "Something done once per server"
+    do_thing_once_per_server()
+```
+
+#### Only ever once across all servers
+
+For statements that should only ever be executed once, use [`~Accelerator.is_main_process`]:
+
+```python
+if accelerator.is_main_process:
+    do_thing_once()
+```
+
+A function can be wrapped using the [`~Accelerator.on_main_process`] function to achieve the same 
+behavior on a function's execution:
+
+```python
+@accelerator.on_main_process
+def do_my_thing():
+    "Something done once per server"
+    do_thing_once()
+```
+
+#### On specific processes
+
+If a function should be ran on a specific overall or local process index, there are similar decorators 
+to achieve this:
+
+```python
+@accelerator.on_local_process(local_process_idx=0)
+def do_my_thing():
+    "Something done on process index 0 on each server"
+    do_thing_on_index_zero_on_each_server()
+```
+
+```python
+@accelerator.on_process(process_index=0)
+def do_my_thing():
+    "Something done on process index 0"
+    do_thing_on_index_zero()
+```
+
+### Synchronicity control
+
+Use [`~Accelerator.wait_for_everyone`] to make sure all processes join that point before continuing. (Useful before a model save for instance).
+
+### Saving and loading
+
+```python
+model = MyModel()
+model = accelerator.prepare(model)
+```
+
+Use [`~Accelerator.save_model`] instead of `torch.save` to save a model. It will remove all model wrappers added during the distributed process, get the state_dict of the model and save it. The state_dict will be in the same precision as the model being trained.
+
+```diff
+- torch.save(state_dict, "my_state.pkl")
+ accelerator.save_model(model, save_directory)
+```
+
+[`~Accelerator.save_model`] can also save a model into sharded checkpoints or with safetensors format.
+Here is an example: 
+
+```python
+accelerator.save_model(model, save_directory, max_shard_size="1GB", safe_serialization=True)
+```
+
+#### 🤗 Transformers models
+
+If you are using models from the [🤗 Transformers](https://huggingface.co/docs/transformers/) library, you can use the `.save_pretrained()` method.
+
+```python
+from transformers import AutoModel
+
+model = AutoModel.from_pretrained("bert-base-cased")
+model = accelerator.prepare(model)
+
+# ...fine-tune with PyTorch...
+
+unwrapped_model = accelerator.unwrap_model(model)
+unwrapped_model.save_pretrained(
+    "path/to/my_model_directory",
+    is_main_process=accelerator.is_main_process,
+    save_function=accelerator.save,
+)
+```
+
+This will ensure your model stays compatible with other 🤗 Transformers functionality like the `.from_pretrained()` method.
+
+```python
+from transformers import AutoModel
+
+model = AutoModel.from_pretrained("path/to/my_model_directory")
+```
+
+### Operations
+
+Use [`~Accelerator.clip_grad_norm_`] instead of ``torch.nn.utils.clip_grad_norm_`` and [`~Accelerator.clip_grad_value_`] instead of ``torch.nn.utils.clip_grad_value``
+
+### Gradient Accumulation
+
+To perform gradient accumulation use [`~Accelerator.accumulate`] and specify a gradient_accumulation_steps. 
+This will also automatically ensure the gradients are synced or unsynced when on 
+multi-device training, check if the step should actually be performed, and auto-scale the loss:
+
+```diff
+- accelerator = Accelerator()
+ accelerator = Accelerator(gradient_accumulation_steps=2)
+
+  for (input, label) in training_dataloader:
+     with accelerator.accumulate(model):
+          predictions = model(input)
+          loss = loss_function(predictions, labels)
+          accelerator.backward(loss)
+          optimizer.step()
+          scheduler.step()
+          optimizer.zero_grad()
+```
+#### GradientAccumulationPlugin
+[[autodoc]] utils.GradientAccumulationPlugin
+
+
+Instead of passing `gradient_accumulation_steps` you can instantiate a GradientAccumulationPlugin and pass it to the [`Accelerator`]'s `__init__`
+as `gradient_accumulation_plugin`. You can only pass either one of `gradient_accumulation_plugin` or `gradient_accumulation_steps` passing both will raise an error.
+```diff
+from accelerate.utils import GradientAccumulationPlugin
+
+gradient_accumulation_plugin = GradientAccumulationPlugin(num_steps=2)
+- accelerator = Accelerator()
+ accelerator = Accelerator(gradient_accumulation_plugin=gradient_accumulation_plugin)
+```
+
+In addition to the number of steps, this also lets you configure whether or not you adjust your learning rate scheduler to account for the change in steps due to accumulation.
+
+## Overall API documentation:

 [[autodoc]] Accelerator
--- a/docs/source/package_reference/kwargs.md
+++ b/docs/source/package_reference/kwargs.md
@ -37,7 +37,3 @@ related to distributed training or mixed precision are created.
 ## InitProcessGroupKwargs

 [[autodoc]] InitProcessGroupKwargs
-
-## KwargsHandler
-
-[[autodoc]] utils.KwargsHandler
--- a/docs/source/package_reference/utilities.md
+++ b/docs/source/package_reference/utilities.md
@ -62,8 +62,10 @@ These are standalone dataclasses used for checks, such as the type of distribute

 These are configurable arguments for specific interactions throughout the PyTorch ecosystem that Accelerate handles under the hood.

+
 [[autodoc]] utils.AutocastKwargs

+
 [[autodoc]] utils.DistributedDataParallelKwargs

 [[autodoc]] utils.FP8RecipeKwargs
@ -72,8 +74,6 @@ These are configurable arguments for specific interactions throughout the PyTorc

 [[autodoc]] utils.InitProcessGroupKwargs

-[[autodoc]] utils.KwargsHandler
-
 ## Plugins

 These are plugins that can be passed to the [`Accelerator`] object. While they are defined elsewhere in the documentation, 
@ -95,8 +95,6 @@ These are classes which can be configured and passed through to the appropriate

 [[autodoc]] utils.BnbQuantizationConfig

-[[autodoc]] utils.DataLoaderConfiguration
-
 [[autodoc]] utils.ProjectConfiguration

 ## Environmental Variables
@ -152,7 +150,7 @@ These functionalities check the state of the current working environment includi

 [[autodoc]] utils.is_torch_version

-[[autodoc]] utils.is_torch_xla_available
+[[autodoc]] utils.is_tpu_available

 [[autodoc]] utils.is_xpu_available

--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@ -9,78 +9,26 @@ Unless required by applicable law or agreed to in writing, software distributed
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.

-⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.
 -->

-# Quicktour
+# Quick tour

-There are many ways to launch and run your code depending on your training environment ([torchrun](https://pytorch.org/docs/stable/elastic/run.html), [DeepSpeed](https://www.deepspeed.ai/), etc.) and available hardware. Accelerate offers a unified interface for launching and training on different distributed setups, allowing you to focus on your PyTorch training code instead of the intricacies of adapting your code to these different setups. This allows you to easily scale your PyTorch code for training and inference on distributed setups with hardware like GPUs and TPUs. Accelerate also provides Big Model Inference to make loading and running inference with really large models that usually don't fit in memory more accessible.
+This guide aims to help you get started with 🤗 Accelerate quickly. It covers the essential steps you need to take to 
+enable distributed training, as well as the adjustments that you need to make in some common scenarios.

-This quicktour introduces the three main features of Accelerate:
+To help you navigate, the guide is split into two sections: 
+* [Getting Started with 🤗 Accelerate](#getting-started-with--accelerate): start here to learn how to modify your script to enable distributed training with 🤗 Accelerate
+* [Common adaptations to the base case](#common-adaptations-to-the-base-case): check out this section for common deviations from the baseline scenario and what adjustments may need to be made to support them.

-* a unified command line launching interface for distributed training scripts
-* a training library for adapting PyTorch training code to run on different distributed setups
-* Big Model Inference
+## Getting started with 🤗 Accelerate

-## Unified launch interface
+### Enable distributed training in your script 

-Accelerate automatically selects the appropriate configuration values for any given distributed training framework (DeepSpeed, FSDP, etc.) through a unified configuration file generated from the [`accelerate config`](../../docs/source/package_reference/cli#accelerate-config) command. You could also pass the configuration values explicitly to the command line which is helpful in certain situations like if you're using SLURM.
+To use 🤗 Accelerate in your own training script, you have to modify four things:

-
-But in most cases, you should always run [`accelerate config`](../../docs/source/package_reference/cli#accelerate-config) first to help Accelerate learn about your training setup.
-
-```bash
-accelerate config
-```
-
-The [`accelerate config`](../../docs/source/package_reference/cli#accelerate-config) command creates and saves a default_config.yaml file in Accelerates cache folder. This file stores the configuration for your training environment, which helps Accelerate correctly launch your training script based on your machine.
-
-After you've configured your environment, you can test your setup with [`accelerate test`](../../docs/source/package_reference/cli#accelerate-test), which launches a short script to test the distributed environment.
-
-```bash
-accelerate test
-```
-
-> [!TIP]
-> Add `--config_file` to the `accelerate test` or `accelerate launch` command to specify the location of the configuration file if it is saved in a non-default location like the cache.
-
-Once your environment is setup, launch your training script with [`accelerate launch`](../../docs/source/package_reference/cli#accelerate-launch)!
-
-```bash
-accelerate launch path_to_script.py --args_for_the_script
-```
-
-To learn more, check out the [Launch distributed code](basic_tutorials/launch) tutorial for more information about launching your scripts.
-
-## Adapt training code
-
-The next main feature of Accelerate is the [`Accelerator`] class which adapts your PyTorch code to run on different distributed setups.
-
-You only need to add a few lines of code to your training script to enable it to run on multiple GPUs or TPUs.
-
-```diff
-+ from accelerate import Accelerator
-+ accelerator = Accelerator()
-
-+ device = accelerator.device
-+ model, optimizer, training_dataloader, scheduler = accelerator.prepare(
-+     model, optimizer, training_dataloader, scheduler
-+ )
-
-  for batch in training_dataloader:
-      optimizer.zero_grad()
-      inputs, targets = batch
-     inputs = inputs.to(device)
-     targets = targets.to(device)
-      outputs = model(inputs)
-      loss = loss_function(outputs, targets)
-+     accelerator.backward(loss)
-      optimizer.step()
-      scheduler.step()
-```
-
-1. Import and instantiate the [`Accelerator`] class at the beginning of your training script. The [`Accelerator`] class initializes everything necessary for distributed training, and it automatically detects your training environment (a single machine with a GPU, a machine with several GPUs, several machines with multiple GPUs or a TPU, etc.) based on how the code was launched.
+1. Import the [`Accelerator`] main class and instantiate one in an `accelerator` object.

 ```python
 from accelerate import Accelerator
@ -88,16 +36,27 @@ from accelerate import Accelerator
 accelerator = Accelerator()
 ```

-2. Remove calls like `.cuda()` on your model and input data. The [`Accelerator`] class automatically places these objects on the appropriate device for you.
+Add this at the beginning of your training script as it will initialize everything necessary for distributed training. 
+You don't need to indicate the kind of environment you are in (a single machine with a GPU, a machine with several GPUs, 
+or several machines with multiple GPUs or a TPU), the library will detect this automatically.

-> [!WARNING]
-> This step is *optional* but it is considered best practice to allow Accelerate to handle device placement. You could also deactivate automatic device placement by passing `device_placement=False` when initializing the [`Accelerator`]. If you want to explicitly place objects on a device with `.to(device)`, make sure you use `accelerator.device` instead. For example, if you create an optimizer before placing a model on `accelerator.device`, training fails on a TPU.
+2. Remove the `.to(device)` or `.cuda()` calls for your model and input data.

-```py
-device = accelerator.device
-```
+The `accelerator` object will handle placing these objects on the right device for you. 
+If you choose to leave those `.to(device)` calls, make sure to use the device provided by the `accelerator` object: `accelerator.device`.

-3. Pass all relevant PyTorch objects for training (optimizer, model, dataloader(s), learning rate scheduler) to the [`~Accelerator.prepare`] method as soon as they're created. This method wraps the model in a container optimized for your distributed setup, uses Accelerates version of the optimizer and scheduler, and creates a sharded version of your dataloader for distribution across GPUs or TPUs.
+<Tip warning={true}>
+
+    You can fully deactivate the automatic device placement by passing along `device_placement=False` when 
+    initializing the [`Accelerator`].
+    However, if you place your objects manually on the proper device, be careful to create your optimizer after putting your
+    model on `accelerator.device` or your training will fail on TPU.
+
+</Tip>
+
+3. Pass all PyTorch objects relevant to training (optimizer, model, dataloader(s), learning rate scheduler) to the
+[`~Accelerator.prepare`] method as soon as these objects are created, before starting your actual
+training loop:

 ```python
 model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
@ -105,23 +64,55 @@ model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
 )
 ```

-4. Replace `loss.backward()` with [`~Accelerator.backward`] to use the correct `backward()` method for your training setup.
+**Important notes**:

-```py
-accelerator.backward(loss)
-```
+* You should always pass the the learning rate scheduler to [`~Accelerator.prepare`], however if the scheduler should *not* be stepped at each optimization step, pass `step_with_optimizer=False` to the [`Accelerator`] init.
+* While you can send your dataloader to [`~Accelerator.prepare`] on its own (and there are cases for doing so, such as distributed inference), it's best to send it to [`~Accelerator.prepare`] together with the model and optimizer.
+* If you wish to run distributed evaluation, send your validation dataloader to [`~Accelerator.prepare`] as well. There are some nuances to distributed validation, check the [Distributed evaluation](#add-distributed-evaluation) section of the guide.
+* Any instruction using your training dataloader length (for instance if you want to log the number of total training
+steps) should go after the call to [`~Accelerator.prepare`].

-Read [Accelerate’s internal mechanisms](../../docs/source/concept_guides/internal_mechanism) guide to learn more details about how Accelerate adapts your code.
+Passing `DataLoader` objects to the [`~Accelerator.prepare`] method ensures that your dataloader will be sharded across 
+all GPUs/TPU cores available so that each one sees a different portion of the training dataset. In other words, if there are 8 processes and a dataset of 64 items, each process will see 8 of these items per iteration. Also, the random states 
+of all processes will be synchronized at the beginning of each iteration through your dataloader, to make sure the data 
+is shuffled the same way (if you decided to use `shuffle=True` or any kind of random sampler).

-### Distributed evaluation
+<Tip>

-To perform distributed evaluation, pass your validation dataloader to the [`~Accelerator.prepare`] method:
+    The actual batch size for your training will be the number of devices used multiplied by the batch size you set in
+    your script. For instance, training on 4 GPUs with a batch size of 16 set when creating the training dataloader will
+    train at an actual batch size of 64 (4 * 16).
+    If you want the batch size remain the same regardless of how many GPUs the script is run on, you can use the 
+    option `split_batches=True` when creating and initializing [`Accelerator`].
+    Your training dataloader may change length when going through this method: if you run on X GPUs, it will have its
+    length divided by X (since your actual batch size will be multiplied by X), unless you set
+    `split_batches=True`.
+
+</Tip>
+
+
+4. Replace the `loss.backward()` line with `accelerator.backward(loss)`.
+
+And you're all set! With all these changes, your script will run on your local machine as well as on multiple GPUs or a
+TPU! You can either use your favorite tool to launch the distributed training, or you can use the 🤗 Accelerate
+launcher.
+
+### Add distributed evaluation
+
+You can perform regular evaluation in your training script if you leave your validation dataloader out of the
+[`~Accelerator.prepare`] method. In this case, you will need to put the input data on the
+`accelerator.device` manually.
+
+To perform distributed evaluation, send along your validation dataloader to the [`~Accelerator.prepare`]
+method:

 ```python
 validation_dataloader = accelerator.prepare(validation_dataloader)
 ```

-Each device in your distributed setup only receives a part of the evaluation data, which means you should group your predictions together with the [`~Accelerator.gather_for_metrics`] method. This method requires all tensors to be the same size on each process, so if your tensors have different sizes on each process (for instance when dynamically padding to the maximum length in a batch), you should use the [`~Accelerator.pad_across_processes`] method to pad you tensor to the largest size across processes.
+Same as with your training dataloader, each device will only see part of the evaluation data should you run your script 
+on multiple devices. This means you will need to group your predictions together which you can do with 
+the [`~Accelerator.gather_for_metrics`] method.

 ```python
 for inputs, targets in validation_dataloader:
@ -132,50 +123,319 @@ for inputs, targets in validation_dataloader:
    metric.add_batch(all_predictions, all_targets)
 ```

-> [!TIP]
-> Data at the end of a dataset may be duplicated so the batch can be equally divided among all workers. The [`~Accelerator.gather_for_metrics`] method automatically removes the duplicated data to calculate a more accurate metric.
+<Tip warning={true}>

-## Big Model Inference
+    Similar to the training dataloader, passing your validation dataloader through
+    [`~Accelerator.prepare`] may change it: if you run on X GPUs, it will have its length divided by X
+    (since your actual batch size will be multiplied by X), unless you set `split_batches=True`.

-Accelerate's Big Model Inference has two main features, [`~accelerate.init_empty_weights`] and [`~accelerate.load_checkpoint_and_dispatch`], to load large models for inference that typically don't fit into memory.
+</Tip>

-> [!TIP]
-> Take a look at the [Handling big models for inference](../../docs/source/concept_guides/big_model_inference) guide for a better understanding of how Big Model Inference works under the hood.
+Some data at the end of the dataset may be duplicated so the batch can be divided equally among all workers. As a result, 
+metrics should be calculated through the [`~Accelerator.gather_for_metrics`] method to automatically remove the duplicated 
+data while gathering and provide a more accurate metric.

-### Empty weights initialization
+<Tip>

-The [`~accelerate.init_empty_weights`] context manager initializes models of any size by creating a *model skeleton* and moving and placing parameters each time they're created to PyTorch's [**meta**](https://pytorch.org/docs/main/meta.html) device. This way, not all weights are immediately loaded and only a small part of the model is loaded into memory at a time.
+    If for some reason you don't wish to have this automatically done, [`~Accelerator.gather`] can be used instead to gather 
+    the data across all processes and this can manually be done instead.

-For example, loading an empty [Mixtral-8x7B](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) model takes significantly less memory than fully loading the models and weights on the CPU.
+</Tip>

-```py
-from accelerate import init_empty_weights
-from transformers import AutoConfig, AutoModelForCausalLM

-config = AutoConfig.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1")
-with init_empty_weights():
-    model = AutoModelForCausalLM.from_config(config)
+<Tip warning={true}>
+
+    The [`~Accelerator.gather`] and [`~Accelerator.gather_for_metrics`] methods require the tensors to be all the same size on each process. If
+    you have tensors of different sizes on each process (for instance when dynamically padding to the maximum length in
+    a batch), you should use the [`~Accelerator.pad_across_processes`] method to pad you tensor to the
+    biggest size across processes.
+
+</Tip>
+
+### Launch your distributed script
+
+You can use the regular commands to launch your distributed training (like `torch.distributed.run` for
+PyTorch) - they are fully compatible with 🤗 Accelerate.
+
+Alternatively, 🤗 Accelerate provides a CLI tool that unifies all launchers, so you only have to remember one command. \
+To use it, run a quick configuration setup first on your machine and answer the questions:
+
+```bash
+accelerate config
 ```

-### Load and dispatch weights
+At the end of the setup, a *default_config.yaml* file will be saved in your cache folder for 🤗 Accelerate. That cache 
+folder is (with decreasing order of priority):

-The [`~accelerate.load_checkpoint_and_dispatch`] function loads full or sharded checkpoints into the empty model, and automatically distribute weights across all available devices.
+- The content of your environment variable `HF_HOME` suffixed with *accelerate*.
+- If it does not exist, the content of your environment variable `XDG_CACHE_HOME` suffixed with
+  *huggingface/accelerate*.
+- If this does not exist either, the folder *~/.cache/huggingface/accelerate*.

-The `device_map` parameter determines where to place each model layer, and specifiying `"auto"` places them on the GPU first, then the CPU, and finally the hard drive as memory-mapped tensors if there's still not enough memory. Use the `no_split_module_classes` parameter to indicate which modules shouldn't be split across devices (typically those with a residual connection).
+By specifying the `--config_file`  flag you can specify an alternative location of the configuration file.
+Once the configuration setup is complete, you can test your setup by running:

-```py
-from accelerate import load_checkpoint_and_dispatch
-
-model = load_checkpoint_and_dispatch(
-    model, checkpoint="mistralai/Mixtral-8x7B-Instruct-v0.1", device_map="auto", no_split_module_classes=['Block']
-)
+```bash
+accelerate test
 ```

-## Next steps
+This will launch a short script that will test the distributed environment. If it runs without issues, you are ready for 
+the next step!

-Now that you've been introduced to the main Accelerate features, your next steps could include:
+Note that if you specified a location for the config file in the previous step, you need to pass it here as well:

-* Check out the [tutorials](docs/source/basic_tutorials/overview) for a gentle walkthrough of Accelerate. This is especially useful if you're new to distributed training and the library.
-* Dive into the [guides](docs/source/usage_guides/explore) to see how to use Accelerate for specific use-cases.
-* Deepen your conceptual understanding of how Accelerate works internally by reading the [concept guides](docs/source/concept_guides/internal_mechanism).
-* Look up classes and commands in the [API reference](docs/source/package_reference/accelerator) to see what parameters and options are available.
+```bash
+accelerate test --config_file path_to_config.yaml
+```
+
+Now that this is done, you can run your script with the following command:
+
+```bash
+accelerate launch path_to_script.py --args_for_the_script
+```
+
+If you stored the config file in a non-default location, you can indicate it to the launcher like this:
+
+```bash
+accelerate launch --config_file path_to_config.yaml path_to_script.py --args_for_the_script
+```
+
+You can override any of the arguments determined by your config file. To see the complete list of parameters that you 
+can pass in, run `accelerate launch -h`. (And further niche argument help by passing in partial commands, such as `accelerate launch --multi_gpu -h` for all `multi_gpu` args)
+
+Check out the [Launch tutorial](basic_tutorials/launch) for more information about launching your scripts.
+
+## Common modifications of the base case 
+
+The previous section covers the minimal essential steps to move a training script into a distributed setup with 🤗 Accelerate.
+Here we describe common modifications/deviations from the base case scenario and the adjustments you need to make to accommodate for them.
+
+### Launch distributed training from a notebook
+
+Accelerate has a [`notebook_launcher`] to help you launch your training function from a 
+notebook. This launcher supports launching a training with TPUs on Colab or Kaggle, as well as training on several GPUs and machines
+(if the machine on which you are running your notebook has them).
+
+Define a function responsible for your whole training and/or evaluation in a cell of the notebook, then execute a
+cell with the following code:
+
+```python
+from accelerate import notebook_launcher
+
+notebook_launcher(training_function)
+```
+
+<Tip warning={true}>
+
+    Your [`Accelerator`] object should only be defined inside the training function. This is because the
+    initialization should be done inside the launcher only.
+
+</Tip>
+
+Check out the [Notebook Launcher tutorial](basic_tutorials/notebook) for more information about training on TPUs.
+
+### Specifics of training on TPU
+
+If you want to launch your script on TPUs, there are a few caveats you should be aware of. Behind the scenes, the TPUs
+will create a graph of all the operations happening in your training step (forward pass, backward pass and optimizer
+step). This is why your first step of training will always be very long as building and compiling this graph for
+optimizations takes some time.
+
+The good news is that this compilation will be cached so the second step and all the following will be much faster. The
+bad news is that it only applies if all of your steps do exactly the same operations, which implies:
+
+- having all tensors of the same length in all your batches
+- having static code (i.e., not a for loop of length that could change from step to step)
+
+Having any of the things above change between two steps will trigger a new compilation which will, once again, take a
+lot of time. In practice, that means you must take special care to have all your tensors in your inputs of the same
+shape (so no dynamic padding for instance if you are in an NLP problem) and should not use layers with for loops that
+have different lengths depending on the inputs (such as an LSTM) or the training will be excruciatingly slow.
+
+To introduce special behavior in your script for TPUs you can check the `distributed_type` of your
+`accelerator`:
+
+```python docstyle-ignore
+from accelerate import DistributedType
+
+if accelerator.distributed_type == DistributedType.TPU:
+    # do something of static shape
+else:
+    # go crazy and be dynamic
+```
+
+The [NLP example](https://github.com/huggingface/accelerate/blob/main/examples/nlp_example.py) shows an example in a 
+situation with dynamic padding.
+
+One last thing to pay close attention to: if your model has tied weights (such as language models which tie the weights
+of the embedding matrix with the weights of the decoder), moving this model to the TPU (either yourself or after you
+passed your model to [`~Accelerator.prepare`]) will break the tying. You will need to retie the weights
+after. You can find an example of this in the [run_clm_no_trainer](https://github.com/huggingface/transformers/blob/master/examples/pytorch/language-modeling/run_clm.py) script in
+the Transformers repository.
+
+Check out the [TPU tutorial](concept_guides/training_tpu) for more information about training on TPUs.
+
+### Execute a statement only on one processes
+
+Some of your instructions only need to run for one process on a given server: for instance a data download or a log
+statement. To do this, wrap the statement in a test like this:
+
+```python docstyle-ignore
+if accelerator.is_local_main_process:
+    # Is executed once per server
+```
+
+Another example is progress bars: to avoid having multiple progress bars in your output, you should only display one on
+the local main process:
+
+```python
+from tqdm.auto import tqdm
+
+progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
+```
+
+The *local* means per machine: if you are running your training on two servers with several GPUs, the instruction will
+be executed once on each of those servers. If you need to execute something only once for all processes (and not per
+machine) for instance, uploading the final model to the 🤗 model hub, wrap it in a test like this:
+
+```python docstyle-ignore
+if accelerator.is_main_process:
+    # Is executed once only
+```
+
+For printing statements you only want executed once per machine, you can just replace the `print` function by
+`accelerator.print`.
+
+
+### Defer execution on multiple GPUs
+
+When you run your usual script, instructions are executed in order. Using 🤗 Accelerate to deploy your script on several
+GPUs at the same time introduces a complication: while each process executes all instructions in order, some may be
+faster than others.
+
+You might need to wait for all processes to have reached a certain point before executing a given instruction. For
+instance, you shouldn't save a model before making sure every process is done with training. To do this, add the
+following line in your code:
+
+```
+accelerator.wait_for_everyone()
+```
+
+This instruction will block all the processes that arrive first until all the other processes have reached that
+point (if you run your script on just one GPU or CPU, this won't do anything).
+
+
+### Save/load a model in a distributed setup
+
+Saving the model you trained might need a bit of adjustment: first you should wait for all processes to reach that
+point in the script as shown above, and then, you should unwrap your model before saving it. This is because when going
+through the [`~Accelerator.prepare`] method, your model may have been placed inside a bigger model,
+which deals with the distributed training. This in turn means that saving your model state dictionary without taking
+any precaution will take that potential extra layer into account, and you will end up with weights you can't load back
+in your base model. The [`~Accelerator.save_model`] method will help you to achieve that. It will unwrap your model and save
+the model state dictionary.
+
+Here is an example:
+
+```
+accelerator.wait_for_everyone()
+accelerator.save_model(model, save_directory)
+```
+
+The [`~Accelerator.save_model`] method can also save a model into sharded checkpoints or with safetensors format:
+
+```python
+accelerator.wait_for_everyone()
+accelerator.save_model(model, save_directory, max_shard_size="1GB", safe_serialization=True)
+```
+
+If your script contains logic to load a checkpoint, we also recommend you load your weights in the unwrapped model
+(this is only useful if you use the load function after making your model go through
+[`~Accelerator.prepare`]). Here is an example:
+
+```python
+unwrapped_model = accelerator.unwrap_model(model)
+path_to_checkpoint = os.path.join(save_directory,"pytorch_model.bin")
+unwrapped_model.load_state_dict(torch.load(path_to_checkpoint))
+```
+
+Note that since all the model parameters are references to tensors, this will load your weights inside `model`.
+
+If you want to load a sharded checkpoint or a checkpoint with safetensors format into the model with a specific `device`, 
+we recommend you to load it with [`~utils.load_checkpoint_in_model`] function. Here's an example:
+
+```python
+load_checkpoint_in_model(unwrapped_model, save_directory, device_map={"":device})
+```
+
+
+### Save/load entire states
+
+When training your model, you may want to save the current state of the model, optimizer, random generators, and potentially 
+learning rate schedulers to be restored in the _same script_.
+You can use [`~Accelerator.save_state`] and [`~Accelerator.load_state`] respectively to do so.
+
+To further customize where and how states saved through [`~Accelerator.save_state`] the [`~utils.ProjectConfiguration`] class can be used. For example 
+if `automatic_checkpoint_naming` is enabled each saved checkpoint will be located then at `Accelerator.project_dir/checkpoints/checkpoint_{checkpoint_number}`.
+
+If you have registered any other stateful items to be stored through [`~Accelerator.register_for_checkpointing`] they will also be saved and/or loaded.
+
+<Tip>
+
+    Every object passed to [`~Accelerator.register_for_checkpointing`] must have a `load_state_dict` and `state_dict` function to be stored
+
+</Tip>
+
+
+### Use gradient clipping
+
+If you are using gradient clipping in your script, you should replace the calls to
+`torch.nn.utils.clip_grad_norm_` or `torch.nn.utils.clip_grad_value_` with [`~Accelerator.clip_grad_norm_`]
+and [`~Accelerator.clip_grad_value_`] respectively.
+
+
+### Train with mixed precision
+
+If you are running your training in Mixed Precision with 🤗 Accelerate, you will get the best result with your loss being
+computed inside your model (like in Transformer models for instance). Every computation outside of the model will be
+executed in full precision (which is generally what you want for loss computation, especially if it involves a
+softmax). However, you might want to put your loss computation inside the [`~Accelerator.autocast`] context manager:
+
+```
+with accelerator.autocast():
+    loss = complex_loss_function(outputs, target):
+```
+
+Another caveat with Mixed Precision training is that the gradient will skip a few updates at the beginning and
+sometimes during training: because of the dynamic loss scaling strategy, there are points during training where the
+gradients have overflown, and the loss scaling factor is reduced to avoid this happening again at the next step.
+
+This means that you may update your learning rate scheduler when there was no update, which is fine in general, but may
+have an impact when you have very little training data, or if the first learning rate values of your scheduler are very
+important. In this case, you can skip the learning rate scheduler updates when the optimizer step was not done like
+this:
+
+```
+if not accelerator.optimizer_step_was_skipped:
+    lr_scheduler.step()
+```
+
+### Use gradient accumulation 
+
+To perform gradient accumulation use [`~Accelerator.accumulate`] and specify a `gradient_accumulation_steps`. 
+This will also automatically ensure the gradients are synced or unsynced when on multi-device training, check if the step should
+actually be performed, and auto-scale the loss:
+
+```python
+accelerator = Accelerator(gradient_accumulation_steps=2)
+model, optimizer, training_dataloader = accelerator.prepare(model, optimizer, training_dataloader)
+
+for input, label in training_dataloader:
+    with accelerator.accumulate(model):
+        predictions = model(input)
+        loss = loss_function(predictions, label)
+        accelerator.backward(loss)
+        optimizer.step()
+        scheduler.step()
+        optimizer.zero_grad()
+```
--- a/docs/source/usage_guides/big_modeling.md
+++ b/docs/source/usage_guides/big_modeling.md
@ -52,7 +52,7 @@ will attempt to fill all the space in your GPU(s), then loading them to the CPU,

 <Tip>

-For more details on designing your own device map, see this section of the [concept guide](../concept_guides/big_model_inference#designing-a-device-map)
+For more details on designing your own device map, see this section of the [concept guide](../concept_guide/big_model_inference#designing-a-device-map)

 </Tip>

--- a/docs/source/usage_guides/deepspeed.md
+++ b/docs/source/usage_guides/deepspeed.md
@ -9,7 +9,7 @@ Unless required by applicable law or agreed to in writing, software distributed
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.

-⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.
 -->

@ -353,7 +353,7 @@ accelerate launch examples/by_feature/deepspeed_with_config_support.py \
 ```

 **ZeRO++ Config Example**
-You can use the features of ZeRO++ by using the appropriate config parameters. Note that ZeRO++ is an extension for ZeRO Stage 3. Here is how the config file can be modified, from [DeepSpeed's ZeRO++ tutorial](https://www.deepspeed.ai/tutorials/zeropp/):
+You can use the the features of ZeRO++ by using the appropriate config parameters. Note that ZeRO++ is an extension for ZeRO Stage 3. Here is how the config file can be modified, from [DeepSpeed's ZeRO++ tutorial](https://www.deepspeed.ai/tutorials/zeropp/):

 ```json
 {
@ -519,7 +519,7 @@ ValueError: When using `deepspeed_config_file`, the following accelerate config
 ['gradient_accumulation_steps', 'gradient_clipping', 'zero_stage', 'offload_optimizer_device', 'offload_param_device',
 'zero3_save_16bit_model', 'mixed_precision'].
 Please specify them appropriately in the DeepSpeed config file.
-If you are using an accelerate config file, remove other config variables mentioned in the above specified list.
+If you are using an accelerate config file, remove others config variables mentioned in the above specified list.
 The easiest method is to create a new config following the questionnaire via `accelerate config`.
 It will only ask for the necessary config variables when using `deepspeed_config_file`.
 ```
@ -656,7 +656,7 @@ ZeRO Stage-3 has 2 options:
   Below is the snippet from `examples/by_feature/deepspeed_with_config_support.py` showing this:
   ```python
   success = model.save_checkpoint(PATH, ckpt_id, checkpoint_state_dict)
-   status_msg = f"checkpointing: PATH={PATH}, ckpt_id={ckpt_id}"
+   status_msg = "checkpointing: PATH={}, ckpt_id={}".format(PATH, ckpt_id)
   if success:
       logging.info(f"Success {status_msg}")
   else:
--- a/docs/source/usage_guides/local_sgd.md
+++ b/docs/source/usage_guides/local_sgd.md
@ -88,7 +88,7 @@ achieved by adding one `with LocalSGD` statement and one call `local_sgd.step()`
 +           local_sgd.step()
 ```

-Under the hood, the Local SGD code **disables** automatic gradient synchronization (but accumulation still works as expected!). Instead it averages model parameters every `local_sgd_steps` steps (as well as at the end of the training loop).
+Under the hood, the Local SGD code **disables** automatic gradient synchornization (but accumulation still works as expected!). Instead it averages model parameters every `local_sgd_steps` steps (as well as in the end of the training loop).

 ## Limitations

--- a/docs/source/usage_guides/low_precision_training.md
+++ b/docs/source/usage_guides/low_precision_training.md
@ -57,7 +57,7 @@ Of the two, `MS-AMP` is traditionally the easier one to configure as there is on
 Currently two levels of optimization are supported in the 🤗 Accelerate integration, `"O1"` and `"O2"` (using the letter 'o', not zero). 

 * `"O1"` will cast the weight gradients and `all_reduce` communications to happen in 8-bit, while the rest are done in 16 bit. This reduces the general GPU memory usage and speeds up communication bandwidths.
-* `"O2"` will also cast first-order optimizer states into 8 bit, while the second order states are in FP16. (Currently just the `Adam` optimizer is supported). This tries its best to minimize final accuracy degradation and will save the highest potential memory.
+* `"O2"` will also cast first-order optimizer states into 8 bit, while the second order states are in FP16. (Currently just the `Adam` optimizer is supported). This tries it's best to minimize final accuracy degradation and will save the highest potential memory.

 To specify an optimization level, pass it to the `FP8KwargsHandler` by setting the `optimization_level` argument:

@ -70,7 +70,7 @@ accelerator = Accelerator(mixed_precision="fp8", kwarg_handlers=kwargs)

 ## Configuring TransformersEngine

-TransformersEngine has much more available for customizing how and what FP8 calculations are performed. A full list of supported arguments and what they mean are available in [NVIDIA's documentation](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html), however they are restated as part of [`FP8KwargsHandler`]'s docstring for your convenience. 
+TransformersEngine has much more available for customizing how and what FP8 calculations are performed. A full list of supported arguments and what they mean are available in [NVIDIA's documentation](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html), however they are restated as part of [`FP8KwargsHandler`]'s docstring for your convience. 

 🤗 Accelerate tries to set sensible defaults, but exploring and tweaking the various parameters yourself can lead to better performance potentially.

@ -83,10 +83,10 @@ kwargs = [FP8RecipeKwargs(backend="te", ...)]
 accelerator = Accelerator(mixed_precision="fp8", kwarg_handlers=kwargs)
 ```

-## Further Reading
+## Futher Reading

 To learn more about training in FP8 please check out the following resources:

 * [Our concept guide](../concept_guides/low_precision_training.md) detailing into more about both TransformersEngine and MS-AMP
 * [The `transformers-engine` documentation](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html)
-* [The `MS-AMP` documentation](https://azure.github.io/MS-AMP/docs/)
+* [The `MS-AMP` documentation](https://azure.github.io/MS-AMP/docs/)
--- a/docs/source/usage_guides/megatron_lm.md
+++ b/docs/source/usage_guides/megatron_lm.md
@ -9,7 +9,7 @@ Unless required by applicable law or agreed to in writing, software distributed
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.

-⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.
 -->

@ -542,7 +542,7 @@ megatron_lm_plugin = MegatronLMPlugin(other_megatron_args=other_megatron_args)
 This covers Decoder only, Encode only and Encoder-Decoder model classes.

 2. Only loss is returned from model forward pass as 
-there is quite complex interplay of pipeline, tensor and data parallelism behind the scenes.
+there is quite complex interplay of pipeline, tensor and data parallelsim behind the scenes.
 The `model(**batch_data)` call return loss(es) averaged across the data parallel ranks.
 This is fine for most cases wherein pre-training jobs are run using Megatron-LM features and
 you can easily compute the `perplexity` using the loss. 
@ -580,4 +580,4 @@ b. Megatron-LM [GPTModel](https://github.com/NVIDIA/Megatron-LM/blob/main/megatr
 c. Megatron-LM [T5Model](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/t5_model.py) : 
 🤗 transformers models with `t5` in  config's model type, e.g., 
 [T5](https://huggingface.co/docs/transformers/model_doc/t5) and 
-[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)
+[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)
--- a/docs/source/usage_guides/model_size_estimator.md
+++ b/docs/source/usage_guides/model_size_estimator.md
@ -51,7 +51,7 @@ Below are a few gradio demos related to what was described above. The first is t
    ></iframe>
 </div>

-A community member has taken the idea and expanded it further, allowing you to filter models directly and see if you can run a particular LLM given GPU constraints and LoRA configurations. To play with it, see [here](https://huggingface.co/spaces/Vokturz/can-it-run-llm) for more details.
+A community member has taken the idea and expended it further, allowing you to filter models directly and see if you can run a particular LLM given GPU constraints and LoRA configurations. To play with it, see [here](https://huggingface.co/spaces/Vokturz/can-it-run-llm) for more details.

 ## The Command

@ -134,4 +134,4 @@ This calculator will tell you how much memory is needed to purely load the model
 This calculation is accurate within a few % of the actual value, so it is a very good view of just how much memory it will take. For instance loading `bert-base-cased` actually takes `413.68 MB` when loaded on CUDA in full precision, and the calculator estimates `413.18 MB`.

 When performing inference you can expect to add up to an additional 20% as found by [EleutherAI](https://blog.eleuther.ai/transformer-math/). We'll be conducting research into finding a more accurate estimate to these values, and will update 
-this calculator once done.
+this calculator once done.
--- a/examples/by_feature/checkpointing.py
+++ b/examples/by_feature/checkpointing.py
@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -85,7 +86,7 @@ def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
-        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
+        max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
        # When using mixed precision we want round multiples of 8/16
        if accelerator.mixed_precision == "fp8":
            pad_to_multiple_of = 16
@ -153,7 +154,7 @@ def training_function(config, args):

    # If the batch size is too big we use gradient accumulation
    gradient_accumulation_steps = 1
-    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
+    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
        gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
        batch_size = MAX_GPU_BATCH_SIZE

--- a/examples/by_feature/cross_validation.py
+++ b/examples/by_feature/cross_validation.py
@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2022 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -105,7 +106,7 @@ def get_fold_dataloaders(

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
-        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
+        max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
        # When using mixed precision we want round multiples of 8/16
        if accelerator.mixed_precision == "fp8":
            pad_to_multiple_of = 16
@ -156,7 +157,7 @@ def training_function(config, args):

    # If the batch size is too big we use gradient accumulation
    gradient_accumulation_steps = 1
-    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
+    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
        gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
        batch_size = MAX_GPU_BATCH_SIZE

--- a/examples/by_feature/deepspeed_with_config_support.py
+++ b/examples/by_feature/deepspeed_with_config_support.py
@ -1,4 +1,5 @@
 #!/usr/bin/env python
+# coding=utf-8
 # Copyright 2022 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -511,7 +512,7 @@ def main():
    optimizer = optimizer_cls(optimizer_grouped_parameters, lr=args.learning_rate)

    # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
-    if accelerator.distributed_type == DistributedType.XLA:
+    if accelerator.distributed_type == DistributedType.TPU:
        model.tie_weights()

    # Scheduler and math around the number of training steps.
--- a/examples/by_feature/early_stopping.py
+++ b/examples/by_feature/early_stopping.py
@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -80,7 +81,7 @@ def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
-        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
+        max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
        # When using mixed precision we want round multiples of 8/16
        if accelerator.mixed_precision == "fp8":
            pad_to_multiple_of = 16
@ -150,7 +151,7 @@ def training_function(config, args):

    # If the batch size is too big we use gradient accumulation
    gradient_accumulation_steps = 1
-    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
+    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
        gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
        batch_size = MAX_GPU_BATCH_SIZE

--- a/examples/by_feature/fsdp_with_peak_mem_tracking.py
+++ b/examples/by_feature/fsdp_with_peak_mem_tracking.py
@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -208,13 +209,13 @@ def training_function(config, args):

    # If the batch size is too big we use gradient accumulation
    gradient_accumulation_steps = 1
-    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
+    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
        gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
        batch_size = MAX_GPU_BATCH_SIZE

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
-        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
+        max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
        # When using mixed precision we want round multiples of 8/16
        if accelerator.mixed_precision == "fp8":
            pad_to_multiple_of = 16
@ -333,11 +334,13 @@ def training_function(config, args):
                        accelerator.save_state(output_dir)
        # New Code #
        # Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
-        accelerator.print(f"Memory before entering the train : {b2mb(tracemalloc.begin)}")
-        accelerator.print(f"Memory consumed at the end of the train (end-begin): {tracemalloc.used}")
-        accelerator.print(f"Peak Memory consumed during the train (max-begin): {tracemalloc.peaked}")
+        accelerator.print("Memory before entering the train : {}".format(b2mb(tracemalloc.begin)))
+        accelerator.print("Memory consumed at the end of the train (end-begin): {}".format(tracemalloc.used))
+        accelerator.print("Peak Memory consumed during the train (max-begin): {}".format(tracemalloc.peaked))
        accelerator.print(
-            f"Total Peak Memory consumed during the train (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}"
+            "Total Peak Memory consumed during the train (max): {}".format(
+                tracemalloc.peaked + b2mb(tracemalloc.begin)
+            )
        )
        # Logging the peak memory usage of the GPU to the tracker
        if args.with_tracking:
@ -384,11 +387,11 @@ def training_function(config, args):
                accelerator.save_state(output_dir)
        # New Code #
        # Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
-        accelerator.print(f"Memory before entering the eval : {b2mb(tracemalloc.begin)}")
-        accelerator.print(f"Memory consumed at the end of the eval (end-begin): {tracemalloc.used}")
-        accelerator.print(f"Peak Memory consumed during the eval (max-begin): {tracemalloc.peaked}")
+        accelerator.print("Memory before entering the eval : {}".format(b2mb(tracemalloc.begin)))
+        accelerator.print("Memory consumed at the end of the eval (end-begin): {}".format(tracemalloc.used))
+        accelerator.print("Peak Memory consumed during the eval (max-begin): {}".format(tracemalloc.peaked))
        accelerator.print(
-            f"Total Peak Memory consumed during the eval (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}"
+            "Total Peak Memory consumed during the eval (max): {}".format(tracemalloc.peaked + b2mb(tracemalloc.begin))
        )
        # Logging the peak memory usage of the GPU to the tracker
        if args.with_tracking:
--- a/examples/by_feature/gradient_accumulation.py
+++ b/examples/by_feature/gradient_accumulation.py
@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -80,7 +81,7 @@ def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
-        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
+        max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
        # When using mixed precision we want round multiples of 8/16
        if accelerator.mixed_precision == "fp8":
            pad_to_multiple_of = 16
@ -125,7 +126,7 @@ def training_function(config, args):
    accelerator = Accelerator(
        cpu=args.cpu, mixed_precision=args.mixed_precision, gradient_accumulation_steps=gradient_accumulation_steps
    )
-    if accelerator.distributed_type == DistributedType.XLA and gradient_accumulation_steps > 1:
+    if accelerator.distributed_type == DistributedType.TPU and gradient_accumulation_steps > 1:
        raise NotImplementedError(
            "Gradient accumulation on TPUs is currently not supported. Pass `gradient_accumulation_steps=1`"
        )
--- a/examples/by_feature/local_sgd.py
+++ b/examples/by_feature/local_sgd.py
@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -83,7 +84,7 @@ def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
-        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
+        max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
        # When using mixed precision we want round multiples of 8/16
        if accelerator.mixed_precision == "fp8":
            pad_to_multiple_of = 16
--- a/examples/by_feature/megatron_lm_gpt_pretraining.py
+++ b/examples/by_feature/megatron_lm_gpt_pretraining.py
@ -1,4 +1,5 @@
 #!/usr/bin/env python
+# coding=utf-8
 # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -505,7 +506,7 @@ def main():
    )

    # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
-    if accelerator.distributed_type == DistributedType.XLA:
+    if accelerator.distributed_type == DistributedType.TPU:
        model.tie_weights()

    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
--- a/examples/by_feature/memory.py
+++ b/examples/by_feature/memory.py
@ -86,7 +86,7 @@ def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
-        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
+        max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
        # When using mixed precision we want round multiples of 8/16
        if accelerator.mixed_precision == "fp8":
            pad_to_multiple_of = 16
--- a/examples/by_feature/multi_process_metrics.py
+++ b/examples/by_feature/multi_process_metrics.py
@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2022 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -87,7 +88,7 @@ def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
-        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
+        max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
        # When using mixed precision we want round multiples of 8/16
        if accelerator.mixed_precision == "fp8":
            pad_to_multiple_of = 16
@ -138,7 +139,7 @@ def training_function(config, args):

    # If the batch size is too big we use gradient accumulation
    gradient_accumulation_steps = 1
-    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
+    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
        gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
        batch_size = MAX_GPU_BATCH_SIZE

--- a/examples/by_feature/tracking.py
+++ b/examples/by_feature/tracking.py
@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -85,7 +86,7 @@ def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
-        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
+        max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
        # When using mixed precision we want round multiples of 8/16
        if accelerator.mixed_precision == "fp8":
            pad_to_multiple_of = 16
@ -148,7 +149,7 @@ def training_function(config, args):

    # If the batch size is too big we use gradient accumulation
    gradient_accumulation_steps = 1
-    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
+    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
        gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
        batch_size = MAX_GPU_BATCH_SIZE

--- a/examples/complete_cv_example.py
+++ b/examples/complete_cv_example.py
@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
--- a/examples/complete_nlp_example.py
+++ b/examples/complete_nlp_example.py
@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -102,13 +103,13 @@ def training_function(config, args):

    # If the batch size is too big we use gradient accumulation
    gradient_accumulation_steps = 1
-    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
+    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
        gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
        batch_size = MAX_GPU_BATCH_SIZE

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
-        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
+        max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
        # When using mixed precision we want round multiples of 8/16
        if accelerator.mixed_precision == "fp8":
            pad_to_multiple_of = 16
--- a/examples/cv_example.py
+++ b/examples/cv_example.py
@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
--- a/examples/inference/bert.py
+++ b/examples/inference/bert.py
@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -75,4 +76,4 @@ end_time = time.time()
 if PartialState().is_last_process:
    output = torch.stack(tuple(output[0]))
    print(f"Time of first pass: {first_batch}")
-    print(f"Average time per batch: {(end_time - start_time) / 5}")
+    print(f"Average time per batch: {(end_time - start_time)/5}")
--- a/examples/inference/gpt2.py
+++ b/examples/inference/gpt2.py
@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -74,4 +75,4 @@ end_time = time.time()
 if PartialState().is_last_process:
    output = torch.stack(tuple(output[0]))
    print(f"Time of first pass: {first_batch}")
-    print(f"Average time per batch: {(end_time - start_time) / 5}")
+    print(f"Average time per batch: {(end_time - start_time)/5}")
--- a/examples/inference/llama.py
+++ b/examples/inference/llama.py
@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
--- a/examples/inference/t5.py
+++ b/examples/inference/t5.py
@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -86,4 +87,4 @@ end_time = time.time()
 if PartialState().is_last_process:
    output = torch.stack(tuple(output[0]))
    print(f"Time of first pass: {first_batch}")
-    print(f"Average time per batch: {(end_time - start_time) / 5}")
+    print(f"Average time per batch: {(end_time - start_time)/5}")
--- a/examples/multigpu_remote_launcher.py
+++ b/examples/multigpu_remote_launcher.py
@ -1,16 +1,3 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 import argparse

 import runhouse as rh
--- a/examples/nlp_example.py
+++ b/examples/nlp_example.py
@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -77,8 +78,8 @@ def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    def collate_fn(examples):
-        # For Torchxla, it's best to pad everything to the same length or training will be very slow.
-        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
+        # On TPU it's best to pad everything to the same length or training will be very slow.
+        max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
        # When using mixed precision we want round multiples of 8/16
        if accelerator.mixed_precision == "fp8":
            pad_to_multiple_of = 16
@ -123,7 +124,7 @@ def training_function(config, args):

    # If the batch size is too big we use gradient accumulation
    gradient_accumulation_steps = 1
-    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
+    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
        gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
        batch_size = MAX_GPU_BATCH_SIZE

--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,35 +1,15 @@
 [tool.ruff]
+# Never enforce `E501` (line length violations).
+ignore = ["E501", "E741", "W605"]
+select = ["E", "F", "I", "W"]
 line-length = 119
-target-version = "py38"

-[tool.ruff.lint]
-preview = true
-ignore-init-module-imports = true
-extend-select = [
-    "B009", # static getattr
-    "B010", # static setattr
-    "CPY", # Copyright
-    "E", # PEP8 errors
-    "F", # PEP8 formatting
-    "I", # Import sorting
-    "TID251", # Banned API
-    "UP", # Pyupgrade
-    "W", # PEP8 warnings
-]
-ignore = [
-    "E501", # Line length (handled by ruff-format)
-    "E741", # Ambiguous variable name
-    "W605", # Invalid escape sequence
-    "UP007", # X | Y type annotations
-]
-
-[tool.ruff.lint.per-file-ignores]
-"__init__.py" = [
-    "F401", # Ignore seemingly unused imports (they're meant for re-export)
-]
+# Ignore import violations in all `__init__.py` files.
+[tool.ruff.per-file-ignores]
+"__init__.py" = ["E402", "F401", "F403", "F811"]
 "manim_animations/*" = ["ALL"]

-[tool.ruff.lint.isort]
+[tool.ruff.isort]
 lines-after-imports = 2
 known-first-party = ["accelerate"]

@ -37,8 +17,3 @@ known-first-party = ["accelerate"]
 exclude = [
    "manim_animations/*"
 ]
-
-[tool.ruff.lint.flake8-tidy-imports.banned-api]
-"os.getenv".msg = "Use os.environ instead"
-"os.putenv".msg = "Use os.environ instead"
-"os.unsetenv".msg = "Use os.environ instead"
--- a/setup.cfg
+++ b/setup.cfg
@ -0,0 +1,14 @@
+[isort]
+default_section = FIRSTPARTY
+ensure_newline_before_comments = True
+force_grid_wrap = 0
+include_trailing_comma = True
+known_first_party = accelerate
+line_length = 119
+lines_after_imports = 2
+multi_line_output = 3
+use_parentheses = True
+
+[flake8]
+ignore = E203, E722, E501, E741, W503, W605
+max-line-length = 119
--- a/setup.py
+++ b/setup.py
@ -19,10 +19,10 @@ extras = {}
 extras["quality"] = [
    "black ~= 23.1",  # hf-doc-builder has a hidden dependency on `black`
    "hf-doc-builder >= 0.3.0",
-    "ruff ~= 0.2.1",
+    "ruff ~= 0.1.15",
 ]
 extras["docs"] = []
-extras["test_prod"] = ["pytest>=7.2.0,<=8.0.0", "pytest-xdist", "pytest-subtests", "parameterized"]
+extras["test_prod"] = ["pytest", "pytest-xdist", "pytest-subtests", "parameterized"]
 extras["test_dev"] = [
    "datasets",
    "evaluate",
@ -47,9 +47,9 @@ extras["sagemaker"] = [

 setup(
    name="accelerate",
-    version="0.28.0.dev",
+    version="0.27.2",
    description="Accelerate",
-    long_description=open("README.md", encoding="utf-8").read(),
+    long_description=open("README.md", "r", encoding="utf-8").read(),
    long_description_content_type="text/markdown",
    keywords="deep learning",
    license="Apache",
--- a/src/accelerate/init.py
+++ b/src/accelerate/init.py
@ -1,17 +1,4 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-__version__ = "0.28.0.dev0"
+__version__ = "0.27.2"

 from .accelerator import Accelerator
 from .big_modeling import (
@ -29,7 +16,6 @@ from .launchers import debug_launcher, notebook_launcher
 from .state import PartialState
 from .utils import (
    AutocastKwargs,
-    DataLoaderConfiguration,
    DeepSpeedPlugin,
    DistributedDataParallelKwargs,
    DistributedType,
--- a/src/accelerate/accelerator.py
+++ b/src/accelerate/accelerator.py
@ -47,7 +47,6 @@ from .utils import (
    WEIGHTS_INDEX_NAME,
    WEIGHTS_NAME,
    AutocastKwargs,
-    DataLoaderConfiguration,
    DeepSpeedPlugin,
    DistributedDataParallelKwargs,
    DistributedType,
@ -83,7 +82,7 @@ from .utils import (
    is_msamp_available,
    is_npu_available,
    is_torch_version,
-    is_torch_xla_available,
+    is_tpu_available,
    is_xpu_available,
    load_fsdp_model,
    load_fsdp_optimizer,
@ -134,8 +133,7 @@ if is_megatron_lm_available():
 from torch.distributed.algorithms.join import Join


-if is_torch_xla_available():
-    import torch_xla.amp as xamp
+if is_tpu_available(check_device=False):
    import torch_xla.core.xla_model as xm
    import torch_xla.distributed.xla_multiprocessing as xmp

@ -151,12 +149,6 @@ except ImportError:

 logger = get_logger(__name__)

-# Sentinel values for defaults
-_split_batches = object()
-_dispatch_batches = object()
-_even_batches = object()
-_use_seedable_sampler = object()
-

 class Accelerator:
    """
@ -166,6 +158,11 @@ class Accelerator:
        device_placement (`bool`, *optional*, defaults to `True`):
            Whether or not the accelerator should put objects on device (tensors yielded by the dataloader, model,
            etc...).
+        split_batches (`bool`, *optional*, defaults to `False`):
+            Whether or not the accelerator should split the batches yielded by the dataloaders across the devices. If
+            `True` the actual batch size used will be the same on any kind of distributed processes, but it must be a
+            round multiple of the `num_processes` you are using. If `False`, actual batch size used will be the one set
+            in your script multiplied by the number of processes.
        mixed_precision (`str`, *optional*):
            Whether or not to use mixed precision training. Choose from 'no','fp16','bf16 or 'fp8'. Will default to the
            value in the environment variable `ACCELERATE_MIXED_PRECISION`, which will use the default value in the
@ -178,15 +175,13 @@ class Accelerator:
        cpu (`bool`, *optional*):
            Whether or not to force the script to execute on CPU. Will ignore GPU available if set to `True` and force
            the execution on one process only.
-        dataloader_config (`DataLoaderConfiguration`, *optional*):
-            A configuration for how the dataloaders should be handled in distributed scenarios.
-        deepspeed_plugin ([`~utils.DeepSpeedPlugin`], *optional*):
+        deepspeed_plugin (`DeepSpeedPlugin`, *optional*):
            Tweak your DeepSpeed related args using this argument. This argument is optional and can be configured
            directly using *accelerate config*
-        fsdp_plugin ([`~utils.FullyShardedDataParallelPlugin`], *optional*):
+        fsdp_plugin (`FullyShardedDataParallelPlugin`, *optional*):
            Tweak your FSDP related args using this argument. This argument is optional and can be configured directly
            using *accelerate config*
-        megatron_lm_plugin ([`~utils.MegatronLMPlugin`], *optional*):
+        megatron_lm_plugin (`MegatronLMPlugin`, *optional*):
            Tweak your MegatronLM related args using this argument. This argument is optional and can be configured
            directly using *accelerate config*
        rng_types (list of `str` or [`~utils.RNGType`]):
@ -209,20 +204,33 @@ class Accelerator:
            - `"comet_ml"`
            If `"all"` is selected, will pick up all available trackers in the environment and initialize them. Can
            also accept implementations of `GeneralTracker` for custom trackers, and can be combined with `"all"`.
-        project_config ([`~utils.ProjectConfiguration`], *optional*):
+        project_config (`ProjectConfiguration`, *optional*):
            A configuration for how saving the state can be handled.
        project_dir (`str`, `os.PathLike`, *optional*):
            A path to a directory for storing data such as logs of locally-compatible loggers and potentially saved
            checkpoints.
+        dispatch_batches (`bool`, *optional*):
+            If set to `True`, the dataloader prepared by the Accelerator is only iterated through on the main process
+            and then the batches are split and broadcast to each process. Will default to `True` for `DataLoader` whose
+            underlying dataset is an `IterableDataset`, `False` otherwise.
+        even_batches (`bool`, *optional*, defaults to `True`):
+            If set to `True`, in cases where the total batch size across all processes does not exactly divide the
+            dataset, samples at the start of the dataset will be duplicated so the batch can be divided equally among
+            all workers.
+        use_seedable_sampler (`bool`, *optional*, defaults to `False`):
+            Whether or not use a fully seedable random sampler ([`~data_loader.SeedableRandomSampler`]). Ensures
+            training results are fully reproducable using a different sampling technique. While seed-to-seed results
+            may differ, on average the differences are neglible when using multiple different seeds to compare. Should
+            also be ran with [`~utils.set_seed`] each time for the best results.
        step_scheduler_with_optimizer (`bool`, *optional`, defaults to `True`):
            Set `True` if the learning rate scheduler is stepped at the same time as the optimizer, `False` if only
            done under certain circumstances (at the end of each epoch, for instance).
-        kwargs_handlers (list of [`~utils.KwargsHandler`], *optional*)
-            A list of [`~utils.KwargsHandler`] to customize how the objects related to distributed training or mixed
-            precision are created. See [kwargs](kwargs) for more information.
-        dynamo_backend (`str` or [`~utils.DynamoBackend`], *optional*, defaults to `"no"`):
+        kwargs_handlers (`list[KwargHandler]`, *optional*)
+            A list of `KwargHandler` to customize how the objects related to distributed training or mixed precision
+            are created. See [kwargs](kwargs) for more information.
+        dynamo_backend (`str` or `DynamoBackend`, *optional*, defaults to `"no"`):
            Set to one of the possible dynamo backends to optimize your training with torch dynamo.
-        gradient_accumulation_plugin ([`~utils.GradientAccumulationPlugin`], *optional*):
+        gradient_accumulation_plugin (`GradientAccumulationPlugin`, *optional*):
            A configuration for how gradient accumulation should be handled, if more tweaking than just the
            `gradient_accumulation_steps` is needed.

@ -245,11 +253,10 @@ class Accelerator:
    def __init__(
        self,
        device_placement: bool = True,
-        split_batches: bool = _split_batches,
+        split_batches: bool = False,
        mixed_precision: PrecisionType | str | None = None,
        gradient_accumulation_steps: int = 1,
        cpu: bool = False,
-        dataloader_config: DataLoaderConfiguration | None = None,
        deepspeed_plugin: DeepSpeedPlugin | None = None,
        fsdp_plugin: FullyShardedDataParallelPlugin | None = None,
        megatron_lm_plugin: MegatronLMPlugin | None = None,
@ -258,9 +265,9 @@ class Accelerator:
        project_dir: str | os.PathLike | None = None,
        project_config: ProjectConfiguration | None = None,
        gradient_accumulation_plugin: GradientAccumulationPlugin | None = None,
-        dispatch_batches: bool | None = _dispatch_batches,
-        even_batches: bool = _even_batches,
-        use_seedable_sampler: bool = _use_seedable_sampler,
+        dispatch_batches: bool | None = None,
+        even_batches: bool = True,
+        use_seedable_sampler: bool = False,
        step_scheduler_with_optimizer: bool = True,
        kwargs_handlers: list[KwargsHandler] | None = None,
        dynamo_backend: DynamoBackend | str | None = None,
@ -390,7 +397,7 @@ class Accelerator:
        if (
            (mixed_precision != "bf16")
            and getattr(self.state, "downcast_bfloat", False)
-            and (self.state.distributedType != DistributedType.XLA)
+            and (self.state.distributedType != DistributedType.TPU)
        ):
            raise ValueError("Can only use `downcast_bf16` when using `mixed_precision='bf16'` and on a TPU")

@ -407,56 +414,36 @@ class Accelerator:
        self.gradient_state = GradientState(
            gradient_accumulation_plugin=gradient_accumulation_plugin,
        )
+        if self.state.distributed_type == DistributedType.TPU:
+            if self.gradient_state.num_steps != 1:
+                raise ValueError(
+                    "Gradient accumulation is not supported on TPU. Please set `gradient_accumulation_steps` to 1 and don't pass in a `GradientAccumulationPlugin` object."
+                )

        self.device_placement = device_placement
-        if dataloader_config is None:
-            dataloader_config = DataLoaderConfiguration()
-        self.dataloader_config = dataloader_config
-        # Deal with deprecated args
-        # TODO: Remove in v1.0.0
-        deprecated_dl_args = {}
-        if dispatch_batches is not _dispatch_batches:
-            deprecated_dl_args["dispatch_batches"] = dispatch_batches
-            self.dataloader_config.dispatch_batches = dispatch_batches
-        if split_batches is not _split_batches:
-            deprecated_dl_args["split_batches"] = split_batches
-            self.dataloader_config.split_batches = split_batches
-        if even_batches is not _even_batches:
-            deprecated_dl_args["even_batches"] = even_batches
-            self.dataloader_config.even_batches = even_batches
-        if use_seedable_sampler is not _use_seedable_sampler:
-            deprecated_dl_args["use_seedable_sampler"] = use_seedable_sampler
-            self.dataloader_config.use_seedable_sampler = use_seedable_sampler
-        if len(deprecated_dl_args) > 0:
-            values = ", ".join([f"{k}={v}" for k, v in deprecated_dl_args.items()])
-            warnings.warn(
-                f"Passing the following arguments to `Accelerator` is deprecated and will be removed in version 1.0 of Accelerate: {deprecated_dl_args.keys()}. "
-                "Please pass an `accelerate.DataLoaderConfiguration` instead: \n"
-                f"dataloader_config = DataLoaderConfiguration({values})",
-                FutureWarning,
-            )
+        self.split_batches = split_batches
+        self.dispatch_batches = dispatch_batches
+        self.even_batches = even_batches
+        self.use_seedable_sampler = use_seedable_sampler
        self.step_scheduler_with_optimizer = step_scheduler_with_optimizer

        # Mixed precision attributes
        self.scaler = None
        self.native_amp = False
+        err = "{mode} mixed precision requires {requirement}"
        if (
            self.state.mixed_precision == "fp16"
            and self.device.type != "cpu"
            and self.distributed_type not in (DistributedType.DEEPSPEED, DistributedType.MEGATRON_LM)
        ):
            self.native_amp = True
-            if self.device.type not in ("xpu", "cuda", "mps", "npu", "xla") or is_torch_xla_available(
-                check_is_tpu=True
-            ):
-                raise ValueError(f"fp16 mixed precision requires a GPU (not {self.device.type!r}).")
+            if self.device.type not in ("xpu", "cuda", "mps", "npu"):
+                raise ValueError(err.format(mode="fp16", requirement="a GPU"))
            kwargs = self.scaler_handler.to_kwargs() if self.scaler_handler is not None else {}
            if self.distributed_type == DistributedType.FSDP:
                from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler

                self.scaler = ShardedGradScaler(**kwargs)
-            elif is_torch_xla_available(check_is_gpu=True):
-                self.scaler = xamp.GradScaler(**kwargs)
            elif is_npu_available():
                self.scaler = torch.npu.amp.GradScaler(**kwargs)
            else:
@ -470,8 +457,8 @@ class Accelerator:
                self.native_amp = True
            else:
                self.native_amp = is_bf16_available(True)
-            if mixed_precision == "bf16" and not self.native_amp and not is_torch_xla_available():
-                raise ValueError("bf16 mixed precision requires PyTorch >= 1.10 and a supported device.")
+            if mixed_precision == "bf16" and not self.native_amp and not is_tpu_available():
+                raise ValueError(err.format(mode="bf16", requirement="PyTorch >= 1.10 and a supported device."))

        # Start of internal step tracking
        self.step = 0
@ -524,26 +511,6 @@ class Accelerator:
    def device(self):
        return self.state.device

-    @property
-    def split_batches(self):
-        return self.dataloader_config.split_batches
-
-    @property
-    def dispatch_batches(self):
-        return self.dataloader_config.dispatch_batches
-
-    @property
-    def even_batches(self):
-        return self.dataloader_config.even_batches
-
-    @even_batches.setter
-    def even_batches(self, value: bool):
-        self.dataloader_config.even_batches = value
-
-    @property
-    def use_seedable_sampler(self):
-        return self.dataloader_config.use_seedable_sampler
-
    @property
    def project_dir(self):
        return self.project_configuration.project_dir
@ -1227,7 +1194,7 @@ class Accelerator:
        # On TPUs, putting the model on the XLA device will create new parameters, so the corresponding optimizer will
        # have parameters disconnected from the model (so no training :-( ).
        # If the model and optimizer have parameters on different devices we raise an error.
-        if self.distributed_type == DistributedType.XLA:
+        if self.distributed_type == DistributedType.TPU:
            model_device, optimizer_device = self._get_devices()
            if model_device is not None and optimizer_device is not None and model_device != optimizer_device:
                raise ValueError(
@ -1239,7 +1206,7 @@ class Accelerator:
                )

        # If we're dealing with device placement, this deals with that by...
-        tpu_should_fix_optimizer = self.device_placement and self.distributed_type == DistributedType.XLA
+        tpu_should_fix_optimizer = self.device_placement and self.distributed_type == DistributedType.TPU
        if tpu_should_fix_optimizer or (self.mixed_precision == "fp8" and self.fp8_recipe_handler.backend == "TE"):
            # 1. grabbing old model parameters
            old_named_params = self._get_named_parameters(*args)
@ -1278,7 +1245,7 @@ class Accelerator:
                item in container
                for container in (self._dataloaders, self._models, self._optimizers, self._schedulers)
            ):
-                item._is_accelerate_prepared = True
+                setattr(item, "_is_accelerate_prepared", True)

        return result if len(result) > 1 else result[0]

@ -1440,7 +1407,7 @@ class Accelerator:
            elif self.distributed_type == DistributedType.MULTI_CPU:
                kwargs = self.ddp_handler.to_kwargs() if self.ddp_handler is not None else {}
                model = torch.nn.parallel.DistributedDataParallel(model, **kwargs)
-            elif self.distributed_type == DistributedType.XLA and self.state.fork_launched:
+            elif self.distributed_type == DistributedType.TPU and self.state.fork_launched:
                model = xmp.MpModelWrapper(model).to(self.device)
        # torch.compile should be called last and only if the model isn't already compiled.
        if self.state.dynamo_plugin.backend != DynamoBackend.NO and not is_compiled_module(model):
@ -1877,7 +1844,7 @@ class Accelerator:
                self._dataloaders.append(data_loader)
            return data_loader
        if device_placement is None:
-            device_placement = self.device_placement if self.distributed_type != DistributedType.XLA else False
+            device_placement = self.device_placement if self.distributed_type != DistributedType.TPU else False
        prepared_data_loader = prepare_data_loader(
            data_loader,
            self.device,
@ -2090,6 +2057,10 @@ class Accelerator:
            for opt in optimizer:
                while isinstance(opt, AcceleratedOptimizer):
                    opt = opt.optimizer
+                # Reduce gradients first for XLA
+                if self.distributed_type == DistributedType.TPU:
+                    gradients = xm._fetch_gradients(opt)
+                    self.reduce(gradients, scale=1.0 / self.num_processes)
                self.scaler.unscale_(opt)

    def clip_grad_norm_(self, parameters, max_norm, norm_type=2):
@ -2127,19 +2098,6 @@ class Accelerator:
            # `accelerator.backward(loss)` is doing that automatically. Therefore, its implementation is not needed
            # We cannot return the gradient norm because DeepSpeed does it.
            return None
-        elif self.distributed_type == DistributedType.XLA:
-            # Reduce gradients first for XLA
-            for acc_opt in self._optimizers:
-                if not acc_opt.gradient_state.is_xla_gradients_synced:
-                    opt = acc_opt
-                    while isinstance(opt, AcceleratedOptimizer):
-                        opt = opt.optimizer
-                    gradients = xm._fetch_gradients(opt)
-                    # Use xm.all_reduce to perform an in-place all-reduce. Recusrsive all-reduce each tensor
-                    # one by one in self.reduce is non-inplace.
-                    xm.all_reduce("sum", gradients, scale=1.0 / self.num_processes)
-                    # Set is_xla_gradients_synced to True to avoid all-reduce twice in the AcceleratedOptimizer step.
-                    acc_opt.gradient_state.is_xla_gradients_synced = True
        self.unscale_gradients()
        return torch.nn.utils.clip_grad_norm_(parameters, max_norm, norm_type=norm_type)

@ -2428,7 +2386,7 @@ class Accelerator:
                self.trackers.append(tracker)
            else:
                tracker_init = LOGGER_TYPE_TO_CLASS[str(tracker)]
-                if tracker_init.requires_logging_directory:
+                if getattr(tracker_init, "requires_logging_directory"):
                    # We can skip this check since it was done in `__init__`
                    self.trackers.append(
                        tracker_init(project_name, self.logging_dir, **init_kwargs.get(str(tracker), {}))
@ -2757,7 +2715,7 @@ class Accelerator:
        os.makedirs(output_dir, exist_ok=True)
        logger.info(f"Saving current state to {output_dir}")

-        if self.distributed_type == DistributedType.XLA:
+        if self.distributed_type == DistributedType.TPU:
            # Finish running the previous step before checkpointing
            xm.mark_step()

@ -3181,7 +3139,6 @@ class Accelerator:
            autocast_handler = self.autocast_handler
        autocast_context = get_mixed_precision_context_manager(self.native_amp, autocast_handler)
        autocast_context.__enter__()
-        # TODO: should the `yield` be in a try/finally block?
        yield
        autocast_context.__exit__(*sys.exc_info())

--- a/src/accelerate/big_modeling.py
+++ b/src/accelerate/big_modeling.py
@ -441,13 +441,7 @@ def dispatch_model(
        def add_warning(fn, model):
            @wraps(fn)
            def wrapper(*args, **kwargs):
-                warning_msg = "You shouldn't move a model that is dispatched using accelerate hooks."
-                if str(fn.__name__) == "to":
-                    to_device = torch._C._nn._parse_to(*args, **kwargs)[0]
-                    if to_device is not None:
-                        logger.warning(warning_msg)
-                else:
-                    logger.warning(warning_msg)
+                logger.warning("You shouldn't move a model when it is dispatched on multiple devices.")
                for param in model.parameters():
                    if param.device == torch.device("meta"):
                        raise RuntimeError("You can't move a model that has some modules offloaded to cpu or disk.")
--- a/src/accelerate/checkpointing.py
+++ b/src/accelerate/checkpointing.py
@ -32,13 +32,13 @@ from .utils import (
    SCHEDULER_NAME,
    WEIGHTS_NAME,
    get_pretty_name,
-    is_torch_xla_available,
+    is_tpu_available,
    is_xpu_available,
    save,
 )


-if is_torch_xla_available():
+if is_tpu_available(check_device=False):
    import torch_xla.core.xla_model as xm

 from .logging import get_logger
@ -142,7 +142,7 @@ def save_accelerator_state(
        states["torch_xpu_manual_seed"] = torch.xpu.get_rng_state_all()
    else:
        states["torch_cuda_manual_seed"] = torch.cuda.get_rng_state_all()
-    if is_torch_xla_available():
+    if is_tpu_available():
        states["xm_seed"] = xm.get_rng_state()
    output_states_file = output_dir.joinpath(states_name)
    torch.save(states, output_states_file)
@ -249,7 +249,7 @@ def load_accelerator_state(
            torch.xpu.set_rng_state_all(states["torch_xpu_manual_seed"])
        else:
            torch.cuda.set_rng_state_all(states["torch_cuda_manual_seed"])
-        if is_torch_xla_available():
+        if is_tpu_available():
            xm.set_rng_state(states["xm_seed"])
        logger.info("All random states loaded successfully")
    except Exception:
--- a/src/accelerate/commands/init.py
+++ b/src/accelerate/commands/init.py
@ -1,13 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
--- a/src/accelerate/commands/accelerate_cli.py
+++ b/src/accelerate/commands/accelerate_cli.py
@ -12,7 +12,9 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License
+# limitations under the License.
+
+from argparse import ArgumentParser

 from accelerate.commands.config import get_config_parser
 from accelerate.commands.env import env_command_parser
@ -20,13 +22,10 @@ from accelerate.commands.estimate import estimate_command_parser
 from accelerate.commands.launch import launch_command_parser
 from accelerate.commands.test import test_command_parser
 from accelerate.commands.tpu import tpu_command_parser
-from accelerate.commands.utils import ArgumentParserWithDashSupport


 def main():
-    parser = ArgumentParserWithDashSupport(
-        "Accelerate CLI tool", usage="accelerate <command> [<args>]", allow_abbrev=False
-    )
+    parser = ArgumentParser("Accelerate CLI tool", usage="accelerate <command> [<args>]", allow_abbrev=False)
    subparsers = parser.add_subparsers(help="accelerate command helpers")

    # Register commands
--- a/src/accelerate/commands/config/cluster.py
+++ b/src/accelerate/commands/config/cluster.py
@ -126,7 +126,7 @@ def get_cluster_input():
    if (
        not use_cpu
        and is_xpu_available()
-        and distributed_type not in [DistributedType.MULTI_GPU, DistributedType.MULTI_NPU, DistributedType.XLA]
+        and distributed_type not in [DistributedType.MULTI_GPU, DistributedType.MULTI_NPU, DistributedType.TPU]
    ):
        ipex_config["use_xpu"] = _ask_field(
            "Do you want to use XPU plugin to speed up training on XPU? [yes/NO]:",
@ -481,7 +481,7 @@ def get_cluster_input():
        DistributedType.MULTI_XPU,
        DistributedType.MULTI_GPU,
        DistributedType.MULTI_NPU,
-        DistributedType.XLA,
+        DistributedType.TPU,
    ]:
        machine_type = str(distributed_type).split(".")[1].replace("MULTI_", "")
        if machine_type == "TPU":
@ -529,7 +529,7 @@ def get_cluster_input():
            default="all",
        )

-    if distributed_type == DistributedType.XLA:
+    if distributed_type == DistributedType.TPU:
        mixed_precision = "no"
        main_training_function = _ask_field(
            "What is the name of the function in your script that should be launched in all parallel scripts? [main]: ",
@ -620,7 +620,7 @@ def get_cluster_input():
            "Torch dynamo used without mixed precision requires TF32 to be efficient. Accelerate will enable it by default when launching your scripts."
        )

-    if distributed_type == DistributedType.XLA and mixed_precision == "bf16":
+    if distributed_type == DistributedType.TPU and mixed_precision == "bf16":
        tpu_downcast_bf16 = _ask_field(
            "Should `torch.float` be cast as `bfloat16` and `torch.double` remain `float32` on TPUs?", default="no"
        )
--- a/src/accelerate/commands/config/config_args.py
+++ b/src/accelerate/commands/config/config_args.py
@ -27,7 +27,7 @@ from ...utils.constants import SAGEMAKER_PYTHON_VERSION, SAGEMAKER_PYTORCH_VERSI


 hf_cache_home = os.path.expanduser(
-    os.environ.get("HF_HOME", os.path.join(os.environ.get("XDG_CACHE_HOME", "~/.cache"), "huggingface"))
+    os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface"))
 )
 cache_dir = os.path.join(hf_cache_home, "accelerate")
 default_json_config_file = os.path.join(cache_dir, "default_config.yaml")
@ -45,13 +45,13 @@ def load_config_from_file(config_file):
        if not os.path.isfile(config_file):
            raise FileNotFoundError(
                f"The passed configuration file `{config_file}` does not exist. "
-                "Please pass an existing file to `accelerate launch`, or use the default one "
+                "Please pass an existing file to `accelerate launch`, or use the the default one "
                "created through `accelerate config` and run `accelerate launch` "
                "without the `--config_file` argument."
            )
    else:
        config_file = default_config_file
-    with open(config_file, encoding="utf-8") as f:
+    with open(config_file, "r", encoding="utf-8") as f:
        if config_file.endswith(".json"):
            if (
                json.load(f).get("compute_environment", ComputeEnvironment.LOCAL_MACHINE)
@ -94,7 +94,7 @@ class BaseConfig:
    @classmethod
    def from_json_file(cls, json_file=None):
        json_file = default_json_config_file if json_file is None else json_file
-        with open(json_file, encoding="utf-8") as f:
+        with open(json_file, "r", encoding="utf-8") as f:
            config_dict = json.load(f)
        if "compute_environment" not in config_dict:
            config_dict["compute_environment"] = ComputeEnvironment.LOCAL_MACHINE
@ -126,7 +126,7 @@ class BaseConfig:
    @classmethod
    def from_yaml_file(cls, yaml_file=None):
        yaml_file = default_yaml_config_file if yaml_file is None else yaml_file
-        with open(yaml_file, encoding="utf-8") as f:
+        with open(yaml_file, "r", encoding="utf-8") as f:
            config_dict = yaml.safe_load(f)
        if "compute_environment" not in config_dict:
            config_dict["compute_environment"] = ComputeEnvironment.LOCAL_MACHINE
--- a/src/accelerate/commands/config/config_utils.py
+++ b/src/accelerate/commands/config/config_utils.py
@ -68,7 +68,7 @@ def _convert_compute_environment(value):

 def _convert_distributed_mode(value):
    value = int(value)
-    return DistributedType(["NO", "MULTI_CPU", "MULTI_XPU", "MULTI_GPU", "MULTI_NPU", "XLA"][value])
+    return DistributedType(["NO", "MULTI_CPU", "MULTI_XPU", "MULTI_GPU", "MULTI_NPU", "TPU"][value])


 def _convert_dynamo_backend(value):
--- a/src/accelerate/commands/estimate.py
+++ b/src/accelerate/commands/estimate.py
@ -105,8 +105,6 @@ def create_empty_model(model_name: str, library_name: str, trust_remote_code: bo
                f"To check `{model_name}`, `transformers` must be installed. Please install it via `pip install transformers`"
            )
        print(f"Loading pretrained config for `{model_name}` from `transformers`...")
-        if model_info.config is None:
-            raise RuntimeError(f"Tried to load `{model_name}` with `transformers` but it does not have any metadata.")

        auto_map = model_info.config.get("auto_map", False)
        config = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code, token=access_token)
--- a/src/accelerate/commands/launch.py
+++ b/src/accelerate/commands/launch.py
@ -28,7 +28,6 @@ import torch
 from accelerate.commands.config import default_config_file, load_config_from_file
 from accelerate.commands.config.config_args import SageMakerConfig
 from accelerate.commands.config.config_utils import DYNAMO_BACKENDS
-from accelerate.commands.utils import ArgumentParserWithDashSupport
 from accelerate.state import get_int_from_env
 from accelerate.utils import (
    ComputeEnvironment,
@ -42,7 +41,7 @@ from accelerate.utils import (
    is_rich_available,
    is_sagemaker_available,
    is_torch_version,
-    is_torch_xla_available,
+    is_tpu_available,
    is_xpu_available,
    patch_environment,
    prepare_deepspeed_cmd_env,
@ -106,19 +105,19 @@ class _CustomHelpAction(argparse._HelpAction):
            for i, arg in enumerate(opts):
                # If the argument's container is outside of the used titles, hide it
                if arg.container.title not in titles + used_titles:
-                    opts[i].help = argparse.SUPPRESS
+                    setattr(opts[i], "help", argparse.SUPPRESS)
                # If the argument is hardware selection, but not being passed, hide it
                elif arg.container.title == "Hardware Selection Arguments":
                    if set(arg.option_strings).isdisjoint(set(args)):
-                        opts[i].help = argparse.SUPPRESS
+                        setattr(opts[i], "help", argparse.SUPPRESS)
                    else:
-                        opts[i].help = arg.help + " (currently selected)"
+                        setattr(opts[i], "help", arg.help + " (currently selected)")
                # If the argument is a training paradigm, but not being passed, hide it
                elif arg.container.title == "Training Paradigm Arguments":
                    if set(arg.option_strings).isdisjoint(set(used_platforms)):
-                        opts[i].help = argparse.SUPPRESS
+                        setattr(opts[i], "help", argparse.SUPPRESS)
                    else:
-                        opts[i].help = arg.help + " (currently selected)"
+                        setattr(opts[i], "help", arg.help + " (currently selected)")
            for i, group in enumerate(list(parser._action_groups)):
                # If all arguments in the group are hidden, hide the group
                if all([arg.help == argparse.SUPPRESS for arg in group._group_actions]):
@ -128,13 +127,10 @@ class _CustomHelpAction(argparse._HelpAction):


 def launch_command_parser(subparsers=None):
-    description = "Launch a python script in a distributed scenario. Arguments can be passed in with either hyphens (`a-b`) or underscores (`a_b`)"
    if subparsers is not None:
-        parser = subparsers.add_parser("launch", description=description, add_help=False, allow_abbrev=False)
+        parser = subparsers.add_parser("launch", add_help=False, allow_abbrev=False)
    else:
-        parser = ArgumentParserWithDashSupport(
-            "Accelerate launch command", description=description, add_help=False, allow_abbrev=False
-        )
+        parser = argparse.ArgumentParser("Accelerate launch command", add_help=False, allow_abbrev=False)

    parser.register("action", "help", _CustomHelpAction)
    parser.add_argument("-h", "--help", action="help", help="Show this help message and exit.")
@ -875,7 +871,7 @@ def _validate_launch_command(args):
                in (DistributedType.MULTI_GPU, DistributedType.MULTI_NPU, DistributedType.MULTI_XPU)
                else False
            )
-            args.tpu = defaults.distributed_type == DistributedType.XLA
+            args.tpu = defaults.distributed_type == DistributedType.TPU
            args.use_fsdp = defaults.distributed_type == DistributedType.FSDP
            args.use_megatron_lm = defaults.distributed_type == DistributedType.MEGATRON_LM
            args.tpu_use_cluster = defaults.tpu_use_cluster if args.tpu else False
@ -928,16 +924,14 @@ def _validate_launch_command(args):
                args.mixed_precision = defaults.mixed_precision
                mp_from_config_flag = True
        else:
+            native_amp = False
+            err = "{mode} mixed precision requires {requirement}"
            if args.use_cpu or (args.use_xpu and torch.xpu.is_available()):
                native_amp = is_torch_version(">=", "1.10")
            else:
                native_amp = is_bf16_available(True)
-            if (
-                args.mixed_precision == "bf16"
-                and not native_amp
-                and not (args.tpu and is_torch_xla_available(check_is_tpu=True))
-            ):
-                raise ValueError("bf16 mixed precision requires PyTorch >= 1.10 and a supported device.")
+            if args.mixed_precision == "bf16" and not native_amp and not (args.tpu and is_tpu_available()):
+                raise ValueError(err.format(mode="bf16", requirement="PyTorch >= 1.10 and a supported device."))

        # Silently set the default here
        if args.dynamo_backend is None:
--- a/src/accelerate/commands/menu/init.py
+++ b/src/accelerate/commands/menu/init.py
@ -1,14 +1 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 from .selection_menu import BulletMenu
--- a/src/accelerate/commands/menu/input.py
+++ b/src/accelerate/commands/menu/input.py
@ -30,7 +30,7 @@ def mark(key: str):
    def decorator(func):
        handle = getattr(func, "handle_key", [])
        handle += [key]
-        func.handle_key = handle
+        setattr(func, "handle_key", handle)
        return func

    return decorator
@ -44,7 +44,7 @@ def mark_multiple(*keys: List[str]):
    def decorator(func):
        handle = getattr(func, "handle_key", [])
        handle += keys
-        func.handle_key = handle
+        setattr(func, "handle_key", handle)
        return func

    return decorator
@ -58,8 +58,8 @@ class KeyHandler(type):
    def __new__(cls, name, bases, attrs):
        new_cls = super().__new__(cls, name, bases, attrs)
        if not hasattr(new_cls, "key_handler"):
-            new_cls.key_handler = {}
-        new_cls.handle_input = KeyHandler.handle_input
+            setattr(new_cls, "key_handler", {})
+        setattr(new_cls, "handle_input", KeyHandler.handle_input)

        for value in attrs.values():
            handled_keys = getattr(value, "handle_key", [])
--- a/src/accelerate/commands/menu/keymap.py
+++ b/src/accelerate/commands/menu/keymap.py
@ -16,6 +16,7 @@
 Utilities relating to parsing raw characters from the keyboard, based on https://github.com/bchao1/bullet
 """

+
 import os
 import string
 import sys
--- a/src/accelerate/commands/menu/selection_menu.py
+++ b/src/accelerate/commands/menu/selection_menu.py
@ -15,7 +15,6 @@
 """
 Main driver for the selection menu, based on https://github.com/bchao1/bullet
 """
-
 import builtins
 import sys

--- a/src/accelerate/commands/test.py
+++ b/src/accelerate/commands/test.py
@ -17,7 +17,7 @@
 import argparse
 import os

-from accelerate.test_utils import execute_subprocess_async, path_in_accelerate_package
+from accelerate.test_utils import execute_subprocess_async


 def test_command_parser(subparsers=None):
@ -43,14 +43,14 @@ def test_command_parser(subparsers=None):


 def test_command(args):
-    script_name = path_in_accelerate_package("test_utils", "scripts", "test_script.py")
+    script_name = os.path.sep.join(__file__.split(os.path.sep)[:-2] + ["test_utils", "scripts", "test_script.py"])

    if args.config_file is None:
-        test_args = [script_name]
+        test_args = script_name
    else:
-        test_args = f"--config_file={args.config_file} {script_name}".split()
+        test_args = f"--config_file={args.config_file} {script_name}"

-    cmd = ["accelerate-launch"] + test_args
+    cmd = ["accelerate-launch"] + test_args.split()
    result = execute_subprocess_async(cmd, env=os.environ.copy())
    if result.returncode == 0:
        print("Test is a success! You are ready for your distributed training!")
--- a/src/accelerate/commands/tpu.py
+++ b/src/accelerate/commands/tpu.py
@ -112,7 +112,7 @@ def tpu_command_launcher(args):
        raise ValueError("You must specify either a command file or a command to run on the pod.")

    if args.command_file:
-        with open(args.command_file) as f:
+        with open(args.command_file, "r") as f:
            args.command = [f.read().splitlines()]

    # To turn list of lists into list of strings
--- a/src/accelerate/commands/utils.py
+++ b/src/accelerate/commands/utils.py
@ -1,90 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-
-
-class ArgumentParserWithDashSupport(argparse.ArgumentParser):
-    """
-    argparse subclass that allows for seamless use of `--a-b` or `--a_b` style arguments automatically.
-
-    Based on the implementation here:
-    https://stackoverflow.com/questions/53527387/make-argparse-treat-dashes-and-underscore-identically
-    """
-
-    def _parse_optional(self, arg_string):
-        # Conditions to not change anything/it's a positional:
-        # - Empty string, positional
-        # - Doesn't start with a prefix
-        # - Single character
-        if (not arg_string) or (arg_string[0] not in self.prefix_chars) or (len(arg_string) == 1):
-            return None
-
-        option_tuples = self._get_option_tuples(arg_string)
-
-        # If multiple matches, it was ambigous, raise an error
-        if len(option_tuples) > 1:
-            options = ", ".join([option_string for _, option_string, _ in option_tuples])
-            self.error(f"ambiguous option: {arg_string} could match {options}")
-
-        # If exactly one match, return it
-        elif len(option_tuples) == 1:
-            (option_tuple,) = option_tuples
-            return option_tuple
-
-        # If not found, but looks like a negative number, probably posisional
-        if self._negative_number_matcher.match(arg_string) and not self._has_negative_number_optionals:
-            return None
-
-        # If it has a space, probably positional
-        if " " in arg_string:
-            return None
-
-        # Otherwise meant to be optional though no such option,
-        # but could be valid in a subparser so just return it
-        return None, arg_string, None
-
-    def _get_option_tuples(self, option_string):
-        result = []
-        explicit_arg = None
-        if "=" in option_string:
-            option_prefix, explicit_arg = option_string.split("=", 1)
-        else:
-            option_prefix = option_string
-        # Assuming it's a perfect match
-        if option_prefix in self._option_string_actions:
-            action = self._option_string_actions[option_prefix]
-            result.append((action, option_prefix, explicit_arg))
-        else:
-            # Imperfect match, have to go dig
-            chars = self.prefix_chars
-            if option_string[0] in chars and option_string[1] not in chars:
-                # short option: if single character, can be concatenated with arguments
-                short_option_prefix = option_string[:2]
-                short_explicit_arg = option_string[2:]
-                if short_option_prefix in self._option_string_actions:
-                    action = self._option_string_actions[short_option_prefix]
-                    result.append((action, short_option_prefix, short_explicit_arg))
-
-            # Finally check for `-` vs `_`
-            underscored = {k.replace("-", "_"): k for k in self._option_string_actions}
-            option_prefix = option_prefix.replace("-", "_")
-            if option_prefix in underscored:
-                action = self._option_string_actions[underscored[option_prefix]]
-                result.append((action, underscored[option_prefix], explicit_arg))
-            elif self.allow_abbrev:
-                for option_string in underscored:
-                    if option_string.startswith(option_prefix):
-                        action = self._option_string_actions[underscored[option_string]]
-                        result.append((action, underscored[option_string], explicit_arg))
-        return result
--- a/src/accelerate/data_loader.py
+++ b/src/accelerate/data_loader.py
@ -20,7 +20,7 @@ import torch
 from torch.utils.data import BatchSampler, DataLoader, IterableDataset, RandomSampler

 from .logging import get_logger
-from .state import AcceleratorState, DistributedType, GradientState, is_torch_xla_available
+from .state import AcceleratorState, DistributedType, GradientState, is_tpu_available
 from .utils import (
    RNGType,
    broadcast,
@ -501,7 +501,7 @@ class DataLoaderShard(DataLoader, DataLoaderStateMixin):
            return len(self.dataset)


-if is_torch_xla_available():
+if is_tpu_available(check_device=False):
    import torch_xla.distributed.parallel_loader as xpl

    class MpDeviceLoaderWrapper(xpl.MpDeviceLoader):
@ -942,7 +942,7 @@ def prepare_data_loader(
    elif sampler_is_batch_sampler:
        dataloader = DataLoaderShard(
            new_dataset,
-            device=device if put_on_device and state.distributed_type != DistributedType.XLA else None,
+            device=device if put_on_device and state.distributed_type != DistributedType.TPU else None,
            sampler=new_batch_sampler,
            batch_size=dataloader.batch_size,
            rng_types=rng_types,
@ -953,7 +953,7 @@ def prepare_data_loader(
    else:
        dataloader = DataLoaderShard(
            new_dataset,
-            device=device if put_on_device and state.distributed_type != DistributedType.XLA else None,
+            device=device if put_on_device and state.distributed_type != DistributedType.TPU else None,
            batch_sampler=new_batch_sampler,
            rng_types=rng_types,
            synchronized_generator=synchronized_generator,
@ -966,7 +966,7 @@ def prepare_data_loader(
            dataloader.sampler.sampler = sampler
        else:
            dataloader.batch_sampler.sampler = sampler
-    if state.distributed_type == DistributedType.XLA:
+    if state.distributed_type == DistributedType.TPU:
        return MpDeviceLoaderWrapper(dataloader, device)
    return dataloader

--- a/src/accelerate/hooks.py
+++ b/src/accelerate/hooks.py
@ -374,7 +374,7 @@ class AlignDevicesHook(ModelHook):
            # this dictionary to allow the garbage collector to do its job.
            for value_pointer, device in self.tied_pointers_to_remove:
                del self.tied_params_map[value_pointer][device]
-            self.tied_pointers_to_remove = set()
+            self.tied_pointers_to_remove = None

        if self.io_same_device and self.input_device is not None:
            output = send_to_device(output, self.input_device, skip_keys=self.skip_keys)
--- a/src/accelerate/inference.py
+++ b/src/accelerate/inference.py
@ -1,16 +1,3 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 import math
 from types import MethodType
 from typing import Any, Dict, List, Optional, Tuple, Union
@ -88,10 +75,8 @@ def build_pipeline(model, split_points, args, kwargs, num_chunks):
    annotate_split_points(model, {split_point: PipeSplitWrapper.SplitPoint.BEGINNING for split_point in split_points})
    found_batch_size = find_pippy_batch_size(args, kwargs)
    if found_batch_size != num_chunks:
-        if args is not None:
-            args = pad_input_tensors(args, found_batch_size, num_chunks)
-        if kwargs is not None:
-            kwargs = pad_input_tensors(kwargs, found_batch_size, num_chunks)
+        args = pad_input_tensors(args, found_batch_size, num_chunks)
+        kwargs = pad_input_tensors(kwargs, found_batch_size, num_chunks)
    pipe = Pipe.from_tracing(model, num_chunks=num_chunks, example_args=args, example_kwargs=kwargs)
    stage = PipelineStage(pipe, state.local_process_index, device=state.device)

--- a/src/accelerate/optimizer.py
+++ b/src/accelerate/optimizer.py
@ -18,10 +18,10 @@ import warnings
 import torch

 from .state import AcceleratorState, GradientState
-from .utils import DistributedType, honor_type, is_torch_xla_available
+from .utils import DistributedType, honor_type, is_tpu_available


-if is_torch_xla_available():
+if is_tpu_available(check_device=False):
    import torch_xla.core.xla_model as xm


@ -68,7 +68,7 @@ class AcceleratedOptimizer(torch.optim.Optimizer):
        # Handle device placement
        if device_placement:
            state_dict = self.optimizer.state_dict()
-            if self.accelerator_state.distributed_type == DistributedType.XLA:
+            if self.accelerator_state.distributed_type == DistributedType.TPU:
                xm.send_cpu_data_to_device(state_dict, self.accelerator_state.device)
            else:
                state_dict = move_to_device(state_dict, self.accelerator_state.device)
@ -102,7 +102,7 @@ class AcceleratedOptimizer(torch.optim.Optimizer):
        self.optimizer.add_param_group(param_group)

    def load_state_dict(self, state_dict):
-        if self.accelerator_state.distributed_type == DistributedType.XLA and self.device_placement:
+        if self.accelerator_state.distributed_type == DistributedType.TPU and self.device_placement:
            xm.send_cpu_data_to_device(state_dict, self.accelerator_state.device)
        self.optimizer.load_state_dict(state_dict)

@ -122,15 +122,11 @@ class AcceleratedOptimizer(torch.optim.Optimizer):
                self.optimizer.zero_grad()

    def step(self, closure=None):
-        if (
-            not self.gradient_state.is_xla_gradients_synced
-            and self.accelerator_state.distributed_type == DistributedType.XLA
-        ):
-            gradients = xm._fetch_gradients(self.optimizer)
-            xm.all_reduce("sum", gradients, scale=1.0 / xm.xrt_world_size())
-            self.gradient_state.is_xla_gradients_synced = True
        if self.gradient_state.sync_gradients:
-            if self.scaler is not None:
+            if self.accelerator_state.distributed_type == DistributedType.TPU:
+                optimizer_args = {"closure": closure} if closure is not None else {}
+                xm.optimizer_step(self.optimizer, optimizer_args=optimizer_args)
+            elif self.scaler is not None:
                self.optimizer.step = self._optimizer_patched_step_method

                self.scaler.step(self.optimizer, closure)
@ -147,8 +143,6 @@ class AcceleratedOptimizer(torch.optim.Optimizer):
                self._accelerate_step_called = False
            else:
                self.optimizer.step(closure)
-        if self.accelerator_state.distributed_type == DistributedType.XLA:
-            self.gradient_state.is_xla_gradients_synced = False

    def _switch_parameters(self, parameters_map):
        for param_group in self.optimizer.param_groups:
--- a/src/accelerate/state.py
+++ b/src/accelerate/state.py
@ -39,7 +39,7 @@ from .utils import (
    is_ipex_available,
    is_mps_available,
    is_npu_available,
-    is_torch_xla_available,
+    is_tpu_available,
    is_xpu_available,
    parse_choice_from_env,
    parse_flag_from_env,
@ -47,7 +47,7 @@ from .utils import (
 from .utils.dataclasses import SageMakerDistributedType


-if is_torch_xla_available():
+if is_tpu_available(check_device=False):
    import torch_xla.core.xla_model as xm


@ -98,7 +98,7 @@ class ThreadLocalSharedDict(threading.local):


 # Prefer global shared dictionary, except when using TPU.
-SharedDict = dict if not is_torch_xla_available() else ThreadLocalSharedDict
+SharedDict = dict if not is_tpu_available(check_device=False) else ThreadLocalSharedDict


 # Inspired by Alex Martelli's 'Borg'.
@ -157,16 +157,12 @@ class PartialState:
                    if self.device is None:
                        self.device = torch.device("cuda", self.local_process_index)
                    torch.cuda.set_device(self.device)
-            elif is_torch_xla_available() and not cpu:
-                self.distributed_type = DistributedType.XLA
-                self.device = xm.xla_device()
-                xm.set_replication(self.device, [self.device])
+            elif is_tpu_available() and not cpu:
+                self.distributed_type = DistributedType.TPU
                self.num_processes = xm.xrt_world_size()
                self.process_index = xm.get_ordinal()
-                if is_torch_xla_available(check_is_tpu=True):
-                    self.local_process_index = xm.get_local_ordinal()
-                else:
-                    self.local_process_index = int(os.environ.get("LOCAL_RANK", -1))
+                self.local_process_index = xm.get_local_ordinal()
+                self.device = xm.xla_device()
            elif (
                os.environ.get("ACCELERATE_USE_DEEPSPEED", "false") == "true"
                and int(os.environ.get("LOCAL_RANK", -1)) != -1
@ -251,10 +247,7 @@ class PartialState:
                if self.device is None:
                    self.device = torch.device("npu", self.local_process_index)
                torch.npu.set_device(self.device)
-            elif (
-                get_int_from_env(["PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE", "WORLD_SIZE"], 1) > 1
-                or int(os.environ.get("LOCAL_RANK", -1)) != -1
-            ):
+            elif get_int_from_env(["PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE", "WORLD_SIZE"], 1) > 1:
                if not cpu and is_xpu_available():
                    self.distributed_type = DistributedType.MULTI_XPU
                else:
@ -339,6 +332,7 @@ class PartialState:

                if self.device is None:
                    self.device = torch.device("cpu") if cpu else self.default_device
+
        self.fork_launched = parse_flag_from_env("FORK_LAUNCHED", 0)

    def __repr__(self) -> str:
@ -419,7 +413,7 @@ class PartialState:
            DistributedType.FSDP,
        ):
            torch.distributed.barrier()
-        elif self.distributed_type == DistributedType.XLA:
+        elif self.distributed_type == DistributedType.TPU:
            xm.rendezvous("accelerate.utils.wait_for_everyone")

    def _goes_first(self, is_main: bool):
@ -806,7 +800,7 @@ class AcceleratorState:
                )
            # deepspeed handles mixed_precision using deepspeed_config
            self._mixed_precision = "no" if self.distributed_type == DistributedType.DEEPSPEED else mixed_precision
-            if self.distributed_type == DistributedType.XLA and is_torch_xla_available(check_is_tpu=True):
+            if self.distributed_type == DistributedType.TPU:
                if mixed_precision == "bf16":
                    if os.environ.get("ACCELERATE_DOWNCAST_BF16"):
                        os.environ["XLA_USE_BF16"] = str(0)
@ -846,6 +840,7 @@ class AcceleratorState:
                        if self._mixed_precision != "no":
                            fsdp_plugin.set_mixed_precision(self._mixed_precision)
                        self.fsdp_plugin = fsdp_plugin
+
            if (
                self.dynamo_plugin.backend != DynamoBackend.NO
                and self._mixed_precision == "no"
@ -1016,10 +1011,6 @@ class GradientState:
            accumulation
        - **sync_with_dataloader** (`bool`) -- Whether the gradients should be synced at the end of the dataloader
            iteration and the number of total steps reset
-        - **is_xla_gradients_synced** (`bool`) -- Whether the XLA gradients have been synchronized. It is initialized
-          as false. Once gradients have been reduced before the optimizer step, this flag is set to true. Subsequently,
-            after each step, the flag is reset to false. FSDP will always synchronize the gradients, hence
-            is_xla_gradients_synced is always true.
    """

    _shared_state = SharedDict()
@ -1033,7 +1024,6 @@ class GradientState:
            self.plugin_kwargs = (
                gradient_accumulation_plugin.to_kwargs() if gradient_accumulation_plugin is not None else {}
            )
-            self._is_xla_gradients_synced = False

        # Plugin args are different and can be updated
        if gradient_accumulation_plugin is not None and self.plugin_kwargs != gradient_accumulation_plugin.to_kwargs():
@ -1081,28 +1071,9 @@ class GradientState:
            f"Gradient accumulation plugin: {self.plugin_kwargs}\n"
        )

-    @property
-    def is_xla_gradients_synced(self):
-        "Returns the value of is_xla_gradients_synced. FSDP will always synchronize the gradients, hence is_xla_gradients_synced is always true."
-        if parse_flag_from_env("ACCELERATE_USE_FSDP", default=False):
-            return True
-        return self._is_xla_gradients_synced
-
-    @is_xla_gradients_synced.setter
-    def is_xla_gradients_synced(self, is_synced):
-        "Set the _is_xla_gradients_synced attribute."
-        self._is_xla_gradients_synced = is_synced
-
    def _set_sync_gradients(self, sync_gradients):
        "Private function that sets whether gradients should be synchronized. Users should not have to call this."
        self.sync_gradients = sync_gradients
-        # Allow grad-sync to automatically work on TPUs
-        if (
-            self.sync_gradients
-            and is_torch_xla_available(check_is_tpu=True)
-            and PartialState().distributed_type == DistributedType.XLA
-        ):
-            xm.mark_step()

    def _add_dataloader(self, dataloader):
        "Private function that adds a dataloader to `self.dataloader_references` and sets `in_dataloader` to `True`. Users should not have to call this."
--- a/src/accelerate/test_utils/init.py
+++ b/src/accelerate/test_utils/init.py
@ -1,25 +1,9 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 from .testing import (
-    DEFAULT_LAUNCH_COMMAND,
    are_the_same_tensors,
    assert_exception,
    device_count,
    execute_subprocess_async,
-    get_launch_command,
    memory_allocated_func,
-    path_in_accelerate_package,
    require_bnb,
    require_cpu,
    require_cuda,
@ -29,7 +13,6 @@ from .testing import (
    require_multi_gpu,
    require_multi_xpu,
    require_non_cpu,
-    require_non_torch_xla,
    require_non_xpu,
    require_npu,
    require_pippy,
--- a/src/accelerate/test_utils/examples.py
+++ b/src/accelerate/test_utils/examples.py
@ -84,14 +84,14 @@ def compare_against_test(base_filename: str, feature_filename: str, parser_only:
            functionalities off of "examples/nlp_example.py", so if `base_filename` is a script other than
            `complete_nlp_example.py`, the template script should be included here. Such as `examples/cv_example.py`
    """
-    with open(base_filename) as f:
+    with open(base_filename, "r") as f:
        base_file_contents = f.readlines()
-    with open(os.path.abspath(os.path.join("examples", "nlp_example.py"))) as f:
+    with open(os.path.abspath(os.path.join("examples", "nlp_example.py")), "r") as f:
        full_file_contents = f.readlines()
-    with open(feature_filename) as f:
+    with open(feature_filename, "r") as f:
        feature_file_contents = f.readlines()
    if secondary_filename is not None:
-        with open(secondary_filename) as f:
+        with open(secondary_filename, "r") as f:
            secondary_file_contents = f.readlines()

    # This is our base, we remove all the code from here in our `full_filename` and `feature_filename` to find the new content
--- a/src/accelerate/test_utils/scripts/init.py
+++ b/src/accelerate/test_utils/scripts/init.py
@ -1,13 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
--- a/src/accelerate/test_utils/scripts/external_deps/init.py
+++ b/src/accelerate/test_utils/scripts/external_deps/init.py
@ -1,13 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
--- a/src/accelerate/test_utils/scripts/external_deps/test_checkpointing.py
+++ b/src/accelerate/test_utils/scripts/external_deps/test_checkpointing.py
@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2022 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -60,7 +61,7 @@ def get_dataloaders(accelerator: Accelerator, batch_size: int = 16, model_name:

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
-        if accelerator.distributed_type == DistributedType.XLA:
+        if accelerator.distributed_type == DistributedType.TPU:
            return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
        return tokenizer.pad(examples, padding="longest", return_tensors="pt")

@ -181,7 +182,7 @@ def training_function(config, args):
        accelerator.print("resumed checkpoint performance:", accuracy)
        accelerator.print("resumed checkpoint's scheduler's lr:", lr_scheduler.get_lr()[0])
        accelerator.print("resumed optimizers's lr:", optimizer.param_groups[0]["lr"])
-        with open(os.path.join(args.output_dir, f"state_{starting_epoch - 1}.json")) as f:
+        with open(os.path.join(args.output_dir, f"state_{starting_epoch-1}.json"), "r") as f:
            resumed_state = json.load(f)
            assert resumed_state["accuracy"] == accuracy, "Accuracy mismatch, loading from checkpoint failed"
            assert (
--- a/src/accelerate/test_utils/scripts/external_deps/test_metrics.py
+++ b/src/accelerate/test_utils/scripts/external_deps/test_metrics.py
@ -25,10 +25,10 @@ from datasets import load_dataset
 from torch.utils.data import DataLoader, IterableDataset
 from transformers import AutoModelForSequenceClassification, AutoTokenizer

-from accelerate import Accelerator, DataLoaderConfiguration, DistributedType
+from accelerate import Accelerator
 from accelerate.data_loader import DataLoaderDispatcher
 from accelerate.test_utils import RegressionDataset, RegressionModel, torch_device
-from accelerate.utils import is_torch_xla_available, set_seed
+from accelerate.utils import set_seed


 os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
@ -36,7 +36,7 @@ os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"

 class ListHandler(logging.Handler):
    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+        super(ListHandler, self).__init__(*args, **kwargs)
        self.logs = []

    def emit(self, record):
@ -81,8 +81,7 @@ def get_dataloader(accelerator: Accelerator, use_longest=False):


 def get_mrpc_setup(dispatch_batches, split_batches):
-    dataloader_config = DataLoaderConfiguration(dispatch_batches=dispatch_batches, split_batches=split_batches)
-    accelerator = Accelerator(dataloader_config=dataloader_config)
+    accelerator = Accelerator(dispatch_batches=dispatch_batches, split_batches=split_batches)
    dataloader = get_dataloader(accelerator, not dispatch_batches)
    model = AutoModelForSequenceClassification.from_pretrained(
        "hf-internal-testing/mrpc-bert-base-cased", return_dict=True
@ -113,8 +112,8 @@ def generate_predictions(model, dataloader, accelerator):
 def test_torch_metrics(
    accelerator: Accelerator, num_samples=82, dispatch_batches=False, split_batches=False, batch_size=16
 ):
-    _, ddp_model, dataloader = get_basic_setup(accelerator, num_samples, batch_size)
-    logits, _ = generate_predictions(ddp_model, dataloader, accelerator)
+    model, ddp_model, dataloader = get_basic_setup(accelerator, num_samples, batch_size)
+    logits, targs = generate_predictions(ddp_model, dataloader, accelerator)
    assert (
        len(logits) == num_samples
    ), f"Unexpected number of inputs:\n    Expected: {num_samples}\n    Actual: {len(logits)}"
@ -162,7 +161,8 @@ def test_gather_for_metrics_with_non_tensor_objects_iterable_dataset():
            return len(self.data)

        def __iter__(self):
-            yield from self.data
+            for element in self.data:
+                yield element

    iterable_dataset = DummyIterableDataset([n for n in range(30)])
    dataloader = DataLoader(iterable_dataset, batch_size=4)
@ -194,7 +194,8 @@ def test_gather_for_metrics_with_iterable_dataset():
            return len(self.data)

        def __iter__(self):
-            yield from self.data
+            for element in self.data:
+                yield element

    iterable_dataset = DummyIterableDataset(torch.as_tensor(range(30)))
    dataloader = DataLoader(iterable_dataset, batch_size=4)
@ -241,26 +242,19 @@ def test_gather_for_metrics_drop_last():


 def main():
-    dataloader_config = DataLoaderConfiguration(split_batches=False, dispatch_batches=False)
-    accelerator = Accelerator(dataloader_config=dataloader_config)
+    accelerator = Accelerator(split_batches=False, dispatch_batches=False)
    if accelerator.is_local_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_warning()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()
-    # TorchXLA does not support batch dispatching. 'put_on_device' is always False for
-    # TorchXLA, which can cause a value error in 'prepare_data_loader' function.
-    dispatch_batches_options = [False] if accelerator.state.distributed_type == DistributedType.XLA else [True, False]
-
-    # Temporarily close this test for TorchXLA due to the 'Cannot set version_counter for
-    # inference tensor' error in inference mode. Reopen it after TorchXLA fixes this bug.
    # These are a bit slower so they should only be ran on the GPU or TPU
-    if accelerator.device.type != "cpu" and not is_torch_xla_available():
+    if accelerator.device.type != "cpu":
        if accelerator.is_local_main_process:
            print("**Testing gather_for_metrics**")
        for split_batches in [True, False]:
-            for dispatch_batches in dispatch_batches_options:
+            for dispatch_batches in [True, False]:
                if accelerator.is_local_main_process:
                    print(f"With: `split_batches={split_batches}`, `dispatch_batches={dispatch_batches}`")
                test_mrpc(dispatch_batches, split_batches)
@ -269,23 +263,15 @@ def main():
        test_gather_for_metrics_with_iterable_dataset()
        print("test gather_for_metrics_with_non_tensor_objects_iterable_dataset")
        test_gather_for_metrics_with_non_tensor_objects_iterable_dataset()
-
-    # MpDeviceLoader in TorchXLA is an asynchronous loader that preloads several batches into cache.
-    # This can cause the 'end_of_dataloader' of DataLoaderStateMixin to be set earlier than intended.
-    # Skip this test when TorchXLA is enabled.
-    if accelerator.state.distributed_type != DistributedType.XLA:
-        if accelerator.is_local_main_process:
-            print("**Test torch metrics**")
-        for split_batches in [True, False]:
-            for dispatch_batches in dispatch_batches_options:
-                dataloader_config = DataLoaderConfiguration(
-                    split_batches=split_batches, dispatch_batches=dispatch_batches
-                )
-                accelerator = Accelerator(dataloader_config=dataloader_config)
-                if accelerator.is_local_main_process:
-                    print(f"With: `split_batches={split_batches}`, `dispatch_batches={dispatch_batches}`, length=99")
-                test_torch_metrics(accelerator, 99)
-                accelerator.state._reset_state()
+    if accelerator.is_local_main_process:
+        print("**Test torch metrics**")
+    for split_batches in [True, False]:
+        for dispatch_batches in [True, False]:
+            accelerator = Accelerator(split_batches=split_batches, dispatch_batches=dispatch_batches)
+            if accelerator.is_local_main_process:
+                print(f"With: `split_batches={split_batches}`, `dispatch_batches={dispatch_batches}`, length=99")
+            test_torch_metrics(accelerator, 99)
+            accelerator.state._reset_state()
    if accelerator.is_local_main_process:
        print("**Test last batch is not dropped when perfectly divisible**")
    accelerator = Accelerator()
--- a/src/accelerate/test_utils/scripts/external_deps/test_peak_memory_usage.py
+++ b/src/accelerate/test_utils/scripts/external_deps/test_peak_memory_usage.py
@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2022 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -116,7 +117,7 @@ def get_dataloaders(

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
-        if accelerator.distributed_type == DistributedType.XLA:
+        if accelerator.distributed_type == DistributedType.TPU:
            return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
        return tokenizer.pad(examples, padding="longest", return_tensors="pt")

@ -208,11 +209,13 @@ def training_function(config, args):
                overall_step += 1

        # Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
-        accelerator.print(f"Memory before entering the train : {b2mb(tracemalloc.begin)}")
-        accelerator.print(f"Memory consumed at the end of the train (end-begin): {tracemalloc.used}")
-        accelerator.print(f"Peak Memory consumed during the train (max-begin): {tracemalloc.peaked}")
+        accelerator.print("Memory before entering the train : {}".format(b2mb(tracemalloc.begin)))
+        accelerator.print("Memory consumed at the end of the train (end-begin): {}".format(tracemalloc.used))
+        accelerator.print("Peak Memory consumed during the train (max-begin): {}".format(tracemalloc.peaked))
        accelerator.print(
-            f"Total Peak Memory consumed during the train (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}"
+            "Total Peak Memory consumed during the train (max): {}".format(
+                tracemalloc.peaked + b2mb(tracemalloc.begin)
+            )
        )
        train_total_peak_memory[f"epoch-{epoch}"] = tracemalloc.peaked + b2mb(tracemalloc.begin)
        if args.peak_memory_upper_bound is not None:
--- a/src/accelerate/test_utils/scripts/external_deps/test_performance.py
+++ b/src/accelerate/test_utils/scripts/external_deps/test_performance.py
@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2022 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -60,7 +61,7 @@ def get_dataloaders(accelerator: Accelerator, batch_size: int = 16, model_name:

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
-        if accelerator.distributed_type == DistributedType.XLA:
+        if accelerator.distributed_type == DistributedType.TPU:
            return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
        return tokenizer.pad(examples, padding="longest", return_tensors="pt")

--- a/src/accelerate/test_utils/scripts/external_deps/test_pippy.py
+++ b/src/accelerate/test_utils/scripts/external_deps/test_pippy.py
@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
--- a/src/accelerate/test_utils/scripts/test_cli.py
+++ b/src/accelerate/test_utils/scripts/test_cli.py
@ -1,16 +1,3 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 import torch


--- a/src/accelerate/test_utils/scripts/test_distributed_data_loop.py
+++ b/src/accelerate/test_utils/scripts/test_distributed_data_loop.py
@ -22,7 +22,7 @@ from unittest.mock import Mock
 import torch
 from torch.utils.data import DataLoader, IterableDataset, TensorDataset

-from accelerate.accelerator import Accelerator, DataLoaderConfiguration
+from accelerate.accelerator import Accelerator
 from accelerate.utils.dataclasses import DistributedType


@ -31,12 +31,12 @@ class DummyIterableDataset(IterableDataset):
        self.data = data

    def __iter__(self):
-        yield from self.data
+        for element in self.data:
+            yield element


 def create_accelerator(even_batches=True):
-    dataloader_config = DataLoaderConfiguration(even_batches=even_batches)
-    accelerator = Accelerator(dataloader_config=dataloader_config)
+    accelerator = Accelerator(even_batches=even_batches)
    assert accelerator.num_processes == 2, "this script expects that two GPUs are available"
    return accelerator

--- a/src/accelerate/test_utils/scripts/test_notebook.py
+++ b/src/accelerate/test_utils/scripts/test_notebook.py
@ -1,20 +1,4 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Test file to ensure that in general certain situational setups for notebooks work.
-"""
-
+# Test file to ensure that in general certain situational setups for notebooks work.
 import os

 from pytest import raises
--- a/src/accelerate/test_utils/scripts/test_ops.py
+++ b/src/accelerate/test_utils/scripts/test_ops.py
@ -41,9 +41,6 @@ def test_gather(state):


 def test_gather_object(state):
-    # Gather objects in TorchXLA is not supported.
-    if state.distributed_type == DistributedType.XLA:
-        return
    obj = [state.process_index]
    gathered_obj = gather_object(obj)
    assert len(gathered_obj) == state.num_processes, f"{gathered_obj}, {len(gathered_obj)} != {state.num_processes}"
@ -51,9 +48,6 @@ def test_gather_object(state):


 def test_gather_non_contigous(state):
-    # Skip this test because the 'is_contiguous' function of XLA tensor always returns True.
-    if state.distributed_type == DistributedType.XLA:
-        return
    # Create a non-contiguous tensor
    tensor = torch.arange(12).view(4, 3).t().to(state.device)
    assert not tensor.is_contiguous()
@ -102,8 +96,8 @@ def test_reduce_mean(state):


 def test_op_checker(state):
-    # Must be in a distributed state, and gathering is currently not supported in TorchXLA.
-    if state.distributed_type in [DistributedType.NO, DistributedType.XLA]:
+    # Must be in a distributed state
+    if state.distributed_type == DistributedType.NO:
        return
    state.debug = True
    # `pad_across_processes`
@ -137,14 +131,14 @@ def test_op_checker(state):


 def test_copy_tensor_to_devices(state):
-    if state.distributed_type not in [DistributedType.MULTI_GPU, DistributedType.XLA]:
+    if state.distributed_type not in [DistributedType.MULTI_GPU, DistributedType.TPU]:
        return
    if state.is_main_process:
        tensor = torch.tensor([1, 2, 3], dtype=torch.int).to(state.device)
    else:
        tensor = None
    tensor = copy_tensor_to_devices(tensor)
-    assert torch.allclose(tensor, torch.tensor([1, 2, 3], dtype=torch.int, device=state.device))
+    assert torch.allclose(tensor, torch.tensor([1, 2, 3], dtype=torch.int, device="cuda"))


 def _mp_fn(index):
--- a/src/accelerate/test_utils/scripts/test_script.py
+++ b/src/accelerate/test_utils/scripts/test_script.py
@ -30,7 +30,6 @@ from accelerate.data_loader import SeedableRandomSampler, prepare_data_loader
 from accelerate.state import AcceleratorState
 from accelerate.test_utils import RegressionDataset, are_the_same_tensors
 from accelerate.utils import (
-    DataLoaderConfiguration,
    DistributedType,
    gather,
    is_bf16_available,
@ -96,7 +95,7 @@ def process_execution_check():
    accelerator.wait_for_everyone()

    if accelerator.is_main_process:
-        with open(path) as f:
+        with open(path, "r") as f:
            text = "".join(f.readlines())
        try:
            assert text.startswith("Currently in the main process\n"), "Main process was not first"
@ -356,9 +355,7 @@ def check_seedable_sampler():
    set_seed(42)
    train_set = RegressionDataset(length=10, seed=42)
    train_dl = DataLoader(train_set, batch_size=2, shuffle=True)
-
-    config = DataLoaderConfiguration(use_seedable_sampler=True)
-    accelerator = Accelerator(dataloader_config=config)
+    accelerator = Accelerator(use_seedable_sampler=True)
    train_dl = accelerator.prepare(train_dl)
    original_items = []
    for _ in range(3):
@ -413,7 +410,7 @@ def training_check(use_seedable_sampler=False):
    train_dl, model, optimizer = accelerator.prepare(train_dl, model, optimizer)
    set_seed(42)
    generator.manual_seed(42)
-    for _ in range(3):
+    for epoch in range(3):
        for batch in train_dl:
            model.zero_grad()
            output = model(batch["x"])
@ -427,8 +424,7 @@ def training_check(use_seedable_sampler=False):

    accelerator.print("Training yielded the same results on one CPU or distributed setup with no batch split.")

-    dataloader_config = DataLoaderConfiguration(split_batches=True, use_seedable_sampler=use_seedable_sampler)
-    accelerator = Accelerator(dataloader_config=dataloader_config)
+    accelerator = Accelerator(split_batches=True, use_seedable_sampler=use_seedable_sampler)
    train_dl = generate_baseline_dataloader(
        train_set, generator, batch_size * state.num_processes, use_seedable_sampler
    )
@ -456,8 +452,7 @@ def training_check(use_seedable_sampler=False):
        # Mostly a test that FP16 doesn't crash as the operation inside the model is not converted to FP16
        print("FP16 training check.")
        AcceleratorState._reset_state()
-        dataloader_config = DataLoaderConfiguration(use_seedable_sampler=use_seedable_sampler)
-        accelerator = Accelerator(mixed_precision="fp16", dataloader_config=dataloader_config)
+        accelerator = Accelerator(mixed_precision="fp16", use_seedable_sampler=use_seedable_sampler)
        train_dl = generate_baseline_dataloader(train_set, generator, batch_size, use_seedable_sampler)
        model = RegressionModel()
        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
@ -497,8 +492,7 @@ def training_check(use_seedable_sampler=False):
        # Mostly a test that BF16 doesn't crash as the operation inside the model is not converted to BF16
        print("BF16 training check.")
        AcceleratorState._reset_state()
-        dataloader_config = DataLoaderConfiguration(use_seedable_sampler=use_seedable_sampler)
-        accelerator = Accelerator(mixed_precision="bf16", dataloader_config=dataloader_config)
+        accelerator = Accelerator(mixed_precision="bf16", use_seedable_sampler=use_seedable_sampler)
        train_dl = generate_baseline_dataloader(train_set, generator, batch_size, use_seedable_sampler)
        model = RegressionModel()
        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
@ -522,8 +516,7 @@ def training_check(use_seedable_sampler=False):
    if is_ipex_available():
        print("ipex BF16 training check.")
        AcceleratorState._reset_state()
-        dataloader_config = DataLoaderConfiguration(use_seedable_sampler=use_seedable_sampler)
-        accelerator = Accelerator(mixed_precision="bf16", cpu=True, dataloader_config=dataloader_config)
+        accelerator = Accelerator(mixed_precision="bf16", cpu=True, use_seedable_sampler=use_seedable_sampler)
        train_dl = generate_baseline_dataloader(train_set, generator, batch_size, use_seedable_sampler)
        model = RegressionModel()
        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
@ -547,8 +540,7 @@ def training_check(use_seedable_sampler=False):
    if is_xpu_available():
        print("xpu BF16 training check.")
        AcceleratorState._reset_state()
-        dataloader_config = DataLoaderConfiguration(use_seedable_sampler=use_seedable_sampler)
-        accelerator = Accelerator(mixed_precision="bf16", cpu=False, dataloader_config=dataloader_config)
+        accelerator = Accelerator(mixed_precision="bf16", cpu=False, use_seedable_sampler=use_seedable_sampler)
        train_dl = generate_baseline_dataloader(train_set, generator, batch_size, use_seedable_sampler)
        model = RegressionModel()
        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
@ -693,10 +685,10 @@ def main():
    if state.local_process_index == 0:
        print("\n**DataLoader integration test**")
    dl_preparation_check()
-    if state.distributed_type != DistributedType.XLA:
+    if state.distributed_type != DistributedType.TPU:
        central_dl_preparation_check()
-        custom_sampler_check()
-        check_seedable_sampler()
+    custom_sampler_check()
+    check_seedable_sampler()

    # Trainings are not exactly the same in DeepSpeed and CPU mode
    if state.distributed_type == DistributedType.DEEPSPEED:
--- a/src/accelerate/test_utils/scripts/test_sync.py
+++ b/src/accelerate/test_utils/scripts/test_sync.py
@ -315,8 +315,7 @@ def main():
    state = accelerator.state
    if state.local_process_index == 0:
        print("**Test `accumulate` gradient accumulation with dataloader break**")
-    if state.distributed_type != DistributedType.XLA:
-        test_dataloader_break()
+    test_dataloader_break()
    if state.distributed_type == DistributedType.NO:
        if state.local_process_index == 0:
            print("**Test NOOP `no_sync` context manager**")
--- a/src/accelerate/test_utils/testing.py
+++ b/src/accelerate/test_utils/testing.py
@ -13,7 +13,6 @@
 # limitations under the License.

 import asyncio
-import inspect
 import os
 import shutil
 import subprocess
@ -28,8 +27,6 @@ from unittest import mock

 import torch

-import accelerate
-
 from ..state import AcceleratorState, PartialState
 from ..utils import (
    gather,
@ -47,7 +44,7 @@ from ..utils import (
    is_tensorboard_available,
    is_timm_available,
    is_torch_version,
-    is_torch_xla_available,
+    is_tpu_available,
    is_transformers_available,
    is_wandb_available,
    is_xpu_available,
@ -56,9 +53,7 @@ from ..utils import (


 def get_backend():
-    if is_torch_xla_available():
-        return "xla", torch.cuda.device_count(), torch.cuda.memory_allocated
-    elif is_cuda_available():
+    if is_cuda_available():
        return "cuda", torch.cuda.device_count(), torch.cuda.memory_allocated
    elif is_mps_available():
        return "mps", 1, torch.mps.current_allocated_memory()
@ -73,28 +68,6 @@ def get_backend():
 torch_device, device_count, memory_allocated_func = get_backend()


-def get_launch_command(**kwargs) -> list:
-    """
-    Wraps around `kwargs` to help simplify launching from `subprocess`.
-
-    Example:
-    ```python
-    # returns ['accelerate', 'launch', '--num_processes=2', '--device_count=2']
-    get_launch_command(num_processes=2, device_count=2)
-    ```
-    """
-    command = ["accelerate", "launch"]
-    for k, v in kwargs.items():
-        if isinstance(v, bool) and v:
-            command.append(f"--{k}")
-        elif v is not None:
-            command.append(f"--{k}={v}")
-    return command
-
-
-DEFAULT_LAUNCH_COMMAND = get_launch_command(num_processes=device_count)
-
-
 def parse_flag_from_env(key, default=False):
    try:
        value = os.environ[key]
@ -144,10 +117,9 @@ def require_non_cpu(test_case):

 def require_cuda(test_case):
    """
-    Decorator marking a test that requires CUDA. These tests are skipped when there are no GPU available or when
-    TorchXLA is available.
+    Decorator marking a test that requires CUDA. These tests are skipped when there are no GPU available.
    """
-    return unittest.skipUnless(is_cuda_available() and not is_torch_xla_available(), "test requires a GPU")(test_case)
+    return unittest.skipUnless(is_cuda_available(), "test requires a GPU")(test_case)


 def require_xpu(test_case):
@ -184,8 +156,7 @@ def require_huggingface_suite(test_case):
    Decorator marking a test that requires transformers and datasets. These tests are skipped when they are not.
    """
    return unittest.skipUnless(
-        is_transformers_available() and is_datasets_available(),
-        "test requires the Hugging Face suite",
+        is_transformers_available() and is_datasets_available(), "test requires the Hugging Face suite"
    )(test_case)


@ -214,15 +185,7 @@ def require_tpu(test_case):
    """
    Decorator marking a test that requires TPUs. These tests are skipped when there are no TPUs available.
    """
-    return unittest.skipUnless(is_torch_xla_available(check_is_tpu=True), "test requires TPU")(test_case)
-
-
-def require_non_torch_xla(test_case):
-    """
-    Decorator marking a test as requiring an environment without TorchXLA. These tests are skipped when TorchXLA is
-    available.
-    """
-    return unittest.skipUnless(not is_torch_xla_available(), "test requires an env without TorchXLA")(test_case)
+    return unittest.skipUnless(is_tpu_available(), "test requires TPU")(test_case)


 def require_single_device(test_case):
@ -380,7 +343,7 @@ class TempDirTestCase(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        "Creates a `tempfile.TemporaryDirectory` and stores it in `cls.tmpdir`"
-        cls.tmpdir = Path(tempfile.mkdtemp())
+        cls.tmpdir = tempfile.mkdtemp()

    @classmethod
    def tearDownClass(cls):
@ -391,7 +354,7 @@ class TempDirTestCase(unittest.TestCase):
    def setUp(self):
        "Destroy all contents in `self.tmpdir`, but not `self.tmpdir`"
        if self.clear_on_setup:
-            for path in self.tmpdir.glob("**/*"):
+            for path in Path(self.tmpdir).glob("**/*"):
                if path.is_file():
                    path.unlink()
                elif path.is_dir():
@ -513,11 +476,7 @@ async def _stream_subprocess(cmd, env=None, stdin=None, timeout=None, quiet=Fals
    return _RunOutput(await p.wait(), out, err)


-def execute_subprocess_async(cmd: list, env=None, stdin=None, timeout=180, quiet=False, echo=True) -> _RunOutput:
-    # Cast every path in `cmd` to a string
-    for i, c in enumerate(cmd):
-        if isinstance(c, Path):
-            cmd[i] = str(c)
+def execute_subprocess_async(cmd, env=None, stdin=None, timeout=180, quiet=False, echo=True) -> _RunOutput:
    loop = asyncio.get_event_loop()
    result = loop.run_until_complete(
        _stream_subprocess(cmd, env=env, stdin=stdin, timeout=timeout, quiet=quiet, echo=echo)
@ -543,10 +502,6 @@ def run_command(command: List[str], return_stdout=False, env=None):
    Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture
    if an error occured while running `command`
    """
-    # Cast every path in `command` to a string
-    for i, c in enumerate(command):
-        if isinstance(c, Path):
-            command[i] = str(c)
    if env is None:
        env = os.environ.copy()
    try:
@ -561,21 +516,6 @@ def run_command(command: List[str], return_stdout=False, env=None):
        ) from e


-def path_in_accelerate_package(*components: str) -> Path:
-    """
-    Get a path within the `accelerate` package's directory.
-
-    Args:
-        *components: Components of the path to join after the package directory.
-
-    Returns:
-        `Path`: The path to the requested file or directory.
-    """
-
-    accelerate_package_dir = Path(inspect.getfile(accelerate)).parent
-    return accelerate_package_dir.joinpath(*components)
-
-
@contextmanager
 def assert_exception(exception_class: Exception, msg: str = None) -> bool:
    """
--- a/src/accelerate/test_utils/training.py
+++ b/src/accelerate/test_utils/training.py
@ -90,7 +90,7 @@ def mocked_dataloaders(accelerator, batch_size: int = 16):

    def collate_fn(examples):
        # On TPU it's best to pad everything to the same length or training will be very slow.
-        if accelerator.distributed_type == DistributedType.XLA:
+        if accelerator.distributed_type == DistributedType.TPU:
            return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
        return tokenizer.pad(examples, padding="longest", return_tensors="pt")

--- a/src/accelerate/tracking.py
+++ b/src/accelerate/tracking.py
@ -618,13 +618,13 @@ class MLflowTracker(GeneralTracker):
        run_name: Optional[str] = None,
        description: Optional[str] = None,
    ):
-        experiment_name = os.environ.get("MLFLOW_EXPERIMENT_NAME", experiment_name)
-        run_id = os.environ.get("MLFLOW_RUN_ID", run_id)
-        tags = os.environ.get("MLFLOW_TAGS", tags)
+        experiment_name = os.getenv("MLFLOW_EXPERIMENT_NAME", experiment_name)
+        run_id = os.getenv("MLFLOW_RUN_ID", run_id)
+        tags = os.getenv("MLFLOW_TAGS", tags)
        if isinstance(tags, str):
            tags = json.loads(tags)

-        nested_run = os.environ.get("MLFLOW_NESTED_RUN", nested_run)
+        nested_run = os.getenv("MLFLOW_NESTED_RUN", nested_run)

        import mlflow

@ -1011,7 +1011,7 @@ def filter_trackers(
                    if log_type not in loggers:
                        if log_type in get_available_trackers():
                            tracker_init = LOGGER_TYPE_TO_CLASS[str(log_type)]
-                            if tracker_init.requires_logging_directory:
+                            if getattr(tracker_init, "requires_logging_directory"):
                                if logging_dir is None:
                                    raise ValueError(
                                        f"Logging with `{log_type}` requires a `logging_dir` to be passed in."
--- a/src/accelerate/utils/init.py
+++ b/src/accelerate/utils/init.py
@ -1,16 +1,3 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 from .constants import (
    MODEL_NAME,
    OPTIMIZER_NAME,
@ -31,7 +18,6 @@ from .dataclasses import (
    BnbQuantizationConfig,
    ComputeEnvironment,
    CustomDtype,
-    DataLoaderConfiguration,
    DeepSpeedPlugin,
    DistributedDataParallelKwargs,
    DistributedType,
@ -89,7 +75,7 @@ from .imports import (
    is_sagemaker_available,
    is_tensorboard_available,
    is_timm_available,
-    is_torch_xla_available,
+    is_tpu_available,
    is_transformer_engine_available,
    is_transformers_available,
    is_wandb_available,
--- a/src/accelerate/utils/dataclasses.py
+++ b/src/accelerate/utils/dataclasses.py
@ -282,30 +282,6 @@ class BaseEnum(enum.Enum, metaclass=EnumWithContains):
        return list(map(str, cls))


-class DeprecatedFieldDescriptor:
-    """
-    Descriptor for deprecated fields in an enum class.
-
-    Args:
-        field_name (`str`):
-            The name of the deprecated field.
-        replaced_with (`str`):
-            The name of the field that replaces the deprecated one.
-    """
-
-    def __init__(self, field_name, replaced_with):
-        self.field_name = field_name
-        self.replaced_with = replaced_with
-
-    def __get__(self, instance, owner):
-        warnings.warn(
-            f"The `{self.field_name}` of `{owner}` is deprecated and will be removed in v1.0.0. "
-            f"Please use the `{self.replaced_with}` instead.",
-            FutureWarning,
-        )
-        return getattr(owner, self.replaced_with)
-
-
 class DistributedType(str, enum.Enum):
    """
    Represents a type of distributed environment.
@ -318,8 +294,7 @@ class DistributedType(str, enum.Enum):
        - **MULTI_NPU** -- Distributed on multiple NPUs.
        - **MULTI_XPU** -- Distributed on multiple XPUs.
        - **DEEPSPEED** -- Using DeepSpeed.
-        - **XLA** -- Using TorchXLA.
-        - **TPU** -- This field will be deprecated in v0.27.0. Use XLA instead.
+        - **TPU** -- Distributed on TPUs.
    """

    # Subclassing str as well as Enum allows the `DistributedType` to be JSON-serializable out of the box.
@ -330,9 +305,8 @@ class DistributedType(str, enum.Enum):
    MULTI_XPU = "MULTI_XPU"
    DEEPSPEED = "DEEPSPEED"
    FSDP = "FSDP"
-    XLA = "XLA"
+    TPU = "TPU"
    MEGATRON_LM = "MEGATRON_LM"
-    TPU = DeprecatedFieldDescriptor("TPU", "XLA")


 class SageMakerDistributedType(str, enum.Enum):
@ -469,7 +443,6 @@ class CustomDtype(enum.Enum):

    FP8 = "fp8"
    INT4 = "int4"
-    INT2 = "int2"


 # data classes
@ -481,48 +454,6 @@ class TensorInformation:
    dtype: torch.dtype


-@dataclass
-class DataLoaderConfiguration:
-    """
-    Configuration for dataloader-related items when calling `accelerator.prepare`.
-    """
-
-    split_batches: bool = field(
-        default=False,
-        metadata={
-            "help": "Whether or not the accelerator should split the batches yielded by the dataloaders across the devices. If"
-            " `True` the actual batch size used will be the same on any kind of distributed processes, but it must be a"
-            " round multiple of the `num_processes` you are using. If `False`, actual batch size used will be the one set"
-            " in your script multiplied by the number of processes."
-        },
-    )
-    dispatch_batches: bool = field(
-        default=None,
-        metadata={
-            "help": "If set to `True`, the dataloader prepared by the Accelerator is only iterated through on the main process"
-            " and then the batches are split and broadcast to each process. Will default to `True` for `DataLoader` whose"
-            " underlying dataset is an `IterableDataslet`, `False` otherwise."
-        },
-    )
-    even_batches: bool = field(
-        default=True,
-        metadata={
-            "help": "If set to `True`, in cases where the total batch size across all processes does not exactly divide the"
-            " dataset, samples at the start of the dataset will be duplicated so the batch can be divided equally among"
-            " all workers."
-        },
-    )
-    use_seedable_sampler: bool = field(
-        default=False,
-        metadata={
-            "help": "Whether or not use a fully seedable random sampler ([`data_loader.SeedableRandomSampler`])."
-            "Ensures training results are fully reproducable using a different sampling technique. "
-            "While seed-to-seed results may differ, on average the differences are neglible when using"
-            "multiple different seeds to compare. Should also be ran with [`~utils.set_seed`] for the best results."
-        },
-    )
-
-
@dataclass
 class ProjectConfiguration:
    """
@ -574,26 +505,7 @@ class ProjectConfiguration:
@dataclass
 class GradientAccumulationPlugin(KwargsHandler):
    """
-    A plugin to configure gradient accumulation behavior. You can only pass one of `gradient_accumulation_plugin` or
-    `gradient_accumulation_steps` to [`Accelerator`]. Passing both raises an error.
-
-    Parameters:
-        num_steps (`int`):
-            The number of steps to accumulate gradients for.
-        adjust_scheduler (`bool`, *optional*, defaults to `True`):
-            Whether to adjust the scheduler steps to account for the number of steps being accumulated. Should be
-            `True` if the used scheduler was not adjusted for gradient accumulation.
-        sync_with_dataloader (`bool`, *optional*, defaults to `True`):
-            Whether to synchronize setting the gradients when at the end of the dataloader.
-
-    Example:
-
-    ```python
-    from accelerate.utils import GradientAccumulationPlugin
-
-    gradient_accumulation_plugin = GradientAccumulationPlugin(num_steps=2)
-    accelerator = Accelerator(gradient_accumulation_plugin=gradient_accumulation_plugin)
-    ```
+    A plugin to configure gradient accumulation behavior.
    """

    num_steps: int = field(default=None, metadata={"help": "The number of steps to accumulate gradients for."})
--- a/src/accelerate/utils/deepspeed.py
+++ b/src/accelerate/utils/deepspeed.py
@ -13,6 +13,7 @@
 # limitations under the License.

 import base64
+import io
 import json
 import os
 from copy import deepcopy
@ -44,7 +45,7 @@ class HfDeepSpeedConfig:
            # modified it, it will not be accepted here again, since `auto` values would have been overridden
            config = deepcopy(config_file_or_dict)
        elif os.path.exists(config_file_or_dict):
-            with open(config_file_or_dict, encoding="utf-8") as f:
+            with io.open(config_file_or_dict, "r", encoding="utf-8") as f:
                config = json.load(f)
        else:
            try:
--- a/src/accelerate/utils/environment.py
+++ b/src/accelerate/utils/environment.py
@ -17,7 +17,7 @@ import platform
 import subprocess
 import sys
 from shutil import which
-from typing import List
+from typing import Dict

 import torch

@ -57,9 +57,9 @@ def parse_choice_from_env(key, default="no"):
    return value


-def are_libraries_initialized(*library_names: str) -> List[str]:
+def are_libraries_initialized(*library_names: str) -> Dict[str, bool]:
    """
-    Checks if any of `library_names` are imported in the environment. Will return any names that are.
+    Checks if any of `library_names` are imported in the environment. Will return results as a `key:bool` pair.
    """
    return [lib_name for lib_name in library_names if lib_name in sys.modules.keys()]

--- a/src/accelerate/utils/imports.py
+++ b/src/accelerate/utils/imports.py
@ -26,21 +26,13 @@ from .environment import parse_flag_from_env, str_to_bool
 from .versions import compare_versions, is_torch_version


-# Try to run Torch native job in an environment with TorchXLA installed by setting this value to 0.
-USE_TORCH_XLA = parse_flag_from_env("USE_TORCH_XLA", default=True)
+try:
+    import torch_xla.core.xla_model as xm  # noqa: F401

-_torch_xla_available = False
-if USE_TORCH_XLA:
-    try:
-        import torch_xla.core.xla_model as xm  # noqa: F401
-        import torch_xla.runtime
+    _tpu_available = True
+except ImportError:
+    _tpu_available = False

-        _torch_xla_available = True
-    except ImportError:
-        pass
-
-# Keep it for is_tpu_available. It will be removed along with is_tpu_available.
-_tpu_available = _torch_xla_available

 # Cache this result has it's a C FFI call which can be pretty time-consuming
 _torch_distributed_available = torch.distributed.is_available()
@ -109,11 +101,6 @@ def is_cuda_available():
@lru_cache
 def is_tpu_available(check_device=True):
    "Checks if `torch_xla` is installed and potentially if a TPU is in the environment"
-    warnings.warn(
-        "`is_tpu_available` is deprecated and will be removed in v0.27.0. "
-        "Please use the `is_torch_xla_available` instead.",
-        FutureWarning,
-    )
    # Due to bugs on the amp series GPUs, we disable torch-xla on them
    if is_cuda_available():
        return False
@ -128,24 +115,6 @@ def is_tpu_available(check_device=True):
    return _tpu_available


-@lru_cache
-def is_torch_xla_available(check_is_tpu=False, check_is_gpu=False):
-    """
-    Check if `torch_xla` is available. To train a native pytorch job in an environment with torch xla installed, set
-    the USE_TORCH_XLA to false.
-    """
-    assert not (check_is_tpu and check_is_gpu), "The check_is_tpu and check_is_gpu cannot both be true."
-
-    if not _torch_xla_available:
-        return False
-    elif check_is_gpu:
-        return torch_xla.runtime.device_type() in ["GPU", "CUDA"]
-    elif check_is_tpu:
-        return torch_xla.runtime.device_type() == "TPU"
-
-    return True
-
-
 def is_deepspeed_available():
    return _is_package_available("deepspeed")

@ -160,7 +129,7 @@ def is_pippy_available():

 def is_bf16_available(ignore_tpu=False):
    "Checks if bf16 is supported, optionally ignoring the TPU"
-    if is_torch_xla_available(check_is_tpu=True):
+    if is_tpu_available():
        return not ignore_tpu
    if is_cuda_available():
        return torch.cuda.is_bf16_supported()
--- a/src/accelerate/utils/launch.py
+++ b/src/accelerate/utils/launch.py
@ -27,7 +27,6 @@ from ..utils import (
    PrecisionType,
    is_ipex_available,
    is_npu_available,
-    is_torch_xla_available,
    is_xpu_available,
 )
 from ..utils.constants import DEEPSPEED_MULTINODE_LAUNCHERS
@ -109,23 +108,23 @@ def prepare_multi_gpu_env(args: argparse.Namespace) -> Dict[str, str]:
    """
    Prepares and returns an environment with the correct multi-GPU environment variables.
    """
-    num_processes = args.num_processes
-    num_machines = args.num_machines
-    main_process_ip = args.main_process_ip
-    main_process_port = args.main_process_port
+    num_processes = getattr(args, "num_processes")
+    num_machines = getattr(args, "num_machines")
+    main_process_ip = getattr(args, "main_process_ip")
+    main_process_port = getattr(args, "main_process_port")
    if num_machines > 1:
-        args.nproc_per_node = str(num_processes // num_machines)
-        args.nnodes = str(num_machines)
-        args.node_rank = int(args.machine_rank)
+        setattr(args, "nproc_per_node", str(num_processes // num_machines))
+        setattr(args, "nnodes", str(num_machines))
+        setattr(args, "node_rank", int(args.machine_rank))
        if getattr(args, "same_network", False):
-            args.master_addr = str(main_process_ip)
-            args.master_port = str(main_process_port)
+            setattr(args, "master_addr", str(main_process_ip))
+            setattr(args, "master_port", str(main_process_port))
        else:
-            args.rdzv_endpoint = f"{main_process_ip}:{main_process_port}"
+            setattr(args, "rdzv_endpoint", f"{main_process_ip}:{main_process_port}")
    else:
-        args.nproc_per_node = str(num_processes)
+        setattr(args, "nproc_per_node", str(num_processes))
        if main_process_port is not None:
-            args.master_port = str(main_process_port)
+            setattr(args, "master_port", str(main_process_port))

    if main_process_port is None:
        main_process_port = 29500
@ -136,16 +135,16 @@ def prepare_multi_gpu_env(args: argparse.Namespace) -> Dict[str, str]:
    if need_port_check and is_port_in_use(main_process_port):
        raise ConnectionError(
            f"Tried to launch distributed communication on port `{main_process_port}`, but another process is utilizing it. "
-            "Please specify a different port (such as using the `--main_process_port` flag or specifying a different `main_process_port` in your config file)"
+            "Please specify a different port (such as using the `----main_process_port` flag or specifying a different `main_process_port` in your config file)"
            " and rerun your script. To automatically use the next open port (on a single node), you can set this to `0`."
        )

    if args.module and args.no_python:
        raise ValueError("--module and --no_python cannot be used together")
    elif args.module:
-        args.module = True
+        setattr(args, "module", True)
    elif args.no_python:
-        args.no_python = True
+        setattr(args, "no_python", True)

    current_env = os.environ.copy()
    if args.debug:
@ -228,16 +227,16 @@ def prepare_deepspeed_cmd_env(args: argparse.Namespace) -> Tuple[List[str], Dict
    """
    Prepares and returns the command list and an environment with the correct DeepSpeed environment variables.
    """
-    num_processes = args.num_processes
-    num_machines = args.num_machines
-    main_process_ip = args.main_process_ip
-    main_process_port = args.main_process_port
+    num_processes = getattr(args, "num_processes")
+    num_machines = getattr(args, "num_machines")
+    main_process_ip = getattr(args, "main_process_ip")
+    main_process_port = getattr(args, "main_process_port")
    cmd = None

    # make sure launcher is not None
    if args.deepspeed_multinode_launcher is None:
        # set to default pdsh
-        args.deepspeed_multinode_launcher = DEEPSPEED_MULTINODE_LAUNCHERS[0]
+        setattr(args, "deepspeed_multinode_launcher", DEEPSPEED_MULTINODE_LAUNCHERS[0])

    if num_machines > 1 and args.deepspeed_multinode_launcher != DEEPSPEED_MULTINODE_LAUNCHERS[1]:
        cmd = ["deepspeed", "--no_local_rank"]
@ -268,18 +267,18 @@ def prepare_deepspeed_cmd_env(args: argparse.Namespace) -> Tuple[List[str], Dict
        cmd.append(args.training_script)
        cmd.extend(args.training_script_args)
    elif num_machines > 1 and args.deepspeed_multinode_launcher == DEEPSPEED_MULTINODE_LAUNCHERS[1]:
-        args.nproc_per_node = str(num_processes // num_machines)
-        args.nnodes = str(num_machines)
-        args.node_rank = int(args.machine_rank)
+        setattr(args, "nproc_per_node", str(num_processes // num_machines))
+        setattr(args, "nnodes", str(num_machines))
+        setattr(args, "node_rank", int(args.machine_rank))
        if getattr(args, "same_network", False):
-            args.master_addr = str(main_process_ip)
-            args.master_port = str(main_process_port)
+            setattr(args, "master_addr", str(main_process_ip))
+            setattr(args, "master_port", str(main_process_port))
        else:
-            args.rdzv_endpoint = f"{main_process_ip}:{main_process_port}"
+            setattr(args, "rdzv_endpoint", f"{main_process_ip}:{main_process_port}")
    else:
-        args.nproc_per_node = str(num_processes)
+        setattr(args, "nproc_per_node", str(num_processes))
        if main_process_port is not None:
-            args.master_port = str(main_process_port)
+            setattr(args, "master_port", str(main_process_port))

    if main_process_port is None:
        main_process_port = 29500
@ -297,9 +296,9 @@ def prepare_deepspeed_cmd_env(args: argparse.Namespace) -> Tuple[List[str], Dict
    if args.module and args.no_python:
        raise ValueError("--module and --no_python cannot be used together")
    elif args.module:
-        args.module = True
+        setattr(args, "module", True)
    elif args.no_python:
-        args.no_python = True
+        setattr(args, "no_python", True)

    current_env = os.environ.copy()
    if args.debug:
@ -348,7 +347,7 @@ def prepare_tpu(
    """
    Prepares and returns an environment with the correct TPU environment variables.
    """
-    if args.mixed_precision == "bf16" and is_torch_xla_available(check_is_tpu=True):
+    if args.mixed_precision == "bf16":
        if args.downcast_bf16:
            current_env["XLA_DOWNCAST_BF16"] = "1"
        else:
@ -418,7 +417,9 @@ def prepare_sagemager_args_inputs(
        os.environ["AWS_ACCESS_KEY_ID"] = args.aws_access_key_id
        os.environ["AWS_SECRET_ACCESS_KEY"] = args.aws_secret_access_key
    else:
-        raise OSError("You need to provide an aws_access_key_id and aws_secret_access_key when not using aws_profile")
+        raise EnvironmentError(
+            "You need to provide an aws_access_key_id and aws_secret_access_key when not using aws_profile"
+        )

    # extract needed arguments
    source_dir = os.path.dirname(args.training_script)
--- a/src/accelerate/utils/megatron_lm.py
+++ b/src/accelerate/utils/megatron_lm.py
@ -237,7 +237,7 @@ class MegatronLMDummyDataLoader:
                data_sharding=args.data_sharding,
            )
        else:
-            raise Exception(f"{args.dataloader_type} dataloader type is not supported.")
+            raise Exception("{} dataloader type is not supported.".format(args.dataloader_type))

        # Torch dataloader.
        return torch.utils.data.DataLoader(
@ -247,7 +247,8 @@ class MegatronLMDummyDataLoader:
    def build_train_valid_test_data_iterators(self):
        def cyclic_iter(iter):
            while True:
-                yield from iter
+                for x in iter:
+                    yield x

        args = get_args()

@ -280,9 +281,9 @@ class MegatronLMDummyDataLoader:
                test_iters * args.global_batch_size,
            ]
            print_rank_0(" > datasets target sizes (minimum size):")
-            print_rank_0(f"    train:      {train_val_test_num_samples[0]}")
-            print_rank_0(f"    validation: {train_val_test_num_samples[1]}")
-            print_rank_0(f"    test:       {train_val_test_num_samples[2]}")
+            print_rank_0("    train:      {}".format(train_val_test_num_samples[0]))
+            print_rank_0("    validation: {}".format(train_val_test_num_samples[1]))
+            print_rank_0("    test:       {}".format(train_val_test_num_samples[2]))

            # Build the datasets.
            train_valid_test_datasets_provider = self.get_train_valid_test_datasets_provider()
@ -844,7 +845,8 @@ def initialize(accelerator, extra_args_provider=None, args_defaults={}):
        if getattr(args, key, None) is not None:
            if args.rank == 0:
                print(
-                    f"WARNING: overriding default arguments for " f"{key}:{getattr(args, key)} with {key}:{value}",
+                    "WARNING: overriding default arguments for {key}:{v} \
+                        with {key}:{v2}".format(key=key, v=getattr(args, key), v2=value),
                    flush=True,
                )
        setattr(args, key, value)
@ -887,7 +889,7 @@ def initialize(accelerator, extra_args_provider=None, args_defaults={}):

        # Random seeds for reproducibility.
        if args.rank == 0:
-            print(f"> setting random seeds to {args.seed} ...")
+            print("> setting random seeds to {} ...".format(args.seed))
        _set_random_seed(args.seed, args.data_parallel_random_init)

    args = get_args()
@ -924,7 +926,7 @@ class MegatronEngine(torch.nn.Module):
    """

    def __init__(self, accelerator, model, optimizer, scheduler):
-        super().__init__()
+        super(MegatronEngine, self).__init__()
        self.module = model
        self.base_model = model[0]
        self.optimizer = optimizer
--- a/src/accelerate/utils/memory.py
+++ b/src/accelerate/utils/memory.py
@ -23,7 +23,7 @@ import inspect

 import torch

-from .imports import is_mps_available, is_npu_available, is_xpu_available
+from .imports import is_npu_available, is_xpu_available


 def release_memory(*objects):
@ -57,8 +57,6 @@ def release_memory(*objects):
        torch.xpu.empty_cache()
    elif is_npu_available():
        torch.npu.empty_cache()
-    elif is_mps_available():
-        torch.mps.empty_cache()
    else:
        torch.cuda.empty_cache()
    return objects
--- a/src/accelerate/utils/modeling.py
+++ b/src/accelerate/utils/modeling.py
@ -32,7 +32,7 @@ import torch.nn as nn
 from ..state import AcceleratorState
 from .constants import SAFE_WEIGHTS_NAME, WEIGHTS_NAME
 from .dataclasses import AutocastKwargs, CustomDtype, DistributedType
-from .imports import is_mps_available, is_npu_available, is_peft_available, is_torch_xla_available, is_xpu_available
+from .imports import is_mps_available, is_npu_available, is_peft_available, is_xpu_available
 from .offload import load_offloaded_weight, offload_weight, save_offload_index
 from .tqdm import is_tqdm_available, tqdm
 from .versions import compare_versions
@ -100,7 +100,7 @@ def convert_file_size_to_int(size: Union[int, str]):
    1048576
    ```
    """
-    mem_size = -1
+    mem_size = 0
    err_msg = (
        f"`size` {size} is not in a valid format. Use an integer for bytes, or a string with an unit (like '5.0GB')."
    )
@ -125,7 +125,7 @@ def convert_file_size_to_int(size: Union[int, str]):
    except ValueError:
        raise ValueError(err_msg)

-    if mem_size < 0:
+    if mem_size <= 0:
        raise ValueError(err_msg)
    return mem_size

@ -143,8 +143,6 @@ def dtype_byte_size(dtype: torch.dtype):
    """
    if dtype == torch.bool:
        return 1 / 8
-    elif dtype == CustomDtype.INT2:
-        return 1 / 4
    elif dtype == CustomDtype.INT4:
        return 1 / 2
    elif dtype == CustomDtype.FP8:
@ -404,14 +402,12 @@ def set_module_tensor_to_device(
                    new_value.SCB = new_value.SCB.to("cpu")
                else:
                    new_value = param_cls(new_value, requires_grad=old_value.requires_grad, **kwargs).to(device)
-            elif param_cls.__name__ in ["QTensor"]:
-                new_value = torch.nn.Parameter(new_value, requires_grad=old_value.requires_grad).to(device)
            else:
                new_value = param_cls(new_value, requires_grad=old_value.requires_grad).to(device)

            module._parameters[tensor_name] = new_value
            if fp16_statistics is not None:
-                module._parameters[tensor_name].SCB = fp16_statistics.to(device)
+                setattr(module._parameters[tensor_name], "SCB", fp16_statistics.to(device))
                del fp16_statistics
            # as we put the weight to meta, it doesn't have SCB attr anymore. make sure that it is not a meta weight
            if (
@ -476,7 +472,8 @@ def named_module_tensors(
            Whether or not to remove the non persistent buffer from the buffers. Useful only when include_buffers =
            True
    """
-    yield from module.named_parameters(recurse=recurse)
+    for named_parameter in module.named_parameters(recurse=recurse):
+        yield named_parameter

    if include_buffers:
        non_persistent_buffers = set()
@ -1153,11 +1150,7 @@ def infer_auto_device_map(
        # Case 1 -> We're too big!
        if current_max_size is not None and current_memory_used + module_size > current_max_size:
            # Split or not split?
-            modules_children = (
-                []
-                if isinstance(module, nn.Parameter) or isinstance(module, torch.Tensor)
-                else list(module.named_children())
-            )
+            modules_children = [] if isinstance(module, nn.Parameter) else list(module.named_children())
            if verbose:
                print(
                    f"Not enough space on {devices[current_device]} to put {name} (space available "
@ -1555,7 +1548,7 @@ def load_checkpoint_in_model(

    if index_filename is not None:
        checkpoint_folder = os.path.split(index_filename)[0]
-        with open(index_filename) as f:
+        with open(index_filename, "r") as f:
            index = json.loads(f.read())

        if "weight_map" in index:
@ -1668,13 +1661,8 @@ def get_mixed_precision_context_manager(native_amp: bool = False, autocast_kwarg
    else:
        autocast_kwargs = autocast_kwargs.to_kwargs()
    if native_amp:
-        device_type = (
-            "cuda"
-            if (state.distributed_type == DistributedType.XLA and is_torch_xla_available(check_is_gpu=True))
-            else state.device.type
-        )
        if state.mixed_precision == "fp16":
-            return torch.autocast(device_type=device_type, dtype=torch.float16, **autocast_kwargs)
+            return torch.autocast(device_type=state.device.type, dtype=torch.float16, **autocast_kwargs)
        elif state.mixed_precision == "bf16" and state.distributed_type in [
            DistributedType.NO,
            DistributedType.MULTI_CPU,
@ -1682,10 +1670,9 @@ def get_mixed_precision_context_manager(native_amp: bool = False, autocast_kwarg
            DistributedType.MULTI_NPU,
            DistributedType.MULTI_XPU,
            DistributedType.FSDP,
-            DistributedType.XLA,
        ]:
-            return torch.autocast(device_type=device_type, dtype=torch.bfloat16, **autocast_kwargs)
+            return torch.autocast(device_type=state.device.type, dtype=torch.bfloat16, **autocast_kwargs)
        else:
-            return torch.autocast(device_type=device_type, **autocast_kwargs)
+            return torch.autocast(device_type=state.device.type, **autocast_kwargs)
    else:
        return contextlib.nullcontext()
--- a/src/accelerate/utils/offload.py
+++ b/src/accelerate/utils/offload.py
@ -72,7 +72,7 @@ def save_offload_index(index, offload_folder):

    offload_index_file = os.path.join(offload_folder, "index.json")
    if os.path.isfile(offload_index_file):
-        with open(offload_index_file, encoding="utf-8") as f:
+        with open(offload_index_file, "r", encoding="utf-8") as f:
            current_index = json.load(f)
    else:
        current_index = {}
--- a/src/accelerate/utils/operations.py
+++ b/src/accelerate/utils/operations.py
@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """
 A set of basic tensor ops compatible with tpu, gpu, and multigpu
 """
@ -29,14 +30,15 @@ from .imports import (
    is_npu_available,
    is_torch_distributed_available,
    is_torch_version,
-    is_torch_xla_available,
+    is_tpu_available,
    is_xpu_available,
 )


-if is_torch_xla_available():
+if is_tpu_available(check_device=False):
    import torch_xla.core.xla_model as xm

+
 if is_torch_distributed_available():
    from torch.distributed import ReduceOp

@ -64,10 +66,17 @@ def is_tensor_information(tensor_info):

 def is_namedtuple(data):
    """
-    Checks if `data` is a `namedtuple` or not. Can have false positives, but only if a user is trying to mimic a
+    Checks if `x` is a `namedtuple` or not. Can have false positives, but only if a user is trying to mimic a
    `namedtuple` perfectly.
    """
-    return isinstance(data, tuple) and hasattr(data, "_asdict") and hasattr(data, "_fields")
+    data_type = type(data)
+    bases = data_type.__bases__
+    if len(bases) != 1 or bases[0] != tuple:
+        return False
+    fields = getattr(data_type, "_fields", None)
+    if not isinstance(fields, tuple):
+        return False
+    return all(isinstance(member, str) for member in fields)


 def honor_type(obj, generator):
@ -160,30 +169,19 @@ def send_to_device(tensor, device, non_blocking=False, skip_keys=None):
                for k, t in tensor.items()
            }
        )
-    elif is_torch_tensor(tensor) or hasattr(tensor, "to"):
-        # `torch.Tensor.to("npu")` could not find context when called for the first time (see this [issue](https://gitee.com/ascend/pytorch/issues/I8KECW?from=project-issue)).
-        if device == "npu":
-            device = "npu:0"
-        if device == "xpu":
-            device = "xpu:0"
-        try:
-            return tensor.to(device, non_blocking=non_blocking)
-        except TypeError:  # .to() doesn't accept non_blocking as kwarg
-            return tensor.to(device)
-        except AssertionError as error:
+    elif hasattr(tensor, "to"):
+        if is_npu_available():
            # `torch.Tensor.to(<int num>)` is not supported by `torch_npu` (see this [issue](https://github.com/Ascend/pytorch/issues/16)).
-            # This call is inside the try-block since is_npu_available is not supported by torch.compile.
-            if is_npu_available():
-                if isinstance(device, int):
-                    device = f"npu:{device}"
-            else:
-                raise error
-        except Exception as error:
-            if is_xpu_available():
-                if isinstance(device, int):
-                    device = f"xpu:{device}"
-            else:
-                raise error
+            if isinstance(device, int):
+                device = f"npu:{device}"
+            # `torch.Tensor.to("npu")` could not find context when called for the first time (see this [issue](https://gitee.com/ascend/pytorch/issues/I8KECW?from=project-issue)).
+            elif device == torch.device("npu"):
+                device = "npu:0"
+        elif is_xpu_available():
+            if isinstance(device, int):
+                device = f"xpu:{device}"
+            elif device == torch.device("xpu"):
+                device = "xpu:0"
        try:
            return tensor.to(device, non_blocking=non_blocking)
        except TypeError:  # .to() doesn't accept non_blocking as kwarg
@ -433,7 +431,7 @@ def gather(tensor):
    Returns:
        The same data structure as `tensor` with all tensors sent to the proper device.
    """
-    if PartialState().distributed_type == DistributedType.XLA:
+    if PartialState().distributed_type == DistributedType.TPU:
        return _tpu_gather(tensor)
    elif PartialState().distributed_type in TORCH_DISTRIBUTED_OPERATION_TYPES:
        return _gpu_gather(tensor)
@ -459,7 +457,7 @@ def gather_object(object: Any):
    Returns:
        The same data structure as `object` with all the objects sent to every device.
    """
-    if PartialState().distributed_type == DistributedType.XLA:
+    if PartialState().distributed_type == DistributedType.TPU:
        raise NotImplementedError("gather objects in TPU is not supported")
    elif PartialState().distributed_type in TORCH_DISTRIBUTED_OPERATION_TYPES:
        return _gpu_gather_object(object)
@ -555,7 +553,7 @@ def broadcast(tensor, from_process: int = 0):
    Returns:
        The same data structure as `tensor` with all tensors broadcasted to the proper device.
    """
-    if PartialState().distributed_type == DistributedType.XLA:
+    if PartialState().distributed_type == DistributedType.TPU:
        return _tpu_broadcast(tensor, src=from_process, name="accelerate.utils.broadcast")
    elif PartialState().distributed_type in TORCH_DISTRIBUTED_OPERATION_TYPES:
        return _gpu_broadcast(tensor, src=from_process)
@ -576,7 +574,7 @@ def broadcast_object_list(object_list, from_process: int = 0):
    Returns:
        The same list containing the objects from process 0.
    """
-    if PartialState().distributed_type == DistributedType.XLA:
+    if PartialState().distributed_type == DistributedType.TPU:
        for i, obj in enumerate(object_list):
            object_list[i] = xm.mesh_reduce("accelerate.utils.broadcast_object_list", obj, lambda x: x[from_process])
    elif PartialState().distributed_type in TORCH_DISTRIBUTED_OPERATION_TYPES:
@ -746,7 +744,7 @@ def reduce(tensor, reduction="mean", scale=1.0):
        cloned_tensor = tensor.clone()
        if state.distributed_type == DistributedType.NO:
            return cloned_tensor
-        if state.distributed_type == DistributedType.XLA:
+        if state.distributed_type == DistributedType.TPU:
            # Some processes may have different HLO graphs than other
            # processes, for example in the breakpoint API
            # accelerator.set_trigger(). Use mark_step to make HLOs
@ -781,10 +779,7 @@ def convert_to_fp32(tensor):
        return tensor.float()

    def _is_fp16_bf16_tensor(tensor):
-        return (is_torch_tensor(tensor) or hasattr(tensor, "dtype")) and tensor.dtype in (
-            torch.float16,
-            torch.bfloat16,
-        )
+        return hasattr(tensor, "dtype") and tensor.dtype in (torch.float16, torch.bfloat16)

    return recursively_apply(_convert_to_fp32, tensor, test_type=_is_fp16_bf16_tensor)

--- a/src/accelerate/utils/other.py
+++ b/src/accelerate/utils/other.py
@ -31,7 +31,7 @@ from ..logging import get_logger
 from ..state import PartialState
 from .constants import FSDP_PYTORCH_VERSION
 from .dataclasses import DistributedType
-from .imports import is_deepspeed_available, is_torch_distributed_available, is_torch_xla_available
+from .imports import is_deepspeed_available, is_torch_distributed_available, is_tpu_available
 from .modeling import id_tensor_storage
 from .transformer_engine import convert_model
 from .versions import is_torch_version
@ -40,7 +40,7 @@ from .versions import is_torch_version
 logger = get_logger(__name__)


-if is_torch_xla_available():
+if is_tpu_available(check_device=False):
    import torch_xla.core.xla_model as xm


@ -87,7 +87,7 @@ def extract_model_from_parallel(model, keep_fp32_wrapper: bool = True):
        model = model.module

    if not keep_fp32_wrapper:
-        forward = model.forward
+        forward = getattr(model, "forward")
        original_forward = model.__dict__.pop("_original_forward", None)
        if original_forward is not None:
            while hasattr(forward, "__wrapped__"):
@ -167,12 +167,6 @@ def save(obj, f, save_on_each_node: bool = False, safe_serialization: bool = Fal
        safe_serialization (`bool`, *optional*, defaults to `False`):
            Whether to save `obj` using `safetensors` or the traditional PyTorch way (that uses `pickle`).
    """
-    # When TorchXLA is enabled, it's necessary to transfer all data to the CPU before saving.
-    # Another issue arises with `id_tensor_storage`, which treats all XLA tensors as identical.
-    # If tensors remain on XLA, calling `clean_state_dict_for_safetensors` will result in only
-    # one XLA tensor remaining.
-    if PartialState().distributed_type == DistributedType.XLA:
-        obj = xm._maybe_convert_to_cpu(obj)
    # Check if it's a model and remove duplicates
    if safe_serialization:
        save_func = partial(safe_save_file, metadata={"format": "pt"})
@ -181,7 +175,9 @@ def save(obj, f, save_on_each_node: bool = False, safe_serialization: bool = Fal
    else:
        save_func = torch.save

-    if PartialState().is_main_process and not save_on_each_node:
+    if PartialState().distributed_type == DistributedType.TPU:
+        xm.save(obj, f)
+    elif PartialState().is_main_process and not save_on_each_node:
        save_func(obj, f)
    elif PartialState().is_local_main_process and save_on_each_node:
        save_func(obj, f)
@ -190,9 +186,9 @@ def save(obj, f, save_on_each_node: bool = False, safe_serialization: bool = Fal
@contextmanager
 def clear_environment():
    """
-    A context manager that will temporarily clear environment variables.
+    A context manager that will cache origin `os.environ` and replace it with a empty dictionary in this context.

-    When this context exits, the previous environment variables will be back.
+    When this context exits, the cached `os.environ` will be back.

    Example:

@ -212,14 +208,12 @@ def clear_environment():
    bar
    ```
    """
-    _old_os_environ = os.environ.copy()
-    os.environ.clear()
+    _old_os_environ = os.environ
+    os.environ = dict()

-    try:
-        yield
-    finally:
-        os.environ.clear()  # clear any added keys,
-        os.environ.update(_old_os_environ)  # then restore previous environment
+    yield
+
+    os.environ = _old_os_environ


@contextmanager
@ -247,16 +241,15 @@ def patch_environment(**kwargs):
            existing_vars[key] = os.environ[key]
        os.environ[key] = str(value)

-    try:
-        yield
-    finally:
-        for key in kwargs:
-            key = key.upper()
-            if key in existing_vars:
-                # restore previous value
-                os.environ[key] = existing_vars[key]
-            else:
-                os.environ.pop(key, None)
+    yield
+
+    for key in kwargs:
+        key = key.upper()
+        if key in existing_vars:
+            # restore previous value
+            os.environ[key] = existing_vars[key]
+        else:
+            os.environ.pop(key, None)


 def get_pretty_name(obj):
--- a/src/accelerate/utils/random.py
+++ b/src/accelerate/utils/random.py
@ -21,10 +21,10 @@ import torch
 from ..state import AcceleratorState
 from .constants import CUDA_DISTRIBUTED_TYPES
 from .dataclasses import DistributedType, RNGType
-from .imports import is_npu_available, is_torch_xla_available, is_xpu_available
+from .imports import is_npu_available, is_tpu_available, is_xpu_available


-if is_torch_xla_available():
+if is_tpu_available(check_device=False):
    import torch_xla.core.xla_model as xm


@ -50,7 +50,7 @@ def set_seed(seed: int, device_specific: bool = False):
    else:
        torch.cuda.manual_seed_all(seed)
    # ^^ safe to call this function even if cuda is not available
-    if is_torch_xla_available():
+    if is_tpu_available():
        xm.set_rng_state(seed)


@ -61,7 +61,7 @@ def synchronize_rng_state(rng_type: Optional[RNGType] = None, generator: Optiona
    elif rng_type == RNGType.CUDA:
        rng_state = torch.cuda.get_rng_state()
    elif rng_type == RNGType.XLA:
-        assert is_torch_xla_available(), "Can't synchronize XLA seeds as torch_xla is unavailable."
+        assert is_tpu_available(), "Can't synchronize XLA seeds on an environment without TPUs."
        rng_state = torch.tensor(xm.get_rng_state())
    elif rng_type == RNGType.NPU:
        assert is_npu_available(), "Can't synchronize NPU seeds on an environment without NPUs."
@ -75,7 +75,7 @@ def synchronize_rng_state(rng_type: Optional[RNGType] = None, generator: Optiona

    # Broadcast the rng state from device 0 to other devices
    state = AcceleratorState()
-    if state.distributed_type == DistributedType.XLA:
+    if state.distributed_type == DistributedType.TPU:
        rng_state = rng_state.to(xm.xla_device())
        xm.collective_broadcast([rng_state])
        xm.mark_step()
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@ -11,7 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import inspect
+import io
 import itertools
 import json
 import os
@ -27,6 +29,7 @@ from transformers.testing_utils import mockenv_context
 from transformers.trainer_utils import set_seed
 from transformers.utils import is_torch_bf16_available

+import accelerate
 from accelerate.accelerator import Accelerator
 from accelerate.scheduler import AcceleratedScheduler
 from accelerate.state import AcceleratorState
@ -34,7 +37,6 @@ from accelerate.test_utils.testing import (
    AccelerateTestCase,
    TempDirTestCase,
    execute_subprocess_async,
-    path_in_accelerate_package,
    require_deepspeed,
    require_multi_device,
    require_non_cpu,
@ -116,9 +118,9 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
        )

        # use self.get_config_dict(stage) to use these to ensure the original is not modified
-        with open(self.ds_config_file[ZERO2], encoding="utf-8") as f:
+        with io.open(self.ds_config_file[ZERO2], "r", encoding="utf-8") as f:
            config_zero2 = json.load(f)
-        with open(self.ds_config_file[ZERO3], encoding="utf-8") as f:
+        with io.open(self.ds_config_file[ZERO3], "r", encoding="utf-8") as f:
            config_zero3 = json.load(f)
            # The following setting slows things down, so don't enable it by default unless needed by a test.
            # It's in the file as a demo for users since we want everything to work out of the box even if slower.
@ -151,7 +153,7 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
            zero3_save_16bit_model=True,
            zero3_init_flag=True,
        )
-        assert not deepspeed_plugin.zero3_init_flag
+        self.assertFalse(deepspeed_plugin.zero3_init_flag)
        deepspeed_plugin.deepspeed_config = None

        # Test zero3_init_flag will be set to True only when ZeRO stage == 3
@ -164,15 +166,15 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
            zero3_save_16bit_model=True,
            zero3_init_flag=True,
        )
-        assert deepspeed_plugin.zero3_init_flag
+        self.assertTrue(deepspeed_plugin.zero3_init_flag)
        deepspeed_plugin.deepspeed_config = None

        # Test config files are loaded correctly
        deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.ds_config_file[stage], zero3_init_flag=True)
        if stage == ZERO2:
-            assert not deepspeed_plugin.zero3_init_flag
+            self.assertFalse(deepspeed_plugin.zero3_init_flag)
        elif stage == ZERO3:
-            assert deepspeed_plugin.zero3_init_flag
+            self.assertTrue(deepspeed_plugin.zero3_init_flag)

        # Test `gradient_accumulation_steps` is set to 1 if unavailable in config file
        with tempfile.TemporaryDirectory() as dirpath:
@ -181,7 +183,7 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
            with open(os.path.join(dirpath, "ds_config.json"), "w") as out_file:
                json.dump(ds_config, out_file)
            deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=os.path.join(dirpath, "ds_config.json"))
-            assert deepspeed_plugin.deepspeed_config["gradient_accumulation_steps"] == 1
+            self.assertEqual(deepspeed_plugin.deepspeed_config["gradient_accumulation_steps"], 1)
            deepspeed_plugin.deepspeed_config = None

        # Test `ValueError` is raised if `zero_optimization` is unavailable in config file
@ -192,7 +194,9 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
                json.dump(ds_config, out_file)
            with self.assertRaises(ValueError) as cm:
                deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=os.path.join(dirpath, "ds_config.json"))
-            assert "Please specify the ZeRO optimization config in the DeepSpeed config." in str(cm.exception)
+            self.assertTrue(
+                "Please specify the ZeRO optimization config in the DeepSpeed config." in str(cm.exception)
+            )
            deepspeed_plugin.deepspeed_config = None

        # Test `deepspeed_config_process`
@ -217,7 +221,7 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
        for ds_key_long, value in kwargs.items():
            config, ds_key = deepspeed_plugin.hf_ds_config.find_config_node(ds_key_long)
            if config.get(ds_key) is not None:
-                assert config.get(ds_key) == value
+                self.assertEqual(config.get(ds_key), value)

        # Test mismatches
        mismatches = {
@ -230,14 +234,17 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
            new_kwargs.update(mismatches)
            deepspeed_plugin.deepspeed_config_process(**new_kwargs)
        for key in mismatches.keys():
-            assert key in str(cm.exception), f"{key} is not in the exception message: {cm.exception}"
+            self.assertTrue(
+                key in str(cm.exception),
+                f"{key} is not in the exception message:\n{cm.exception}",
+            )

        # Test `ValueError` is raised if some config file fields with `auto` value is missing in `kwargs`
        deepspeed_plugin.deepspeed_config["optimizer"]["params"]["lr"] = "auto"
        with self.assertRaises(ValueError) as cm:
            del kwargs["optimizer.params.lr"]
            deepspeed_plugin.deepspeed_config_process(**kwargs)
-        assert "`optimizer.params.lr` not found in kwargs." in str(cm.exception)
+        self.assertTrue("`optimizer.params.lr` not found in kwargs." in str(cm.exception))

    @parameterized.expand([FP16, BF16], name_func=parameterized_custom_name_func)
    def test_accelerate_state_deepspeed(self, dtype):
@ -253,7 +260,7 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
        )
        with mockenv_context(**self.dist_env):
            state = Accelerator(mixed_precision=dtype, deepspeed_plugin=deepspeed_plugin).state
-            assert state.deepspeed_plugin.deepspeed_config[dtype]["enabled"]
+            self.assertTrue(state.deepspeed_plugin.deepspeed_config[dtype]["enabled"])

    def test_init_zero3(self):
        deepspeed_plugin = DeepSpeedPlugin(
@ -270,7 +277,7 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
            accelerator = Accelerator(deepspeed_plugin=deepspeed_plugin)  # noqa: F841
            from transformers.deepspeed import is_deepspeed_zero3_enabled

-            assert is_deepspeed_zero3_enabled()
+            self.assertTrue(is_deepspeed_zero3_enabled())

    @parameterized.expand(optim_scheduler_params, name_func=parameterized_custom_name_func)
    def test_prepare_deepspeed(self, optim_type, scheduler_type):
@ -326,14 +333,15 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
                    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
                        model, dummy_optimizer, train_dataloader, eval_dataloader, lr_scheduler
                    )
-                assert "You cannot create a `DummyOptim` without specifying an optimizer in the config file." in str(
-                    cm.exception
+                self.assertTrue(
+                    "You cannot create a `DummyOptim` without specifying an optimizer in the config file."
+                    in str(cm.exception)
                )
                with self.assertRaises(ValueError) as cm:
                    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
                        model, optimizer, train_dataloader, eval_dataloader, dummy_lr_scheduler
                    )
-                assert (
+                self.assertTrue(
                    "Either specify a scheduler in the config file or "
                    "pass in the `lr_scheduler_callable` parameter when using `accelerate.utils.DummyScheduler`."
                    in str(cm.exception)
@ -341,7 +349,7 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):

                with self.assertRaises(ValueError) as cm:
                    model, optimizer, lr_scheduler = accelerator.prepare(model, optimizer, lr_scheduler)
-                assert (
+                self.assertTrue(
                    "When using DeepSpeed, `accelerate.prepare()` requires you to pass at least one of training or evaluation dataloaders "
                    "with `batch_size` attribute returning an integer value "
                    "or alternatively set an integer value in `train_micro_batch_size_per_gpu` in the deepspeed config file "
@ -352,12 +360,12 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
                model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
                    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
                )
-                assert accelerator.deepspeed_config["zero_allow_untested_optimizer"]
-                assert accelerator.deepspeed_config["train_batch_size"], 16
-                assert type(model) is DeepSpeedEngine
-                assert type(optimizer) is DeepSpeedOptimizerWrapper
-                assert type(lr_scheduler) is AcceleratedScheduler
-                assert type(accelerator.deepspeed_engine_wrapped) is DeepSpeedEngineWrapper
+                self.assertTrue(accelerator.deepspeed_config["zero_allow_untested_optimizer"])
+                self.assertTrue(accelerator.deepspeed_config["train_batch_size"], 16)
+                self.assertEqual(type(model), DeepSpeedEngine)
+                self.assertEqual(type(optimizer), DeepSpeedOptimizerWrapper)
+                self.assertEqual(type(lr_scheduler), AcceleratedScheduler)
+                self.assertEqual(type(accelerator.deepspeed_engine_wrapped), DeepSpeedEngineWrapper)

        elif optim_type == DS_OPTIMIZER and scheduler_type == DS_SCHEDULER:
            # Test DeepSpeed optimizer + DeepSpeed scheduler
@ -388,33 +396,36 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
                    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
                        model, optimizer, train_dataloader, eval_dataloader, dummy_lr_scheduler
                    )
-                assert "You cannot specify an optimizer in the config file and in the code at the same time" in str(
-                    cm.exception
+                self.assertTrue(
+                    "You cannot specify an optimizer in the config file and in the code at the same time"
+                    in str(cm.exception)
                )

                with self.assertRaises(ValueError) as cm:
                    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
                        model, dummy_optimizer, train_dataloader, eval_dataloader, lr_scheduler
                    )
-                assert "You cannot specify a scheduler in the config file and in the code at the same time" in str(
-                    cm.exception
+                self.assertTrue(
+                    "You cannot specify a scheduler in the config file and in the code at the same time"
+                    in str(cm.exception)
                )

                with self.assertRaises(ValueError) as cm:
                    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
                        model, dummy_optimizer, train_dataloader, eval_dataloader, lr_scheduler
                    )
-                assert "You cannot specify a scheduler in the config file and in the code at the same time" in str(
-                    cm.exception
+                self.assertTrue(
+                    "You cannot specify a scheduler in the config file and in the code at the same time"
+                    in str(cm.exception)
                )

                model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
                    model, dummy_optimizer, train_dataloader, eval_dataloader, dummy_lr_scheduler
                )
-                assert type(model) is DeepSpeedEngine
-                assert type(optimizer) is DeepSpeedOptimizerWrapper
-                assert type(lr_scheduler) is DeepSpeedSchedulerWrapper
-                assert type(accelerator.deepspeed_engine_wrapped) is DeepSpeedEngineWrapper
+                self.assertTrue(type(model) == DeepSpeedEngine)
+                self.assertTrue(type(optimizer) == DeepSpeedOptimizerWrapper)
+                self.assertTrue(type(lr_scheduler) == DeepSpeedSchedulerWrapper)
+                self.assertTrue(type(accelerator.deepspeed_engine_wrapped) == DeepSpeedEngineWrapper)

        elif optim_type == CUSTOM_OPTIMIZER and scheduler_type == DS_SCHEDULER:
            # Test custom optimizer + DeepSpeed scheduler
@ -445,11 +456,11 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
                model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
                    model, optimizer, train_dataloader, eval_dataloader, dummy_lr_scheduler
                )
-                assert type(model) is DeepSpeedEngine
-                assert type(optimizer) is DeepSpeedOptimizerWrapper
-                assert type(lr_scheduler) is DeepSpeedSchedulerWrapper
-                assert type(accelerator.deepspeed_engine_wrapped) is DeepSpeedEngineWrapper
-        elif optim_type == DS_OPTIMIZER and scheduler_type is CUSTOM_SCHEDULER:
+                self.assertTrue(type(model) == DeepSpeedEngine)
+                self.assertTrue(type(optimizer) == DeepSpeedOptimizerWrapper)
+                self.assertTrue(type(lr_scheduler) == DeepSpeedSchedulerWrapper)
+                self.assertTrue(type(accelerator.deepspeed_engine_wrapped) == DeepSpeedEngineWrapper)
+        elif optim_type == DS_OPTIMIZER and scheduler_type == CUSTOM_SCHEDULER:
            # Test deepspeed optimizer + custom scheduler
            deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.ds_config_file[ZERO2])
            with mockenv_context(**self.dist_env):
@ -479,7 +490,7 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
                    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
                        model, dummy_optimizer, train_dataloader, eval_dataloader, lr_scheduler
                    )
-                assert (
+                self.assertTrue(
                    "You can only specify `accelerate.utils.DummyScheduler` in the code when using `accelerate.utils.DummyOptim`."
                    in str(cm.exception)
                )
@ -489,7 +500,7 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
                    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
                        model, dummy_optimizer, train_dataloader, eval_dataloader, dummy_lr_scheduler
                    )
-                assert (
+                self.assertTrue(
                    "Either specify a scheduler in the config file or "
                    "pass in the `lr_scheduler_callable` parameter when using `accelerate.utils.DummyScheduler`."
                    in str(cm.exception)
@ -543,7 +554,7 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
                model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
                    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
                )
-            assert (
+            self.assertTrue(
                "At least one of the dataloaders passed to `accelerate.prepare()` has `None` as batch size. "
                "Please set an integer value in `train_micro_batch_size_per_gpu` in the deepspeed config file "
                "or assign integer value to `AcceleratorState().deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu']`."
@ -599,7 +610,7 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
                "set `zero3_save_16bit_model` to True when using `accelerate config`. "
                "To save the full checkpoint, run `model.save_checkpoint(save_dir)` and use `zero_to_fp32.py` to recover weights."
            )
-            assert msg in str(cm.exception)
+            self.assertTrue(msg in str(cm.exception))

    def test_autofill_dsconfig(self):
        deepspeed_plugin = DeepSpeedPlugin(
@ -622,22 +633,31 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
            model, _, train_dataloader, eval_dataloader, _ = accelerator.prepare(
                model, dummy_optimizer, train_dataloader, eval_dataloader, dummy_lr_scheduler
            )
-            config = accelerator.deepspeed_config
-            assert config["train_micro_batch_size_per_gpu"] == 16
-            assert config["train_batch_size"] == 16
+            self.assertEqual(accelerator.deepspeed_config["train_micro_batch_size_per_gpu"], 16)
+            self.assertEqual(accelerator.deepspeed_config["train_batch_size"], 16)

-            assert config["optimizer"]["params"]["lr"] == 5e-05
-            assert config["optimizer"]["params"]["weight_decay"] == 1e-4
+            self.assertEqual(accelerator.deepspeed_config["optimizer"]["params"]["lr"], 5e-5)
+            self.assertEqual(accelerator.deepspeed_config["optimizer"]["params"]["weight_decay"], 1e-4)

-            assert config["scheduler"]["params"]["warmup_min_lr"] == 0.0
-            assert config["scheduler"]["params"]["warmup_max_lr"] == 5e-05
-            assert config["scheduler"]["params"]["warmup_num_steps"] == 10
+            self.assertEqual(accelerator.deepspeed_config["scheduler"]["params"]["warmup_min_lr"], 0.0)
+            self.assertEqual(accelerator.deepspeed_config["scheduler"]["params"]["warmup_max_lr"], 5e-5)
+            self.assertEqual(accelerator.deepspeed_config["scheduler"]["params"]["warmup_num_steps"], 10)

-            assert config["gradient_clipping"] == 1.0
-            assert config["zero_optimization"]["reduce_bucket_size"] == (hidden_size * hidden_size)
-            assert config["zero_optimization"]["stage3_prefetch_bucket_size"] == ((0.9 * hidden_size) * hidden_size)
-            assert config["zero_optimization"]["stage3_param_persistence_threshold"] == (10 * hidden_size)
-            assert not config["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"]
+            self.assertEqual(accelerator.deepspeed_config["gradient_clipping"], 1.0)
+            self.assertEqual(
+                accelerator.deepspeed_config["zero_optimization"]["reduce_bucket_size"], hidden_size * hidden_size
+            )
+            self.assertEqual(
+                accelerator.deepspeed_config["zero_optimization"]["stage3_prefetch_bucket_size"],
+                0.9 * hidden_size * hidden_size,
+            )
+            self.assertEqual(
+                accelerator.deepspeed_config["zero_optimization"]["stage3_param_persistence_threshold"],
+                10 * hidden_size,
+            )
+            self.assertFalse(
+                accelerator.deepspeed_config["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"]
+            )

    @parameterized.expand(model_types, name_func=parameterized_custom_name_func)
    def test_autofill_comm_buffers_dsconfig(self, model_type):
@ -679,22 +699,29 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
                        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
                    )
                msg = "Can't find `model.config` entry"
-                assert msg in str(cm.exception)
+                self.assertTrue(msg in str(cm.exception))
            elif model_type == CONFIG_WITH_NO_HIDDEN_SIZE:
                with self.assertRaises(ValueError) as cm:
                    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
                        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
                    )
                msg = "Can find neither `model.config.hidden_size` nor `model.config.hidden_sizes`"
-                assert msg in str(cm.exception)
+                self.assertTrue(msg in str(cm.exception))
            else:
                model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
                    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
                )
-                zero_opt = accelerator.deepspeed_config["zero_optimization"]
-                assert zero_opt["reduce_bucket_size"] == (hidden_size * hidden_size)
-                assert zero_opt["stage3_prefetch_bucket_size"] == (0.9 * hidden_size) * hidden_size
-                assert zero_opt["stage3_param_persistence_threshold"] == (10 * hidden_size)
+                self.assertEqual(
+                    accelerator.deepspeed_config["zero_optimization"]["reduce_bucket_size"], hidden_size * hidden_size
+                )
+                self.assertEqual(
+                    accelerator.deepspeed_config["zero_optimization"]["stage3_prefetch_bucket_size"],
+                    0.9 * hidden_size * hidden_size,
+                )
+                self.assertEqual(
+                    accelerator.deepspeed_config["zero_optimization"]["stage3_param_persistence_threshold"],
+                    10 * hidden_size,
+                )

    @parameterized.expand([FP16, BF16], name_func=parameterized_custom_name_func)
    def test_autofill_dsconfig_from_ds_plugin(self, dtype):
@ -724,21 +751,25 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):

        with mockenv_context(**self.dist_env):
            accelerator = Accelerator(deepspeed_plugin=deepspeed_plugin, mixed_precision=dtype)
-            config = accelerator.state.deepspeed_plugin.deepspeed_config
-            assert config["gradient_clipping"] == 1.0
-            assert config["gradient_accumulation_steps"] == 2
-            assert config["zero_optimization"]["stage"] == 2
-            assert config["zero_optimization"]["offload_optimizer"]["device"] == "cpu"
-            assert config["zero_optimization"]["offload_param"]["device"] == "cpu"
-            assert config["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"]
-            assert config[dtype]["enabled"]
+            deepspeed_plugin = accelerator.state.deepspeed_plugin
+            self.assertEqual(deepspeed_plugin.deepspeed_config["gradient_clipping"], 1.0)
+            self.assertEqual(deepspeed_plugin.deepspeed_config["gradient_accumulation_steps"], 2)
+            self.assertEqual(deepspeed_plugin.deepspeed_config["zero_optimization"]["stage"], 2)
+            self.assertEqual(
+                deepspeed_plugin.deepspeed_config["zero_optimization"]["offload_optimizer"]["device"], "cpu"
+            )
+            self.assertEqual(deepspeed_plugin.deepspeed_config["zero_optimization"]["offload_param"]["device"], "cpu")
+            self.assertTrue(
+                deepspeed_plugin.deepspeed_config["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"]
+            )
+            self.assertTrue(deepspeed_plugin.deepspeed_config[dtype]["enabled"])

        AcceleratorState._reset_state(True)
        diff_dtype = "bf16" if dtype == "fp16" else "fp16"
        with mockenv_context(**self.dist_env):
            with self.assertRaises(ValueError) as cm:
                accelerator = Accelerator(deepspeed_plugin=deepspeed_plugin, mixed_precision=diff_dtype)
-            assert (
+            self.assertTrue(
                f"`--mixed_precision` arg cannot be set to `{diff_dtype}` when `{dtype}` is set in the DeepSpeed config file."
                in str(cm.exception)
            )
@ -749,7 +780,7 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
        with mockenv_context(**self.dist_env):
            accelerator = Accelerator(deepspeed_plugin=deepspeed_plugin, mixed_precision=dtype)
            deepspeed_plugin = accelerator.state.deepspeed_plugin
-            assert deepspeed_plugin.deepspeed_config["gradient_accumulation_steps"] == 4
+            self.assertEqual(deepspeed_plugin.deepspeed_config["gradient_accumulation_steps"], 4)

        # filling the `auto` gradient_accumulation_steps via Accelerator's value
        AcceleratorState._reset_state(True)
@ -777,7 +808,7 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
                model, dummy_optimizer, train_dataloader, eval_dataloader, dummy_lr_scheduler
            )
            deepspeed_plugin = accelerator.state.deepspeed_plugin
-            assert deepspeed_plugin.deepspeed_config["gradient_accumulation_steps"] == 8
+            self.assertEqual(deepspeed_plugin.deepspeed_config["gradient_accumulation_steps"], 8)

    def test_ds_config_assertions(self):
        ambiguous_env = self.dist_env.copy()
@ -798,7 +829,7 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
                    zero3_save_16bit_model=True,
                )
                _ = Accelerator(deepspeed_plugin=deepspeed_plugin, mixed_precision=FP16)
-            assert (
+            self.assertTrue(
                "If you are using an accelerate config file, remove others config variables mentioned in the above specified list."
                in str(cm.exception)
            )
@ -809,10 +840,13 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
            hf_ds_config=self.ds_config_file[stage],
            zero3_init_flag=True,
        )
-        assert deepspeed_plugin.zero_stage == int(stage.replace("zero", ""))
+        self.assertEqual(deepspeed_plugin.zero_stage, int(stage.replace("zero", "")))

    def test_basic_run(self):
-        test_file_path = path_in_accelerate_package("test_utils", "scripts", "external_deps", "test_performance.py")
+        mod_file = inspect.getfile(accelerate.test_utils)
+        test_file_path = os.path.sep.join(
+            mod_file.split(os.path.sep)[:-1] + ["scripts", "external_deps", "test_performance.py"]
+        )
        with tempfile.TemporaryDirectory() as dirpath:
            cmd = [
                "accelerate",
@ -839,8 +873,6 @@ class DeepSpeedConfigIntegration(AccelerateTestCase):
@require_multi_device
@slow
 class DeepSpeedIntegrationTest(TempDirTestCase):
-    test_scripts_folder = path_in_accelerate_package("test_utils", "scripts", "external_deps")
-
    def setUp(self):
        super().setUp()
        self._test_file_path = inspect.getfile(self.__class__)
@ -867,8 +899,11 @@ class DeepSpeedIntegrationTest(TempDirTestCase):
        self.n_train = 160
        self.n_val = 160

+        mod_file = inspect.getfile(accelerate.test_utils)
+        self.test_scripts_folder = os.path.sep.join(mod_file.split(os.path.sep)[:-1] + ["scripts", "external_deps"])
+
    def test_performance(self):
-        self.test_file_path = self.test_scripts_folder / "test_performance.py"
+        self.test_file_path = os.path.join(self.test_scripts_folder, "test_performance.py")
        cmd = [
            "accelerate",
            "launch",
@ -889,7 +924,7 @@ class DeepSpeedIntegrationTest(TempDirTestCase):
            cmd_stage.extend([f"--zero_stage={stage}"])
            cmd_stage.extend(["--offload_optimizer_device=none", "--offload_param_device=none"])
            if self.zero3_offload_config:
-                with open(self.ds_config_file[ZERO3], encoding="utf-8") as f:
+                with io.open(self.ds_config_file[ZERO3], "r", encoding="utf-8") as f:
                    ds_config = json.load(f)
                    del ds_config["bf16"]
                    del ds_config["optimizer"]["params"]["torch_adam"]
@ -912,7 +947,7 @@ class DeepSpeedIntegrationTest(TempDirTestCase):
                execute_subprocess_async(cmd_stage, env=os.environ.copy())

    def test_checkpointing(self):
-        self.test_file_path = self.test_scripts_folder / "test_checkpointing.py"
+        self.test_file_path = os.path.join(self.test_scripts_folder, "test_checkpointing.py")
        cmd = [
            "accelerate",
            "launch",
@ -933,7 +968,7 @@ class DeepSpeedIntegrationTest(TempDirTestCase):
            cmd_stage.extend([f"--zero_stage={stage}"])
            cmd_stage.extend(["--offload_optimizer_device=none", "--offload_param_device=none"])
            if self.zero3_offload_config:
-                with open(self.ds_config_file[ZERO3], encoding="utf-8") as f:
+                with io.open(self.ds_config_file[ZERO3], "r", encoding="utf-8") as f:
                    ds_config = json.load(f)
                    del ds_config["bf16"]
                    del ds_config["optimizer"]["params"]["torch_adam"]
@ -966,7 +1001,7 @@ class DeepSpeedIntegrationTest(TempDirTestCase):
                execute_subprocess_async(cmd_stage, env=os.environ.copy())

    def test_peak_memory_usage(self):
-        self.test_file_path = self.test_scripts_folder / "test_peak_memory_usage.py"
+        self.test_file_path = os.path.join(self.test_scripts_folder, "test_peak_memory_usage.py")
        cmd = [
            "accelerate",
            "launch",
@ -992,8 +1027,8 @@ class DeepSpeedIntegrationTest(TempDirTestCase):
                    ]
                )
                for i in range(3):
-                    if f"stage_{i + 1}" in spec:
-                        cmd_stage.extend([f"--zero_stage={i + 1}"])
+                    if f"stage_{i+1}" in spec:
+                        cmd_stage.extend([f"--zero_stage={i+1}"])
                        break
                cmd_stage.extend(
                    [
@ -1004,7 +1039,7 @@ class DeepSpeedIntegrationTest(TempDirTestCase):
                    ]
                )
                if "cpu_offload" in spec:
-                    with open(self.ds_config_file[ZERO3], encoding="utf-8") as f:
+                    with io.open(self.ds_config_file[ZERO3], "r", encoding="utf-8") as f:
                        ds_config = json.load(f)
                        del ds_config["bf16"]
                        del ds_config["fp16"]
@ -1029,7 +1064,7 @@ class DeepSpeedIntegrationTest(TempDirTestCase):
                execute_subprocess_async(cmd_stage, env=os.environ.copy())

    def test_lr_scheduler(self):
-        self.test_file_path = self.test_scripts_folder / "test_performance.py"
+        self.test_file_path = os.path.join(self.test_scripts_folder, "test_performance.py")
        cmd = [
            "accelerate",
            "launch",
--- a/tests/fsdp/test_fsdp.py
+++ b/tests/fsdp/test_fsdp.py
@ -13,6 +13,7 @@
 # limitations under the License.


+import inspect
 import os

 import torch
@ -20,18 +21,16 @@ from transformers import AutoModel
 from transformers.testing_utils import mockenv_context
 from transformers.trainer_utils import set_seed

+import accelerate
 from accelerate.accelerator import Accelerator
 from accelerate.state import AcceleratorState
 from accelerate.test_utils.testing import (
    AccelerateTestCase,
    TempDirTestCase,
    execute_subprocess_async,
-    get_launch_command,
-    path_in_accelerate_package,
    require_fsdp,
    require_multi_device,
    require_non_cpu,
-    require_non_torch_xla,
    slow,
 )
 from accelerate.utils.constants import (
@ -54,7 +53,6 @@ dtypes = [FP16, BF16]

@require_fsdp
@require_non_cpu
-@require_non_torch_xla
 class FSDPPluginIntegration(AccelerateTestCase):
    def setUp(self):
        super().setUp()
@ -77,7 +75,7 @@ class FSDPPluginIntegration(AccelerateTestCase):
            env["FSDP_SHARDING_STRATEGY"] = f"{i + 1}"
            with mockenv_context(**env):
                fsdp_plugin = FullyShardedDataParallelPlugin()
-                assert fsdp_plugin.sharding_strategy == ShardingStrategy(i + 1)
+                self.assertEqual(fsdp_plugin.sharding_strategy, ShardingStrategy(i + 1))

        # check that giving names works fine
        for i, strategy in enumerate(FSDP_SHARDING_STRATEGY):
@ -85,7 +83,7 @@ class FSDPPluginIntegration(AccelerateTestCase):
            env["FSDP_SHARDING_STRATEGY"] = strategy
            with mockenv_context(**env):
                fsdp_plugin = FullyShardedDataParallelPlugin()
-                assert fsdp_plugin.sharding_strategy == ShardingStrategy(i + 1)
+                self.assertEqual(fsdp_plugin.sharding_strategy, ShardingStrategy(i + 1))

    def test_backward_prefetch(self):
        from torch.distributed.fsdp.fully_sharded_data_parallel import BackwardPrefetch
@ -96,9 +94,9 @@ class FSDPPluginIntegration(AccelerateTestCase):
            with mockenv_context(**env):
                fsdp_plugin = FullyShardedDataParallelPlugin()
                if prefetch_policy == "NO_PREFETCH":
-                    assert fsdp_plugin.backward_prefetch is None
+                    self.assertIsNone(fsdp_plugin.backward_prefetch)
                else:
-                    assert fsdp_plugin.backward_prefetch == BackwardPrefetch(i + 1)
+                    self.assertEqual(fsdp_plugin.backward_prefetch, BackwardPrefetch(i + 1))

    def test_state_dict_type(self):
        from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
@ -108,10 +106,10 @@ class FSDPPluginIntegration(AccelerateTestCase):
            env["FSDP_STATE_DICT_TYPE"] = state_dict_type
            with mockenv_context(**env):
                fsdp_plugin = FullyShardedDataParallelPlugin()
-                assert fsdp_plugin.state_dict_type == StateDictType(i + 1)
+                self.assertEqual(fsdp_plugin.state_dict_type, StateDictType(i + 1))
                if state_dict_type == "FULL_STATE_DICT":
-                    assert fsdp_plugin.state_dict_config.offload_to_cpu
-                    assert fsdp_plugin.state_dict_config.rank0_only
+                    self.assertTrue(fsdp_plugin.state_dict_config.offload_to_cpu)
+                    self.assertTrue(fsdp_plugin.state_dict_config.rank0_only)

    def test_auto_wrap_policy(self):
        model = AutoModel.from_pretrained(BERT_BASE_CASED)
@ -126,9 +124,9 @@ class FSDPPluginIntegration(AccelerateTestCase):
                fsdp_plugin = FullyShardedDataParallelPlugin()
                fsdp_plugin.set_auto_wrap_policy(model)
                if policy == "NO_WRAP":
-                    assert fsdp_plugin.auto_wrap_policy is None
+                    self.assertIsNone(fsdp_plugin.auto_wrap_policy)
                else:
-                    assert fsdp_plugin.auto_wrap_policy is not None
+                    self.assertIsNotNone(fsdp_plugin.auto_wrap_policy)

        env = self.dist_env.copy()
        env["FSDP_AUTO_WRAP_POLICY"] = "TRANSFORMER_BASED_WRAP"
@ -137,7 +135,7 @@ class FSDPPluginIntegration(AccelerateTestCase):
            fsdp_plugin = FullyShardedDataParallelPlugin()
            with self.assertRaises(Exception) as cm:
                fsdp_plugin.set_auto_wrap_policy(model)
-            assert "Could not find the transformer layer class to wrap in the model." in str(cm.exception)
+            self.assertTrue("Could not find the transformer layer class to wrap in the model." in str(cm.exception))

        env = self.dist_env.copy()
        env["FSDP_AUTO_WRAP_POLICY"] = "SIZE_BASED_WRAP"
@ -145,7 +143,7 @@ class FSDPPluginIntegration(AccelerateTestCase):
        with mockenv_context(**env):
            fsdp_plugin = FullyShardedDataParallelPlugin()
            fsdp_plugin.set_auto_wrap_policy(model)
-            assert fsdp_plugin.auto_wrap_policy is None
+            self.assertIsNone(fsdp_plugin.auto_wrap_policy)

    def test_mixed_precision(self):
        from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision
@ -161,11 +159,11 @@ class FSDPPluginIntegration(AccelerateTestCase):
                elif mp_dtype == "bf16":
                    dtype = torch.bfloat16
                mp_policy = MixedPrecision(param_dtype=dtype, reduce_dtype=dtype, buffer_dtype=dtype)
-                assert accelerator.state.fsdp_plugin.mixed_precision_policy == mp_policy
+                self.assertEqual(accelerator.state.fsdp_plugin.mixed_precision_policy, mp_policy)
                if mp_dtype == FP16:
-                    assert isinstance(accelerator.scaler, ShardedGradScaler)
+                    self.assertTrue(isinstance(accelerator.scaler, ShardedGradScaler))
                elif mp_dtype == BF16:
-                    assert accelerator.scaler is None
+                    self.assertIsNone(accelerator.scaler)
                AcceleratorState._reset_state(True)

    def test_cpu_offload(self):
@ -176,17 +174,13 @@ class FSDPPluginIntegration(AccelerateTestCase):
            env["FSDP_OFFLOAD_PARAMS"] = str(flag).lower()
            with mockenv_context(**env):
                fsdp_plugin = FullyShardedDataParallelPlugin()
-                assert fsdp_plugin.cpu_offload == CPUOffload(offload_params=flag)
+                self.assertEqual(fsdp_plugin.cpu_offload, CPUOffload(offload_params=flag))


-# Skip this test when TorchXLA is available because accelerate.launch does not support TorchXLA FSDP.
-@require_non_torch_xla
@require_fsdp
@require_multi_device
@slow
 class FSDPIntegrationTest(TempDirTestCase):
-    test_scripts_folder = path_in_accelerate_package("test_utils", "scripts", "external_deps")
-
    def setUp(self):
        super().setUp()
        self.performance_lower_bound = 0.82
@ -205,9 +199,12 @@ class FSDPIntegrationTest(TempDirTestCase):
        self.n_train = 160
        self.n_val = 160

+        mod_file = inspect.getfile(accelerate.test_utils)
+        self.test_scripts_folder = os.path.sep.join(mod_file.split(os.path.sep)[:-1] + ["scripts", "external_deps"])
+
    def test_performance(self):
-        self.test_file_path = self.test_scripts_folder / "test_performance.py"
-        cmd = get_launch_command(num_processes=2, num_machines=1, machine_rank=0, use_fsdp=True)
+        self.test_file_path = os.path.join(self.test_scripts_folder, "test_performance.py")
+        cmd = ["accelerate", "launch", "--num_processes=2", "--num_machines=1", "--machine_rank=0", "--use_fsdp"]
        for config in self.performance_configs:
            cmd_config = cmd.copy()
            for i, strategy in enumerate(FSDP_SHARDING_STRATEGY):
@ -244,15 +241,17 @@ class FSDPIntegrationTest(TempDirTestCase):
                execute_subprocess_async(cmd_config, env=os.environ.copy())

    def test_checkpointing(self):
-        self.test_file_path = self.test_scripts_folder / "test_checkpointing.py"
-        cmd = get_launch_command(
-            num_processes=2,
-            num_machines=1,
-            machine_rank=0,
-            use_fsdp=True,
-            mixed_precision="fp16",
-            fsdp_transformer_layer_cls_to_wrap="BertLayer",
-        )
+        self.test_file_path = os.path.join(self.test_scripts_folder, "test_checkpointing.py")
+        cmd = [
+            "accelerate",
+            "launch",
+            "--num_processes=2",
+            "--num_machines=1",
+            "--machine_rank=0",
+            "--use_fsdp",
+            "--mixed_precision=fp16",
+            "--fsdp_transformer_layer_cls_to_wrap=BertLayer",
+        ]

        for i, strategy in enumerate(FSDP_SHARDING_STRATEGY):
            cmd_config = cmd.copy()
@ -289,8 +288,14 @@ class FSDPIntegrationTest(TempDirTestCase):
                    execute_subprocess_async(cmd_config, env=os.environ.copy())

    def test_peak_memory_usage(self):
-        self.test_file_path = self.test_scripts_folder / "test_peak_memory_usage.py"
-        cmd = get_launch_command(num_processes=2, num_machines=1, machine_rank=0)
+        self.test_file_path = os.path.join(self.test_scripts_folder, "test_peak_memory_usage.py")
+        cmd = [
+            "accelerate",
+            "launch",
+            "--num_processes=2",
+            "--num_machines=1",
+            "--machine_rank=0",
+        ]
        for spec, peak_mem_upper_bound in self.peak_memory_usage_upper_bound.items():
            cmd_config = cmd.copy()
            if "fp16" in spec:
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Zach Mueller	5eb7ab9f6a	Adds tag v0.27.2 for pypi	2024-02-13 10:14:49 -05:00
Zach Mueller	5114a37ea3	Fix seedable sampler logic and expound docs (#2434 ) * Fix and add more docs * Add tests + ensure working * Fixup all tests!	2024-02-13 10:13:40 -05:00
Zach Mueller	ad8a2db6fc	Release: v0.27.1	2024-02-13 10:06:34 -05:00
Zach Mueller	b7087be5f6	Release: v0.27.0	2024-02-09 10:46:47 -05:00