Release: v0.29.3

add strict arg to load_checkpoint_and_dispatch (#2641 )
fix backend check (#2670 )
2025-11-13 21:59:16 +08:00 · 2024-04-17 11:39:48 -04:00 · 2024-04-17 11:38:52 -04:00 · 2024-04-17 11:38:44 -04:00 · 2024-04-09 07:47:11 -04:00 · 2024-04-09 07:46:37 -04:00
5 changed files with 66 additions and 47 deletions
--- a/setup.py
+++ b/setup.py
@ -47,7 +47,7 @@ extras["sagemaker"] = [

 setup(
    name="accelerate",
-    version="0.29.0.dev",
+    version="0.29.3",
    description="Accelerate",
    long_description=open("README.md", encoding="utf-8").read(),
    long_description_content_type="text/markdown",
--- a/src/accelerate/init.py
+++ b/src/accelerate/init.py
@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "0.29.0.dev0"
+__version__ = "0.29.3"

 from .accelerator import Accelerator
 from .big_modeling import (
--- a/src/accelerate/big_modeling.py
+++ b/src/accelerate/big_modeling.py
@ -508,6 +508,7 @@ def load_checkpoint_and_dispatch(
    skip_keys: Optional[Union[str, List[str]]] = None,
    preload_module_classes: Optional[List[str]] = None,
    force_hooks: bool = False,
+    strict: bool = False,
 ):
    """
    Loads a (potentially sharded) checkpoint inside a model, potentially sending weights to a given device as they are
@ -554,6 +555,9 @@ def load_checkpoint_and_dispatch(
        force_hooks (`bool`, *optional*, defaults to `False`):
            Whether or not to force device hooks to be attached to the model even if all layers are dispatched to a
            single device.
+        strict (`bool`, *optional*, defaults to `False`):
+            Whether to strictly enforce that the keys in the checkpoint state_dict match the keys of the model's
+            state_dict.

    Example:

@ -608,6 +612,7 @@ def load_checkpoint_and_dispatch(
        dtype=dtype,
        offload_state_dict=offload_state_dict,
        offload_buffers=offload_buffers,
+        strict=strict,
    )
    if device_map is None:
        return model
--- a/src/accelerate/state.py
+++ b/src/accelerate/state.py
@ -179,22 +179,14 @@ class PartialState:
                )

            # Sets up self.backend + imports
-            backend, distributed_type = self._prepare_backend(cpu, use_sagemaker_dp, kwargs.pop("backend", None))
+            original_backend = kwargs.pop("backend", None)
+            backend, distributed_type = self._prepare_backend(cpu, use_sagemaker_dp, original_backend)
+            if original_backend is not None and backend != original_backend:
+                raise ValueError("Your assigned backend {original_backend} is not avaliable, please use {backend}")
            self.backend = backend
            self.distributed_type = distributed_type
            use_deepspeed = False
-            if not cpu:
-                # Deal with XLA
-                if is_torch_xla_available():
-                    self.device = xm.xla_device()
-                    xm.set_replication(self.device, xm.get_xla_supported_devices())
-                    self.num_processes = xm.xrt_world_size()
-                    self.process_index = xm.get_ordinal()
-                    if is_torch_xla_available(check_is_tpu=True):
-                        self.local_process_index = xm.get_local_ordinal()
-                    else:
-                        self.local_process_index = int(os.environ.get("LOCAL_RANK", -1))
-                    self.distributed_type = DistributedType.XLA
+            if not cpu and self.backend != "xla":
                if int(os.environ.get("LOCAL_RANK", -1)) != -1:
                    # Deal with spawning deepspeed
                    if os.environ.get("ACCELERATE_USE_DEEPSPEED", "false") == "true":
@ -204,7 +196,7 @@ class PartialState:
                            )
                        from deepspeed import comm as dist

-                        if is_xpu_available and is_ccl_available():
+                        if is_xpu_available() and is_ccl_available():
                            os.environ["CCL_PROCESS_LAUNCHER"] = "none"
                            os.environ["CCL_LOCAL_SIZE"] = os.environ.get("LOCAL_WORLD_SIZE", "1")
                            os.environ["CCL_LOCAL_RANK"] = os.environ.get("LOCAL_RANK", "0")
@ -270,6 +262,16 @@ class PartialState:
                self.num_processes = 1
                self.process_index = 0
                self.local_process_index = 0
+            elif self.backend == "xla":
+                # XLA needs device setting first for `set_replication`
+                self.set_device()
+                xm.set_replication(self.device, xm.get_xla_supported_devices())
+                self.num_processes = xm.xrt_world_size()
+                self.process_index = xm.get_ordinal()
+                if is_torch_xla_available(check_is_tpu=True):
+                    self.local_process_index = xm.get_local_ordinal()
+                else:
+                    self.local_process_index = int(os.environ.get("LOCAL_RANK", -1))
            else:
                self.num_processes = torch.distributed.get_world_size()
                self.process_index = torch.distributed.get_rank()
@ -284,16 +286,17 @@ class PartialState:
            # Set CPU affinity if enabled
            if parse_flag_from_env("ACCELERATE_CPU_AFFINITY", False):
                set_numa_affinity(self.local_process_index)
-        self.fork_launched = parse_flag_from_env("FORK_LAUNCHED", 0)

-        # Check for old RTX 4000's that can't use P2P or IB and are on old drivers
-        if self.device.type == "cuda" and not check_cuda_p2p_ib_support():
-            if "NCCL_P2P_DISABLE" not in os.environ or "NCCL_IB_DISABLE" not in os.environ:
-                raise NotImplementedError(
-                    "Using RTX 4000 series doesn't support faster communication broadband via P2P or IB. "
-                    'Please set `NCCL_P2P_DISABLE="1"` and `NCCL_IB_DISABLE="1" or use `accelerate launch` which '
-                    "will do this automatically."
-                )
+            # Check for old RTX 4000's that can't use P2P or IB and are on old drivers
+            if self.device.type == "cuda" and not check_cuda_p2p_ib_support():
+                if "NCCL_P2P_DISABLE" not in os.environ or "NCCL_IB_DISABLE" not in os.environ:
+                    raise NotImplementedError(
+                        "Using RTX 4000 series doesn't support faster communication broadband via P2P or IB. "
+                        'Please set `NCCL_P2P_DISABLE="1"` and `NCCL_IB_DISABLE="1" or use `accelerate launch` which '
+                        "will do this automatically."
+                    )
+        # Important: This should be the *only* code outside of `self.initialized!`
+        self.fork_launched = parse_flag_from_env("FORK_LAUNCHED", 0)

    def __repr__(self) -> str:
        return (
@ -715,19 +718,22 @@ class PartialState:

            backend = "smddp"
            distributed_type = DistributedType.MULTI_GPU
-        elif int(os.environ.get("LOCAL_RANK", -1)) != -1:
-            if not cpu:
-                if is_mlu_available():
-                    backend = "cncl"
-                    distributed_type = DistributedType.MULTI_MLU
-                elif torch.cuda.is_available():
-                    if backend is None:
-                        backend = "nccl"
-                    distributed_type = DistributedType.MULTI_GPU
-                elif is_npu_available():
-                    backend = "hccl"
-                    distributed_type = DistributedType.MULTI_NPU
-        if backend is None and (
+        elif is_torch_xla_available():
+            backend = "xla"
+            distributed_type = DistributedType.XLA
+        elif int(os.environ.get("LOCAL_RANK", -1)) != -1 and not cpu:
+            if is_mlu_available():
+                backend = "cncl"
+                distributed_type = DistributedType.MULTI_MLU
+            elif torch.cuda.is_available():
+                if backend is None:
+                    backend = "nccl"
+                distributed_type = DistributedType.MULTI_GPU
+            elif is_npu_available():
+                backend = "hccl"
+                distributed_type = DistributedType.MULTI_NPU
+
+        if distributed_type is None and (
            int(os.environ.get("LOCAL_RANK", -1)) != -1
            or get_int_from_env(["PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE", "WORLD_SIZE"], 1) > 1
        ):
@ -735,8 +741,11 @@ class PartialState:
                distributed_type = DistributedType.MULTI_XPU
            else:
                distributed_type = DistributedType.MULTI_CPU
-            if is_ccl_available() and (
-                get_int_from_env(["CCL_WORKER_COUNT"], 0) > 0 or distributed_type == DistributedType.MULTI_XPU
+
+            if (
+                backend in (None, "ccl")
+                and is_ccl_available()
+                and (get_int_from_env(["CCL_WORKER_COUNT"], 0) > 0 or distributed_type == DistributedType.MULTI_XPU)
            ):
                if get_ccl_version() >= "1.12":
                    import oneccl_bindings_for_pytorch  # noqa: F401
@ -744,12 +753,13 @@ class PartialState:
                    import torch_ccl  # noqa: F401

                backend = "ccl"
-            elif torch.distributed.is_mpi_available():
+            elif backend in (None, "mpi") and torch.distributed.is_mpi_available():
                backend = "mpi"
            else:
                backend = "gloo"
        if distributed_type is None:
            distributed_type = DistributedType.NO
+
        return backend, distributed_type

    def set_device(self):
@ -758,17 +768,20 @@ class PartialState:
        """
        if self.device is not None:
            return
-        if self.num_processes == 1:
+        if self.distributed_type == DistributedType.NO:
            self.device = torch.device("cpu") if self._cpu else self.default_device
            return
        device = str(self.distributed_type).split(".")[-1].replace("MULTI_", "").lower()
-        if device not in ("cpu", "gpu", "mlu", "npu", "xpu"):
+        if device not in ("cpu", "gpu", "mlu", "npu", "xpu", "xla"):
            raise ValueError(
                f"Can't set device for {self.distributed_type} ({device}), verify we should be calling `_set_device()` for it!"
            )
-        if device == "gpu":
-            device = "cuda"
-        self.device = torch.device(device, self.local_process_index)
+        if device == "xla":
+            self.device = xm.xla_device()
+        else:
+            if device == "gpu":
+                device = "cuda"
+            self.device = torch.device(device, self.local_process_index)
        if self.device is not None:
            if device == "xpu":
                torch.xpu.set_device(self.device)
--- a/src/accelerate/test_utils/scripts/test_script.py
+++ b/src/accelerate/test_utils/scripts/test_script.py
@ -22,7 +22,6 @@ from copy import deepcopy
 from pathlib import Path

 import numpy as np
-import pytest
 import torch
 from torch.utils.data import DataLoader, Dataset

@ -711,6 +710,8 @@ def test_trigger():


 def test_reinstantiated_state():
+    import pytest
+
    AcceleratorState._reset_state()
    simple_model = torch.nn.Linear(1, 1)
    # First define an accelerator
Author	SHA1	Message	Date
Zach Mueller	e82de1215a	Release: v0.29.3	2024-04-17 11:39:48 -04:00
Marc Sun	02f6abcfd2	add strict arg to load_checkpoint_and_dispatch (#2641 )	2024-04-17 11:38:52 -04:00
jiqing-feng	fa0bd4005c	fix backend check (#2670 ) * fix backend check * reformat backend check * Update src/accelerate/state.py Co-authored-by: Zach Mueller <muellerzr@gmail.com> * Update src/accelerate/state.py Co-authored-by: Zach Mueller <muellerzr@gmail.com> * raise value error if backend mismatch * Update src/accelerate/state.py Co-authored-by: Zach Mueller <muellerzr@gmail.com> --------- Co-authored-by: Zach Mueller <muellerzr@gmail.com>	2024-04-17 11:38:44 -04:00
Zach Mueller	39e0a8ef59	Release: v0.29.2	2024-04-09 07:47:11 -04:00
Zach Mueller	759a9336ce	Patch for v0.29.2	2024-04-09 07:46:37 -04:00
Zach Mueller	210778370e	Release: v0.29.1	2024-04-05 13:07:09 -04:00
Zach Mueller	12eed81eb8	Fixup	2024-04-05 13:01:54 -04:00
Zach Mueller	ec88c8f54a	Release: v0.29.0	2024-04-05 09:38:10 -04:00