xpu enablement on left cases (#3654)

* 1. enable xpu for launcher 2. expand cuda only ds uts to xpu 3. expand profiler example to xpu Signed-off-by: YAO Matrix <matrix.yao@intel.com> * fix style Signed-off-by: YAO Matrix <matrix.yao@intel.com> * rename Signed-off-by: YAO Matrix <matrix.yao@intel.com> * Update profiler.py * Apply style fixes --------- Signed-off-by: YAO Matrix <matrix.yao@intel.com> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
2025-10-20 10:03:46 +08:00 · 2025-07-08 00:10:53 +08:00
parent 07ce74868c
commit 1ac8643df7
3 changed files with 30 additions and 26 deletions
--- a/examples/by_feature/profiler.py
+++ b/examples/by_feature/profiler.py
@ -31,8 +31,8 @@ from accelerate.utils import ProfileKwargs
 #
 # This example trains a Bert base model on GLUE MRPC
 # in any of the following settings (with the same script):
-#   - single CPU or single GPU
-#   - multi GPUS (using PyTorch distributed mode)
+#   - single CPU or single device (CUDA GPU, Intel XPU etc.)
+#   - multi devices (using PyTorch distributed mode)
 #   - (multi) TPUs
 #   - fp16 (mixed-precision) or fp32 (normal precision)
 #
@ -183,7 +183,8 @@ def training_function(config, args):
        # New Code #
        accelerator.print(
            prof.key_averages().table(
-                sort_by="self_cpu_time_total" if args.cpu else "self_cuda_time_total", row_limit=-1
+                sort_by="self_cpu_time_total" if args.cpu else f"self_{accelerator.device.type}_time_total",
+                row_limit=-1,
            )
        )

@ -215,7 +216,7 @@ def main():
        choices=["no", "fp16", "bf16", "fp8"],
        help="Whether to use mixed precision. Choose"
        "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
-        "and an Nvidia Ampere GPU.",
+        "and an Nvidia Ampere GPU or an Intel XPU.",
    )
    # New Code #
    parser.add_argument(
--- a/src/accelerate/launchers.py
+++ b/src/accelerate/launchers.py
@ -60,8 +60,8 @@ def notebook_launcher(

    <Tip warning={true}>

-    To use this function absolutely zero calls to a CUDA device must be made in the notebook session before calling. If
-    any have been made, you will need to restart the notebook and make sure no cells use any CUDA capability.
+    To use this function absolutely zero calls to a device must be made in the notebook session before calling. If any
+    have been made, you will need to restart the notebook and make sure no cells use any device capability.

    Setting `ACCELERATE_DEBUG_MODE="1"` in your environment will run a test before truly launching to ensure that none
    of those calls have been made.
@ -76,11 +76,11 @@ def notebook_launcher(
            Tuple of arguments to pass to the function (it will receive `*args`).
        num_processes (`int`, *optional*):
            The number of processes to use for training. Will default to 8 in Colab/Kaggle if a TPU is available, to
-            the number of GPUs available otherwise.
+            the number of devices available otherwise.
        mixed_precision (`str`, *optional*, defaults to `"no"`):
-            If `fp16` or `bf16`, will use mixed precision training on multi-GPU.
+            If `fp16` or `bf16`, will use mixed precision training on multi-device.
        use_port (`str`, *optional*, defaults to `"29500"`):
-            The port to use to communicate between processes when launching a multi-GPU training.
+            The port to use to communicate between processes when launching a multi-device training.
        master_addr (`str`, *optional*, defaults to `"127.0.0.1"`):
            The address to use for communication between processes.
        node_rank (`int`, *optional*, defaults to 0):
@ -105,7 +105,7 @@ def notebook_launcher(
    Example:

    ```python
-    # Assume this is defined in a Jupyter Notebook on an instance with two GPUs
+    # Assume this is defined in a Jupyter Notebook on an instance with two devices
    from accelerate import notebook_launcher


@ -158,27 +158,27 @@ def notebook_launcher(
    else:
        if num_processes is None:
            raise ValueError(
-                "You have to specify the number of GPUs you would like to use, add `num_processes=...` to your call."
+                "You have to specify the number of devices you would like to use, add `num_processes=...` to your call."
            )
        if node_rank >= num_nodes:
            raise ValueError("The node_rank must be less than the number of nodes.")
        if num_processes > 1:
-            # Multi-GPU launch
+            # Multi-device launch
            from torch.distributed.launcher.api import LaunchConfig, elastic_launch
            from torch.multiprocessing import start_processes
            from torch.multiprocessing.spawn import ProcessRaisedException

            if len(AcceleratorState._shared_state) > 0:
                raise ValueError(
-                    "To launch a multi-GPU training from your notebook, the `Accelerator` should only be initialized "
+                    "To launch a multi-device training from your notebook, the `Accelerator` should only be initialized "
                    "inside your training function. Restart your notebook and make sure no cells initializes an "
                    "`Accelerator`."
                )
-            # Check for specific libraries known to initialize CUDA that users constantly use
+            # Check for specific libraries known to initialize device that users constantly use
            problematic_imports = are_libraries_initialized("bitsandbytes")
            if len(problematic_imports) > 0:
                err = (
-                    "Could not start distributed process. Libraries known to initialize CUDA upon import have been "
+                    "Could not start distributed process. Libraries known to initialize device upon import have been "
                    "imported already. Please keep these imports inside your training function to try and help with this:"
                )
                for lib_name in problematic_imports:
@ -203,24 +203,26 @@ def notebook_launcher(
            # process here (the other ones will be set be the launcher).
            with patch_environment(**patched_env):
                # First dummy launch
+                device_type = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
+                distributed_type = "MULTI_XPU" if device_type == "xpu" else "MULTI_GPU"
                if os.environ.get("ACCELERATE_DEBUG_MODE", "false").lower() == "true":
-                    launcher = PrepareForLaunch(test_launch, distributed_type="MULTI_GPU")
+                    launcher = PrepareForLaunch(test_launch, distributed_type=distributed_type)
                    try:
                        start_processes(launcher, args=(), nprocs=num_processes, start_method="fork")
                    except ProcessRaisedException as e:
                        err = "An issue was found when verifying a stable environment for the notebook launcher."
-                        if "Cannot re-initialize CUDA in forked subprocess" in e.args[0]:
+                        if f"Cannot re-initialize {device_type.upper()} in forked subprocess" in e.args[0]:
                            raise RuntimeError(
                                f"{err}"
                                "This likely stems from an outside import causing issues once the `notebook_launcher()` is called. "
                                "Please review your imports and test them when running the `notebook_launcher()` to identify "
-                                "which one is problematic and causing CUDA to be initialized."
+                                f"which one is problematic and causing {device_type.upper()} to be initialized."
                            ) from e
                        else:
                            raise RuntimeError(f"{err} The following error was raised: {e}") from e
                # Now the actual launch
-                launcher = PrepareForLaunch(function, distributed_type="MULTI_GPU")
-                print(f"Launching training on {num_processes} GPUs.")
+                launcher = PrepareForLaunch(function, distributed_type=distributed_type)
+                print(f"Launching training on {num_processes} {device_type.upper()}s.")
                try:
                    if rdzv_conf is None:
                        rdzv_conf = {}
@ -244,23 +246,25 @@ def notebook_launcher(
                        launch_config_kwargs["log_line_prefix_template"] = log_line_prefix_template
                    elastic_launch(config=LaunchConfig(**launch_config_kwargs), entrypoint=function)(*args)
                except ProcessRaisedException as e:
-                    if "Cannot re-initialize CUDA in forked subprocess" in e.args[0]:
+                    if f"Cannot re-initialize {device_type.upper()} in forked subprocess" in e.args[0]:
                        raise RuntimeError(
-                            "CUDA has been initialized before the `notebook_launcher` could create a forked subprocess. "
+                            f"{device_type.upper()} has been initialized before the `notebook_launcher` could create a forked subprocess. "
                            "This likely stems from an outside import causing issues once the `notebook_launcher()` is called. "
                            "Please review your imports and test them when running the `notebook_launcher()` to identify "
-                            "which one is problematic and causing CUDA to be initialized."
+                            f"which one is problematic and causing {device_type.upper()} to be initialized."
                        ) from e
                    else:
                        raise RuntimeError(f"An issue was found when launching the training: {e}") from e

        else:
-            # No need for a distributed launch otherwise as it's either CPU, GPU or MPS.
+            # No need for a distributed launch otherwise as it's either CPU, GPU, XPU or MPS.
            if is_mps_available():
                os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
                print("Launching training on MPS.")
            elif torch.cuda.is_available():
                print("Launching training on one GPU.")
+            elif torch.xpu.is_available():
+                print("Launching training on one XPU.")
            else:
                print("Launching training on CPU.")
            function(*args)
--- a/tests/deepspeed/test_deepspeed_gradient_accumulation.py
+++ b/tests/deepspeed/test_deepspeed_gradient_accumulation.py
@ -22,7 +22,7 @@ from transformers import AutoModel
 from transformers.trainer_utils import set_seed

 from accelerate.accelerator import Accelerator
-from accelerate.test_utils.testing import AccelerateTestCase, require_cuda, require_deepspeed
+from accelerate.test_utils.testing import AccelerateTestCase, require_deepspeed
 from accelerate.test_utils.training import RegressionDataset
 from accelerate.utils import patch_environment
 from accelerate.utils.dataclasses import DeepSpeedPlugin
@ -37,7 +37,6 @@ FP16 = "fp16"


@require_deepspeed
-@require_cuda
 class DeepSpeedGradientAccumulationTest(AccelerateTestCase):
    def setUp(self):
        super().setUp()