xpu enablement on left cases (#3654)

* 1. enable xpu for launcher 2. expand cuda only ds uts to xpu 3. expand profiler example to xpu

Signed-off-by: YAO Matrix <matrix.yao@intel.com>

* fix style

Signed-off-by: YAO Matrix <matrix.yao@intel.com>

* rename

Signed-off-by: YAO Matrix <matrix.yao@intel.com>

* Update profiler.py

* Apply style fixes

---------

Signed-off-by: YAO Matrix <matrix.yao@intel.com>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
This commit is contained in:
Yao Matrix
2025-07-08 00:10:53 +08:00
committed by GitHub
parent 07ce74868c
commit 1ac8643df7
3 changed files with 30 additions and 26 deletions

View File

@ -31,8 +31,8 @@ from accelerate.utils import ProfileKwargs
#
# This example trains a Bert base model on GLUE MRPC
# in any of the following settings (with the same script):
# - single CPU or single GPU
# - multi GPUS (using PyTorch distributed mode)
# - single CPU or single device (CUDA GPU, Intel XPU etc.)
# - multi devices (using PyTorch distributed mode)
# - (multi) TPUs
# - fp16 (mixed-precision) or fp32 (normal precision)
#
@ -183,7 +183,8 @@ def training_function(config, args):
# New Code #
accelerator.print(
prof.key_averages().table(
sort_by="self_cpu_time_total" if args.cpu else "self_cuda_time_total", row_limit=-1
sort_by="self_cpu_time_total" if args.cpu else f"self_{accelerator.device.type}_time_total",
row_limit=-1,
)
)
@ -215,7 +216,7 @@ def main():
choices=["no", "fp16", "bf16", "fp8"],
help="Whether to use mixed precision. Choose"
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
"and an Nvidia Ampere GPU.",
"and an Nvidia Ampere GPU or an Intel XPU.",
)
# New Code #
parser.add_argument(

View File

@ -60,8 +60,8 @@ def notebook_launcher(
<Tip warning={true}>
To use this function absolutely zero calls to a CUDA device must be made in the notebook session before calling. If
any have been made, you will need to restart the notebook and make sure no cells use any CUDA capability.
To use this function absolutely zero calls to a device must be made in the notebook session before calling. If any
have been made, you will need to restart the notebook and make sure no cells use any device capability.
Setting `ACCELERATE_DEBUG_MODE="1"` in your environment will run a test before truly launching to ensure that none
of those calls have been made.
@ -76,11 +76,11 @@ def notebook_launcher(
Tuple of arguments to pass to the function (it will receive `*args`).
num_processes (`int`, *optional*):
The number of processes to use for training. Will default to 8 in Colab/Kaggle if a TPU is available, to
the number of GPUs available otherwise.
the number of devices available otherwise.
mixed_precision (`str`, *optional*, defaults to `"no"`):
If `fp16` or `bf16`, will use mixed precision training on multi-GPU.
If `fp16` or `bf16`, will use mixed precision training on multi-device.
use_port (`str`, *optional*, defaults to `"29500"`):
The port to use to communicate between processes when launching a multi-GPU training.
The port to use to communicate between processes when launching a multi-device training.
master_addr (`str`, *optional*, defaults to `"127.0.0.1"`):
The address to use for communication between processes.
node_rank (`int`, *optional*, defaults to 0):
@ -105,7 +105,7 @@ def notebook_launcher(
Example:
```python
# Assume this is defined in a Jupyter Notebook on an instance with two GPUs
# Assume this is defined in a Jupyter Notebook on an instance with two devices
from accelerate import notebook_launcher
@ -158,27 +158,27 @@ def notebook_launcher(
else:
if num_processes is None:
raise ValueError(
"You have to specify the number of GPUs you would like to use, add `num_processes=...` to your call."
"You have to specify the number of devices you would like to use, add `num_processes=...` to your call."
)
if node_rank >= num_nodes:
raise ValueError("The node_rank must be less than the number of nodes.")
if num_processes > 1:
# Multi-GPU launch
# Multi-device launch
from torch.distributed.launcher.api import LaunchConfig, elastic_launch
from torch.multiprocessing import start_processes
from torch.multiprocessing.spawn import ProcessRaisedException
if len(AcceleratorState._shared_state) > 0:
raise ValueError(
"To launch a multi-GPU training from your notebook, the `Accelerator` should only be initialized "
"To launch a multi-device training from your notebook, the `Accelerator` should only be initialized "
"inside your training function. Restart your notebook and make sure no cells initializes an "
"`Accelerator`."
)
# Check for specific libraries known to initialize CUDA that users constantly use
# Check for specific libraries known to initialize device that users constantly use
problematic_imports = are_libraries_initialized("bitsandbytes")
if len(problematic_imports) > 0:
err = (
"Could not start distributed process. Libraries known to initialize CUDA upon import have been "
"Could not start distributed process. Libraries known to initialize device upon import have been "
"imported already. Please keep these imports inside your training function to try and help with this:"
)
for lib_name in problematic_imports:
@ -203,24 +203,26 @@ def notebook_launcher(
# process here (the other ones will be set be the launcher).
with patch_environment(**patched_env):
# First dummy launch
device_type = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
distributed_type = "MULTI_XPU" if device_type == "xpu" else "MULTI_GPU"
if os.environ.get("ACCELERATE_DEBUG_MODE", "false").lower() == "true":
launcher = PrepareForLaunch(test_launch, distributed_type="MULTI_GPU")
launcher = PrepareForLaunch(test_launch, distributed_type=distributed_type)
try:
start_processes(launcher, args=(), nprocs=num_processes, start_method="fork")
except ProcessRaisedException as e:
err = "An issue was found when verifying a stable environment for the notebook launcher."
if "Cannot re-initialize CUDA in forked subprocess" in e.args[0]:
if f"Cannot re-initialize {device_type.upper()} in forked subprocess" in e.args[0]:
raise RuntimeError(
f"{err}"
"This likely stems from an outside import causing issues once the `notebook_launcher()` is called. "
"Please review your imports and test them when running the `notebook_launcher()` to identify "
"which one is problematic and causing CUDA to be initialized."
f"which one is problematic and causing {device_type.upper()} to be initialized."
) from e
else:
raise RuntimeError(f"{err} The following error was raised: {e}") from e
# Now the actual launch
launcher = PrepareForLaunch(function, distributed_type="MULTI_GPU")
print(f"Launching training on {num_processes} GPUs.")
launcher = PrepareForLaunch(function, distributed_type=distributed_type)
print(f"Launching training on {num_processes} {device_type.upper()}s.")
try:
if rdzv_conf is None:
rdzv_conf = {}
@ -244,23 +246,25 @@ def notebook_launcher(
launch_config_kwargs["log_line_prefix_template"] = log_line_prefix_template
elastic_launch(config=LaunchConfig(**launch_config_kwargs), entrypoint=function)(*args)
except ProcessRaisedException as e:
if "Cannot re-initialize CUDA in forked subprocess" in e.args[0]:
if f"Cannot re-initialize {device_type.upper()} in forked subprocess" in e.args[0]:
raise RuntimeError(
"CUDA has been initialized before the `notebook_launcher` could create a forked subprocess. "
f"{device_type.upper()} has been initialized before the `notebook_launcher` could create a forked subprocess. "
"This likely stems from an outside import causing issues once the `notebook_launcher()` is called. "
"Please review your imports and test them when running the `notebook_launcher()` to identify "
"which one is problematic and causing CUDA to be initialized."
f"which one is problematic and causing {device_type.upper()} to be initialized."
) from e
else:
raise RuntimeError(f"An issue was found when launching the training: {e}") from e
else:
# No need for a distributed launch otherwise as it's either CPU, GPU or MPS.
# No need for a distributed launch otherwise as it's either CPU, GPU, XPU or MPS.
if is_mps_available():
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
print("Launching training on MPS.")
elif torch.cuda.is_available():
print("Launching training on one GPU.")
elif torch.xpu.is_available():
print("Launching training on one XPU.")
else:
print("Launching training on CPU.")
function(*args)

View File

@ -22,7 +22,7 @@ from transformers import AutoModel
from transformers.trainer_utils import set_seed
from accelerate.accelerator import Accelerator
from accelerate.test_utils.testing import AccelerateTestCase, require_cuda, require_deepspeed
from accelerate.test_utils.testing import AccelerateTestCase, require_deepspeed
from accelerate.test_utils.training import RegressionDataset
from accelerate.utils import patch_environment
from accelerate.utils.dataclasses import DeepSpeedPlugin
@ -37,7 +37,6 @@ FP16 = "fp16"
@require_deepspeed
@require_cuda
class DeepSpeedGradientAccumulationTest(AccelerateTestCase):
def setUp(self):
super().setUp()