mirror of
https://github.com/huggingface/accelerate.git
synced 2025-10-20 10:03:46 +08:00
xpu enablement on left cases (#3654)
* 1. enable xpu for launcher 2. expand cuda only ds uts to xpu 3. expand profiler example to xpu Signed-off-by: YAO Matrix <matrix.yao@intel.com> * fix style Signed-off-by: YAO Matrix <matrix.yao@intel.com> * rename Signed-off-by: YAO Matrix <matrix.yao@intel.com> * Update profiler.py * Apply style fixes --------- Signed-off-by: YAO Matrix <matrix.yao@intel.com> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
This commit is contained in:
@ -31,8 +31,8 @@ from accelerate.utils import ProfileKwargs
|
||||
#
|
||||
# This example trains a Bert base model on GLUE MRPC
|
||||
# in any of the following settings (with the same script):
|
||||
# - single CPU or single GPU
|
||||
# - multi GPUS (using PyTorch distributed mode)
|
||||
# - single CPU or single device (CUDA GPU, Intel XPU etc.)
|
||||
# - multi devices (using PyTorch distributed mode)
|
||||
# - (multi) TPUs
|
||||
# - fp16 (mixed-precision) or fp32 (normal precision)
|
||||
#
|
||||
@ -183,7 +183,8 @@ def training_function(config, args):
|
||||
# New Code #
|
||||
accelerator.print(
|
||||
prof.key_averages().table(
|
||||
sort_by="self_cpu_time_total" if args.cpu else "self_cuda_time_total", row_limit=-1
|
||||
sort_by="self_cpu_time_total" if args.cpu else f"self_{accelerator.device.type}_time_total",
|
||||
row_limit=-1,
|
||||
)
|
||||
)
|
||||
|
||||
@ -215,7 +216,7 @@ def main():
|
||||
choices=["no", "fp16", "bf16", "fp8"],
|
||||
help="Whether to use mixed precision. Choose"
|
||||
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
|
||||
"and an Nvidia Ampere GPU.",
|
||||
"and an Nvidia Ampere GPU or an Intel XPU.",
|
||||
)
|
||||
# New Code #
|
||||
parser.add_argument(
|
||||
|
@ -60,8 +60,8 @@ def notebook_launcher(
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
To use this function absolutely zero calls to a CUDA device must be made in the notebook session before calling. If
|
||||
any have been made, you will need to restart the notebook and make sure no cells use any CUDA capability.
|
||||
To use this function absolutely zero calls to a device must be made in the notebook session before calling. If any
|
||||
have been made, you will need to restart the notebook and make sure no cells use any device capability.
|
||||
|
||||
Setting `ACCELERATE_DEBUG_MODE="1"` in your environment will run a test before truly launching to ensure that none
|
||||
of those calls have been made.
|
||||
@ -76,11 +76,11 @@ def notebook_launcher(
|
||||
Tuple of arguments to pass to the function (it will receive `*args`).
|
||||
num_processes (`int`, *optional*):
|
||||
The number of processes to use for training. Will default to 8 in Colab/Kaggle if a TPU is available, to
|
||||
the number of GPUs available otherwise.
|
||||
the number of devices available otherwise.
|
||||
mixed_precision (`str`, *optional*, defaults to `"no"`):
|
||||
If `fp16` or `bf16`, will use mixed precision training on multi-GPU.
|
||||
If `fp16` or `bf16`, will use mixed precision training on multi-device.
|
||||
use_port (`str`, *optional*, defaults to `"29500"`):
|
||||
The port to use to communicate between processes when launching a multi-GPU training.
|
||||
The port to use to communicate between processes when launching a multi-device training.
|
||||
master_addr (`str`, *optional*, defaults to `"127.0.0.1"`):
|
||||
The address to use for communication between processes.
|
||||
node_rank (`int`, *optional*, defaults to 0):
|
||||
@ -105,7 +105,7 @@ def notebook_launcher(
|
||||
Example:
|
||||
|
||||
```python
|
||||
# Assume this is defined in a Jupyter Notebook on an instance with two GPUs
|
||||
# Assume this is defined in a Jupyter Notebook on an instance with two devices
|
||||
from accelerate import notebook_launcher
|
||||
|
||||
|
||||
@ -158,27 +158,27 @@ def notebook_launcher(
|
||||
else:
|
||||
if num_processes is None:
|
||||
raise ValueError(
|
||||
"You have to specify the number of GPUs you would like to use, add `num_processes=...` to your call."
|
||||
"You have to specify the number of devices you would like to use, add `num_processes=...` to your call."
|
||||
)
|
||||
if node_rank >= num_nodes:
|
||||
raise ValueError("The node_rank must be less than the number of nodes.")
|
||||
if num_processes > 1:
|
||||
# Multi-GPU launch
|
||||
# Multi-device launch
|
||||
from torch.distributed.launcher.api import LaunchConfig, elastic_launch
|
||||
from torch.multiprocessing import start_processes
|
||||
from torch.multiprocessing.spawn import ProcessRaisedException
|
||||
|
||||
if len(AcceleratorState._shared_state) > 0:
|
||||
raise ValueError(
|
||||
"To launch a multi-GPU training from your notebook, the `Accelerator` should only be initialized "
|
||||
"To launch a multi-device training from your notebook, the `Accelerator` should only be initialized "
|
||||
"inside your training function. Restart your notebook and make sure no cells initializes an "
|
||||
"`Accelerator`."
|
||||
)
|
||||
# Check for specific libraries known to initialize CUDA that users constantly use
|
||||
# Check for specific libraries known to initialize device that users constantly use
|
||||
problematic_imports = are_libraries_initialized("bitsandbytes")
|
||||
if len(problematic_imports) > 0:
|
||||
err = (
|
||||
"Could not start distributed process. Libraries known to initialize CUDA upon import have been "
|
||||
"Could not start distributed process. Libraries known to initialize device upon import have been "
|
||||
"imported already. Please keep these imports inside your training function to try and help with this:"
|
||||
)
|
||||
for lib_name in problematic_imports:
|
||||
@ -203,24 +203,26 @@ def notebook_launcher(
|
||||
# process here (the other ones will be set be the launcher).
|
||||
with patch_environment(**patched_env):
|
||||
# First dummy launch
|
||||
device_type = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
|
||||
distributed_type = "MULTI_XPU" if device_type == "xpu" else "MULTI_GPU"
|
||||
if os.environ.get("ACCELERATE_DEBUG_MODE", "false").lower() == "true":
|
||||
launcher = PrepareForLaunch(test_launch, distributed_type="MULTI_GPU")
|
||||
launcher = PrepareForLaunch(test_launch, distributed_type=distributed_type)
|
||||
try:
|
||||
start_processes(launcher, args=(), nprocs=num_processes, start_method="fork")
|
||||
except ProcessRaisedException as e:
|
||||
err = "An issue was found when verifying a stable environment for the notebook launcher."
|
||||
if "Cannot re-initialize CUDA in forked subprocess" in e.args[0]:
|
||||
if f"Cannot re-initialize {device_type.upper()} in forked subprocess" in e.args[0]:
|
||||
raise RuntimeError(
|
||||
f"{err}"
|
||||
"This likely stems from an outside import causing issues once the `notebook_launcher()` is called. "
|
||||
"Please review your imports and test them when running the `notebook_launcher()` to identify "
|
||||
"which one is problematic and causing CUDA to be initialized."
|
||||
f"which one is problematic and causing {device_type.upper()} to be initialized."
|
||||
) from e
|
||||
else:
|
||||
raise RuntimeError(f"{err} The following error was raised: {e}") from e
|
||||
# Now the actual launch
|
||||
launcher = PrepareForLaunch(function, distributed_type="MULTI_GPU")
|
||||
print(f"Launching training on {num_processes} GPUs.")
|
||||
launcher = PrepareForLaunch(function, distributed_type=distributed_type)
|
||||
print(f"Launching training on {num_processes} {device_type.upper()}s.")
|
||||
try:
|
||||
if rdzv_conf is None:
|
||||
rdzv_conf = {}
|
||||
@ -244,23 +246,25 @@ def notebook_launcher(
|
||||
launch_config_kwargs["log_line_prefix_template"] = log_line_prefix_template
|
||||
elastic_launch(config=LaunchConfig(**launch_config_kwargs), entrypoint=function)(*args)
|
||||
except ProcessRaisedException as e:
|
||||
if "Cannot re-initialize CUDA in forked subprocess" in e.args[0]:
|
||||
if f"Cannot re-initialize {device_type.upper()} in forked subprocess" in e.args[0]:
|
||||
raise RuntimeError(
|
||||
"CUDA has been initialized before the `notebook_launcher` could create a forked subprocess. "
|
||||
f"{device_type.upper()} has been initialized before the `notebook_launcher` could create a forked subprocess. "
|
||||
"This likely stems from an outside import causing issues once the `notebook_launcher()` is called. "
|
||||
"Please review your imports and test them when running the `notebook_launcher()` to identify "
|
||||
"which one is problematic and causing CUDA to be initialized."
|
||||
f"which one is problematic and causing {device_type.upper()} to be initialized."
|
||||
) from e
|
||||
else:
|
||||
raise RuntimeError(f"An issue was found when launching the training: {e}") from e
|
||||
|
||||
else:
|
||||
# No need for a distributed launch otherwise as it's either CPU, GPU or MPS.
|
||||
# No need for a distributed launch otherwise as it's either CPU, GPU, XPU or MPS.
|
||||
if is_mps_available():
|
||||
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
||||
print("Launching training on MPS.")
|
||||
elif torch.cuda.is_available():
|
||||
print("Launching training on one GPU.")
|
||||
elif torch.xpu.is_available():
|
||||
print("Launching training on one XPU.")
|
||||
else:
|
||||
print("Launching training on CPU.")
|
||||
function(*args)
|
||||
|
@ -22,7 +22,7 @@ from transformers import AutoModel
|
||||
from transformers.trainer_utils import set_seed
|
||||
|
||||
from accelerate.accelerator import Accelerator
|
||||
from accelerate.test_utils.testing import AccelerateTestCase, require_cuda, require_deepspeed
|
||||
from accelerate.test_utils.testing import AccelerateTestCase, require_deepspeed
|
||||
from accelerate.test_utils.training import RegressionDataset
|
||||
from accelerate.utils import patch_environment
|
||||
from accelerate.utils.dataclasses import DeepSpeedPlugin
|
||||
@ -37,7 +37,6 @@ FP16 = "fp16"
|
||||
|
||||
|
||||
@require_deepspeed
|
||||
@require_cuda
|
||||
class DeepSpeedGradientAccumulationTest(AccelerateTestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
|
Reference in New Issue
Block a user