From 33cd94500e4cf7a495e915eda69d28d75737fa0d Mon Sep 17 00:00:00 2001 From: Yao Matrix Date: Fri, 15 Aug 2025 09:40:47 -0700 Subject: [PATCH] fix xpu device_id AttributeError issue (#7488) # Reproduce w/ PyTorch 2.8 ``` $ git clone https://github.com/huggingface/trl.git $ cd ./trl $ accelerate launch --config_file examples/accelerate_configs/deepspeed_zero3.yaml examples/scripts/sft_gpt_oss.py --torch_dtype bfloat16 --model_name_or_path openai/gpt-oss-20b --packing true packing_strategy wrapped --run_name 20b-full-eager --attn_implementation sdpa --dataset_num_proc 6 --dataset_name HuggingFaceH4/Multilingual-Thinking --gradient_checkpointing --max_length 4096 --per_device_train_batch_size 1 --num_train_epochs 1 --logging_steps 1 --warmup_ratio 0.03 --lr_scheduler_type cosine_with_min_lr --lr_scheduler_kwargs '{"min_lr_rate": 0.1}' --output_dir gpt-oss-20b-multilingual-reasoner --report_to trackio --seed 42 ``` # Issue > File "/workspace/accelerate/src/accelerate/state.py", line 216, in __init__ > dist.init_distributed(dist_backend=self.backend, auto_mpi_discovery=False, **kwargs) > File "/usr/local/lib/python3.12/dist-packages/deepspeed/comm/comm.py", line 854, in init_distributed > cdb = TorchBackend(dist_backend, timeout, init_method, rank, world_size) > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > File "/usr/local/lib/python3.12/dist-packages/deepspeed/comm/torch.py", line 120, in __init__ > self.init_process_group(backend, timeout, init_method, rank, world_size) > File "/usr/local/lib/python3.12/dist-packages/deepspeed/comm/torch.py", line 164, in init_process_group > torch.distributed.init_process_group(backend, **kwargs) > File "/usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py", line 81, in wrapper > return func(*args, **kwargs) > ^^^^^^^^^^^^^^^^^^^^^ > File "/usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py", line 95, in wrapper > func_return = func(*args, **kwargs) > ^^^^^^^^^^^^^^^^^^^^^ > File "/usr/local/lib/python3.12/dist-packages/torch/distributed/distributed_c10d.py", line 1685, in init_process_group > if device_id is not None and device_id.type != "cpu": > AttributeError: 'device' object has no attribute 'type' # Root Cause `torch.xpu.device` in PyTorch is a context manager in PyTorch rather than a device class, it doesn't have attribute `type` # Fix switch to use `torch.device` Signed-off-by: Yao, Matrix Co-authored-by: Olatunji Ruwase --- accelerator/xpu_accelerator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/accelerator/xpu_accelerator.py b/accelerator/xpu_accelerator.py index 1cc436683..09f58abdd 100644 --- a/accelerator/xpu_accelerator.py +++ b/accelerator/xpu_accelerator.py @@ -61,7 +61,7 @@ class XPU_Accelerator(DeepSpeedAccelerator): return 'xpu:{}'.format(device_index) def device(self, device_index=None): - return torch.xpu.device(device_index) + return torch.device('xpu', device_index) def set_device(self, device_index): torch.xpu.set_device(device_index)