From e98a59ec2db55908936e18fce42cdc5ae5ed48d2 Mon Sep 17 00:00:00 2001
From: Yao Matrix <matrix.yao@intel.com>
Date: Fri, 8 Aug 2025 03:06:22 -0700
Subject: [PATCH] DOC Make docs more device agnostic (e.g. XPU) (#2728)

Also adjusted some more examples.

Signed-off-by: Yao, Matrix <matrix.yao@intel.com>
---
 README.md                                            |  4 ++--
 docs/source/developer_guides/lora.md                 |  8 +++++---
 docs/source/developer_guides/model_merging.md        | 10 +++++++---
 docs/source/developer_guides/quantization.md         |  4 +++-
 docs/source/task_guides/ia3.md                       | 10 ++++++----
 .../boft_controlnet/utils/pipeline_controlnet.py     |  5 ++++-
 examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py |  3 ++-
 examples/qalora_finetuning/qalora_gptq_finetuning.py | 12 +++++++++---
 8 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index 85a4c115..77a61c68 100644
--- a/README.md
+++ b/README.md
@@ -42,7 +42,7 @@ Prepare a model for training with a PEFT method such as LoRA by wrapping the bas
 from transformers import AutoModelForCausalLM
 from peft import LoraConfig, TaskType, get_peft_model
 
-device = "cuda"
+device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
 model_id = "Qwen/Qwen2.5-3B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device)
 peft_config = LoraConfig(
@@ -65,7 +65,7 @@ To load a PEFT model for inference:
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from peft import PeftModel
 
-device = "cuda"
+device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
 model_id = "Qwen/Qwen2.5-3B-Instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device)
diff --git a/docs/source/developer_guides/lora.md b/docs/source/developer_guides/lora.md
index df3f58ff..06311da6 100644
--- a/docs/source/developer_guides/lora.md
+++ b/docs/source/developer_guides/lora.md
@@ -109,7 +109,7 @@ peft_config = LoraConfig(
 ```
 The parameter `rho` (≥ 1.0) determines how much redistribution is allowed. When `rho=1.0` and `r=16`, LoRA adapters are limited to exactly 16 ranks, preventing any redistribution from occurring. A recommended value for EVA with redistribution is 2.0, meaning the maximum rank allowed for a layer is 2r.
 
-It is recommended to perform EVA initialization on a GPU as it is much faster. To optimize the amount of available memory for EVA, you can use the `low_cpu_mem_usage` flag in [`get_peft_model`]:
+It is recommended to perform EVA initialization on an accelerator(e.g. CUDA GPU, Intel XPU) as it is much faster. To optimize the amount of available memory for EVA, you can use the `low_cpu_mem_usage` flag in [`get_peft_model`]:
 ```python
 peft_model = get_peft_model(model, peft_config, low_cpu_mem_usage=True)
 ```
@@ -203,7 +203,7 @@ model = PeftModel.from_pretrained(base_model, peft_model_id, ephemeral_gpu_offlo
 DoRA is optimized (computes faster and takes less memory) for models in the evaluation mode, or when dropout is set to 0. We reuse the
 base result at those times to get the speedup.
 Running [dora finetuning](https://github.com/huggingface/peft/blob/main/examples/dora_finetuning/dora_finetuning.py)
-with `CUDA_VISIBLE_DEVICES=0 time python examples/dora_finetuning/dora_finetuning.py --quantize --lora_dropout 0 --batch_size 16 --eval_step 2 --use_dora`
+with `CUDA_VISIBLE_DEVICES=0 ZE_AFFINITY_MASK=0 time python examples/dora_finetuning/dora_finetuning.py --quantize --lora_dropout 0 --batch_size 16 --eval_step 2 --use_dora`
 on a 4090 with gradient accumulation set to 2 and max step to 20 resulted with the following observations:
 
 | | Without Optimization | With Optimization |
@@ -471,11 +471,13 @@ There are several supported methods for `combination_type`. Refer to the [docume
 Now, perform inference:
 
 ```python
+device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
+
 tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
 
 prompt = "Hey, are you conscious? Can you talk to me?"
 inputs = tokenizer(prompt, return_tensors="pt")
-inputs = {k: v.to("cuda") for k, v in inputs.items()}
+inputs = {k: v.to(device) for k, v in inputs.items()}
 
 with torch.no_grad():
     generate_ids = model.generate(**inputs, max_length=30)
diff --git a/docs/source/developer_guides/model_merging.md b/docs/source/developer_guides/model_merging.md
index 408baf5d..31cda64a 100644
--- a/docs/source/developer_guides/model_merging.md
+++ b/docs/source/developer_guides/model_merging.md
@@ -99,12 +99,13 @@ Now you can use the merged model as an instruction-tuned model to write ad copy
 <hfoption id="instruct">
 
 ```py
+device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
 messages = [
     {"role": "user", "content": "Write an essay about Generative AI."},
 ]
 text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
 inputs = tokenizer(text, return_tensors="pt")
-inputs = {k: v.to("cuda") for k, v in inputs.items()}
+inputs = {k: v.to(device) for k, v in inputs.items()}
 outputs = model.generate(**inputs, max_new_tokens=256, do_sample=True, top_p=0.95, temperature=0.2, repetition_penalty=1.2, eos_token_id=tokenizer.eos_token_id)
 print(tokenizer.decode(outputs[0]))
 ```
@@ -113,13 +114,14 @@ print(tokenizer.decode(outputs[0]))
 <hfoption id="ad copy">
 
 ```py
+device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
 messages = [
     {"role": "system", "content": "Create a text ad given the following product and description."},
     {"role": "user", "content": "Product: Sony PS5 PlayStation Console\nDescription: The PS5 console unleashes new gaming possibilities that you never anticipated."},
 ]
 text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
 inputs = tokenizer(text, return_tensors="pt")
-inputs = {k: v.to("cuda") for k, v in inputs.items()}
+inputs = {k: v.to(device) for k, v in inputs.items()}
 outputs = model.generate(**inputs, max_new_tokens=128, do_sample=True, top_p=0.95, temperature=0.2, repetition_penalty=1.2, eos_token_id=tokenizer.eos_token_id)
 print(tokenizer.decode(outputs[0]))
 ```
@@ -128,13 +130,15 @@ print(tokenizer.decode(outputs[0]))
 <hfoption id="SQL">
 
 ```py
+device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
+
 text = """Table: 2-11365528-2
 Columns: ['Team', 'Head Coach', 'President', 'Home Ground', 'Location']
 Natural Query: Who is the Head Coach of the team whose President is Mario Volarevic?
 SQL Query:"""
 
 inputs = tokenizer(text, return_tensors="pt")
-inputs = {k: v.to("cuda") for k, v in inputs.items()}
+inputs = {k: v.to(device) for k, v in inputs.items()}
 outputs = model.generate(**inputs, max_new_tokens=64, repetition_penalty=1.1, eos_token_id=tokenizer("</s>").input_ids[-1])
 print(tokenizer.decode(outputs[0]))
 ```
diff --git a/docs/source/developer_guides/quantization.md b/docs/source/developer_guides/quantization.md
index a094f82c..b14abecc 100644
--- a/docs/source/developer_guides/quantization.md
+++ b/docs/source/developer_guides/quantization.md
@@ -197,7 +197,9 @@ The models that are quantized using Half-Quadratic Quantization of Large Machine
 ```python
 from hqq.engine.hf import HQQModelForCausalLM
 
-quantized_model = HQQModelForCausalLM.from_quantized(save_dir_or_hfhub, device='cuda')
+device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
+
+quantized_model = HQQModelForCausalLM.from_quantized(save_dir_or_hfhub, device=device)
 peft_config = LoraConfig(...)
 quantized_model = get_peft_model(quantized_model, peft_config)
 ```
diff --git a/docs/source/task_guides/ia3.md b/docs/source/task_guides/ia3.md
index 87b5c12d..f3dd9c70 100644
--- a/docs/source/task_guides/ia3.md
+++ b/docs/source/task_guides/ia3.md
@@ -92,7 +92,7 @@ processed_ds = ds.map(
 )
 ```
 
-Create a training and evaluation [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader), and set `pin_memory=True` to speed up data transfer to the GPU during training if your dataset samples are on a CPU.
+Create a training and evaluation [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader), and set `pin_memory=True` to speed up data transfer to the accelerator during training if your dataset samples are on a CPU.
 
 ```py
 from torch.utils.data import DataLoader
@@ -159,12 +159,12 @@ lr_scheduler = get_linear_schedule_with_warmup(
 )
 ```
 
-Move the model to the GPU and create a training loop that reports the loss and perplexity for each epoch.
+Move the model to the accelerator and create a training loop that reports the loss and perplexity for each epoch.
 
 ```py
 from tqdm import tqdm
 
-device = "cuda"
+device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
 model = model.to(device)
 
 for epoch in range(num_epochs):
@@ -219,7 +219,9 @@ To load the model for inference, use the [`~AutoPeftModelForSeq2SeqLM.from_pretr
 ```py
 from peft import AutoPeftModelForSeq2SeqLM
 
-model = AutoPeftModelForSeq2SeqLM.from_pretrained("<your-hf-account-name>/mt0-large-ia3").to("cuda")
+device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
+
+model = AutoPeftModelForSeq2SeqLM.from_pretrained("<your-hf-account-name>/mt0-large-ia3").to(device)
 tokenizer = AutoTokenizer.from_pretrained("bigscience/mt0-large")
 
 i = 15
diff --git a/examples/boft_controlnet/utils/pipeline_controlnet.py b/examples/boft_controlnet/utils/pipeline_controlnet.py
index 1846a59b..d4f5f35e 100644
--- a/examples/boft_controlnet/utils/pipeline_controlnet.py
+++ b/examples/boft_controlnet/utils/pipeline_controlnet.py
@@ -426,7 +426,10 @@ class LightControlNetPipeline(StableDiffusionControlNetPipeline):
         if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
             self.unet.to("cpu")
             self.controlnet.to("cpu")
-            torch.cuda.empty_cache()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            elif torch.xpu.is_available():
+                torch.xpu.empty_cache()
 
         if not output_type == "latent":
             image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
diff --git a/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py b/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py
index 8b695bbc..6773e909 100755
--- a/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py
+++ b/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py
@@ -9,7 +9,8 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from peft import LoraConfig, get_peft_model
 
 
-os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # force to use CUDA GPU device 0
+os.environ["ZE_AFFINITY_MASK"] = "0"  # force to use Intel XPU device 0
 # -*- coding: utf-8 -*-
 """Finetune-opt-bnb-peft.ipynb
 
diff --git a/examples/qalora_finetuning/qalora_gptq_finetuning.py b/examples/qalora_finetuning/qalora_gptq_finetuning.py
index a396d3ef..7a39d264 100644
--- a/examples/qalora_finetuning/qalora_gptq_finetuning.py
+++ b/examples/qalora_finetuning/qalora_gptq_finetuning.py
@@ -62,7 +62,10 @@ def load_or_quantize_model(
             print(f"Model {base_model} is not GPTQ-quantized. Will quantize it.")
             # Clean up the test model to free memory
             del test_model
-            torch.cuda.empty_cache() if torch.cuda.is_available() else None
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            elif torch.xpu.is_available():
+                torch.xpu.empty_cache()
 
     except Exception as e:
         print(f"Could not load model {base_model} directly: {e}")
@@ -253,8 +256,11 @@ def train_model(
         label_names=["labels"],
     )
 
-    # Clear CUDA cache to free memory
-    torch.cuda.empty_cache()
+    # Clear accelerator cache to free memory
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    elif torch.xpu.is_available():
+        torch.xpu.empty_cache()
 
     # Initialize trainer
     trainer = Trainer(