FEAT Add LoRA INC support (#2499)

Add LoRA Adds Intel Neural Compressor. --------- Signed-off-by: Daniel Socek <daniel.socek@intel.com>
2025-10-20 15:33:48 +08:00 · 2025-04-28 12:39:37 -04:00
parent 453a6ff336
commit 003cf20bcd
6 changed files with 195 additions and 1 deletions
--- a/examples/stable_diffusion/inc_flux_lora_hpu.py
+++ b/examples/stable_diffusion/inc_flux_lora_hpu.py
@ -0,0 +1,67 @@
+"""
+This exampe demonstrates loading of LoRA adapter (via PEFT) into an FP8 INC-quantized FLUX model.
+
+More info on Intel Neural Compressor (INC) FP8 quantization is available at:
+https://github.com/intel/neural-compressor/tree/master/examples/helloworld/fp8_example
+
+Requirements:
+pip install optimum-habana sentencepiece neural-compressor[pt] peft
+"""
+
+import importlib
+
+import torch
+from neural_compressor.torch.quantization import FP8Config, convert, finalize_calibration, prepare
+
+
+# Checks if HPU device is available
+# Adapted from https://github.com/huggingface/accelerate/blob/b451956fd69a135efc283aadaa478f0d33fcbe6a/src/accelerate/utils/imports.py#L435
+def is_hpu_available():
+    if (
+        importlib.util.find_spec("habana_frameworks") is None
+        or importlib.util.find_spec("habana_frameworks.torch") is None
+    ):
+        return False
+
+    import habana_frameworks.torch  # noqa: F401
+
+    return hasattr(torch, "hpu") and torch.hpu.is_available()
+
+
+# Ensure HPU device is available before proceeding
+if is_hpu_available():
+    from optimum.habana.diffusers import GaudiFluxPipeline
+else:
+    raise RuntimeError("HPU device not found. This code requires Intel Gaudi device to run.")
+
+# Example: FLUX model inference on HPU via optimum-habana pipeline
+hpu_configs = {
+    "use_habana": True,
+    "use_hpu_graphs": True,
+    "sdp_on_bf16": True,
+    "gaudi_config": "Habana/stable-diffusion",
+}
+pipe = GaudiFluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16, **hpu_configs)
+prompt = "A picture of sks dog in a bucket"
+
+# Quantize FLUX transformer to FP8 using INC (Intel Neural Compressor)
+quant_configs = {
+    "mode": "AUTO",
+    "observer": "maxabs",
+    "scale_method": "maxabs_hw",
+    "allowlist": {"types": [], "names": []},
+    "blocklist": {"types": [], "names": []},
+    "dump_stats_path": "/tmp/hqt_output/measure",
+}
+config = FP8Config(**quant_configs)
+pipe.transformer = prepare(pipe.transformer, config)
+pipe(prompt)
+finalize_calibration(pipe.transformer)
+pipe.transformer = convert(pipe.transformer)
+
+# Load LoRA weights with PEFT
+pipe.load_lora_weights("dsocek/lora-flux-dog", adapter_name="user_lora")
+
+# Run inference
+image = pipe(prompt).images[0]
+image.save("dog.png")