mirror of
https://github.com/huggingface/peft.git
synced 2025-10-20 15:33:48 +08:00
Compare commits
37 Commits
b774fd901e
...
main
Author | SHA1 | Date | |
---|---|---|---|
2813b9c4bf | |||
8d8aa0b716 | |||
182f4c945a | |||
1a1f97263d | |||
87b90f045e | |||
086f187a4d | |||
ec5a1b2ce6 | |||
9b8cf2a0c3 | |||
6392935921 | |||
25f97e663a | |||
61a11f9180 | |||
2f9f759587 | |||
2410f458c8 | |||
879587f3db | |||
f8aca0a0c2 | |||
e9f5707e3f | |||
2c29cf7936 | |||
31989eab83 | |||
b0954e0daa | |||
f00d94a170 | |||
24aebeec21 | |||
815956b9b8 | |||
ffa971a68c | |||
4469af57a0 | |||
e596112b7b | |||
046e32bf16 | |||
190f9873b1 | |||
6030f9160e | |||
ae671baec9 | |||
7b2a5b1f02 | |||
530d7bbf1e | |||
9da3f77960 | |||
c15daaa5aa | |||
4f868bd7c9 | |||
50329a7138 | |||
f6b0a2dd43 | |||
f1b83646a6 |
6
.github/workflows/tests-main.yml
vendored
6
.github/workflows/tests-main.yml
vendored
@ -6,9 +6,6 @@ on:
|
||||
paths-ignore:
|
||||
- 'docs/**'
|
||||
|
||||
env:
|
||||
TRANSFORMERS_IS_CI: 1
|
||||
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
@ -31,6 +28,9 @@ jobs:
|
||||
pip install -U git+https://github.com/huggingface/transformers.git
|
||||
pip install -e .[test]
|
||||
- name: Test with pytest
|
||||
env:
|
||||
TRANSFORMERS_IS_CI: 1
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
run: |
|
||||
make test
|
||||
- name: Post to Slack
|
||||
|
15
.github/workflows/tests.yml
vendored
15
.github/workflows/tests.yml
vendored
@ -11,7 +11,6 @@ on:
|
||||
|
||||
env:
|
||||
HF_HOME: .cache/huggingface
|
||||
TRANSFORMERS_IS_CI: 1
|
||||
|
||||
permissions: {}
|
||||
|
||||
@ -41,8 +40,11 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: ["3.9", "3.10", "3.11", "3.12"]
|
||||
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
||||
os: ["ubuntu-latest", "macos-13", "windows-latest"]
|
||||
exclude:
|
||||
- os: macos-13
|
||||
python-version: "3.13"
|
||||
runs-on: ${{ matrix.os }}
|
||||
steps:
|
||||
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||
@ -79,17 +81,14 @@ jobs:
|
||||
pip install setuptools
|
||||
# cpu version of pytorch
|
||||
pip install -e .[test]
|
||||
- name: Downgrade numpy on MacOS and Windows
|
||||
# TODO: remove numpy downgrade on MacOS & Windows once torch fixes numpy 2.0 issue
|
||||
shell: bash
|
||||
if: matrix.os == 'windows-latest' || matrix.os == 'macos-13'
|
||||
run: |
|
||||
pip install --force-reinstall -U "numpy<2.0.0"
|
||||
- name: Test with pytest
|
||||
# MacOS tests are currently too flaky and will fail almost each time. Thus, continue (green checkmark) even if
|
||||
# they fail, but add a notice so that the failure is not completely silent
|
||||
continue-on-error: ${{ matrix.os == 'macos-13' }}
|
||||
shell: bash
|
||||
env:
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
TRANSFORMERS_IS_CI: 1
|
||||
run: |
|
||||
set +e
|
||||
make test
|
||||
|
@ -134,6 +134,10 @@
|
||||
title: MiSS
|
||||
- local: package_reference/road
|
||||
title: RoAd
|
||||
- local: package_reference/waveft
|
||||
title: WaveFT
|
||||
- local: package_reference/delora
|
||||
title: DeLoRA
|
||||
|
||||
title: Adapters
|
||||
- sections:
|
||||
@ -143,5 +147,7 @@
|
||||
title: Helpers
|
||||
- local: package_reference/hotswap
|
||||
title: Hotswapping adapters
|
||||
- local: package_reference/functional
|
||||
title: Functions for PEFT integration
|
||||
title: Utilities
|
||||
title: API reference
|
||||
|
@ -263,11 +263,11 @@ model = AutoModelForCausalLM.from_pretrained(
|
||||
quantization_config=bnb_config,
|
||||
trust_remote_code=True,
|
||||
attn_implementation="flash_attention_2" if args.use_flash_attn else "eager",
|
||||
+ torch_dtype=quant_storage_dtype or torch.float32,
|
||||
+ dtype=quant_storage_dtype or torch.float32,
|
||||
)
|
||||
```
|
||||
|
||||
Notice that `torch_dtype` for `AutoModelForCausalLM` is same as the `bnb_4bit_quant_storage` data type. That's it. Everything else is handled by Trainer and TRL.
|
||||
Notice that `dtype` for `AutoModelForCausalLM` is same as the `bnb_4bit_quant_storage` data type. That's it. Everything else is handled by Trainer and TRL.
|
||||
|
||||
## Memory usage
|
||||
|
||||
@ -276,11 +276,8 @@ In the above example, the memory consumed per GPU is **36.6 GB**. Therefore, wha
|
||||
# Use PEFT and DeepSpeed with ZeRO3 and CPU Offloading for finetuning large models on a single GPU
|
||||
This section of guide will help you learn how to use our DeepSpeed [training script](https://github.com/huggingface/peft/blob/main/examples/conditional_generation/peft_lora_seq2seq_accelerate_ds_zero3_offload.py). You'll configure the script to train a large model for conditional generation with ZeRO-3 and CPU Offload.
|
||||
|
||||
<Tip>
|
||||
|
||||
💡 To help you get started, check out our example training scripts for [causal language modeling](https://github.com/huggingface/peft/blob/main/examples/causal_language_modeling/peft_lora_clm_accelerate_ds_zero3_offload.py) and [conditional generation](https://github.com/huggingface/peft/blob/main/examples/conditional_generation/peft_lora_seq2seq_accelerate_ds_zero3_offload.py). You can adapt these scripts for your own applications or even use them out of the box if your task is similar to the one in the scripts.
|
||||
|
||||
</Tip>
|
||||
> [!TIP]
|
||||
> 💡 To help you get started, check out our example training scripts for [causal language modeling](https://github.com/huggingface/peft/blob/main/examples/causal_language_modeling/peft_lora_clm_accelerate_ds_zero3_offload.py) and [conditional generation](https://github.com/huggingface/peft/blob/main/examples/conditional_generation/peft_lora_seq2seq_accelerate_ds_zero3_offload.py). You can adapt these scripts for your own applications or even use them out of the box if your task is similar to the one in the scripts.
|
||||
|
||||
## Configuration
|
||||
|
||||
@ -338,11 +335,8 @@ Let's dive a little deeper into the script so you can see what's going on, and u
|
||||
|
||||
Within the [`main`](https://github.com/huggingface/peft/blob/2822398fbe896f25d4dac5e468624dc5fd65a51b/examples/conditional_generation/peft_lora_seq2seq_accelerate_ds_zero3_offload.py#L103) function, the script creates an [`~accelerate.Accelerator`] class to initialize all the necessary requirements for distributed training.
|
||||
|
||||
<Tip>
|
||||
|
||||
💡 Feel free to change the model and dataset inside the `main` function. If your dataset format is different from the one in the script, you may also need to write your own preprocessing function.
|
||||
|
||||
</Tip>
|
||||
> [!TIP]
|
||||
> 💡 Feel free to change the model and dataset inside the `main` function. If your dataset format is different from the one in the script, you may also need to write your own preprocessing function.
|
||||
|
||||
The script also creates a configuration for the 🤗 PEFT method you're using, which in this case, is LoRA. The [`LoraConfig`] specifies the task type and important parameters such as the dimension of the low-rank matrices, the matrices scaling factor, and the dropout probability of the LoRA layers. If you want to use a different 🤗 PEFT method, make sure you replace `LoraConfig` with the appropriate [class](../package_reference/tuners).
|
||||
|
||||
@ -439,20 +433,17 @@ dataset['train'][label_column][:10]=['no complaint', 'no complaint', 'complaint'
|
||||
2. When using CPU offloading, the major gains from using PEFT to shrink the optimizer states and gradients to that of the adapter weights would be realized on CPU RAM and there won't be savings with respect to GPU memory.
|
||||
3. DeepSpeed Stage 3 and qlora when used with CPU offloading leads to more GPU memory usage when compared to disabling CPU offloading.
|
||||
|
||||
<Tip>
|
||||
|
||||
💡 When you have code that requires merging (and unmerging) of weights, try to manually collect the parameters with DeepSpeed Zero-3 beforehand:
|
||||
|
||||
```python
|
||||
import deepspeed
|
||||
|
||||
is_ds_zero_3 = ... # check if Zero-3
|
||||
|
||||
with deepspeed.zero.GatheredParameters(list(model.parameters()), enabled= is_ds_zero_3):
|
||||
model.merge_adapter()
|
||||
# do whatever is needed, then unmerge in the same context if unmerging is required
|
||||
...
|
||||
model.unmerge_adapter()
|
||||
```
|
||||
|
||||
</Tip>
|
||||
> [!TIP]
|
||||
> 💡 When you have code that requires merging (and unmerging) of weights, try to manually collect the parameters with DeepSpeed Zero-3 beforehand:
|
||||
>
|
||||
> ```python
|
||||
> import deepspeed
|
||||
>
|
||||
> is_ds_zero_3 = ... # check if Zero-3
|
||||
>
|
||||
> with deepspeed.zero.GatheredParameters(list(model.parameters()), enabled= is_ds_zero_3):
|
||||
> model.merge_adapter()
|
||||
> # do whatever is needed, then unmerge in the same context if unmerging is required
|
||||
> ...
|
||||
> model.unmerge_adapter()
|
||||
> ```
|
||||
|
@ -264,11 +264,11 @@ model = AutoModelForCausalLM.from_pretrained(
|
||||
quantization_config=bnb_config,
|
||||
trust_remote_code=True,
|
||||
attn_implementation="flash_attention_2" if args.use_flash_attn else "eager",
|
||||
+ torch_dtype=quant_storage_dtype or torch.float32,
|
||||
+ dtype=quant_storage_dtype or torch.float32,
|
||||
)
|
||||
```
|
||||
|
||||
Notice that `torch_dtype` for `AutoModelForCausalLM` is same as the `bnb_4bit_quant_storage` data type. That's it. Everything else is handled by Trainer and TRL.
|
||||
Notice that `dtype` for `AutoModelForCausalLM` is same as the `bnb_4bit_quant_storage` data type. That's it. Everything else is handled by Trainer and TRL.
|
||||
|
||||
## Memory usage
|
||||
|
||||
|
@ -22,11 +22,8 @@ This guide will give you a brief overview of the adapter methods supported by PE
|
||||
|
||||
## Low-Rank Adaptation (LoRA)
|
||||
|
||||
<Tip>
|
||||
|
||||
LoRA is one of the most popular PEFT methods and a good starting point if you're just getting started with PEFT. It was originally developed for large language models but it is a tremendously popular training method for diffusion models because of its efficiency and effectiveness.
|
||||
|
||||
</Tip>
|
||||
> [!TIP]
|
||||
> LoRA is one of the most popular PEFT methods and a good starting point if you're just getting started with PEFT. It was originally developed for large language models but it is a tremendously popular training method for diffusion models because of its efficiency and effectiveness.
|
||||
|
||||
As mentioned briefly earlier, [LoRA](https://hf.co/papers/2106.09685) is a technique that accelerates finetuning large models while consuming less memory.
|
||||
|
||||
|
@ -129,21 +129,15 @@ Let's break this down:
|
||||
- By default, LoRA isn't applied to BERT's embedding layer, so there are _no entries_ for `lora_A_embedding` and `lora_B_embedding`.
|
||||
- The keys of the `state_dict` always start with `"base_model.model."`. The reason is that, in PEFT, we wrap the base model inside a tuner-specific model (`LoraModel` in this case), which itself is wrapped in a general PEFT model (`PeftModel`). For this reason, these two prefixes are added to the keys. When converting to the PEFT format, it is required to add these prefixes.
|
||||
|
||||
<Tip>
|
||||
|
||||
This last point is not true for prefix tuning techniques like prompt tuning. There, the extra embeddings are directly stored in the `state_dict` without any prefixes added to the keys.
|
||||
|
||||
</Tip>
|
||||
> [!TIP]
|
||||
> This last point is not true for prefix tuning techniques like prompt tuning. There, the extra embeddings are directly stored in the `state_dict` without any prefixes added to the keys.
|
||||
|
||||
When inspecting the parameter names in the loaded model, you might be surprised to find that they look a bit different, e.g. `base_model.model.encoder.layer.0.attention.self.query.lora_A.default.weight`. The difference is the *`.default`* part in the second to last segment. This part exists because PEFT generally allows the addition of multiple adapters at once (using an `nn.ModuleDict` or `nn.ParameterDict` to store them). For example, if you add another adapter called "other", the key for that adapter would be `base_model.model.encoder.layer.0.attention.self.query.lora_A.other.weight`.
|
||||
|
||||
When you call [`~PeftModel.save_pretrained`], the adapter name is stripped from the keys. The reason is that the adapter name is not an important part of the model architecture; it is just an arbitrary name. When loading the adapter, you could choose a totally different name, and the model would still work the same way. This is why the adapter name is not stored in the checkpoint file.
|
||||
|
||||
<Tip>
|
||||
|
||||
If you call `save_pretrained("some/path")` and the adapter name is not `"default"`, the adapter is stored in a sub-directory with the same name as the adapter. So if the name is "other", it would be stored inside of `some/path/other`.
|
||||
|
||||
</Tip>
|
||||
> [!TIP]
|
||||
> If you call `save_pretrained("some/path")` and the adapter name is not `"default"`, the adapter is stored in a sub-directory with the same name as the adapter. So if the name is "other", it would be stored inside of `some/path/other`.
|
||||
|
||||
In some circumstances, deciding which values to add to the checkpoint file can become a bit more complicated. For example, in PEFT, DoRA is implemented as a special case of LoRA. If you want to convert a DoRA model to PEFT, you should create a LoRA checkpoint with extra entries for DoRA. You can see this in the `__init__` of the previous `LoraLayer` code:
|
||||
|
||||
|
@ -48,12 +48,9 @@ class MLP(nn.Module):
|
||||
|
||||
This is a straightforward multilayer perceptron with an input layer, a hidden layer, and an output layer.
|
||||
|
||||
<Tip>
|
||||
|
||||
For this toy example, we choose an exceedingly large number of hidden units to highlight the efficiency gains
|
||||
from PEFT, but those gains are in line with more realistic examples.
|
||||
|
||||
</Tip>
|
||||
> [!TIP]
|
||||
> For this toy example, we choose an exceedingly large number of hidden units to highlight the efficiency gains
|
||||
> from PEFT, but those gains are in line with more realistic examples.
|
||||
|
||||
There are a few linear layers in this model that could be tuned with LoRA. When working with common 🤗 Transformers
|
||||
models, PEFT will know which layers to apply LoRA to, but in this case, it is up to us as a user to choose the layers.
|
||||
@ -272,11 +269,8 @@ peft_model = get_peft_model(base_model, config)
|
||||
# do training
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
When you call [`get_peft_model`], you will see a warning because PEFT does not recognize the targeted module type. In this case, you can ignore this warning.
|
||||
|
||||
</Tip>
|
||||
> [!TIP]
|
||||
> When you call [`get_peft_model`], you will see a warning because PEFT does not recognize the targeted module type. In this case, you can ignore this warning.
|
||||
|
||||
By supplying a custom mapping, PEFT first checks the base model's layers against the custom mapping and dispatches to the custom LoRA layer type if there is a match. If there is no match, PEFT checks the built-in LoRA layer types for a match.
|
||||
|
||||
|
@ -119,11 +119,8 @@ initialize_lora_eva_weights(peft_model, dataloader)
|
||||
```
|
||||
EVA works out of the box with bitsandbytes. Simply initialize the model with `quantization_config` and call [`initialize_lora_eva_weights`] as usual.
|
||||
|
||||
<Tip>
|
||||
|
||||
For further instructions on using EVA, please refer to our [documentation](https://github.com/huggingface/peft/tree/main/examples/eva_finetuning).
|
||||
|
||||
</Tip>
|
||||
> [!TIP]
|
||||
> For further instructions on using EVA, please refer to our [documentation](https://github.com/huggingface/peft/tree/main/examples/eva_finetuning).
|
||||
|
||||
### LoftQ
|
||||
|
||||
@ -158,11 +155,8 @@ At the moment, `replace_lora_weights_loftq` has these additional limitations:
|
||||
- Model files must be stored as a `safetensors` file.
|
||||
- Only bitsandbytes 4bit quantization is supported.
|
||||
|
||||
<Tip>
|
||||
|
||||
Learn more about how PEFT works with quantization in the [Quantization](quantization) guide.
|
||||
|
||||
</Tip>
|
||||
> [!TIP]
|
||||
> Learn more about how PEFT works with quantization in the [Quantization](quantization) guide.
|
||||
|
||||
### Rank-stabilized LoRA
|
||||
|
||||
@ -545,7 +539,7 @@ from peft import PeftModel
|
||||
import torch
|
||||
|
||||
base_model = AutoModelForCausalLM.from_pretrained(
|
||||
"mistralai/Mistral-7B-v0.1", torch_dtype=torch.float16, device_map="auto"
|
||||
"mistralai/Mistral-7B-v0.1", dtype=torch.float16, device_map="auto"
|
||||
)
|
||||
```
|
||||
|
||||
@ -570,11 +564,8 @@ model.add_weighted_adapter(
|
||||
model.set_adapter(weighted_adapter_name)
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
There are several supported methods for `combination_type`. Refer to the [documentation](../package_reference/lora#peft.LoraModel.add_weighted_adapter) for more details. Note that "svd" as the `combination_type` is not supported when using `torch.float16` or `torch.bfloat16` as the datatype.
|
||||
|
||||
</Tip>
|
||||
> [!TIP]
|
||||
> There are several supported methods for `combination_type`. Refer to the [documentation](../package_reference/lora#peft.LoraModel.add_weighted_adapter) for more details. Note that "svd" as the `combination_type` is not supported when using `torch.float16` or `torch.bfloat16` as the datatype.
|
||||
|
||||
Now, perform inference:
|
||||
|
||||
@ -792,43 +783,40 @@ model = create_arrow_model(
|
||||
```
|
||||
To encode general knowledge, GenKnowSub subtracts the average of the provided general adapters from each task-specific adapter once, before routing begins. Furthermore, the ability to add or remove adapters after calling ```create_arrow_model``` (as described in the Arrow section) is still supported in this case.
|
||||
|
||||
<Tip>
|
||||
|
||||
**Things to keep in mind when using Arrow + GenKnowSub:**
|
||||
|
||||
- All LoRA adapters (task-specific and general) must share the same ```rank``` and ```target_modules```.
|
||||
|
||||
- Any inconsistency in these settings will raise an error in ```create_arrow_model```.
|
||||
|
||||
- Having different scaling factors (```lora_alpha```) across task adapters is supported — Arrow handles them automatically.
|
||||
|
||||
- Merging the ```"arrow_router"``` is not supported, due to its dynamic routing behavior.
|
||||
|
||||
- In create_arrow_model, task adapters are loaded as ```task_i``` and general adapters as ```gks_j``` (where ```i``` and ```j``` are indices). The function ensures consistency of ```target_modules```, ```rank```, and whether adapters are applied to ```Linear``` or ```Linear4bit``` layers. It then adds the ```"arrow_router"``` module and activates it. Any customization of this process requires overriding ```create_arrow_model```.
|
||||
|
||||
- This implementation is compatible with 4-bit quantization (via bitsandbytes):
|
||||
|
||||
```py
|
||||
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
|
||||
import torch
|
||||
|
||||
# Quantisation config
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_compute_dtype=torch.bfloat16,
|
||||
bnb_4bit_use_double_quant=False,
|
||||
)
|
||||
|
||||
# Loading the model
|
||||
base_model = AutoModelForCausalLM.from_pretrained(
|
||||
"microsoft/Phi-3-mini-4k-instruct",
|
||||
torch_dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
quantization_config=bnb_config,
|
||||
)
|
||||
|
||||
# Now call create_arrow_model() as we explained before.
|
||||
```
|
||||
|
||||
</Tip>
|
||||
> [!TIP]
|
||||
> **Things to keep in mind when using Arrow + GenKnowSub:**
|
||||
>
|
||||
> - All LoRA adapters (task-specific and general) must share the same ```rank``` and ```target_modules```.
|
||||
>
|
||||
> - Any inconsistency in these settings will raise an error in ```create_arrow_model```.
|
||||
>
|
||||
> - Having different scaling factors (```lora_alpha```) across task adapters is supported — Arrow handles them automatically.
|
||||
>
|
||||
> - Merging the ```"arrow_router"``` is not supported, due to its dynamic routing behavior.
|
||||
>
|
||||
> - In create_arrow_model, task adapters are loaded as ```task_i``` and general adapters as ```gks_j``` (where ```i``` and ```j``` are indices). The function ensures consistency of ```target_modules```, ```rank```, and whether adapters are applied to ```Linear``` or ```Linear4bit``` layers. It then adds the ```"arrow_router"``` module and activates it. Any customization of this process requires overriding ```create_arrow_model```.
|
||||
>
|
||||
> - This implementation is compatible with 4-bit quantization (via bitsandbytes):
|
||||
>
|
||||
> ```py
|
||||
> from transformers import AutoModelForCausalLM, BitsAndBytesConfig
|
||||
> import torch
|
||||
>
|
||||
> # Quantisation config
|
||||
> bnb_config = BitsAndBytesConfig(
|
||||
> load_in_4bit=True,
|
||||
> bnb_4bit_quant_type="nf4",
|
||||
> bnb_4bit_compute_dtype=torch.bfloat16,
|
||||
> bnb_4bit_use_double_quant=False,
|
||||
> )
|
||||
>
|
||||
> # Loading the model
|
||||
> base_model = AutoModelForCausalLM.from_pretrained(
|
||||
> "microsoft/Phi-3-mini-4k-instruct",
|
||||
> dtype=torch.bfloat16,
|
||||
> device_map="auto",
|
||||
> quantization_config=bnb_config,
|
||||
> )
|
||||
>
|
||||
> # Now call create_arrow_model() as we explained before.
|
||||
> ```
|
@ -144,7 +144,7 @@ The models support LoRA adapter tuning. To tune the quantized model you'll need
|
||||
```py
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
"BlackSamorez/Mixtral-8x7b-AQLM-2Bit-1x16-hf-test-dispatch",
|
||||
torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True,
|
||||
dtype="auto", device_map="auto", low_cpu_mem_usage=True,
|
||||
)
|
||||
|
||||
peft_config = LoraConfig(...)
|
||||
|
@ -43,7 +43,7 @@ python -m pip install git+https://github.com/huggingface/peft
|
||||
|
||||
### ValueError: Attempting to unscale FP16 gradients
|
||||
|
||||
This error probably occurred because the model was loaded with `torch_dtype=torch.float16` and then used in an automatic mixed precision (AMP) context, e.g. by setting `fp16=True` in the [`~transformers.Trainer`] class from 🤗 Transformers. The reason is that when using AMP, trainable weights should never use fp16. To make this work without loading the whole model in fp32, add the following to your code:
|
||||
This error probably occurred because the model was loaded with `dtype=torch.float16` and then used in an automatic mixed precision (AMP) context, e.g. by setting `fp16=True` in the [`~transformers.Trainer`] class from 🤗 Transformers. The reason is that when using AMP, trainable weights should never use fp16. To make this work without loading the whole model in fp32, add the following to your code:
|
||||
|
||||
```python
|
||||
peft_model = get_peft_model(...)
|
||||
@ -71,11 +71,8 @@ trainer = Trainer(model=peft_model, fp16=True, ...)
|
||||
trainer.train()
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
Starting from PEFT version v0.12.0, PEFT automatically promotes the dtype of adapter weights from `torch.float16` and `torch.bfloat16` to `torch.float32` where appropriate. To _prevent_ this behavior, you can pass `autocast_adapter_dtype=False` to [`~get_peft_model`], to [`~PeftModel.from_pretrained`], and to [`~PeftModel.load_adapter`].
|
||||
|
||||
</Tip>
|
||||
> [!TIP]
|
||||
> Starting from PEFT version v0.12.0, PEFT automatically promotes the dtype of adapter weights from `torch.float16` and `torch.bfloat16` to `torch.float32` where appropriate. To _prevent_ this behavior, you can pass `autocast_adapter_dtype=False` to [`~get_peft_model`], to [`~PeftModel.from_pretrained`], and to [`~PeftModel.load_adapter`].
|
||||
|
||||
### Selecting the dtype of the adapter
|
||||
|
||||
@ -137,11 +134,8 @@ You should probably TRAIN this model on a down-stream task to be able to use it
|
||||
|
||||
The mentioned layers should be added to `modules_to_save` in the config to avoid the described problem.
|
||||
|
||||
<Tip>
|
||||
|
||||
As an example, when loading a model that is using the DeBERTa architecture for sequence classification, you'll see a warning that the following weights are newly initialized: `['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']`. From this, it follows that the `classifier` and `pooler` layers should be added to: `modules_to_save=["classifier", "pooler"]`.
|
||||
|
||||
</Tip>
|
||||
> [!TIP]
|
||||
> As an example, when loading a model that is using the DeBERTa architecture for sequence classification, you'll see a warning that the following weights are newly initialized: `['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']`. From this, it follows that the `classifier` and `pooler` layers should be added to: `modules_to_save=["classifier", "pooler"]`.
|
||||
|
||||
### Extending the vocabulary
|
||||
|
||||
@ -300,7 +294,7 @@ It is possible to get this information for non-PEFT models if they are using PEF
|
||||
|
||||
>>> path = "runwayml/stable-diffusion-v1-5"
|
||||
>>> lora_id = "takuma104/lora-test-text-encoder-lora-target"
|
||||
>>> pipe = StableDiffusionPipeline.from_pretrained(path, torch_dtype=torch.float16)
|
||||
>>> pipe = StableDiffusionPipeline.from_pretrained(path, dtype=torch.float16)
|
||||
>>> pipe.load_lora_weights(lora_id, adapter_name="adapter-1")
|
||||
>>> pipe.load_lora_weights(lora_id, adapter_name="adapter-2")
|
||||
>>> pipe.set_lora_device(["adapter-2"], "cuda")
|
||||
@ -345,11 +339,8 @@ TunerModelStatus(
|
||||
|
||||
Loading adapters like LoRA weights should generally be fast compared to loading the base model. However, there can be use cases where the adapter weights are quite large or where users need to load a large number of adapters -- the loading time can add up in this case. The reason for this is that the adapter weights are first initialized and then overridden by the loaded weights, which is wasteful. To speed up the loading time, you can pass the `low_cpu_mem_usage=True` argument to [`~PeftModel.from_pretrained`] and [`~PeftModel.load_adapter`].
|
||||
|
||||
<Tip>
|
||||
|
||||
If this option works well across different use cases, it may become the default for adapter loading in the future.
|
||||
|
||||
</Tip>
|
||||
> [!TIP]
|
||||
> If this option works well across different use cases, it may become the default for adapter loading in the future.
|
||||
|
||||
|
||||
## Reproducibility
|
||||
@ -401,3 +392,67 @@ If it is not possible for you to upgrade PEFT, there is a workaround you can try
|
||||
Assume the error message says that the unknown keyword argument is named `foobar`. Search inside the `adapter_config.json` of this PEFT adapter for the `foobar` entry and delete it from the file. Then save the file and try loading the model again.
|
||||
|
||||
This solution works most of the time. As long as it is the default value for `foobar`, it can be ignored. However, when it is set to some other value, you will get incorrect results. Upgrading PEFT is the recommended solution.
|
||||
|
||||
## Adapter handling
|
||||
|
||||
### Using multiple adapters at the same time
|
||||
|
||||
PEFT allows you to create more than one adapter on the same model. This can be useful in many situations. For example, for inference, you may want to serve two fine-tuned models from the same base model instead of loading the base model once for each fine-tuned model, which would cost more memory. However, multiple adapters can be activated at the same time. This way, the model may leverage the learnings from all those adapters at the same time. As an example, if you have a diffusion model, you may want to use one LoRA adapter to change the style and a different one to change the subject.
|
||||
|
||||
Activating multiple adapters at the same time is generally possible on all PEFT methods (LoRA, LoHa, IA³, etc.) except for prompt learning methods (p-tuning, prefix tuning, etc.). The following example illustrates how to achieve this:
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM
|
||||
from peft import PeftModel
|
||||
|
||||
model_id = ...
|
||||
base_model = AutoModelForCausalLM.from_pretrained(model_id)
|
||||
model = PeftModel.from_pretrained(base_model, lora_path_0) # default adapter_name is 'default'
|
||||
model.load_adapter(lora_path_1, adapter_name="other")
|
||||
# the 'other' adapter was loaded but it's not active yet, so to activate both adapters:
|
||||
model.base_model.set_adapter(["default", "other"])
|
||||
```
|
||||
|
||||
> [!TIP]
|
||||
> In the example above, you can see that we need to call `model.base_model.set_adapter(["default", "other"])`. Why can we not call `model.set_adapter(["default", "other"])`? This is unfortunately not possible because, as explained earlier, some PEFT methods don't support activating more than one adapter at a time.
|
||||
|
||||
It is also possible to train two adapters at the same time, but you should be careful to ensure that the weights of both adapters are known to the optimizer. Otherwise, only one adapter will receive updates.
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM
|
||||
from peft import LoraConfig, get_peft_model
|
||||
|
||||
model_id = ...
|
||||
base_model = AutoModelForCausalLM.from_pretrained(model_id)
|
||||
lora_config_0 = LoraConfig(...)
|
||||
lora_config_1 = LoraConfig(...)
|
||||
model = get_peft_model(base_model, lora_config_0)
|
||||
model.add_adapter(adapter_name="other", peft_config=lora_config_1)
|
||||
```
|
||||
|
||||
If we would now call:
|
||||
|
||||
```python
|
||||
from transformers import Trainer
|
||||
|
||||
trainer = Trainer(model=model, ...)
|
||||
trainer.train()
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```python
|
||||
optimizer = torch.optim.AdamW([param for param in model.parameters() if param.requires_grad], ...)
|
||||
```
|
||||
|
||||
then the second LoRA adapter (`"other"`) would not be trained. This is because it is inactive at this moment, which means the `requires_grad` attribute on its parameters is set to `False` and the optimizer will ignore it. Therefore, make sure to activate all adapters that should be trained _before_ initializing the optimizer:
|
||||
|
||||
```python
|
||||
# activate all adapters
|
||||
model.base_model.set_adapter(["default", "other"])
|
||||
trainer = Trainer(model=model, ...)
|
||||
trainer.train()
|
||||
```
|
||||
|
||||
> [!TIP]
|
||||
> This section deals with using multiple adapters _of the same type_ on the same model, for example, using multiple LoRA adapters at the same time. It does not apply to using _different types_ of adapters on the same model, for example one LoRA adapter and one LoHa adapter. For this, please check [`PeftMixedModel`](https://huggingface.co/docs/peft/developer_guides/mixed_models).
|
||||
|
35
docs/source/package_reference/delora.md
Normal file
35
docs/source/package_reference/delora.md
Normal file
@ -0,0 +1,35 @@
|
||||
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# DeLoRA: Decoupled Low-rank Adaptation
|
||||
[DeLoRA](https://huggingface.co/papers/2503.18225) is a parameter-efficient fine-tuning technique that implicitly maintains a Frobenius boundary with respect to the pretrained weights by normalizing and scaling learnable low-rank matrices. This effectively decouples the learning of directions (BA term) and magnitude (boundary term) of the weight updates, avoiding catastrophic shifts in the adapted weights and enhancing robustness to hyperparameter choices.
|
||||
|
||||
Note:
|
||||
- use 10-100x larger learning rate than standard LoRA variants (typical values from 1e-3/1e-2/..)
|
||||
- do not set a too small initial boundary parameter lambda (typical values are around 10/15/..)
|
||||
- setting different lambdas to different layers is possible
|
||||
|
||||
The abstract from the paper is:
|
||||
|
||||
> Parameter-Efficient FineTuning (PEFT) methods have recently gained significant popularity thanks to the widespread availability of large-scale pretrained models. These methods allow for quick adaptation to downstream tasks with minimal computational cost. However, popular finetuning methods such as LoRA exhibit limited robustness when it comes to hyperparameter choices or extended training regimes, preventing optimal out-of-the-box performance. In contrast, bounded approaches, such as ETHER, provide greater robustness but are limited to extremely low-rank adaptations and fixed-strength transformations, reducing their adaptation expressive power. In this work, we propose Decoupled Low-rank Adaptation (DeLoRA), a novel finetuning method that normalizes and scales learnable low-rank matrices. By bounding the distance of the transformation, DeLoRA effectively decouples the angular learning from the adaptation strength, enhancing robustness without compromising performance. Through evaluations on subject-driven image generation, natural language understanding, and instruction tuning, we show that DeLoRA matches or surpasses performance of competing PEFT methods, while exhibiting stronger robustness.
|
||||
|
||||
## DeloraConfig
|
||||
|
||||
[[autodoc]] tuners.delora.config.DeloraConfig
|
||||
|
||||
## DeloraModel
|
||||
|
||||
[[autodoc]] tuners.delora.model.DeloraModel
|
37
docs/source/package_reference/functional.md
Normal file
37
docs/source/package_reference/functional.md
Normal file
@ -0,0 +1,37 @@
|
||||
<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
-->
|
||||
|
||||
# Functions for PEFT integration
|
||||
|
||||
A collection of functions that could be useful for non-PeftModel models, e.g. transformers or diffusers integration
|
||||
|
||||
The functions provided here can be considered "public API" of PEFT and hence are safe to be used by packages that provide PEFT integrations.
|
||||
|
||||
## Cast the adapter weight dtypes
|
||||
[[autodoc]] functional.cast_adapter_dtype
|
||||
- all
|
||||
|
||||
## Delete the PEFT adapter from model
|
||||
[[autodoc]] functional.delete_adapter
|
||||
- all
|
||||
|
||||
## Get the state dict of the PEFT adapter
|
||||
[[autodoc]] functional.get_peft_model_state_dict
|
||||
- all
|
||||
|
||||
## Inject a PEFT adapter into the model based on a PEFT config
|
||||
[[autodoc]] functional.inject_adapter_in_model
|
||||
- all
|
||||
|
||||
## Set the active PEFT adapter(s) of the model
|
||||
[[autodoc]] functional.set_adapter
|
||||
- all
|
||||
|
||||
## Set the `requires_grad` attribute of the specified adapters
|
||||
[[autodoc]] functional.set_requires_grad
|
||||
- all
|
||||
|
||||
## Load the weights of the PEFT state dict into the model
|
||||
[[autodoc]] functional.set_peft_model_state_dict
|
||||
- all
|
35
docs/source/package_reference/waveft.md
Normal file
35
docs/source/package_reference/waveft.md
Normal file
@ -0,0 +1,35 @@
|
||||
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# WaveFT: Wavelet Fine-Tuning
|
||||
|
||||
[WaveFT](https://arxiv.org/abs/2505.12532) is a novel parameter-efficient fine-tuning (PEFT) method that introduces sparse updates in the **wavelet domain** of residual matrices. Unlike LoRA, which is constrained by discrete low-rank choices, WaveFT enables fine-grained control over the number of trainable parameters by directly learning a sparse set of coefficients in the transformed space. These coefficients are then mapped back to the weight domain via the Inverse Discrete Wavelet Transform (IDWT), producing high-rank updates without incurring inference overhead.
|
||||
|
||||
WaveFT currently has the following constraint:
|
||||
|
||||
- Only `nn.Linear` layers are supported.
|
||||
|
||||
The abstract from the paper is:
|
||||
|
||||
>Efficiently adapting large foundation models is critical, especially with tight compute and memory budgets. Parameter-Efficient Fine-Tuning (PEFT) methods such as LoRA offer limited granularity and effectiveness in few-parameter regimes. We propose Wavelet Fine-Tuning (WaveFT), a novel PEFT method that learns highly sparse updates in the wavelet domain of residual matrices. WaveFT allows precise control of trainable parameters, offering fine-grained capacity adjustment and excelling with remarkably low parameter count, potentially far fewer than LoRA’s minimum—ideal for extreme parameter-efficient scenarios. Evaluated on personalized text-to-image generation using Stable Diffusion XL as baseline, WaveFT significantly outperforms LoRA and other PEFT methods, especially at low parameter counts; achieving superior subject fidelity, prompt alignment, and image diversity.
|
||||
|
||||
## WaveFTConfig
|
||||
|
||||
[[autodoc]] tuners.waveft.config.WaveFTConfig
|
||||
|
||||
## WaveFTModel
|
||||
|
||||
[[autodoc]] tuners.waveft.model.WaveFTModel
|
@ -36,11 +36,8 @@ from peft import LoraConfig, TaskType
|
||||
peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
See the [`LoraConfig`] reference for more details about other parameters you can adjust, such as the modules to target or the bias type.
|
||||
|
||||
</Tip>
|
||||
> [!TIP]
|
||||
> See the [`LoraConfig`] reference for more details about other parameters you can adjust, such as the modules to target or the bias type.
|
||||
|
||||
Once the [`LoraConfig`] is setup, create a [`PeftModel`] with the [`get_peft_model`] function. It takes a base model - which you can load from the Transformers library - and the [`LoraConfig`] containing the parameters for how to configure a model for training with LoRA.
|
||||
|
||||
@ -124,11 +121,8 @@ Both methods only save the extra PEFT weights that were trained, meaning it is s
|
||||
|
||||
## Inference
|
||||
|
||||
<Tip>
|
||||
|
||||
Take a look at the [AutoPeftModel](package_reference/auto_class) API reference for a complete list of available `AutoPeftModel` classes.
|
||||
|
||||
</Tip>
|
||||
> [!TIP]
|
||||
> Take a look at the [AutoPeftModel](package_reference/auto_class) API reference for a complete list of available `AutoPeftModel` classes.
|
||||
|
||||
Easily load any PEFT-trained model for inference with the [`AutoPeftModel`] class and the [`~transformers.PreTrainedModel.from_pretrained`] method:
|
||||
|
||||
|
@ -20,11 +20,8 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
This guide will show you how to train a sequence-to-sequence model with IA3 to *generate a sentiment* given some financial news.
|
||||
|
||||
<Tip>
|
||||
|
||||
Some familiarity with the general process of training a sequence-to-sequence would be really helpful and allow you to focus on how to apply IA3. If you’re new, we recommend taking a look at the [Translation](https://huggingface.co/docs/transformers/tasks/translation) and [Summarization](https://huggingface.co/docs/transformers/tasks/summarization) guides first from the Transformers documentation. When you’re ready, come back and see how easy it is to drop PEFT in to your training!
|
||||
|
||||
</Tip>
|
||||
> [!TIP]
|
||||
> Some familiarity with the general process of training a sequence-to-sequence would be really helpful and allow you to focus on how to apply IA3. If you’re new, we recommend taking a look at the [Translation](https://huggingface.co/docs/transformers/tasks/translation) and [Summarization](https://huggingface.co/docs/transformers/tasks/summarization) guides first from the Transformers documentation. When you’re ready, come back and see how easy it is to drop PEFT in to your training!
|
||||
|
||||
## Dataset
|
||||
|
||||
@ -123,11 +120,8 @@ model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/mt0-large")
|
||||
|
||||
All PEFT methods need a configuration that contains and specifies all the parameters for how the PEFT method should be applied. Create an [`IA3Config`] with the task type and set the inference mode to `False`. You can find additional parameters for this configuration in the [API reference](../package_reference/ia3#ia3config).
|
||||
|
||||
<Tip>
|
||||
|
||||
Call the [`~PeftModel.print_trainable_parameters`] method to compare the number of trainable parameters of [`PeftModel`] versus the number of parameters in the base model!
|
||||
|
||||
</Tip>
|
||||
> [!TIP]
|
||||
> Call the [`~PeftModel.print_trainable_parameters`] method to compare the number of trainable parameters of [`PeftModel`] versus the number of parameters in the base model!
|
||||
|
||||
Once the configuration is setup, pass it to the [`get_peft_model`] function along with the base model to create a trainable [`PeftModel`].
|
||||
|
||||
|
@ -24,11 +24,8 @@ Additionally, PEFT supports the [X-LoRA](../conceptual_guides/adapter#mixture-of
|
||||
|
||||
This guide will show you how to quickly train an image classification model - with a low-rank decomposition method - to identify the class of food shown in an image.
|
||||
|
||||
<Tip>
|
||||
|
||||
Some familiarity with the general process of training an image classification model would be really helpful and allow you to focus on the low-rank decomposition methods. If you're new, we recommend taking a look at the [Image classification](https://huggingface.co/docs/transformers/tasks/image_classification) guide first from the Transformers documentation. When you're ready, come back and see how easy it is to drop PEFT in to your training!
|
||||
|
||||
</Tip>
|
||||
> [!TIP]
|
||||
> Some familiarity with the general process of training an image classification model would be really helpful and allow you to focus on the low-rank decomposition methods. If you're new, we recommend taking a look at the [Image classification](https://huggingface.co/docs/transformers/tasks/image_classification) guide first from the Transformers documentation. When you're ready, come back and see how easy it is to drop PEFT in to your training!
|
||||
|
||||
Before you begin, make sure you have all the necessary libraries installed.
|
||||
|
||||
@ -150,11 +147,8 @@ model = AutoModelForImageClassification.from_pretrained(
|
||||
|
||||
Every PEFT method requires a configuration that holds all the parameters specifying how the PEFT method should be applied. Once the configuration is setup, pass it to the [`~peft.get_peft_model`] function along with the base model to create a trainable [`PeftModel`].
|
||||
|
||||
<Tip>
|
||||
|
||||
Call the [`~PeftModel.print_trainable_parameters`] method to compare the number of parameters of [`PeftModel`] versus the number of parameters in the base model!
|
||||
|
||||
</Tip>
|
||||
> [!TIP]
|
||||
> Call the [`~PeftModel.print_trainable_parameters`] method to compare the number of parameters of [`PeftModel`] versus the number of parameters in the base model!
|
||||
|
||||
<hfoptions id="loras">
|
||||
<hfoption id="LoRA">
|
||||
|
@ -22,11 +22,8 @@ The PEFT library supports several types of prompting methods (p-tuning, prefix t
|
||||
|
||||
This guide will show you how to train a causal language model - with a soft prompting method - to *generate a classification* for whether a tweet is a complaint or not.
|
||||
|
||||
<Tip>
|
||||
|
||||
Some familiarity with the general process of training a causal language model would be really helpful and allow you to focus on the soft prompting methods. If you're new, we recommend taking a look at the [Causal language modeling](https://huggingface.co/docs/transformers/tasks/language_modeling) guide first from the Transformers documentation. When you're ready, come back and see how easy it is to drop PEFT in to your training!
|
||||
|
||||
</Tip>
|
||||
> [!TIP]
|
||||
> Some familiarity with the general process of training a causal language model would be really helpful and allow you to focus on the soft prompting methods. If you're new, we recommend taking a look at the [Causal language modeling](https://huggingface.co/docs/transformers/tasks/language_modeling) guide first from the Transformers documentation. When you're ready, come back and see how easy it is to drop PEFT in to your training!
|
||||
|
||||
Before you begin, make sure you have all the necessary libraries installed.
|
||||
|
||||
@ -146,11 +143,8 @@ model = AutoModelForCausalLM.from_pretrained("bigscience/bloomz-560m")
|
||||
|
||||
For any PEFT method, you'll need to create a configuration which contains all the parameters that specify how the PEFT method should be applied. Once the configuration is setup, pass it to the [`~peft.get_peft_model`] function along with the base model to create a trainable [`PeftModel`].
|
||||
|
||||
<Tip>
|
||||
|
||||
Call the [`~PeftModel.print_trainable_parameters`] method to compare the number of trainable parameters of [`PeftModel`] versus the number of parameters in the base model!
|
||||
|
||||
</Tip>
|
||||
> [!TIP]
|
||||
> Call the [`~PeftModel.print_trainable_parameters`] method to compare the number of trainable parameters of [`PeftModel`] versus the number of parameters in the base model!
|
||||
|
||||
<hfoptions id="configurations">
|
||||
<hfoption id="p-tuning">
|
||||
|
@ -22,11 +22,8 @@ The PEFT library is designed to help you quickly train large models on free or l
|
||||
|
||||
## PEFT configurations
|
||||
|
||||
<Tip>
|
||||
|
||||
Learn more about the parameters you can configure for each PEFT method in their respective API reference page.
|
||||
|
||||
</Tip>
|
||||
> [!TIP]
|
||||
> Learn more about the parameters you can configure for each PEFT method in their respective API reference page.
|
||||
|
||||
A configuration stores important parameters that specify how a particular PEFT method should be applied.
|
||||
|
||||
@ -158,15 +155,12 @@ model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
|
||||
lora_model = PeftModel.from_pretrained(model, "ybelkada/opt-350m-lora")
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
By default, the [`PeftModel`] is set for inference, but if you'd like to train the adapter some more you can set `is_trainable=True`.
|
||||
|
||||
```py
|
||||
lora_model = PeftModel.from_pretrained(model, "ybelkada/opt-350m-lora", is_trainable=True)
|
||||
```
|
||||
|
||||
</Tip>
|
||||
> [!TIP]
|
||||
> By default, the [`PeftModel`] is set for inference, but if you'd like to train the adapter some more you can set `is_trainable=True`.
|
||||
>
|
||||
> ```py
|
||||
> lora_model = PeftModel.from_pretrained(model, "ybelkada/opt-350m-lora", is_trainable=True)
|
||||
> ```
|
||||
|
||||
The [`PeftModel.from_pretrained`] method is the most flexible way to load a [`PeftModel`] because it doesn't matter what model framework was used (Transformers, timm, a generic PyTorch model). Other classes, like [`AutoPeftModel`], are just a convenient wrapper around the base [`PeftModel`], and makes it easier to load PEFT models directly from the Hub or locally where the PEFT weights are stored.
|
||||
|
||||
|
@ -303,7 +303,7 @@ if __name__ == "__main__":
|
||||
# Loading the model
|
||||
base_model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_NAME,
|
||||
torch_dtype=torch.bfloat16,
|
||||
dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
quantization_config=bnb_config,
|
||||
)
|
||||
|
@ -22,7 +22,6 @@ from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.utils.checkpoint
|
||||
from accelerate import Accelerator
|
||||
from diffusers import DDIMScheduler
|
||||
from diffusers.utils import check_min_version
|
||||
@ -85,7 +84,7 @@ def main(args):
|
||||
args.pretrained_model_name_or_path,
|
||||
controlnet=controlnet,
|
||||
unet=unet.model,
|
||||
torch_dtype=torch.float32,
|
||||
dtype=torch.float32,
|
||||
requires_safety_checker=False,
|
||||
).to(device)
|
||||
|
||||
|
@ -139,16 +139,16 @@ def main(args):
|
||||
cur_class_images = len(list(class_images_dir.iterdir()))
|
||||
|
||||
if cur_class_images < args.num_class_images:
|
||||
torch_dtype = torch.float16 if accelerator.device.type in ["cuda", "xpu"] else torch.float32
|
||||
dtype = torch.float16 if accelerator.device.type in ["cuda", "xpu"] else torch.float32
|
||||
if args.prior_generation_precision == "fp32":
|
||||
torch_dtype = torch.float32
|
||||
dtype = torch.float32
|
||||
elif args.prior_generation_precision == "fp16":
|
||||
torch_dtype = torch.float16
|
||||
dtype = torch.float16
|
||||
elif args.prior_generation_precision == "bf16":
|
||||
torch_dtype = torch.bfloat16
|
||||
dtype = torch.bfloat16
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
args.pretrained_model_name_or_path,
|
||||
torch_dtype=torch_dtype,
|
||||
dtype=dtype,
|
||||
safety_checker=None,
|
||||
revision=args.revision,
|
||||
)
|
||||
|
@ -11,7 +11,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
from trl import SFTConfig, SFTTrainer
|
||||
from datasets import load_dataset
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16, device_map="auto")
|
||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", dtype=torch.bfloat16, device_map="auto")
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
|
||||
tokenizer.pad_token_id = tokenizer.eos_token_id
|
||||
bone_config = BoneConfig(
|
||||
@ -47,7 +47,7 @@ from peft import PeftModel
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16, device_map="auto"
|
||||
"meta-llama/Llama-2-7b-hf", dtype=torch.bfloat16, device_map="auto"
|
||||
)
|
||||
peft_model = PeftModel.from_pretrained(model, "bone-llama-2-7b")
|
||||
```
|
||||
|
@ -57,7 +57,7 @@ elif script_args.base_model_name_or_path is not None:
|
||||
print(f"No available pre-processed model, manually initialize a Bone using {script_args.base_model_name_or_path}.")
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
script_args.base_model_name_or_path,
|
||||
torch_dtype=(
|
||||
dtype=(
|
||||
torch.float16
|
||||
if script_args.bits == "fp16"
|
||||
else (torch.bfloat16 if script_args.bits == "bf16" else torch.float32)
|
||||
|
@ -78,7 +78,7 @@ from peft.tuners.lora.corda import preprocess_corda
|
||||
from trl import SFTConfig, SFTTrainer
|
||||
from datasets import load_dataset
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16, device_map="auto")
|
||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", dtype=torch.bfloat16, device_map="auto")
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
|
||||
tokenizer.pad_token_id = tokenizer.eos_token_id
|
||||
sampled_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:256]")
|
||||
@ -236,7 +236,7 @@ from peft import PeftModel
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16, device_map="auto"
|
||||
"meta-llama/Llama-2-7b-hf", dtype=torch.bfloat16, device_map="auto"
|
||||
)
|
||||
# No SVD is performed during this step, and the base model remains unaltered.
|
||||
peft_model = PeftModel.from_pretrained(model, "corda-llama-2-7b-lora")
|
||||
|
@ -229,7 +229,7 @@ def train():
|
||||
print("Train in Full Finetuning mode")
|
||||
model = transformers.AutoModelForCausalLM.from_pretrained(
|
||||
script_args.model_name_or_path,
|
||||
torch_dtype=torch.bfloat16,
|
||||
dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
)
|
||||
trainable_params, all_param = get_nb_trainable_parameters(model)
|
||||
|
@ -49,7 +49,7 @@ def main(args):
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_id, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True
|
||||
model_id, device_map="auto", dtype=torch.float16, trust_remote_code=True
|
||||
)
|
||||
|
||||
# Collect data
|
||||
|
@ -553,7 +553,7 @@
|
||||
"base_model = AutoModelForCausalLM.from_pretrained(\n",
|
||||
" model_id,\n",
|
||||
" cache_dir='.',\n",
|
||||
" torch_dtype=torch.float16,\n",
|
||||
" dtype=torch.float16,\n",
|
||||
" device_map='auto'\n",
|
||||
")\n",
|
||||
"\n",
|
||||
|
102
examples/delora_finetuning/README.md
Normal file
102
examples/delora_finetuning/README.md
Normal file
@ -0,0 +1,102 @@
|
||||
# DeLoRA: Decoupled Low-Rank Adaptation
|
||||
|
||||
## Introduction
|
||||
[DeLoRA](https://huggingface.co/papers/2503.18225) tackles finetuning in a Frobenius-norm bounded setup: this allows to prevent divergence from the pretrained model, effectively decoupling the learning of angles and magnitudes.
|
||||
|
||||
This is done by (i) normalization of the BA low-rank matrices, which bound the updates' Frobenius norm, (ii) learnable scaling lambda, which controls the update's boundary/magnitude, (iii) layer-wise scaling of ||W||, to adapt each update's norm to the original weights' norm.
|
||||
|
||||
## Quick start
|
||||
|
||||
With respect to your standard PEFT training procedure with LoRA, simply swap your `LoraConfig` for a `DeloraConfig`. Note however that `lora_alpha` parameter is replaced by `delora_lambda` parameter which sets an upper bound to the Frobenius norm of the weight change.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from peft import DeloraConfig, get_peft_model
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
from trl import SFTConfig, SFTTrainer
|
||||
from datasets import load_dataset
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B", dtype=torch.bfloat16, device_map="auto")
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
|
||||
tokenizer.pad_token_id = tokenizer.eos_token_id
|
||||
delora_config = DeloraConfig(r=32, delora_lambda=15)
|
||||
|
||||
peft_model = get_peft_model(model, delora_config)
|
||||
peft_model.print_trainable_parameters()
|
||||
|
||||
dataset = load_dataset("imdb", split="train[:1%]")
|
||||
|
||||
training_args = SFTConfig(dataset_text_field="text", max_seq_length=128)
|
||||
trainer = SFTTrainer(
|
||||
model=peft_model,
|
||||
args=training_args,
|
||||
train_dataset=dataset,
|
||||
processing_class=tokenizer,
|
||||
)
|
||||
trainer.train()
|
||||
peft_model.save_pretrained("delora-llama-3-8b")
|
||||
```
|
||||
|
||||
To utilize the fine-tuned DeLoRA modules, simply run the following command:
|
||||
```python
|
||||
import torch
|
||||
from peft import PeftModel
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Meta-Llama-3-8B", dtype=torch.bfloat16, device_map="auto"
|
||||
)
|
||||
peft_model = PeftModel.from_pretrained(model, "delora-llama-3-8b")
|
||||
```
|
||||
|
||||
## Advanced Usage
|
||||
In this script the default DeLoRA layers are the query and value layers of the Llama model. Adding adapters on more layers will increase memory usage. If you wish to choose a different set of layers for DeLoRA to be applied on, you can simply define it using:
|
||||
```bash
|
||||
python examples/delora_finetuning/delora_finetuning.py --base_model meta-llama/Meta-Llama-3-8B --delora_target_modules "q_proj,k_proj,v_proj,o_proj"
|
||||
```
|
||||
|
||||
Using different lambdas for different layers is also possible by setting `lambda_pattern`.
|
||||
|
||||
### Fine-tune
|
||||
```bash
|
||||
python delora_finetuning.py \
|
||||
--base_model "PATH_TO_MODEL" \
|
||||
--data_path "PATH_TO_DATASET" \
|
||||
--output_dir "PATH_TO_OUTPUT_DIR" \
|
||||
--batch_size 1 \
|
||||
--num_epochs 3 \
|
||||
--learning_rate 3e-3 \
|
||||
--cutoff_len 512 \
|
||||
--val_set_size 500 \
|
||||
--eval_step 10 \
|
||||
--save_step 100 \
|
||||
--device "auto" \
|
||||
--rank 32 \
|
||||
--delora_lambda 15 \
|
||||
--module_dropout 0.1 \
|
||||
--delora_target_modules "q_proj,v_proj" \
|
||||
--hub_model_id "YOUR_HF_REPO" \
|
||||
--push_to_hub
|
||||
```
|
||||
|
||||
## Additional Notes
|
||||
### Best practices
|
||||
- use 10-100x larger learning rate than standard LoRA variants (typical values from 1e-3/1e-2/..)
|
||||
- do not set a too small initial boundary parameter lambda (typical values are around 10/15/..)
|
||||
|
||||
|
||||
### DeLoRA vs DoRA
|
||||
DeLoRA might feel quite similar to DoRA (given the similar target of decoupling angular from magnitude learning), however it presents key differences: (i) DoRA applies normalization and scaling operations on the fully finetuned weights ($W + \Delta W$), (ii) DoRA's normalization operation is performed on the column space of the weight matrices.
|
||||
|
||||
Conversely DeLoRA (i) introduces the normalization and scaling operations directly on the weight updates $\Delta W$, better preventing divergence from the pretrained model, and (ii) normalizes the inner low-dimensional space, which enforces a Frobenius-norm boundary to the weight updates.
|
||||
|
||||
|
||||
## Citation
|
||||
```
|
||||
@inproceedings{bini2025decouplinganglesstrengthlowrank,
|
||||
title={Decoupling Angles and Strength in Low-rank Adaptation},
|
||||
author={Massimo Bini and Leander Girrbach and Zeynep Akata},
|
||||
year={2025},
|
||||
booktitle={International Conference on Learning Representations (ICLR)},
|
||||
}
|
||||
```
|
189
examples/delora_finetuning/delora_finetuning.py
Normal file
189
examples/delora_finetuning/delora_finetuning.py
Normal file
@ -0,0 +1,189 @@
|
||||
# This script is based on examples/randlora_finetuning/randlora_finetuning.py
|
||||
import os
|
||||
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
from transformers import (
|
||||
AutoModelForCausalLM,
|
||||
AutoTokenizer,
|
||||
DataCollatorForLanguageModeling,
|
||||
Trainer,
|
||||
TrainingArguments,
|
||||
)
|
||||
|
||||
from peft import DeloraConfig, get_peft_model
|
||||
|
||||
|
||||
def train_model(
|
||||
base_model: str,
|
||||
data_path: str,
|
||||
output_dir: str,
|
||||
batch_size: int,
|
||||
num_epochs: int,
|
||||
learning_rate: float,
|
||||
cutoff_len: int,
|
||||
val_set_size: int,
|
||||
eval_step: int,
|
||||
save_step: int,
|
||||
device: str,
|
||||
rank: int,
|
||||
delora_lambda: int,
|
||||
module_dropout: float,
|
||||
target_modules: str,
|
||||
hub_model_id: str,
|
||||
push_to_hub: bool,
|
||||
):
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
hf_token = os.getenv("HF_TOKEN")
|
||||
|
||||
# Setup device
|
||||
device = torch.device(device)
|
||||
print(f"Using device: {device}")
|
||||
|
||||
# load tokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(base_model, token=hf_token)
|
||||
|
||||
# Compute type
|
||||
device_type = device.type
|
||||
device_module = getattr(torch, device_type, torch.cuda)
|
||||
bf16_supported = device_module.is_available() and device_module.is_bf16_supported()
|
||||
dtype = torch.bfloat16 if bf16_supported else torch.float32
|
||||
|
||||
# Load the base model
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
base_model,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
# DeLoRA config for the PEFT model
|
||||
peft_config = DeloraConfig(
|
||||
r=rank,
|
||||
delora_lambda=delora_lambda,
|
||||
target_modules=(target_modules.split(",") if target_modules else None),
|
||||
module_dropout=module_dropout,
|
||||
bias="none",
|
||||
)
|
||||
|
||||
# get the peft model with DeLoRA config
|
||||
model = get_peft_model(model, peft_config)
|
||||
|
||||
model.to(device) # MODEL TO ACCELERATOR
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
# Load the dataset
|
||||
dataset = load_dataset(data_path)
|
||||
|
||||
def tokenize_function(examples):
|
||||
inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=cutoff_len)
|
||||
inputs["labels"] = inputs["input_ids"].copy() # setting labels for a language modeling task
|
||||
return inputs
|
||||
|
||||
# Tokenize the dataset and prepare for training
|
||||
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)
|
||||
|
||||
# Data collator to dynamically pad the batched examples
|
||||
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
|
||||
|
||||
# Compute the total amount of training step for warmup
|
||||
max_steps = int((len(dataset) // batch_size) * num_epochs)
|
||||
|
||||
# Define training arguments
|
||||
training_args = TrainingArguments(
|
||||
output_dir=output_dir,
|
||||
num_train_epochs=num_epochs,
|
||||
per_device_train_batch_size=batch_size,
|
||||
per_device_eval_batch_size=batch_size,
|
||||
warmup_steps=int(max_steps * 0.1), # 10% of total trainig steps
|
||||
weight_decay=0.0,
|
||||
logging_steps=eval_step,
|
||||
save_steps=save_step,
|
||||
save_total_limit=2,
|
||||
push_to_hub=push_to_hub,
|
||||
hub_model_id=hub_model_id,
|
||||
gradient_accumulation_steps=16,
|
||||
learning_rate=learning_rate,
|
||||
hub_token=hf_token,
|
||||
label_names=["labels"],
|
||||
)
|
||||
|
||||
# Clear accelerator cache to free memory
|
||||
device_module.empty_cache()
|
||||
|
||||
# Initialize the Trainer
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=tokenized_datasets["train"],
|
||||
eval_dataset=tokenized_datasets["test"],
|
||||
data_collator=data_collator,
|
||||
)
|
||||
|
||||
# Start model training
|
||||
trainer.train()
|
||||
|
||||
# Save and push the trained model and tokenizer
|
||||
if push_to_hub:
|
||||
# Push the main model to the hub
|
||||
trainer.push_to_hub(commit_message="Fine-tuned model")
|
||||
|
||||
# Save the model and tokenizer locally
|
||||
model.save_pretrained(output_dir)
|
||||
tokenizer.save_pretrained(output_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Fine-tune LLaMA with DeLoRA")
|
||||
parser.add_argument("--base_model", type=str, default="huggyllama/llama-7b", help="Base model path or name")
|
||||
parser.add_argument(
|
||||
"--data_path", type=str, default="timdettmers/openassistant-guanaco", help="Dataset path or name"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir", type=str, default="path/to/output", help="Output directory for the fine-tuned model"
|
||||
)
|
||||
parser.add_argument("--batch_size", type=int, default=1, help="Batch size")
|
||||
parser.add_argument("--num_epochs", type=int, default=1, help="Number of training epochs")
|
||||
parser.add_argument("--learning_rate", type=float, default=3e-3, help="Learning rate")
|
||||
parser.add_argument("--cutoff_len", type=int, default=512, help="Cutoff length for tokenization")
|
||||
parser.add_argument("--val_set_size", type=int, default=500, help="Validation set size")
|
||||
parser.add_argument("--eval_step", type=int, default=10, help="Evaluation step interval")
|
||||
parser.add_argument("--save_step", type=int, default=100, help="Save step interval")
|
||||
parser.add_argument("--device", type=str, default="auto", help="Device to use for training")
|
||||
parser.add_argument("--rank", type=int, default=32, help="DeLoRA basis rank")
|
||||
parser.add_argument("--delora_lambda", type=int, default=640, help="DeLoRA alpha")
|
||||
parser.add_argument("--module_dropout", type=float, default=0.05, help="DeLoRA dropout rate")
|
||||
parser.add_argument(
|
||||
"--target_modules", type=str, default=None, help="Comma-separated list of target modules for DeLoRA"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hub_model_id",
|
||||
type=str,
|
||||
default="path/to/repo",
|
||||
help="Repository name to push the model on the Hugging Face Hub",
|
||||
)
|
||||
parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to Hugging Face Hub")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.device == "auto":
|
||||
args.device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
|
||||
|
||||
train_model(
|
||||
base_model=args.base_model,
|
||||
data_path=args.data_path,
|
||||
output_dir=args.output_dir,
|
||||
batch_size=args.batch_size,
|
||||
num_epochs=args.num_epochs,
|
||||
learning_rate=args.learning_rate,
|
||||
cutoff_len=args.cutoff_len,
|
||||
val_set_size=args.val_set_size,
|
||||
eval_step=args.eval_step,
|
||||
save_step=args.save_step,
|
||||
device=args.device,
|
||||
rank=args.rank,
|
||||
delora_lambda=args.delora_lambda,
|
||||
module_dropout=args.module_dropout,
|
||||
target_modules=args.target_modules,
|
||||
hub_model_id=args.hub_model_id,
|
||||
push_to_hub=args.push_to_hub,
|
||||
)
|
@ -6,7 +6,7 @@
|
||||
"id": "CV_gQs58bsvM"
|
||||
},
|
||||
"source": [
|
||||
"# Fine-tuning [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on [timdettmers/openassistant-guanaco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco) Dataset using QDora (quantized Lora w/ use_dora=True) on T4 Free Colab GPU."
|
||||
"# Fine-tuning [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on [timdettmers/openassistant-guanaco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco) Dataset using QDora (quantized Lora w/ use_dora=True)."
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -1010,6 +1010,7 @@
|
||||
"top_p = 0.9\n",
|
||||
"temperature = 0.7\n",
|
||||
"user_question = \"What is the purpose of quantization in LLMs?\"\n",
|
||||
"device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"prompt = (\n",
|
||||
@ -1021,7 +1022,7 @@
|
||||
"\n",
|
||||
"\n",
|
||||
"def generate(model, user_question, max_new_tokens=max_new_tokens, top_p=top_p, temperature=temperature):\n",
|
||||
" inputs = tokenizer(prompt.format(user_question=user_question), return_tensors=\"pt\").to(\"cuda\")\n",
|
||||
" inputs = tokenizer(prompt.format(user_question=user_question), return_tensors=\"pt\").to(device)\n",
|
||||
"\n",
|
||||
" outputs = model.generate(\n",
|
||||
" **inputs,\n",
|
||||
|
@ -13,7 +13,7 @@ from peft import LoraConfig, get_peft_model
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer
|
||||
from datasets import load_dataset
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("huggyllama/llama-7b", device_map="cuda")
|
||||
model = AutoModelForCausalLM.from_pretrained("huggyllama/llama-7b", device_map="auto")
|
||||
tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
|
||||
dataset = load_dataset("timdettmers/openassistant-guanaco", split="train")
|
||||
lora_config = LoraConfig(
|
||||
@ -70,7 +70,6 @@ python dora_finetuning.py \
|
||||
--quantize \
|
||||
--eval_step 10 \
|
||||
--save_step 100 \
|
||||
--device "cuda:0" \
|
||||
--lora_r 16 \
|
||||
--lora_alpha 32 \
|
||||
--lora_dropout 0.05 \
|
||||
|
@ -39,7 +39,10 @@ def train_model(
|
||||
hf_token = os.getenv("HF_TOKEN")
|
||||
|
||||
# Setup device
|
||||
device = torch.device(device)
|
||||
if device == "auto":
|
||||
device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
|
||||
else:
|
||||
device = torch.device(device)
|
||||
print(f"Using device: {device}")
|
||||
|
||||
# load tokenizer
|
||||
@ -47,14 +50,16 @@ def train_model(
|
||||
|
||||
# QDoRA (quantized dora): IF YOU WANNA QUANTIZE THE MODEL
|
||||
if quantize:
|
||||
if (torch.cuda.is_available() and torch.cuda.is_bf16_supported()) or torch.xpu.is_available():
|
||||
bnb_4bit_compute_dtype = torch.bfloat16
|
||||
else:
|
||||
bnb_4bit_compute_dtype = torch.float16
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
base_model,
|
||||
token=hf_token,
|
||||
quantization_config=BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_compute_dtype=(
|
||||
torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
|
||||
),
|
||||
bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
),
|
||||
@ -117,8 +122,11 @@ def train_model(
|
||||
hub_token=hf_token,
|
||||
)
|
||||
|
||||
# Clear CUDA cache to free memory
|
||||
torch.cuda.empty_cache()
|
||||
# Clear device cache to free memory
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
elif torch.xpu.is_available():
|
||||
torch.xpu.empty_cache()
|
||||
|
||||
# Initialize the Trainer
|
||||
trainer = Trainer(
|
||||
@ -162,7 +170,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--quantize", action="store_true", help="Use quantization")
|
||||
parser.add_argument("--eval_step", type=int, default=10, help="Evaluation step interval")
|
||||
parser.add_argument("--save_step", type=int, default=100, help="Save step interval")
|
||||
parser.add_argument("--device", type=str, default="cuda:0", help="Device to use for training")
|
||||
parser.add_argument("--device", type=str, default="auto", help="Device to use for training")
|
||||
parser.add_argument("--lora_r", type=int, default=8, help="LoRA rank")
|
||||
parser.add_argument("--lora_alpha", type=int, default=16, help="LoRA alpha")
|
||||
parser.add_argument("--lora_dropout", type=float, default=0.05, help="LoRA dropout rate")
|
||||
|
@ -13,30 +13,26 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-01T04:23:43.891513Z",
|
||||
"iopub.status.busy": "2024-11-01T04:23:43.890672Z",
|
||||
"iopub.status.idle": "2024-11-01T04:24:09.121446Z",
|
||||
"shell.execute_reply": "2024-11-01T04:24:09.119925Z",
|
||||
"shell.execute_reply.started": "2024-11-01T04:23:43.891464Z"
|
||||
},
|
||||
"id": "o52TJHcYD25q",
|
||||
"outputId": "c5482c79-ff56-4ffa-d20c-46c3d30d2cd5",
|
||||
"trusted": true
|
||||
"outputId": "c5482c79-ff56-4ffa-d20c-46c3d30d2cd5"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"[notice] A new release of pip is available: 24.0 -> 24.3.1\n",
|
||||
"[notice] To update, run: python.exe -m pip install --upgrade pip\n"
|
||||
"\u001b[33m DEPRECATION: Building 'rouge-score' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'rouge-score'. Discussion can be found at https://github.com/pypa/pip/issues/6334\u001b[0m\u001b[33m\n",
|
||||
"\u001b[0m\u001b[33m DEPRECATION: Building 'sqlitedict' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'sqlitedict'. Discussion can be found at https://github.com/pypa/pip/issues/6334\u001b[0m\u001b[33m\n",
|
||||
"\u001b[0m\u001b[33m DEPRECATION: Building 'word2number' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'word2number'. Discussion can be found at https://github.com/pypa/pip/issues/6334\u001b[0m\u001b[33m\n",
|
||||
"\u001b[0m\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\u001b[0m\u001b[33m\n",
|
||||
"\u001b[0m\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.1.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.2\u001b[0m\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -56,64 +52,184 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-01T04:24:09.124656Z",
|
||||
"iopub.status.busy": "2024-11-01T04:24:09.123802Z",
|
||||
"iopub.status.idle": "2024-11-01T04:31:18.166404Z",
|
||||
"shell.execute_reply": "2024-11-01T04:31:18.165468Z",
|
||||
"shell.execute_reply.started": "2024-11-01T04:24:09.124605Z"
|
||||
},
|
||||
"id": "hwJIYD5KD25q",
|
||||
"outputId": "51e69f81-d048-46b2-9699-658d3ffc5f08",
|
||||
"trusted": true
|
||||
"outputId": "51e69f81-d048-46b2-9699-658d3ffc5f08"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2024-11-01:20:45:03,210 INFO [evaluator.py:164] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234\n",
|
||||
"2024-11-01:20:45:03,211 INFO [evaluator.py:188] Initializing hf model, with arguments: {'pretrained': 'bert-base-cased', 'dtype': 'bfloat16'}\n",
|
||||
"2024-11-01:20:45:03,213 INFO [huggingface.py:129] Using device 'cuda:0'\n",
|
||||
"2024-11-01:20:45:03,450 INFO [huggingface.py:481] Using model type 'default'\n",
|
||||
"2024-11-01:20:45:03,741 INFO [huggingface.py:365] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda:0'}\n",
|
||||
"If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`\n",
|
||||
"2024-11-01:20:45:15,862 INFO [task.py:415] Building contexts for hellaswag on rank 0...\n",
|
||||
"100%|██████████| 10042/10042 [00:02<00:00, 4477.77it/s]\n",
|
||||
"2024-11-01:20:45:18,875 INFO [evaluator.py:489] Running loglikelihood requests\n",
|
||||
"Running loglikelihood requests: 100%|██████████| 40168/40168 [00:34<00:00, 1152.65it/s]\n"
|
||||
"If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "7b1ea8948a0747bc98795d6459270044",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"README.md: 0.00B [00:00, ?B/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "4ec51e06812446899b66826c41697f8d",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"data/train-00000-of-00001.parquet: 0%| | 0.00/24.4M [00:00<?, ?B/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "fbd73be60e5a4bc68d1347504a6b7070",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"data/test-00000-of-00001.parquet: 0%| | 0.00/6.11M [00:00<?, ?B/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "0c887d06a56a410eae563e11a3080a52",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"data/validation-00000-of-00001.parquet: 0%| | 0.00/6.32M [00:00<?, ?B/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "adb9c60c23f74f7d9af72ea2e34bc22c",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Generating train split: 0%| | 0/39905 [00:00<?, ? examples/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "b3418259aeaf4d459d5ed4fe9b8434fc",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Generating test split: 0%| | 0/10003 [00:00<?, ? examples/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "da448b2a7a534fec9045c086fbda5d0d",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Generating validation split: 0%| | 0/10042 [00:00<?, ? examples/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "83f04bb57ab94be58080e8c67675a4e5",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Map: 0%| | 0/39905 [00:00<?, ? examples/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "f08ebbc81eaa45818c85e51169ac48cf",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Map: 0%| | 0/10042 [00:00<?, ? examples/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|█████████████████████████████████████████████████████████████████████████████████████████████| 10042/10042 [00:02<00:00, 4111.19it/s]\n",
|
||||
"Running loglikelihood requests: 0%| | 0/40168 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.\n",
|
||||
"Running loglikelihood requests: 100%|██████████████████████████████████████████████████████████████| 40168/40168 [02:40<00:00, 250.28it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'hellaswag': {'alias': 'hellaswag',\n",
|
||||
" 'acc,none': 0.24905397331208923,\n",
|
||||
" 'acc_stderr,none': 0.004315812968431576,\n",
|
||||
" 'acc_norm,none': 0.2439753037243577,\n",
|
||||
" 'acc_norm_stderr,none': 0.004286002710084076}}"
|
||||
" 'acc,none': 0.24915355506871142,\n",
|
||||
" 'acc_stderr,none': 0.004316389476434537,\n",
|
||||
" 'acc_norm,none': 0.244672376020713,\n",
|
||||
" 'acc_norm_stderr,none': 0.004290142029921662}}"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"import lm_eval\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n",
|
||||
"output = lm_eval.simple_evaluate(model = 'hf',\n",
|
||||
" model_args = {\n",
|
||||
" 'pretrained' : 'bert-base-cased',\n",
|
||||
" 'dtype' : 'bfloat16'},\n",
|
||||
" tasks = 'hellaswag',\n",
|
||||
" device = 'cuda:0',\n",
|
||||
" device = device,\n",
|
||||
" batch_size = 128,\n",
|
||||
" log_samples = False)\n",
|
||||
"output[\"results\"]"
|
||||
@ -130,17 +246,9 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-01T04:31:18.168035Z",
|
||||
"iopub.status.busy": "2024-11-01T04:31:18.167698Z",
|
||||
"iopub.status.idle": "2024-11-01T04:31:19.278584Z",
|
||||
"shell.execute_reply": "2024-11-01T04:31:19.277820Z",
|
||||
"shell.execute_reply.started": "2024-11-01T04:31:18.168000Z"
|
||||
},
|
||||
"id": "FmtVeh7QD25r",
|
||||
"trusted": true
|
||||
"id": "FmtVeh7QD25r"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -155,21 +263,13 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-01T04:31:19.282259Z",
|
||||
"iopub.status.busy": "2024-11-01T04:31:19.281242Z",
|
||||
"iopub.status.idle": "2024-11-01T04:31:19.565464Z",
|
||||
"shell.execute_reply": "2024-11-01T04:31:19.564510Z",
|
||||
"shell.execute_reply.started": "2024-11-01T04:31:19.282213Z"
|
||||
},
|
||||
"id": "rHF7tzN9D25r",
|
||||
"outputId": "352ad9ab-2efc-41f8-c3d5-a7da05d9529b",
|
||||
"trusted": true
|
||||
"outputId": "352ad9ab-2efc-41f8-c3d5-a7da05d9529b"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
@ -177,7 +277,8 @@
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
|
||||
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
|
||||
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
|
||||
"The 8-bit optimizer is not available on your device, only available on CUDA for now.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -211,7 +312,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
@ -329,24 +430,149 @@
|
||||
"e3082a5e8a4144f5982ad478d9a54a2c"
|
||||
]
|
||||
},
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-01T04:31:19.567617Z",
|
||||
"iopub.status.busy": "2024-11-01T04:31:19.567073Z",
|
||||
"iopub.status.idle": "2024-11-01T04:32:36.026219Z",
|
||||
"shell.execute_reply": "2024-11-01T04:32:36.025274Z",
|
||||
"shell.execute_reply.started": "2024-11-01T04:31:19.567569Z"
|
||||
},
|
||||
"id": "8cZUKSQLD25r",
|
||||
"outputId": "0c0120e6-28f0-4496-a395-6d48d1b159e5",
|
||||
"trusted": true
|
||||
"outputId": "0c0120e6-28f0-4496-a395-6d48d1b159e5"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Map: 100%|██████████| 25000/25000 [00:06<00:00, 3799.73 examples/s]\n"
|
||||
]
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "ca48589e7f2f46b49b0c6f0f643cbcc8",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"README.md: 0.00B [00:00, ?B/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "0ea02d82b14141c0a0237e8036404c84",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"train-00000-of-00001.parquet: 0%| | 0.00/21.0M [00:00<?, ?B/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "c26d8db5c6ac402aa0d1042df7c10858",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"test-00000-of-00001.parquet: 0%| | 0.00/20.5M [00:00<?, ?B/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "a6b57f45ca0e46edb407d7763e7cc141",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"unsupervised-00000-of-00001.parquet: 0%| | 0.00/42.0M [00:00<?, ?B/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "2cb938dfb23240a9b5e2a85c3e6796a6",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Generating train split: 0%| | 0/25000 [00:00<?, ? examples/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "7e1d3d3c1e414338b44e886a0ed29b8b",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Generating test split: 0%| | 0/25000 [00:00<?, ? examples/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "c6147cbc8124478ea9eb921a2789d20e",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Generating unsupervised split: 0%| | 0/50000 [00:00<?, ? examples/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "ed1329e2506a48b7b30f05a3ded2c230",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Map: 0%| | 0/25000 [00:00<?, ? examples/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "1c6d66613cfc44d391d9a8bb3c58e732",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Map: 0%| | 0/25000 [00:00<?, ? examples/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "26681307716846d68eca88409b126248",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Map: 0%| | 0/50000 [00:00<?, ? examples/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@ -364,17 +590,9 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 7,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-01T04:32:36.027737Z",
|
||||
"iopub.status.busy": "2024-11-01T04:32:36.027412Z",
|
||||
"iopub.status.idle": "2024-11-01T04:32:36.033004Z",
|
||||
"shell.execute_reply": "2024-11-01T04:32:36.032106Z",
|
||||
"shell.execute_reply.started": "2024-11-01T04:32:36.027702Z"
|
||||
},
|
||||
"id": "bA3k0iVED25r",
|
||||
"trusted": true
|
||||
"id": "bA3k0iVED25r"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -389,225 +607,117 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": 8,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 380
|
||||
},
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-01T04:32:36.035037Z",
|
||||
"iopub.status.busy": "2024-11-01T04:32:36.034215Z",
|
||||
"iopub.status.idle": "2024-11-01T06:29:13.969507Z",
|
||||
"shell.execute_reply": "2024-11-01T06:29:13.968477Z",
|
||||
"shell.execute_reply.started": "2024-11-01T04:32:36.035004Z"
|
||||
},
|
||||
"id": "DFG74c3kD25s",
|
||||
"outputId": "5dd9f988-95db-4efb-e632-5f741801910a",
|
||||
"trusted": true
|
||||
"outputId": "5dd9f988-95db-4efb-e632-5f741801910a"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 13%|█▎ | 500/3910 [08:16<56:48, 1.00it/s] "
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'loss': 0.3296, 'grad_norm': 1.0583174228668213, 'learning_rate': 0.0017442455242966753, 'epoch': 0.64}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" \n",
|
||||
" 20%|██ | 782/3910 [17:50<37:42, 1.38it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'eval_loss': 0.2824118435382843, 'eval_accuracy': 0.8816, 'eval_runtime': 308.5134, 'eval_samples_per_second': 81.034, 'eval_steps_per_second': 1.267, 'epoch': 1.0}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 26%|██▌ | 1000/3910 [21:14<45:16, 1.07it/s] "
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'loss': 0.2756, 'grad_norm': 1.9588807821273804, 'learning_rate': 0.0014884910485933505, 'epoch': 1.28}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 38%|███▊ | 1500/3910 [29:05<38:42, 1.04it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'loss': 0.2461, 'grad_norm': 0.7111016511917114, 'learning_rate': 0.0012327365728900255, 'epoch': 1.92}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" \n",
|
||||
" 40%|████ | 1564/3910 [35:10<28:13, 1.39it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'eval_loss': 0.2278510481119156, 'eval_accuracy': 0.9102, 'eval_runtime': 304.8594, 'eval_samples_per_second': 82.005, 'eval_steps_per_second': 1.283, 'epoch': 2.0}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 51%|█████ | 2000/3910 [42:06<29:44, 1.07it/s] "
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'loss': 0.2154, 'grad_norm': 1.8200898170471191, 'learning_rate': 0.000976982097186701, 'epoch': 2.56}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" \n",
|
||||
" 60%|██████ | 2346/3910 [52:29<18:45, 1.39it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'eval_loss': 0.20936396718025208, 'eval_accuracy': 0.91996, 'eval_runtime': 299.637, 'eval_samples_per_second': 83.434, 'eval_steps_per_second': 1.305, 'epoch': 3.0}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 64%|██████▍ | 2500/3910 [54:57<22:19, 1.05it/s] "
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'loss': 0.1988, 'grad_norm': 2.2769970893859863, 'learning_rate': 0.000721227621483376, 'epoch': 3.2}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 77%|███████▋ | 3000/3910 [1:02:52<14:23, 1.05it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'loss': 0.1635, 'grad_norm': 1.54856538772583, 'learning_rate': 0.00046547314578005116, 'epoch': 3.84}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" \n",
|
||||
" 80%|████████ | 3128/3910 [1:09:56<09:30, 1.37it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'eval_loss': 0.21944810450077057, 'eval_accuracy': 0.92164, 'eval_runtime': 303.6132, 'eval_samples_per_second': 82.342, 'eval_steps_per_second': 1.288, 'epoch': 4.0}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 90%|████████▉ | 3500/3910 [1:15:48<06:22, 1.07it/s] "
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'loss': 0.1355, 'grad_norm': 1.006062626838684, 'learning_rate': 0.00020971867007672635, 'epoch': 4.48}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" \n",
|
||||
"100%|██████████| 3910/3910 [1:27:17<00:00, 1.36it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'eval_loss': 0.2205527275800705, 'eval_accuracy': 0.92556, 'eval_runtime': 302.417, 'eval_samples_per_second': 82.667, 'eval_steps_per_second': 1.293, 'epoch': 5.0}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|██████████| 3910/3910 [1:27:17<00:00, 1.34s/it]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'train_runtime': 5237.7573, 'train_samples_per_second': 23.865, 'train_steps_per_second': 0.747, 'train_loss': 0.213202116983321, 'epoch': 5.0}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
"No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"\n",
|
||||
" <div>\n",
|
||||
" \n",
|
||||
" <progress value='3910' max='3910' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
|
||||
" [3910/3910 40:13, Epoch 5/5]\n",
|
||||
" </div>\n",
|
||||
" <table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: left;\">\n",
|
||||
" <th>Epoch</th>\n",
|
||||
" <th>Training Loss</th>\n",
|
||||
" <th>Validation Loss</th>\n",
|
||||
" <th>Accuracy</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0.353800</td>\n",
|
||||
" <td>0.261258</td>\n",
|
||||
" <td>0.901160</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>0.277400</td>\n",
|
||||
" <td>0.221651</td>\n",
|
||||
" <td>0.912480</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>0.244500</td>\n",
|
||||
" <td>0.216107</td>\n",
|
||||
" <td>0.918200</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>0.197000</td>\n",
|
||||
" <td>0.215257</td>\n",
|
||||
" <td>0.920040</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td>5</td>\n",
|
||||
" <td>0.157700</td>\n",
|
||||
" <td>0.215050</td>\n",
|
||||
" <td>0.923240</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table><p>"
|
||||
],
|
||||
"text/plain": [
|
||||
"TrainOutput(global_step=3910, training_loss=0.213202116983321, metrics={'train_runtime': 5237.7573, 'train_samples_per_second': 23.865, 'train_steps_per_second': 0.747, 'total_flos': 3.300271872e+16, 'train_loss': 0.213202116983321, 'epoch': 5.0})"
|
||||
"<IPython.core.display.HTML object>"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "7298a140779d4fd88a65a191af265821",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Downloading builder script: 0.00B [00:00, ?B/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "3c6b99b4b5854527a8b34b92a8d2986b",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Downloading builder script: 0.00B [00:00, ?B/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"TrainOutput(global_step=3910, training_loss=0.24082870385835847, metrics={'train_runtime': 2416.0772, 'train_samples_per_second': 51.737, 'train_steps_per_second': 1.618, 'total_flos': 3.300271872e+16, 'train_loss': 0.24082870385835847, 'epoch': 5.0})"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -652,51 +762,11 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-01T06:40:42.417713Z",
|
||||
"iopub.status.busy": "2024-11-01T06:40:42.416565Z",
|
||||
"iopub.status.idle": "2024-11-01T06:47:21.642618Z",
|
||||
"shell.execute_reply": "2024-11-01T06:47:21.641637Z",
|
||||
"shell.execute_reply.started": "2024-11-01T06:40:42.417667Z"
|
||||
},
|
||||
"id": "7tgAq7nLD25s",
|
||||
"trusted": true
|
||||
"id": "7tgAq7nLD25s"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2024-11-01:23:37:57,640 INFO [evaluator.py:164] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234\n",
|
||||
"2024-11-01:23:37:57,641 INFO [evaluator.py:188] Initializing hf model, with arguments: {'pretrained': 'bert-base-cased', 'peft': './bert-lora-imdb/checkpoint-3910', 'dtype': 'bfloat16'}\n",
|
||||
"2024-11-01:23:37:57,643 INFO [huggingface.py:129] Using device 'cuda:0'\n",
|
||||
"2024-11-01:23:37:57,891 INFO [huggingface.py:481] Using model type 'default'\n",
|
||||
"2024-11-01:23:37:58,161 INFO [huggingface.py:365] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda:0'}\n",
|
||||
"If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`\n",
|
||||
"2024-11-01:23:38:10,295 INFO [task.py:415] Building contexts for hellaswag on rank 0...\n",
|
||||
"100%|██████████| 10042/10042 [00:02<00:00, 4453.89it/s]\n",
|
||||
"2024-11-01:23:38:13,313 INFO [evaluator.py:489] Running loglikelihood requests\n",
|
||||
"Running loglikelihood requests: 100%|██████████| 40168/40168 [00:44<00:00, 893.15it/s] \n",
|
||||
"2024-11-01:23:39:12,119 WARNING [huggingface.py:1353] Failed to get model SHA for ./bert-lora-imdb/checkpoint-3910 at revision main. Error: Repo id must be in the form 'repo_name' or 'namespace/repo_name': './bert-lora-imdb/checkpoint-3910'. Use `repo_type` argument if needed.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'hellaswag': {'alias': 'hellaswag',\n",
|
||||
" 'acc,none': 0.2535351523600876,\n",
|
||||
" 'acc_stderr,none': 0.00434145484189232,\n",
|
||||
" 'acc_norm,none': 0.24875522804222266,\n",
|
||||
" 'acc_norm_stderr,none': 0.0043140816086246455}}"
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# use the path of your checkpoint here\n",
|
||||
"output = lm_eval.simple_evaluate(model = 'hf',\n",
|
||||
@ -705,7 +775,7 @@
|
||||
" 'peft' : './bert-lora-imdb/checkpoint-3910',\n",
|
||||
" 'dtype' : 'bfloat16'},\n",
|
||||
" tasks = 'hellaswag',\n",
|
||||
" device = 'cuda:0',\n",
|
||||
" device = device,\n",
|
||||
" batch_size = 128,\n",
|
||||
" log_samples = False)\n",
|
||||
"\n",
|
||||
@ -736,7 +806,7 @@
|
||||
"sourceType": "notebook"
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": ".peft",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
@ -750,7 +820,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
"version": "3.11.13"
|
||||
},
|
||||
"widgets": {
|
||||
"application/vnd.jupyter.widget-state+json": {
|
||||
|
@ -55,7 +55,7 @@ model = AutoModelForCausalLM.from_pretrained(
|
||||
bnb_4bit_use_double_quant=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
),
|
||||
torch_dtype=torch.float16,
|
||||
dtype=torch.float16,
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
|
||||
|
@ -141,16 +141,16 @@ def main(args):
|
||||
cur_class_images = len(list(class_images_dir.iterdir()))
|
||||
|
||||
if cur_class_images < args.num_class_images:
|
||||
torch_dtype = torch.float16 if accelerator.device.type in ["cuda", "xpu"] else torch.float32
|
||||
dtype = torch.float16 if accelerator.device.type in ["cuda", "xpu"] else torch.float32
|
||||
if args.prior_generation_precision == "fp32":
|
||||
torch_dtype = torch.float32
|
||||
dtype = torch.float32
|
||||
elif args.prior_generation_precision == "fp16":
|
||||
torch_dtype = torch.float16
|
||||
dtype = torch.float16
|
||||
elif args.prior_generation_precision == "bf16":
|
||||
torch_dtype = torch.bfloat16
|
||||
dtype = torch.bfloat16
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
args.pretrained_model_name_or_path,
|
||||
torch_dtype=torch_dtype,
|
||||
dtype=dtype,
|
||||
safety_checker=None,
|
||||
revision=args.revision,
|
||||
)
|
||||
|
@ -196,7 +196,7 @@
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Overriding torch_dtype=None with `torch_dtype=torch.float16` due to requirements of `bitsandbytes` to enable model loading in mixed int8. Either pass torch_dtype=torch.float16 or don't pass this argument at all to remove this warning.\n"
|
||||
"Overriding dtype=None with `dtype=torch.float16` due to requirements of `bitsandbytes` to enable model loading in mixed int8. Either pass dtype=torch.float16 or don't pass this argument at all to remove this warning.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -1201,7 +1201,7 @@
|
||||
"peft_model_id = \"ybelkada/flan-t5-large-financial-phrasebank-lora\"\n",
|
||||
"config = PeftConfig.from_pretrained(peft_model_id)\n",
|
||||
"\n",
|
||||
"model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, torch_dtype=\"auto\", device_map=\"auto\")\n",
|
||||
"model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, dtype=\"auto\", device_map=\"auto\")\n",
|
||||
"tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)\n",
|
||||
"\n",
|
||||
"# Load the Lora model\n",
|
||||
|
@ -24,7 +24,7 @@ MODEL_ID = "LoftQ/Mistral-7B-v0.1-4bit-64rank"
|
||||
|
||||
base_model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_ID,
|
||||
torch_dtype=torch.bfloat16, # you may change it with different models
|
||||
dtype=torch.bfloat16, # you may change it with different models
|
||||
quantization_config=BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_compute_dtype=torch.bfloat16, # bfloat16 is recommended
|
||||
@ -81,7 +81,7 @@ MODEL_DIR = "model_zoo/loftq/Llama-2-7b-hf-4bit-16rank"
|
||||
|
||||
base_model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_DIR,
|
||||
torch_dtype=torch.bfloat16,
|
||||
dtype=torch.bfloat16,
|
||||
quantization_config=BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_compute_dtype=torch.bfloat16,
|
||||
|
@ -454,7 +454,7 @@ def main():
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_use_double_quant=False,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_compute_dtype=config.torch_dtype,
|
||||
bnb_4bit_compute_dtype=config.dtype,
|
||||
),
|
||||
)
|
||||
else:
|
||||
|
@ -628,16 +628,16 @@ def main(args):
|
||||
cur_class_images = len(list(class_images_dir.iterdir()))
|
||||
|
||||
if cur_class_images < args.num_class_images:
|
||||
torch_dtype = torch.float16 if accelerator.device.type in ["cuda", "xpu"] else torch.float32
|
||||
dtype = torch.float16 if accelerator.device.type in ["cuda", "xpu"] else torch.float32
|
||||
if args.prior_generation_precision == "fp32":
|
||||
torch_dtype = torch.float32
|
||||
dtype = torch.float32
|
||||
elif args.prior_generation_precision == "fp16":
|
||||
torch_dtype = torch.float16
|
||||
dtype = torch.float16
|
||||
elif args.prior_generation_precision == "bf16":
|
||||
torch_dtype = torch.bfloat16
|
||||
dtype = torch.bfloat16
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
args.pretrained_model_name_or_path,
|
||||
torch_dtype=torch_dtype,
|
||||
dtype=dtype,
|
||||
safety_checker=None,
|
||||
revision=args.revision,
|
||||
)
|
||||
|
@ -72,14 +72,14 @@ def train_model(
|
||||
bnb_4bit_use_double_quant=False,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
),
|
||||
torch_dtype=compute_dtype,
|
||||
dtype=compute_dtype,
|
||||
device_map=device_map,
|
||||
)
|
||||
# setup for quantized training
|
||||
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
|
||||
else:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
base_model_name_or_path, torch_dtype=compute_dtype, device_map=device_map
|
||||
base_model_name_or_path, dtype=compute_dtype, device_map=device_map
|
||||
)
|
||||
|
||||
# LoRA config for the PEFT model
|
||||
|
@ -11,7 +11,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
from trl import SFTConfig, SFTTrainer
|
||||
from datasets import load_dataset
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16, device_map="auto")
|
||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", dtype=torch.bfloat16, device_map="auto")
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
|
||||
tokenizer.pad_token_id = tokenizer.eos_token_id
|
||||
|
||||
@ -55,7 +55,7 @@ from peft import PeftModel
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16, device_map="auto"
|
||||
"meta-llama/Llama-2-7b-hf", dtype=torch.bfloat16, device_map="auto"
|
||||
)
|
||||
peft_model = PeftModel.from_pretrained(model, "miss-llama-2-7b")
|
||||
```
|
||||
|
@ -59,7 +59,7 @@ elif script_args.base_model_name_or_path is not None:
|
||||
print(f"No available pre-processed model, manually initialize a MiSS using {script_args.base_model_name_or_path}.")
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
script_args.base_model_name_or_path,
|
||||
torch_dtype=(
|
||||
dtype=(
|
||||
torch.float16
|
||||
if script_args.bits == "fp16"
|
||||
else (torch.bfloat16 if script_args.bits == "bf16" else torch.float32)
|
||||
|
@ -689,7 +689,7 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"model = model.to(dtype=torch.float16, device=device)\n",
|
||||
"model = model.to(torch_dtype=torch.float16, device=device)\n",
|
||||
"\n",
|
||||
"pipe = DiffusionPipeline.from_pretrained(\n",
|
||||
" model_id, unet=model, variant=\"fp16\", torch_dtype=torch.float16,\n",
|
||||
@ -796,7 +796,7 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"model = model.to(dtype=torch.float16, device=device)\n",
|
||||
"model = model.to(torch_dtype=torch.float16, device=device)\n",
|
||||
"\n",
|
||||
"pipe = DiffusionPipeline.from_pretrained(\n",
|
||||
" model_id, unet=model, variant=\"fp16\", torch_dtype=torch.float16,\n",
|
||||
@ -868,7 +868,7 @@
|
||||
"del pipe\n",
|
||||
"\n",
|
||||
"pipe = DiffusionPipeline.from_pretrained(\n",
|
||||
" model_id, variant=\"fp16\", torch_dtype=torch.float16,\n",
|
||||
" model_id, variant=\"fp16\", dtype=torch.float16,\n",
|
||||
").to(device)\n",
|
||||
"\n",
|
||||
"prompt = \"toy_face of a hacker with a hoodie, pixel art\"\n",
|
||||
|
@ -638,16 +638,16 @@ def main(args):
|
||||
cur_class_images = len(list(class_images_dir.iterdir()))
|
||||
|
||||
if cur_class_images < args.num_class_images:
|
||||
torch_dtype = torch.float16 if accelerator.device.type in ["cuda", "xpu"] else torch.float32
|
||||
dtype = torch.float16 if accelerator.device.type in ["cuda", "xpu"] else torch.float32
|
||||
if args.prior_generation_precision == "fp32":
|
||||
torch_dtype = torch.float32
|
||||
dtype = torch.float32
|
||||
elif args.prior_generation_precision == "fp16":
|
||||
torch_dtype = torch.float16
|
||||
dtype = torch.float16
|
||||
elif args.prior_generation_precision == "bf16":
|
||||
torch_dtype = torch.bfloat16
|
||||
dtype = torch.bfloat16
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
args.pretrained_model_name_or_path,
|
||||
torch_dtype=torch_dtype,
|
||||
dtype=dtype,
|
||||
safety_checker=None,
|
||||
revision=args.revision,
|
||||
)
|
||||
|
@ -11,7 +11,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
from trl import SFTConfig, SFTTrainer
|
||||
from datasets import load_dataset
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.bfloat16, device_map="auto")
|
||||
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", dtype=torch.bfloat16, device_map="auto")
|
||||
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
|
||||
dataset = load_dataset("imdb", split="train[:1%]")
|
||||
lora_config = LoraConfig(
|
||||
|
@ -44,7 +44,7 @@ def train(
|
||||
lora_alpha: int = 16,
|
||||
lora_dropout: float = 0.05,
|
||||
lora_target_modules: list[str] = None,
|
||||
torch_dtype: str = "float16",
|
||||
dtype: str = "float16",
|
||||
init_lora_weights="olora",
|
||||
seed: Optional[int] = None,
|
||||
):
|
||||
@ -57,7 +57,7 @@ def train(
|
||||
# Set seed
|
||||
if seed is not None:
|
||||
set_seed(seed)
|
||||
model_kwargs = {"torch_dtype": getattr(torch, torch_dtype), "device_map": device_map}
|
||||
model_kwargs = {"dtype": getattr(torch, dtype), "device_map": device_map}
|
||||
if quantize:
|
||||
model_kwargs["quantization_config"] = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
@ -170,7 +170,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--lora_alpha", type=int, default=16)
|
||||
parser.add_argument("--lora_dropout", type=float, default=0.05)
|
||||
parser.add_argument("--lora_target_modules", type=str, default=None)
|
||||
parser.add_argument("--torch_dtype", type=str, default="float16")
|
||||
parser.add_argument("--dtype", type=str, default="float16")
|
||||
parser.add_argument("--init_lora_weights", type=str, default="olora")
|
||||
parser.add_argument("--seed", type=int, default=None)
|
||||
|
||||
@ -193,7 +193,7 @@ if __name__ == "__main__":
|
||||
lora_alpha=args.lora_alpha,
|
||||
lora_dropout=args.lora_dropout,
|
||||
lora_target_modules=args.lora_target_modules,
|
||||
torch_dtype=args.torch_dtype,
|
||||
dtype=args.dtype,
|
||||
init_lora_weights=args.init_lora_weights,
|
||||
seed=args.seed,
|
||||
)
|
||||
|
@ -6,10 +6,11 @@ PiSSA represents a matrix $W\in\mathbb{R}^{m\times n}$ within the model by the p
|
||||
```python
|
||||
import torch
|
||||
from peft import LoraConfig, get_peft_model
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLMfrom trl import SFTConfig, SFTTrainer
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
from trl import SFTConfig, SFTTrainer
|
||||
from datasets import load_dataset
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16, device_map="auto")
|
||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", dtype=torch.bfloat16, device_map="auto")
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
|
||||
tokenizer.pad_token_id = tokenizer.eos_token_id
|
||||
lora_config = LoraConfig(
|
||||
@ -42,7 +43,7 @@ from peft import PeftModel
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16, device_map="auto"
|
||||
"meta-llama/Llama-2-7b-hf", dtype=torch.bfloat16, device_map="auto"
|
||||
)
|
||||
# Performs SVD again to initialize the residual model and loads the state_dict of the fine-tuned PiSSA modules.
|
||||
peft_model = PeftModel.from_pretrained(model, "pissa-llama-2-7b")
|
||||
@ -82,7 +83,7 @@ from peft import PeftModel
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16, device_map="auto"
|
||||
"meta-llama/Llama-2-7b-hf", dtype=torch.bfloat16, device_map="auto"
|
||||
)
|
||||
# No SVD is performed during this step, and the base model remains unaltered.
|
||||
peft_model = PeftModel.from_pretrained(model, "pissa-llama-2-7b-lora")
|
||||
|
@ -75,7 +75,7 @@ if script_args.bits in ["nf4", "fp4", "int8"]:
|
||||
elif script_args.residual_model_name_or_path is not None:
|
||||
res_model = AutoModelForCausalLM.from_pretrained(
|
||||
script_args.residual_model_name_or_path,
|
||||
torch_dtype=(
|
||||
dtype=(
|
||||
torch.float16
|
||||
if script_args.bits == "fp16"
|
||||
else (torch.bfloat16 if script_args.bits == "bf16" else torch.float32)
|
||||
@ -94,7 +94,7 @@ elif script_args.base_model_name_or_path is not None:
|
||||
)
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
script_args.base_model_name_or_path,
|
||||
torch_dtype=(
|
||||
dtype=(
|
||||
torch.float16
|
||||
if script_args.bits == "fp16"
|
||||
else (torch.bfloat16 if script_args.bits == "bf16" else torch.float32)
|
||||
|
@ -39,7 +39,7 @@ print(script_args)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
script_args.base_model_name_or_path,
|
||||
torch_dtype=(
|
||||
dtype=(
|
||||
torch.float16
|
||||
if script_args.bits == "fp16"
|
||||
else (torch.bfloat16 if script_args.bits == "bf16" else torch.float32)
|
||||
|
@ -44,7 +44,7 @@ def load_or_quantize_model(
|
||||
test_model = AutoModelForCausalLM.from_pretrained(
|
||||
base_model,
|
||||
device_map="auto",
|
||||
torch_dtype=torch.float16,
|
||||
dtype=torch.float16,
|
||||
trust_remote_code=True, # Some GPTQ models might need this
|
||||
)
|
||||
|
||||
@ -95,7 +95,7 @@ def load_or_quantize_model(
|
||||
|
||||
# Load and quantize the model
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
base_model, device_map="auto", quantization_config=gptq_config, torch_dtype=torch.float16
|
||||
base_model, device_map="auto", quantization_config=gptq_config, dtype=torch.float16
|
||||
)
|
||||
|
||||
# Save the quantized model to cache
|
||||
|
@ -52,7 +52,7 @@ def train_model(
|
||||
device_type = device.type
|
||||
device_module = getattr(torch, device_type, torch.cuda)
|
||||
bf16_suppotrted = device_module.is_available() and device_module.is_bf16_supported()
|
||||
torch_dtype = torch.bfloat16 if bf16_suppotrted else torch.float16
|
||||
dtype = torch.bfloat16 if bf16_suppotrted else torch.float16
|
||||
|
||||
# QRandLora (quantized randlora): IF YOU WANNA QUANTIZE THE MODEL
|
||||
if quantize:
|
||||
@ -65,14 +65,14 @@ def train_model(
|
||||
bnb_4bit_use_double_quant=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
),
|
||||
torch_dtype=torch_dtype,
|
||||
dtype=dtype,
|
||||
)
|
||||
# setup for quantized training
|
||||
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
|
||||
else:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
base_model,
|
||||
torch_dtype=torch_dtype,
|
||||
dtype=dtype,
|
||||
token=hf_token,
|
||||
)
|
||||
# LoRa config for the PEFT model
|
||||
|
@ -207,7 +207,7 @@
|
||||
"source": [
|
||||
"quant_config = TorchAoConfig(quant_type=\"int8_dynamic_activation_int8_weight\")\n",
|
||||
"model = AutoModelForSequenceClassification.from_pretrained(\n",
|
||||
" model_name_or_path, return_dict=True, device_map=0, torch_dtype=torch.bfloat16, quantization_config=quant_config\n",
|
||||
" model_name_or_path, return_dict=True, device_map=0, dtype=torch.bfloat16, quantization_config=quant_config\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
|
@ -207,7 +207,7 @@
|
||||
"source": [
|
||||
"quant_config = TorchAoConfig(quant_type=\"int8_weight_only\")\n",
|
||||
"model = AutoModelForSequenceClassification.from_pretrained(\n",
|
||||
" model_name_or_path, return_dict=True, device_map=0, torch_dtype=torch.bfloat16, quantization_config=quant_config\n",
|
||||
" model_name_or_path, return_dict=True, device_map=0, dtype=torch.bfloat16, quantization_config=quant_config\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
|
@ -129,14 +129,12 @@ def create_and_prepare_model(args, data_args, training_args):
|
||||
load_in_4bit=args.use_4bit_quantization,
|
||||
)
|
||||
else:
|
||||
torch_dtype = (
|
||||
quant_storage_dtype if quant_storage_dtype and quant_storage_dtype.is_floating_point else torch.float32
|
||||
)
|
||||
dtype = quant_storage_dtype if quant_storage_dtype and quant_storage_dtype.is_floating_point else torch.float32
|
||||
|
||||
# Prepare model loading arguments
|
||||
model_kwargs = {
|
||||
"trust_remote_code": True,
|
||||
"torch_dtype": torch_dtype,
|
||||
"dtype": dtype,
|
||||
}
|
||||
if args.use_flash_attn:
|
||||
if torch.xpu.is_available():
|
||||
|
@ -11,7 +11,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
from trl import SFTConfig, SFTTrainer
|
||||
from datasets import load_dataset
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.bfloat16, device_map="auto")
|
||||
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", dtype=torch.bfloat16, device_map="auto")
|
||||
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
|
||||
dataset = load_dataset("imdb", split="train[:1%]")
|
||||
shira_config = ShiraConfig(
|
||||
|
@ -42,7 +42,7 @@ def train(
|
||||
device_map: str = "auto",
|
||||
shira_r: int = 32,
|
||||
shira_target_modules: list[str] = None,
|
||||
torch_dtype: str = "float16",
|
||||
dtype: str = "float16",
|
||||
seed: Optional[int] = None,
|
||||
use_custom_random_mask_function_with_custom_kwargs: Optional[bool] = False,
|
||||
):
|
||||
@ -55,7 +55,7 @@ def train(
|
||||
# Set seed
|
||||
if seed is not None:
|
||||
set_seed(seed)
|
||||
model_kwargs = {"torch_dtype": getattr(torch, torch_dtype), "device_map": device_map}
|
||||
model_kwargs = {"dtype": getattr(torch, dtype), "device_map": device_map}
|
||||
model = AutoModelForCausalLM.from_pretrained(base_model, **model_kwargs)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
|
||||
@ -191,7 +191,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--device_map", type=str, default="auto")
|
||||
parser.add_argument("--shira_r", type=int, default=32)
|
||||
parser.add_argument("--shira_target_modules", type=str, default=None)
|
||||
parser.add_argument("--torch_dtype", type=str, default="float16")
|
||||
parser.add_argument("--dtype", type=str, default="float16")
|
||||
parser.add_argument("--seed", type=int, default=None)
|
||||
parser.add_argument("--use_custom_random_mask_function_with_custom_kwargs", action="store_true")
|
||||
|
||||
@ -211,7 +211,7 @@ if __name__ == "__main__":
|
||||
device_map=args.device_map,
|
||||
shira_r=args.shira_r,
|
||||
shira_target_modules=args.shira_target_modules,
|
||||
torch_dtype=args.torch_dtype,
|
||||
dtype=args.dtype,
|
||||
seed=args.seed,
|
||||
use_custom_random_mask_function_with_custom_kwargs=args.use_custom_random_mask_function_with_custom_kwargs,
|
||||
)
|
||||
|
@ -802,16 +802,16 @@ def main(args):
|
||||
cur_class_images = len(list(class_images_dir.iterdir()))
|
||||
|
||||
if cur_class_images < args.num_class_images:
|
||||
torch_dtype = torch.float16 if accelerator.device.type in ["cuda", "xpu"] else torch.float32
|
||||
dtype = torch.float16 if accelerator.device.type in ["cuda", "xpu"] else torch.float32
|
||||
if args.prior_generation_precision == "fp32":
|
||||
torch_dtype = torch.float32
|
||||
dtype = torch.float32
|
||||
elif args.prior_generation_precision == "fp16":
|
||||
torch_dtype = torch.float16
|
||||
dtype = torch.float16
|
||||
elif args.prior_generation_precision == "bf16":
|
||||
torch_dtype = torch.bfloat16
|
||||
dtype = torch.bfloat16
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
args.pretrained_model_name_or_path,
|
||||
torch_dtype=torch_dtype,
|
||||
dtype=dtype,
|
||||
safety_checker=None,
|
||||
revision=args.revision,
|
||||
)
|
||||
|
64
examples/waveft_finetuning/README.md
Normal file
64
examples/waveft_finetuning/README.md
Normal file
@ -0,0 +1,64 @@
|
||||
|
||||
|
||||
# WaveFT: Wavelet Fine-Tuning
|
||||
|
||||
## Introduction
|
||||
[WaveFT](https://arxiv.org/abs/2505.12532) is a novel parameter-efficient fine-tuning (PEFT) method that introduces sparse updates in the **wavelet domain** of residual matrices. Unlike LoRA, which is constrained by discrete low-rank choices, WaveFT enables fine-grained control over the number of trainable parameters by directly learning a sparse set of coefficients in the transformed space. These coefficients are then mapped back to the weight domain via the Inverse Discrete Wavelet Transform (IDWT), producing high-rank updates without incurring inference overhead.
|
||||
|
||||
## Quick start
|
||||
```python
|
||||
import torch
|
||||
from peft import WaveFTConfig, get_peft_model
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
from trl import SFTConfig, SFTTrainer
|
||||
from datasets import load_dataset
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", dtype=torch.bfloat16, device_map="auto")
|
||||
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
|
||||
dataset = load_dataset("imdb", split="train[:1%]")
|
||||
waveft_config = WaveFTConfig(
|
||||
n_frequency=2592,
|
||||
)
|
||||
peft_model = get_peft_model(model, waveft_config)
|
||||
training_args = SFTConfig(dataset_text_field="text", max_seq_length=128)
|
||||
trainer = SFTTrainer(
|
||||
model=peft_model,
|
||||
train_dataset=dataset,
|
||||
processing_class=tokenizer,
|
||||
)
|
||||
trainer.train()
|
||||
peft_model.save_pretrained("waveft-opt-350m")
|
||||
```
|
||||
|
||||
For more options and a more detailed example code, you can refer to waveft finetuning script.
|
||||
Run the script simply by running:
|
||||
```bash
|
||||
python3 examples/waveft_finetuning/waveft_finetuning.py --base_model facebook/opt-350m
|
||||
```
|
||||
|
||||
If you want to run DDP by [accelerate](https://huggingface.co/docs/accelerate/en/index), please run `accelerate config` to set your ddp config, and run:
|
||||
```bash
|
||||
accelerate launch examples/waveft_finetuning/waveft_finetuning.py --base_model facebook/opt-350m
|
||||
```
|
||||
please add `--device_map cpu` if you want to run finetune on CPU.
|
||||
|
||||
## Use the model
|
||||
You can load and use the model as any other 🤗 PEFT model
|
||||
```python
|
||||
from peft import PeftModel
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")
|
||||
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
|
||||
waveft_model = PeftModel.from_pretrained(model, "waveft-opt-350m")
|
||||
```
|
||||
|
||||
## Citation
|
||||
@misc{bilican2025exploringsparsityparameterefficient,
|
||||
title={Exploring Sparsity for Parameter Efficient Fine Tuning Using Wavelets},
|
||||
author={Ahmet Bilican and M. Akın Yılmaz and A. Murat Tekalp and R. Gökberk Cinbiş},
|
||||
year={2025},
|
||||
eprint={2505.12532},
|
||||
archivePrefix={arXiv},
|
||||
primaryClass={cs.CV},
|
||||
url={https://arxiv.org/abs/2505.12532},
|
||||
}
|
189
examples/waveft_finetuning/waveft_finetuning.py
Normal file
189
examples/waveft_finetuning/waveft_finetuning.py
Normal file
@ -0,0 +1,189 @@
|
||||
# Copyright 2025-present the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import transformers
|
||||
from datasets import load_dataset
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
|
||||
|
||||
from peft import (
|
||||
WaveFTConfig,
|
||||
get_peft_model,
|
||||
)
|
||||
|
||||
|
||||
def train(
|
||||
base_model: str,
|
||||
data_path: str = "yahma/alpaca-cleaned",
|
||||
output_dir: str = "waveft",
|
||||
batch_size: int = 16,
|
||||
num_epochs: int = 1,
|
||||
learning_rate: float = 3e-4,
|
||||
cutoff_len: int = 256,
|
||||
val_set_size: int = 16,
|
||||
eval_step: int = 100,
|
||||
save_step: int = 100,
|
||||
device_map: str = "auto",
|
||||
waveft_n_frequency: int = 2592,
|
||||
waveft_target_modules: list[str] = None,
|
||||
waveft_scaling: float = 25.0,
|
||||
waveft_wavelet_family: str = "db1",
|
||||
waveft_use_idwt: bool = True,
|
||||
dtype: str = "float16",
|
||||
seed: Optional[int] = None,
|
||||
):
|
||||
# Set device_map to the right place when enabling DDP.
|
||||
world_size = int(os.environ.get("WORLD_SIZE", 0)) or int(os.environ.get("PMI_SIZE", 0))
|
||||
if world_size > 1 and device_map != "cpu":
|
||||
from accelerate import Accelerator
|
||||
|
||||
device_map = {"": Accelerator().process_index}
|
||||
# Set seed
|
||||
if seed is not None:
|
||||
set_seed(seed)
|
||||
model_kwargs = {"dtype": getattr(torch, dtype), "device_map": device_map}
|
||||
model = AutoModelForCausalLM.from_pretrained(base_model, **model_kwargs)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
|
||||
# For some tokenizer with no pad token like llama
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
def tokenize(prompt, add_eos_token=True):
|
||||
result = tokenizer(
|
||||
prompt,
|
||||
truncation=True,
|
||||
max_length=cutoff_len,
|
||||
padding=False,
|
||||
return_tensors=None,
|
||||
)
|
||||
if (
|
||||
result["input_ids"][-1] != tokenizer.eos_token_id
|
||||
and len(result["input_ids"]) < cutoff_len
|
||||
and add_eos_token
|
||||
):
|
||||
result["input_ids"].append(tokenizer.eos_token_id)
|
||||
result["attention_mask"].append(1)
|
||||
|
||||
result["labels"] = result["input_ids"].copy()
|
||||
|
||||
return result
|
||||
|
||||
def generate_and_tokenize_prompt(example):
|
||||
full_prompt = generate_prompt(example)
|
||||
tokenized_full_prompt = tokenize(full_prompt)
|
||||
return tokenized_full_prompt
|
||||
|
||||
config = WaveFTConfig(
|
||||
n_frequency=waveft_n_frequency,
|
||||
scaling=waveft_scaling,
|
||||
wavelet_family=waveft_wavelet_family,
|
||||
use_idwt=waveft_use_idwt,
|
||||
target_modules=waveft_target_modules,
|
||||
task_type="CAUSAL_LM",
|
||||
)
|
||||
|
||||
model = get_peft_model(model, config)
|
||||
|
||||
data = load_dataset(data_path)
|
||||
|
||||
train_val = data["train"].train_test_split(test_size=val_set_size, shuffle=True, seed=42)
|
||||
train_data = train_val["train"].shuffle().map(generate_and_tokenize_prompt)
|
||||
val_data = train_val["test"].shuffle().map(generate_and_tokenize_prompt)
|
||||
|
||||
trainer = transformers.Trainer(
|
||||
model=model,
|
||||
train_dataset=train_data,
|
||||
eval_dataset=val_data,
|
||||
args=transformers.TrainingArguments(
|
||||
per_device_train_batch_size=batch_size,
|
||||
warmup_steps=100,
|
||||
num_train_epochs=num_epochs,
|
||||
learning_rate=learning_rate,
|
||||
logging_steps=100,
|
||||
optim="adamw_torch",
|
||||
eval_strategy="steps",
|
||||
save_strategy="steps",
|
||||
eval_steps=eval_step,
|
||||
save_steps=save_step,
|
||||
output_dir=output_dir,
|
||||
save_total_limit=3,
|
||||
load_best_model_at_end=True,
|
||||
ddp_find_unused_parameters=False if world_size > 1 else None,
|
||||
),
|
||||
data_collator=transformers.DataCollatorForSeq2Seq(
|
||||
tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
|
||||
),
|
||||
)
|
||||
trainer.train()
|
||||
model.save_pretrained(output_dir)
|
||||
|
||||
|
||||
def generate_prompt(example):
|
||||
return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
||||
### Instruction:
|
||||
{example["instruction"]}
|
||||
### Response:
|
||||
{example["output"]}"""
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--base_model", type=str)
|
||||
parser.add_argument("--data_path", type=str, default="yahma/alpaca-cleaned")
|
||||
parser.add_argument("--output_dir", type=str, default="waveft")
|
||||
parser.add_argument("--batch_size", type=int, default=16)
|
||||
parser.add_argument("--num_epochs", type=int, default=1)
|
||||
parser.add_argument("--learning_rate", type=float, default=3e-4)
|
||||
parser.add_argument("--cutoff_len", type=int, default=256)
|
||||
parser.add_argument("--val_set_size", type=int, default=16)
|
||||
parser.add_argument("--eval_step", type=int, default=100)
|
||||
parser.add_argument("--save_step", type=int, default=100)
|
||||
parser.add_argument("--device_map", type=str, default="auto")
|
||||
parser.add_argument("--waveft_n_frequency", type=int, default=2592)
|
||||
parser.add_argument("--waveft_target_modules", type=str, default=None)
|
||||
parser.add_argument("--waveft_scaling", type=float, default=25.0)
|
||||
parser.add_argument("--waveft_wavelet_family", type=str, default="db1")
|
||||
parser.add_argument("--waveft_use_idwt", action="store_true", default=True)
|
||||
parser.add_argument("--dtype", type=str, default="float16")
|
||||
parser.add_argument("--seed", type=int, default=None)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
train(
|
||||
base_model=args.base_model,
|
||||
data_path=args.data_path,
|
||||
output_dir=args.output_dir,
|
||||
batch_size=args.batch_size,
|
||||
num_epochs=args.num_epochs,
|
||||
learning_rate=args.learning_rate,
|
||||
cutoff_len=args.cutoff_len,
|
||||
val_set_size=args.val_set_size,
|
||||
eval_step=args.eval_step,
|
||||
save_step=args.save_step,
|
||||
device_map=args.device_map,
|
||||
waveft_n_frequency=args.waveft_n_frequency,
|
||||
waveft_target_modules=args.waveft_target_modules,
|
||||
waveft_scaling=args.waveft_scaling,
|
||||
waveft_wavelet_family=args.waveft_wavelet_family,
|
||||
waveft_use_idwt=args.waveft_use_idwt,
|
||||
dtype=args.dtype,
|
||||
seed=args.seed,
|
||||
)
|
@ -0,0 +1,20 @@
|
||||
{
|
||||
"lambda_pattern": {},
|
||||
"auto_mapping": null,
|
||||
"base_model_name_or_path": null,
|
||||
"bias": "none",
|
||||
"exclude_modules": null,
|
||||
"inference_mode": false,
|
||||
"init_weights": true,
|
||||
"layers_pattern": null,
|
||||
"layers_to_transform": null,
|
||||
"delora_lambda": 15,
|
||||
"module_dropout": 0.0,
|
||||
"modules_to_save": null,
|
||||
"peft_type": "DELORA",
|
||||
"r": 32,
|
||||
"rank_pattern": {},
|
||||
"revision": null,
|
||||
"target_modules": null,
|
||||
"task_type": "CAUSAL_LM"
|
||||
}
|
@ -0,0 +1,6 @@
|
||||
{
|
||||
"optimizer_kwargs": {
|
||||
"lr": 1e-3
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,30 @@
|
||||
{
|
||||
"alpha_pattern": {},
|
||||
"auto_mapping": null,
|
||||
"base_model_name_or_path": null,
|
||||
"bias": "none",
|
||||
"corda_config": null,
|
||||
"eva_config": null,
|
||||
"exclude_modules": null,
|
||||
"fan_in_fan_out": false,
|
||||
"inference_mode": false,
|
||||
"init_lora_weights": true,
|
||||
"layer_replication": null,
|
||||
"layers_pattern": null,
|
||||
"layers_to_transform": null,
|
||||
"loftq_config": {},
|
||||
"lora_alpha": 20,
|
||||
"lora_bias": false,
|
||||
"lora_dropout": 0.0,
|
||||
"megatron_config": null,
|
||||
"megatron_core": "megatron.core",
|
||||
"modules_to_save": null,
|
||||
"peft_type": "LORA",
|
||||
"r": 10,
|
||||
"rank_pattern": {},
|
||||
"revision": null,
|
||||
"target_modules": ["gate_proj", "up_proj", "down_proj"],
|
||||
"task_type": "CAUSAL_LM",
|
||||
"use_dora": false,
|
||||
"use_rslora": false
|
||||
}
|
@ -0,0 +1,18 @@
|
||||
{
|
||||
"auto_mapping": null,
|
||||
"base_model_name_or_path": null,
|
||||
"bias": "none",
|
||||
"exclude_modules": null,
|
||||
"inference_mode": false,
|
||||
"init_weights": "bat",
|
||||
"layers_pattern": null,
|
||||
"layers_to_transform": null,
|
||||
"mini_r": 1,
|
||||
"miss_dropout": 0.0,
|
||||
"modules_to_save": null,
|
||||
"peft_type": "MISS",
|
||||
"r": 64,
|
||||
"revision": null,
|
||||
"target_modules": null,
|
||||
"task_type": null
|
||||
}
|
@ -0,0 +1,18 @@
|
||||
{
|
||||
"auto_mapping": null,
|
||||
"base_model_name_or_path": null,
|
||||
"bias": "none",
|
||||
"exclude_modules": null,
|
||||
"inference_mode": false,
|
||||
"init_weights": true,
|
||||
"layers_pattern": null,
|
||||
"layers_to_transform": null,
|
||||
"mini_r": 1,
|
||||
"miss_dropout": 0.0,
|
||||
"modules_to_save": null,
|
||||
"peft_type": "MISS",
|
||||
"r": 64,
|
||||
"revision": null,
|
||||
"target_modules": null,
|
||||
"task_type": null
|
||||
}
|
@ -0,0 +1,18 @@
|
||||
{
|
||||
"auto_mapping": null,
|
||||
"base_model_name_or_path": null,
|
||||
"bias": "none",
|
||||
"exclude_modules": null,
|
||||
"inference_mode": false,
|
||||
"init_weights": "mini",
|
||||
"layers_pattern": null,
|
||||
"layers_to_transform": null,
|
||||
"mini_r": 64,
|
||||
"miss_dropout": 0.0,
|
||||
"modules_to_save": null,
|
||||
"peft_type": "MISS",
|
||||
"r": 64,
|
||||
"revision": null,
|
||||
"target_modules": null,
|
||||
"task_type": null
|
||||
}
|
@ -0,0 +1,17 @@
|
||||
{
|
||||
"auto_mapping": null,
|
||||
"base_model_name_or_path": null,
|
||||
"inference_mode": false,
|
||||
"num_attention_heads": 24,
|
||||
"num_layers": 28,
|
||||
"num_transformer_submodules": 1,
|
||||
"num_virtual_tokens": 200,
|
||||
"peft_type": "PROMPT_TUNING",
|
||||
"prompt_tuning_init": "SAMPLE_VOCAB",
|
||||
"prompt_tuning_init_text": null,
|
||||
"revision": null,
|
||||
"task_type": "CAUSAL_LM",
|
||||
"token_dim": 3072,
|
||||
"tokenizer_kwargs": null,
|
||||
"tokenizer_name_or_path": null
|
||||
}
|
@ -0,0 +1,6 @@
|
||||
{
|
||||
"optimizer_kwargs": {
|
||||
"lr": 1e-3
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,26 @@
|
||||
{
|
||||
"auto_mapping": null,
|
||||
"base_model_name_or_path": null,
|
||||
"bias": "none",
|
||||
"exclude_modules": null,
|
||||
"fan_in_fan_out": false,
|
||||
"inference_mode": false,
|
||||
"init_weights": true,
|
||||
"layers_pattern": null,
|
||||
"layers_to_transform": null,
|
||||
"modules_to_save": null,
|
||||
"n_frequency": 5000,
|
||||
"n_frequency_pattern": {},
|
||||
"peft_type": "WAVEFT",
|
||||
"proportional_parameters": false,
|
||||
"random_loc_seed": 777,
|
||||
"revision": null,
|
||||
"scaling": 25.0,
|
||||
"target_modules": [
|
||||
"q_proj",
|
||||
"v_proj"
|
||||
],
|
||||
"task_type": "CAUSAL_LM",
|
||||
"use_idwt": true,
|
||||
"wavelet_family": "db1"
|
||||
}
|
352
method_comparison/MetaMathQA/results/miss--llama-3.2-3B-bat.json
Normal file
352
method_comparison/MetaMathQA/results/miss--llama-3.2-3B-bat.json
Normal file
@ -0,0 +1,352 @@
|
||||
{
|
||||
"run_info": {
|
||||
"created_at": "2025-08-14T11:55:49+00:00",
|
||||
"total_time": 2808.721444314,
|
||||
"experiment_name": "miss/llama-3.2-3B-bat",
|
||||
"peft_branch": "main",
|
||||
"train_config": {
|
||||
"model_id": "meta-llama/Llama-3.2-3B",
|
||||
"dtype": "bfloat16",
|
||||
"max_seq_length": 768,
|
||||
"batch_size": 4,
|
||||
"batch_size_eval": 50,
|
||||
"max_steps": 5000,
|
||||
"eval_steps": 250,
|
||||
"compile": false,
|
||||
"query_template": "Question: {query} Think step by step.\nAnswer:",
|
||||
"seed": 0,
|
||||
"grad_norm_clip": 1.0,
|
||||
"optimizer_type": "AdamW",
|
||||
"optimizer_kwargs": {
|
||||
"lr": 0.0001,
|
||||
"weight_decay": 0.1
|
||||
},
|
||||
"lr_scheduler": "cosine",
|
||||
"use_amp": false,
|
||||
"autocast_adapter_dtype": true,
|
||||
"generation_kwargs": {
|
||||
"max_length": 800,
|
||||
"max_new_tokens": 300
|
||||
},
|
||||
"attn_implementation": null
|
||||
},
|
||||
"peft_config": {
|
||||
"task_type": null,
|
||||
"peft_type": "MISS",
|
||||
"auto_mapping": null,
|
||||
"base_model_name_or_path": "meta-llama/Llama-3.2-3B",
|
||||
"revision": null,
|
||||
"inference_mode": false,
|
||||
"r": 64,
|
||||
"miss_dropout": 0.0,
|
||||
"mini_r": 1,
|
||||
"target_modules": [
|
||||
"v_proj",
|
||||
"q_proj"
|
||||
],
|
||||
"exclude_modules": null,
|
||||
"init_weights": "bat",
|
||||
"layers_to_transform": null,
|
||||
"layers_pattern": null,
|
||||
"bias": "none",
|
||||
"modules_to_save": null
|
||||
},
|
||||
"error_msg": ""
|
||||
},
|
||||
"train_info": {
|
||||
"accelerator_memory_reserved_avg": 14713719934,
|
||||
"accelerator_memory_max": 25251807232,
|
||||
"accelerator_memory_reserved_99th": 20472733368,
|
||||
"train_time": 2466.149786608999,
|
||||
"file_size": 29367552,
|
||||
"num_trainable_params": 7340032,
|
||||
"num_total_params": 3220089856,
|
||||
"status": "success",
|
||||
"metrics": [
|
||||
{
|
||||
"step": 250,
|
||||
"valid accuracy": 0.32,
|
||||
"train loss": 0.8741402707099915,
|
||||
"train samples": 1000,
|
||||
"train time": 44.507981576001725,
|
||||
"eval time": 16.603345405999903,
|
||||
"tokens / sec": 4756.8771376088835,
|
||||
"mem allocated avg": 6898417197.056,
|
||||
"mem reserved avg": 14772422574.08,
|
||||
"elapsed time": 128.87205576299993
|
||||
},
|
||||
{
|
||||
"step": 500,
|
||||
"valid accuracy": 0.42,
|
||||
"train loss": 0.6949697629213333,
|
||||
"train samples": 2000,
|
||||
"train time": 43.6579733309992,
|
||||
"eval time": 12.170993550999924,
|
||||
"tokens / sec": 4764.192749467687,
|
||||
"mem allocated avg": 6890132037.632,
|
||||
"mem reserved avg": 14662515032.064,
|
||||
"elapsed time": 244.05737383899998
|
||||
},
|
||||
{
|
||||
"step": 750,
|
||||
"valid accuracy": 0.38,
|
||||
"train loss": 0.667268633723259,
|
||||
"train samples": 3000,
|
||||
"train time": 44.76929137299828,
|
||||
"eval time": 8.243386759000032,
|
||||
"tokens / sec": 4789.0192903368525,
|
||||
"mem allocated avg": 6900972326.912,
|
||||
"mem reserved avg": 14823525974.016,
|
||||
"elapsed time": 357.2643382499999
|
||||
},
|
||||
{
|
||||
"step": 1000,
|
||||
"valid accuracy": 0.48,
|
||||
"train loss": 0.6478440872430802,
|
||||
"train samples": 4000,
|
||||
"train time": 43.91589877199954,
|
||||
"eval time": 9.950706549000074,
|
||||
"tokens / sec": 4743.976687842116,
|
||||
"mem allocated avg": 6892131758.08,
|
||||
"mem reserved avg": 14678444998.656,
|
||||
"elapsed time": 470.61746281599994
|
||||
},
|
||||
{
|
||||
"step": 1250,
|
||||
"valid accuracy": 0.4,
|
||||
"train loss": 0.6435494017601013,
|
||||
"train samples": 5000,
|
||||
"train time": 44.14956537599949,
|
||||
"eval time": 16.547810228000117,
|
||||
"tokens / sec": 4723.444007296278,
|
||||
"mem allocated avg": 6892566360.064,
|
||||
"mem reserved avg": 14674737233.92,
|
||||
"elapsed time": 591.057877963
|
||||
},
|
||||
{
|
||||
"step": 1500,
|
||||
"valid accuracy": 0.44,
|
||||
"train loss": 0.6368351166248322,
|
||||
"train samples": 6000,
|
||||
"train time": 44.08414804900008,
|
||||
"eval time": 16.39257521799982,
|
||||
"tokens / sec": 4748.441543371237,
|
||||
"mem allocated avg": 6893236697.088,
|
||||
"mem reserved avg": 14706580389.888,
|
||||
"elapsed time": 711.4482007859999
|
||||
},
|
||||
{
|
||||
"step": 1750,
|
||||
"valid accuracy": 0.48,
|
||||
"train loss": 0.6278127529621125,
|
||||
"train samples": 7000,
|
||||
"train time": 44.35628801999951,
|
||||
"eval time": 16.51757288099998,
|
||||
"tokens / sec": 4719.849413584954,
|
||||
"mem allocated avg": 6894834587.648,
|
||||
"mem reserved avg": 14716881600.512,
|
||||
"elapsed time": 832.303061434
|
||||
},
|
||||
{
|
||||
"step": 2000,
|
||||
"valid accuracy": 0.44,
|
||||
"train loss": 0.6281237225532532,
|
||||
"train samples": 8000,
|
||||
"train time": 43.95804043099747,
|
||||
"eval time": 16.465996583000106,
|
||||
"tokens / sec": 4724.869397352412,
|
||||
"mem allocated avg": 6891602710.528,
|
||||
"mem reserved avg": 14655669927.936,
|
||||
"elapsed time": 952.480474365
|
||||
},
|
||||
{
|
||||
"step": 2250,
|
||||
"valid accuracy": 0.42,
|
||||
"train loss": 0.6159191156625747,
|
||||
"train samples": 9000,
|
||||
"train time": 44.99231110500091,
|
||||
"eval time": 16.5404373570002,
|
||||
"tokens / sec": 4777.4385160692145,
|
||||
"mem allocated avg": 6903352731.648,
|
||||
"mem reserved avg": 14850520514.56,
|
||||
"elapsed time": 1074.326083797
|
||||
},
|
||||
{
|
||||
"step": 2500,
|
||||
"valid accuracy": 0.44,
|
||||
"train loss": 0.6119081476926803,
|
||||
"train samples": 10000,
|
||||
"train time": 43.74939265700118,
|
||||
"eval time": 16.33099729599985,
|
||||
"tokens / sec": 4707.882498273705,
|
||||
"mem allocated avg": 6887975004.16,
|
||||
"mem reserved avg": 14597494931.456,
|
||||
"elapsed time": 1194.094911997
|
||||
},
|
||||
{
|
||||
"step": 2750,
|
||||
"valid accuracy": 0.44,
|
||||
"train loss": 0.6010881408452987,
|
||||
"train samples": 11000,
|
||||
"train time": 43.686495668999896,
|
||||
"eval time": 11.229614545000004,
|
||||
"tokens / sec": 4850.0342441142875,
|
||||
"mem allocated avg": 6899207546.88,
|
||||
"mem reserved avg": 14785391362.048,
|
||||
"elapsed time": 1308.783695182
|
||||
},
|
||||
{
|
||||
"step": 3000,
|
||||
"valid accuracy": 0.5,
|
||||
"train loss": 0.5899516706466674,
|
||||
"train samples": 12000,
|
||||
"train time": 43.49030302700089,
|
||||
"eval time": 16.45857661900004,
|
||||
"tokens / sec": 4799.483688821613,
|
||||
"mem allocated avg": 6894123913.216,
|
||||
"mem reserved avg": 14693427052.544,
|
||||
"elapsed time": 1428.4006117669999
|
||||
},
|
||||
{
|
||||
"step": 3250,
|
||||
"valid accuracy": 0.52,
|
||||
"train loss": 0.5989595657587051,
|
||||
"train samples": 13000,
|
||||
"train time": 44.46332806799887,
|
||||
"eval time": 16.496417500999996,
|
||||
"tokens / sec": 4743.257177633304,
|
||||
"mem allocated avg": 6895596777.472,
|
||||
"mem reserved avg": 14723995140.096,
|
||||
"elapsed time": 1549.445265484
|
||||
},
|
||||
{
|
||||
"step": 3500,
|
||||
"valid accuracy": 0.46,
|
||||
"train loss": 0.579978278040886,
|
||||
"train samples": 14000,
|
||||
"train time": 43.63575344299579,
|
||||
"eval time": 10.30441635599982,
|
||||
"tokens / sec": 4806.838050224342,
|
||||
"mem allocated avg": 6893774680.064,
|
||||
"mem reserved avg": 14699450073.088,
|
||||
"elapsed time": 1663.316950223
|
||||
},
|
||||
{
|
||||
"step": 3750,
|
||||
"valid accuracy": 0.44,
|
||||
"train loss": 0.5772325273752212,
|
||||
"train samples": 15000,
|
||||
"train time": 45.25726027099972,
|
||||
"eval time": 16.524598716000128,
|
||||
"tokens / sec": 4788.2483098266675,
|
||||
"mem allocated avg": 6905177583.616,
|
||||
"mem reserved avg": 14889795977.216,
|
||||
"elapsed time": 1785.1977310290001
|
||||
},
|
||||
{
|
||||
"step": 4000,
|
||||
"valid accuracy": 0.4,
|
||||
"train loss": 0.5859311088323593,
|
||||
"train samples": 16000,
|
||||
"train time": 43.383903580999686,
|
||||
"eval time": 16.386461492000308,
|
||||
"tokens / sec": 4710.802466597467,
|
||||
"mem allocated avg": 6886734053.376,
|
||||
"mem reserved avg": 14584660361.216,
|
||||
"elapsed time": 1904.6209389110002
|
||||
},
|
||||
{
|
||||
"step": 4250,
|
||||
"valid accuracy": 0.5,
|
||||
"train loss": 0.5724418247938157,
|
||||
"train samples": 17000,
|
||||
"train time": 44.42285394400233,
|
||||
"eval time": 9.048803244000283,
|
||||
"tokens / sec": 4758.564145078759,
|
||||
"mem allocated avg": 6896789555.2,
|
||||
"mem reserved avg": 14740688470.016,
|
||||
"elapsed time": 2018.321323589
|
||||
},
|
||||
{
|
||||
"step": 4500,
|
||||
"valid accuracy": 0.46,
|
||||
"train loss": 0.5792494393587112,
|
||||
"train samples": 18000,
|
||||
"train time": 43.636566284001674,
|
||||
"eval time": 16.3964514889999,
|
||||
"tokens / sec": 4762.4737163655245,
|
||||
"mem allocated avg": 6892818855.936,
|
||||
"mem reserved avg": 14655921586.176,
|
||||
"elapsed time": 2137.859151554
|
||||
},
|
||||
{
|
||||
"step": 4750,
|
||||
"valid accuracy": 0.46,
|
||||
"train loss": 0.5680228790044785,
|
||||
"train samples": 19000,
|
||||
"train time": 43.96985955700529,
|
||||
"eval time": 16.500367100000403,
|
||||
"tokens / sec": 4774.61156608476,
|
||||
"mem allocated avg": 6894185185.28,
|
||||
"mem reserved avg": 14706722996.224,
|
||||
"elapsed time": 2258.0618387639997
|
||||
},
|
||||
{
|
||||
"step": 5000,
|
||||
"valid accuracy": 0.44,
|
||||
"train loss": 0.5760680929422378,
|
||||
"train samples": 20000,
|
||||
"train time": 43.83249596400128,
|
||||
"eval time": 16.474086973999874,
|
||||
"tokens / sec": 4751.7257555001215,
|
||||
"mem allocated avg": 6891346642.944,
|
||||
"mem reserved avg": 14655552487.424,
|
||||
"elapsed time": 2377.7959423069997
|
||||
},
|
||||
{
|
||||
"step": 5000,
|
||||
"test accuracy": 0.5049279757391963,
|
||||
"train loss": 0.5760680929422378,
|
||||
"train samples": 20000,
|
||||
"train total tokens": 4198051
|
||||
}
|
||||
]
|
||||
},
|
||||
"meta_info": {
|
||||
"model_info": {
|
||||
"sha": "13afe5124825b4f3751f836b40dafda64c1ed062",
|
||||
"created_at": "2024-09-18T15:23:48+00:00"
|
||||
},
|
||||
"dataset_info": {
|
||||
"metamath": {
|
||||
"sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18",
|
||||
"created_at": "2023-09-21T17:22:46+00:00"
|
||||
},
|
||||
"gsm8k": {
|
||||
"sha": "e53f048856ff4f594e959d75785d2c2d37b678ee",
|
||||
"created_at": "2022-04-12T10:22:10+00:00"
|
||||
}
|
||||
},
|
||||
"package_info": {
|
||||
"transformers-version": "4.52.4",
|
||||
"transformers-commit-hash": null,
|
||||
"peft-version": "0.17.1.dev0",
|
||||
"peft-commit-hash": "47961bb54706e45fd3b5460baa4921a48bcdce35",
|
||||
"datasets-version": "3.6.0",
|
||||
"datasets-commit-hash": null,
|
||||
"bitsandbytes-version": "0.46.0",
|
||||
"bitsandbytes-commit-hash": null,
|
||||
"torch-version": "2.7.1+cu126",
|
||||
"torch-commit-hash": null
|
||||
},
|
||||
"system_info": {
|
||||
"system": "Linux",
|
||||
"release": "6.14.0-1010-aws",
|
||||
"version": "#10~24.04.1-Ubuntu SMP Fri Jul 18 20:44:30 UTC 2025",
|
||||
"machine": "x86_64",
|
||||
"processor": "x86_64",
|
||||
"accelerator": "NVIDIA L40S"
|
||||
},
|
||||
"pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n"
|
||||
}
|
||||
}
|
@ -0,0 +1,352 @@
|
||||
{
|
||||
"run_info": {
|
||||
"created_at": "2025-08-14T12:42:42+00:00",
|
||||
"total_time": 1917.9635583239997,
|
||||
"experiment_name": "miss/llama-3.2-3B-default",
|
||||
"peft_branch": "main",
|
||||
"train_config": {
|
||||
"model_id": "meta-llama/Llama-3.2-3B",
|
||||
"dtype": "bfloat16",
|
||||
"max_seq_length": 768,
|
||||
"batch_size": 4,
|
||||
"batch_size_eval": 50,
|
||||
"max_steps": 5000,
|
||||
"eval_steps": 250,
|
||||
"compile": false,
|
||||
"query_template": "Question: {query} Think step by step.\nAnswer:",
|
||||
"seed": 0,
|
||||
"grad_norm_clip": 1.0,
|
||||
"optimizer_type": "AdamW",
|
||||
"optimizer_kwargs": {
|
||||
"lr": 0.0001,
|
||||
"weight_decay": 0.1
|
||||
},
|
||||
"lr_scheduler": "cosine",
|
||||
"use_amp": false,
|
||||
"autocast_adapter_dtype": true,
|
||||
"generation_kwargs": {
|
||||
"max_length": 800,
|
||||
"max_new_tokens": 300
|
||||
},
|
||||
"attn_implementation": null
|
||||
},
|
||||
"peft_config": {
|
||||
"task_type": null,
|
||||
"peft_type": "MISS",
|
||||
"auto_mapping": null,
|
||||
"base_model_name_or_path": "meta-llama/Llama-3.2-3B",
|
||||
"revision": null,
|
||||
"inference_mode": false,
|
||||
"r": 64,
|
||||
"miss_dropout": 0.0,
|
||||
"mini_r": 1,
|
||||
"target_modules": [
|
||||
"q_proj",
|
||||
"v_proj"
|
||||
],
|
||||
"exclude_modules": null,
|
||||
"init_weights": true,
|
||||
"layers_to_transform": null,
|
||||
"layers_pattern": null,
|
||||
"bias": "none",
|
||||
"modules_to_save": null
|
||||
},
|
||||
"error_msg": ""
|
||||
},
|
||||
"train_info": {
|
||||
"accelerator_memory_reserved_avg": 11170868939,
|
||||
"accelerator_memory_max": 20248002560,
|
||||
"accelerator_memory_reserved_99th": 16301393182,
|
||||
"train_time": 1713.3205038909991,
|
||||
"file_size": 29367496,
|
||||
"num_trainable_params": 7340032,
|
||||
"num_total_params": 3220089856,
|
||||
"status": "success",
|
||||
"metrics": [
|
||||
{
|
||||
"step": 250,
|
||||
"valid accuracy": 0.36,
|
||||
"train loss": 0.8771686832904816,
|
||||
"train samples": 1000,
|
||||
"train time": 29.625120898993828,
|
||||
"eval time": 11.058316758999808,
|
||||
"tokens / sec": 7146.603746254777,
|
||||
"mem allocated avg": 6894354876.416,
|
||||
"mem reserved avg": 11212691603.456,
|
||||
"elapsed time": 89.49706801999992
|
||||
},
|
||||
{
|
||||
"step": 500,
|
||||
"valid accuracy": 0.42,
|
||||
"train loss": 0.6949640859365463,
|
||||
"train samples": 2000,
|
||||
"train time": 29.06092399400177,
|
||||
"eval time": 5.4939734129998214,
|
||||
"tokens / sec": 7157.205326400859,
|
||||
"mem allocated avg": 6887297284.096,
|
||||
"mem reserved avg": 11116172279.808,
|
||||
"elapsed time": 166.80778670399968
|
||||
},
|
||||
{
|
||||
"step": 750,
|
||||
"valid accuracy": 0.42,
|
||||
"train loss": 0.6677889958620071,
|
||||
"train samples": 3000,
|
||||
"train time": 29.654036860997167,
|
||||
"eval time": 6.225696284000151,
|
||||
"tokens / sec": 7230.078016190556,
|
||||
"mem allocated avg": 6897885888.512,
|
||||
"mem reserved avg": 11257109282.816,
|
||||
"elapsed time": 245.76960384799986
|
||||
},
|
||||
{
|
||||
"step": 1000,
|
||||
"valid accuracy": 0.38,
|
||||
"train loss": 0.6483739440441132,
|
||||
"train samples": 4000,
|
||||
"train time": 28.837856293007462,
|
||||
"eval time": 10.98694702900002,
|
||||
"tokens / sec": 7224.392752470884,
|
||||
"mem allocated avg": 6888501639.168,
|
||||
"mem reserved avg": 11141564596.224,
|
||||
"elapsed time": 328.02559429799976
|
||||
},
|
||||
{
|
||||
"step": 1250,
|
||||
"valid accuracy": 0.46,
|
||||
"train loss": 0.6433384964466095,
|
||||
"train samples": 5000,
|
||||
"train time": 28.81160366599852,
|
||||
"eval time": 7.707165779999741,
|
||||
"tokens / sec": 7237.986556302045,
|
||||
"mem allocated avg": 6888334700.544,
|
||||
"mem reserved avg": 11139123511.296,
|
||||
"elapsed time": 407.06604839199963
|
||||
},
|
||||
{
|
||||
"step": 1500,
|
||||
"valid accuracy": 0.5,
|
||||
"train loss": 0.6369507477283478,
|
||||
"train samples": 6000,
|
||||
"train time": 28.99961056100119,
|
||||
"eval time": 8.123675749000085,
|
||||
"tokens / sec": 7218.407280320836,
|
||||
"mem allocated avg": 6890289985.536,
|
||||
"mem reserved avg": 11163484028.928,
|
||||
"elapsed time": 486.7935630989996
|
||||
},
|
||||
{
|
||||
"step": 1750,
|
||||
"valid accuracy": 0.42,
|
||||
"train loss": 0.6278414962291717,
|
||||
"train samples": 7000,
|
||||
"train time": 29.449354215004405,
|
||||
"eval time": 11.046255440000095,
|
||||
"tokens / sec": 7108.984410032798,
|
||||
"mem allocated avg": 6891426932.736,
|
||||
"mem reserved avg": 11175706230.784,
|
||||
"elapsed time": 570.2098619899998
|
||||
},
|
||||
{
|
||||
"step": 2000,
|
||||
"valid accuracy": 0.42,
|
||||
"train loss": 0.62835728931427,
|
||||
"train samples": 8000,
|
||||
"train time": 28.844003398995028,
|
||||
"eval time": 11.063917559999936,
|
||||
"tokens / sec": 7200.664801170994,
|
||||
"mem allocated avg": 6888937164.8,
|
||||
"mem reserved avg": 11125752070.144,
|
||||
"elapsed time": 652.66749592
|
||||
},
|
||||
{
|
||||
"step": 2250,
|
||||
"valid accuracy": 0.46,
|
||||
"train loss": 0.616273587346077,
|
||||
"train samples": 9000,
|
||||
"train time": 29.490800742004012,
|
||||
"eval time": 8.136742810000214,
|
||||
"tokens / sec": 7288.645767215389,
|
||||
"mem allocated avg": 6899370121.216,
|
||||
"mem reserved avg": 11286914007.04,
|
||||
"elapsed time": 733.5518891469997
|
||||
},
|
||||
{
|
||||
"step": 2500,
|
||||
"valid accuracy": 0.48,
|
||||
"train loss": 0.6127588752508163,
|
||||
"train samples": 10000,
|
||||
"train time": 28.812003271001686,
|
||||
"eval time": 11.006928690999757,
|
||||
"tokens / sec": 7148.652527306175,
|
||||
"mem allocated avg": 6884932614.144,
|
||||
"mem reserved avg": 11077299470.336,
|
||||
"elapsed time": 815.9489541349999
|
||||
},
|
||||
{
|
||||
"step": 2750,
|
||||
"valid accuracy": 0.48,
|
||||
"train loss": 0.6011098005771637,
|
||||
"train samples": 11000,
|
||||
"train time": 29.451534630989954,
|
||||
"eval time": 11.065294603999973,
|
||||
"tokens / sec": 7194.226129630993,
|
||||
"mem allocated avg": 6895703631.872,
|
||||
"mem reserved avg": 11229007446.016,
|
||||
"elapsed time": 899.4794512619997
|
||||
},
|
||||
{
|
||||
"step": 3000,
|
||||
"valid accuracy": 0.5,
|
||||
"train loss": 0.590887265920639,
|
||||
"train samples": 12000,
|
||||
"train time": 29.118879764002486,
|
||||
"eval time": 11.043336514999282,
|
||||
"tokens / sec": 7168.235924310477,
|
||||
"mem allocated avg": 6890226739.2,
|
||||
"mem reserved avg": 11156563427.328,
|
||||
"elapsed time": 982.334967583
|
||||
},
|
||||
{
|
||||
"step": 3250,
|
||||
"valid accuracy": 0.52,
|
||||
"train loss": 0.6000960898399353,
|
||||
"train samples": 13000,
|
||||
"train time": 29.13528394500463,
|
||||
"eval time": 11.077541423999719,
|
||||
"tokens / sec": 7238.680096548703,
|
||||
"mem allocated avg": 6892138940.416,
|
||||
"mem reserved avg": 11182651998.208,
|
||||
"elapsed time": 1065.5038535119998
|
||||
},
|
||||
{
|
||||
"step": 3500,
|
||||
"valid accuracy": 0.46,
|
||||
"train loss": 0.5813224712610244,
|
||||
"train samples": 14000,
|
||||
"train time": 29.210709365002003,
|
||||
"eval time": 11.022432370000388,
|
||||
"tokens / sec": 7180.585633134473,
|
||||
"mem allocated avg": 6891394273.28,
|
||||
"mem reserved avg": 11167116296.192,
|
||||
"elapsed time": 1148.6551861069997
|
||||
},
|
||||
{
|
||||
"step": 3750,
|
||||
"valid accuracy": 0.5,
|
||||
"train loss": 0.5779635999202728,
|
||||
"train samples": 15000,
|
||||
"train time": 29.93350169399855,
|
||||
"eval time": 11.012248770000042,
|
||||
"tokens / sec": 7239.48043951862,
|
||||
"mem allocated avg": 6900874864.64,
|
||||
"mem reserved avg": 11322674642.944,
|
||||
"elapsed time": 1233.146194169
|
||||
},
|
||||
{
|
||||
"step": 4000,
|
||||
"valid accuracy": 0.5,
|
||||
"train loss": 0.5870059201717377,
|
||||
"train samples": 16000,
|
||||
"train time": 28.73894150599881,
|
||||
"eval time": 11.028763495000021,
|
||||
"tokens / sec": 7111.361424266106,
|
||||
"mem allocated avg": 6883623936.0,
|
||||
"mem reserved avg": 11058022449.152,
|
||||
"elapsed time": 1315.630321268
|
||||
},
|
||||
{
|
||||
"step": 4250,
|
||||
"valid accuracy": 0.48,
|
||||
"train loss": 0.5732149496078491,
|
||||
"train samples": 17000,
|
||||
"train time": 29.274482168998475,
|
||||
"eval time": 11.023004681000202,
|
||||
"tokens / sec": 7220.930460175991,
|
||||
"mem allocated avg": 6893432758.272,
|
||||
"mem reserved avg": 11193867567.104,
|
||||
"elapsed time": 1399.0288222240001
|
||||
},
|
||||
{
|
||||
"step": 4500,
|
||||
"valid accuracy": 0.48,
|
||||
"train loss": 0.5802423723936081,
|
||||
"train samples": 18000,
|
||||
"train time": 28.807760504997532,
|
||||
"eval time": 11.07264679400032,
|
||||
"tokens / sec": 7213.958890138232,
|
||||
"mem allocated avg": 6888416004.096,
|
||||
"mem reserved avg": 11124485390.336,
|
||||
"elapsed time": 1481.5334540769995
|
||||
},
|
||||
{
|
||||
"step": 4750,
|
||||
"valid accuracy": 0.52,
|
||||
"train loss": 0.5696245921850205,
|
||||
"train samples": 19000,
|
||||
"train time": 29.20943511798214,
|
||||
"eval time": 11.082792330000302,
|
||||
"tokens / sec": 7187.369394581538,
|
||||
"mem allocated avg": 6890813089.792,
|
||||
"mem reserved avg": 11168844349.44,
|
||||
"elapsed time": 1565.0750862589998
|
||||
},
|
||||
{
|
||||
"step": 5000,
|
||||
"valid accuracy": 0.5,
|
||||
"train loss": 0.5774132673740386,
|
||||
"train samples": 20000,
|
||||
"train time": 29.084354959996745,
|
||||
"eval time": 11.092419973000688,
|
||||
"tokens / sec": 7161.238414483417,
|
||||
"mem allocated avg": 6887869800.448,
|
||||
"mem reserved avg": 11118328152.064,
|
||||
"elapsed time": 1648.4280528519998
|
||||
},
|
||||
{
|
||||
"step": 5000,
|
||||
"test accuracy": 0.5087187263078089,
|
||||
"train loss": 0.5774132673740386,
|
||||
"train samples": 20000,
|
||||
"train total tokens": 4198051
|
||||
}
|
||||
]
|
||||
},
|
||||
"meta_info": {
|
||||
"model_info": {
|
||||
"sha": "13afe5124825b4f3751f836b40dafda64c1ed062",
|
||||
"created_at": "2024-09-18T15:23:48+00:00"
|
||||
},
|
||||
"dataset_info": {
|
||||
"metamath": {
|
||||
"sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18",
|
||||
"created_at": "2023-09-21T17:22:46+00:00"
|
||||
},
|
||||
"gsm8k": {
|
||||
"sha": "e53f048856ff4f594e959d75785d2c2d37b678ee",
|
||||
"created_at": "2022-04-12T10:22:10+00:00"
|
||||
}
|
||||
},
|
||||
"package_info": {
|
||||
"transformers-version": "4.52.4",
|
||||
"transformers-commit-hash": null,
|
||||
"peft-version": "0.17.1.dev0",
|
||||
"peft-commit-hash": "47961bb54706e45fd3b5460baa4921a48bcdce35",
|
||||
"datasets-version": "3.6.0",
|
||||
"datasets-commit-hash": null,
|
||||
"bitsandbytes-version": "0.46.0",
|
||||
"bitsandbytes-commit-hash": null,
|
||||
"torch-version": "2.7.1+cu126",
|
||||
"torch-commit-hash": null
|
||||
},
|
||||
"system_info": {
|
||||
"system": "Linux",
|
||||
"release": "6.14.0-1010-aws",
|
||||
"version": "#10~24.04.1-Ubuntu SMP Fri Jul 18 20:44:30 UTC 2025",
|
||||
"machine": "x86_64",
|
||||
"processor": "x86_64",
|
||||
"accelerator": "NVIDIA L40S"
|
||||
},
|
||||
"pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n"
|
||||
}
|
||||
}
|
@ -0,0 +1,352 @@
|
||||
{
|
||||
"run_info": {
|
||||
"created_at": "2025-08-14T13:14:44+00:00",
|
||||
"total_time": 1939.2463944070005,
|
||||
"experiment_name": "miss/llama-3.2-3B-mini",
|
||||
"peft_branch": "main",
|
||||
"train_config": {
|
||||
"model_id": "meta-llama/Llama-3.2-3B",
|
||||
"dtype": "bfloat16",
|
||||
"max_seq_length": 768,
|
||||
"batch_size": 4,
|
||||
"batch_size_eval": 50,
|
||||
"max_steps": 5000,
|
||||
"eval_steps": 250,
|
||||
"compile": false,
|
||||
"query_template": "Question: {query} Think step by step.\nAnswer:",
|
||||
"seed": 0,
|
||||
"grad_norm_clip": 1.0,
|
||||
"optimizer_type": "AdamW",
|
||||
"optimizer_kwargs": {
|
||||
"lr": 0.0001,
|
||||
"weight_decay": 0.1
|
||||
},
|
||||
"lr_scheduler": "cosine",
|
||||
"use_amp": false,
|
||||
"autocast_adapter_dtype": true,
|
||||
"generation_kwargs": {
|
||||
"max_length": 800,
|
||||
"max_new_tokens": 300
|
||||
},
|
||||
"attn_implementation": null
|
||||
},
|
||||
"peft_config": {
|
||||
"task_type": null,
|
||||
"peft_type": "MISS",
|
||||
"auto_mapping": null,
|
||||
"base_model_name_or_path": "meta-llama/Llama-3.2-3B",
|
||||
"revision": null,
|
||||
"inference_mode": false,
|
||||
"r": 64,
|
||||
"miss_dropout": 0.0,
|
||||
"mini_r": 64,
|
||||
"target_modules": [
|
||||
"q_proj",
|
||||
"v_proj"
|
||||
],
|
||||
"exclude_modules": null,
|
||||
"init_weights": "mini",
|
||||
"layers_to_transform": null,
|
||||
"layers_pattern": null,
|
||||
"bias": "none",
|
||||
"modules_to_save": null
|
||||
},
|
||||
"error_msg": ""
|
||||
},
|
||||
"train_info": {
|
||||
"accelerator_memory_reserved_avg": 11076096963,
|
||||
"accelerator_memory_max": 20189282304,
|
||||
"accelerator_memory_reserved_99th": 16251103477,
|
||||
"train_time": 1757.4722608399989,
|
||||
"file_size": 924568,
|
||||
"num_trainable_params": 229376,
|
||||
"num_total_params": 3212979200,
|
||||
"status": "success",
|
||||
"metrics": [
|
||||
{
|
||||
"step": 250,
|
||||
"valid accuracy": 0.34,
|
||||
"train loss": 1.0204485692977905,
|
||||
"train samples": 1000,
|
||||
"train time": 30.37152520300151,
|
||||
"eval time": 11.248587610999493,
|
||||
"tokens / sec": 6970.970294869372,
|
||||
"mem allocated avg": 6780477966.336,
|
||||
"mem reserved avg": 11118412038.144,
|
||||
"elapsed time": 90.66597219600044
|
||||
},
|
||||
{
|
||||
"step": 500,
|
||||
"valid accuracy": 0.34,
|
||||
"train loss": 0.747962894320488,
|
||||
"train samples": 2000,
|
||||
"train time": 29.572977570002877,
|
||||
"eval time": 11.171488900999975,
|
||||
"tokens / sec": 7033.278928631729,
|
||||
"mem allocated avg": 6773512382.464,
|
||||
"mem reserved avg": 11022605746.176,
|
||||
"elapsed time": 174.9062917200008
|
||||
},
|
||||
{
|
||||
"step": 750,
|
||||
"valid accuracy": 0.36,
|
||||
"train loss": 0.7062408643960952,
|
||||
"train samples": 3000,
|
||||
"train time": 30.206997992999277,
|
||||
"eval time": 8.297702855000352,
|
||||
"tokens / sec": 7097.726164304351,
|
||||
"mem allocated avg": 6784103079.936,
|
||||
"mem reserved avg": 11160933892.096,
|
||||
"elapsed time": 257.1565456070002
|
||||
},
|
||||
{
|
||||
"step": 1000,
|
||||
"valid accuracy": 0.42,
|
||||
"train loss": 0.688418450832367,
|
||||
"train samples": 4000,
|
||||
"train time": 29.89673271099673,
|
||||
"eval time": 8.431126997999854,
|
||||
"tokens / sec": 6968.520674614356,
|
||||
"mem allocated avg": 6774552799.232,
|
||||
"mem reserved avg": 11046932709.376,
|
||||
"elapsed time": 338.98425150300045
|
||||
},
|
||||
{
|
||||
"step": 1250,
|
||||
"valid accuracy": 0.26,
|
||||
"train loss": 0.6864906589984894,
|
||||
"train samples": 5000,
|
||||
"train time": 29.511754502003896,
|
||||
"eval time": 11.189089829000295,
|
||||
"tokens / sec": 7066.269136450018,
|
||||
"mem allocated avg": 6774476697.6,
|
||||
"mem reserved avg": 11043107504.128,
|
||||
"elapsed time": 423.2539680640002
|
||||
},
|
||||
{
|
||||
"step": 1500,
|
||||
"valid accuracy": 0.34,
|
||||
"train loss": 0.6819815402030944,
|
||||
"train samples": 6000,
|
||||
"train time": 29.53373699200074,
|
||||
"eval time": 10.82430943000054,
|
||||
"tokens / sec": 7087.860234439605,
|
||||
"mem allocated avg": 6776410671.104,
|
||||
"mem reserved avg": 11066327171.072,
|
||||
"elapsed time": 507.00703521000014
|
||||
},
|
||||
{
|
||||
"step": 1750,
|
||||
"valid accuracy": 0.24,
|
||||
"train loss": 0.6748508417606354,
|
||||
"train samples": 7000,
|
||||
"train time": 29.92787808700814,
|
||||
"eval time": 11.135467017999872,
|
||||
"tokens / sec": 6995.317188587526,
|
||||
"mem allocated avg": 6777799206.912,
|
||||
"mem reserved avg": 11081728655.36,
|
||||
"elapsed time": 591.3889150910009
|
||||
},
|
||||
{
|
||||
"step": 2000,
|
||||
"valid accuracy": 0.32,
|
||||
"train loss": 0.6793494290113449,
|
||||
"train samples": 8000,
|
||||
"train time": 29.828631155996845,
|
||||
"eval time": 7.671241181000369,
|
||||
"tokens / sec": 6962.974563391727,
|
||||
"mem allocated avg": 6775091949.568,
|
||||
"mem reserved avg": 11030155493.376,
|
||||
"elapsed time": 672.1807907450002
|
||||
},
|
||||
{
|
||||
"step": 2250,
|
||||
"valid accuracy": 0.36,
|
||||
"train loss": 0.6712708432674408,
|
||||
"train samples": 9000,
|
||||
"train time": 30.12409129900061,
|
||||
"eval time": 7.389505904999169,
|
||||
"tokens / sec": 7135.418554754249,
|
||||
"mem allocated avg": 6785428178.944,
|
||||
"mem reserved avg": 11193422970.88,
|
||||
"elapsed time": 753.4013944540002
|
||||
},
|
||||
{
|
||||
"step": 2500,
|
||||
"valid accuracy": 0.36,
|
||||
"train loss": 0.670761358499527,
|
||||
"train samples": 10000,
|
||||
"train time": 29.392454811994867,
|
||||
"eval time": 11.273840481000661,
|
||||
"tokens / sec": 7007.478664760802,
|
||||
"mem allocated avg": 6770948311.04,
|
||||
"mem reserved avg": 10981837111.296,
|
||||
"elapsed time": 837.4394236850003
|
||||
},
|
||||
{
|
||||
"step": 2750,
|
||||
"valid accuracy": 0.36,
|
||||
"train loss": 0.6636076529026032,
|
||||
"train samples": 11000,
|
||||
"train time": 30.132865259004575,
|
||||
"eval time": 7.065399131999584,
|
||||
"tokens / sec": 7031.558339334618,
|
||||
"mem allocated avg": 6781682612.224,
|
||||
"mem reserved avg": 11132194521.088,
|
||||
"elapsed time": 918.3523091420002
|
||||
},
|
||||
{
|
||||
"step": 3000,
|
||||
"valid accuracy": 0.3,
|
||||
"train loss": 0.6547267787456512,
|
||||
"train samples": 12000,
|
||||
"train time": 29.80804876098864,
|
||||
"eval time": 6.651864860000387,
|
||||
"tokens / sec": 7002.504648112936,
|
||||
"mem allocated avg": 6776379066.368,
|
||||
"mem reserved avg": 11060597751.808,
|
||||
"elapsed time": 998.1520945420007
|
||||
},
|
||||
{
|
||||
"step": 3250,
|
||||
"valid accuracy": 0.36,
|
||||
"train loss": 0.6653184123039245,
|
||||
"train samples": 13000,
|
||||
"train time": 29.843793005994485,
|
||||
"eval time": 11.134645372000705,
|
||||
"tokens / sec": 7066.829607001965,
|
||||
"mem allocated avg": 6778676955.136,
|
||||
"mem reserved avg": 11088607313.92,
|
||||
"elapsed time": 1082.892349787
|
||||
},
|
||||
{
|
||||
"step": 3500,
|
||||
"valid accuracy": 0.4,
|
||||
"train loss": 0.6504588623046875,
|
||||
"train samples": 14000,
|
||||
"train time": 30.091547277996142,
|
||||
"eval time": 11.186960818999978,
|
||||
"tokens / sec": 6970.395974067295,
|
||||
"mem allocated avg": 6777435619.328,
|
||||
"mem reserved avg": 11074858385.408,
|
||||
"elapsed time": 1168.1813894270008
|
||||
},
|
||||
{
|
||||
"step": 3750,
|
||||
"valid accuracy": 0.38,
|
||||
"train loss": 0.6486766980886459,
|
||||
"train samples": 15000,
|
||||
"train time": 30.235947965997184,
|
||||
"eval time": 6.424060680000366,
|
||||
"tokens / sec": 7167.064854182855,
|
||||
"mem allocated avg": 6787226097.664,
|
||||
"mem reserved avg": 11226214039.552,
|
||||
"elapsed time": 1249.1344440330004
|
||||
},
|
||||
{
|
||||
"step": 4000,
|
||||
"valid accuracy": 0.34,
|
||||
"train loss": 0.6649546232223511,
|
||||
"train samples": 16000,
|
||||
"train time": 29.315789502004918,
|
||||
"eval time": 6.29557701000067,
|
||||
"tokens / sec": 6971.430872977951,
|
||||
"mem allocated avg": 6769964711.936,
|
||||
"mem reserved avg": 10964573356.032,
|
||||
"elapsed time": 1328.0223749040006
|
||||
},
|
||||
{
|
||||
"step": 4250,
|
||||
"valid accuracy": 0.38,
|
||||
"train loss": 0.6468708947896957,
|
||||
"train samples": 17000,
|
||||
"train time": 29.780288893992292,
|
||||
"eval time": 6.263248704000034,
|
||||
"tokens / sec": 7098.2857403591015,
|
||||
"mem allocated avg": 6779703865.344,
|
||||
"mem reserved avg": 11102406574.08,
|
||||
"elapsed time": 1408.0423461730006
|
||||
},
|
||||
{
|
||||
"step": 4500,
|
||||
"valid accuracy": 0.36,
|
||||
"train loss": 0.6554104331731796,
|
||||
"train samples": 18000,
|
||||
"train time": 29.55899746599971,
|
||||
"eval time": 8.381054077999579,
|
||||
"tokens / sec": 7030.617335349179,
|
||||
"mem allocated avg": 6774673686.528,
|
||||
"mem reserved avg": 11030071607.296,
|
||||
"elapsed time": 1489.5959585560004
|
||||
},
|
||||
{
|
||||
"step": 4750,
|
||||
"valid accuracy": 0.3,
|
||||
"train loss": 0.6466003597974778,
|
||||
"train samples": 19000,
|
||||
"train time": 29.626044395983627,
|
||||
"eval time": 8.314826920000087,
|
||||
"tokens / sec": 7086.298703733166,
|
||||
"mem allocated avg": 6776780376.064,
|
||||
"mem reserved avg": 11071855263.744,
|
||||
"elapsed time": 1571.4978439640008
|
||||
},
|
||||
{
|
||||
"step": 5000,
|
||||
"valid accuracy": 0.36,
|
||||
"train loss": 0.6535431078672409,
|
||||
"train samples": 20000,
|
||||
"train time": 29.328363572000853,
|
||||
"eval time": 8.339948383999399,
|
||||
"tokens / sec": 7101.657734454723,
|
||||
"mem allocated avg": 6774118805.504,
|
||||
"mem reserved avg": 11025097162.752,
|
||||
"elapsed time": 1652.8993119440001
|
||||
},
|
||||
{
|
||||
"step": 5000,
|
||||
"test accuracy": 0.3912054586808188,
|
||||
"train loss": 0.6535431078672409,
|
||||
"train samples": 20000,
|
||||
"train total tokens": 4198051
|
||||
}
|
||||
]
|
||||
},
|
||||
"meta_info": {
|
||||
"model_info": {
|
||||
"sha": "13afe5124825b4f3751f836b40dafda64c1ed062",
|
||||
"created_at": "2024-09-18T15:23:48+00:00"
|
||||
},
|
||||
"dataset_info": {
|
||||
"metamath": {
|
||||
"sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18",
|
||||
"created_at": "2023-09-21T17:22:46+00:00"
|
||||
},
|
||||
"gsm8k": {
|
||||
"sha": "e53f048856ff4f594e959d75785d2c2d37b678ee",
|
||||
"created_at": "2022-04-12T10:22:10+00:00"
|
||||
}
|
||||
},
|
||||
"package_info": {
|
||||
"transformers-version": "4.52.4",
|
||||
"transformers-commit-hash": null,
|
||||
"peft-version": "0.17.1.dev0",
|
||||
"peft-commit-hash": "47961bb54706e45fd3b5460baa4921a48bcdce35",
|
||||
"datasets-version": "3.6.0",
|
||||
"datasets-commit-hash": null,
|
||||
"bitsandbytes-version": "0.46.0",
|
||||
"bitsandbytes-commit-hash": null,
|
||||
"torch-version": "2.7.1+cu126",
|
||||
"torch-commit-hash": null
|
||||
},
|
||||
"system_info": {
|
||||
"system": "Linux",
|
||||
"release": "6.14.0-1010-aws",
|
||||
"version": "#10~24.04.1-Ubuntu SMP Fri Jul 18 20:44:30 UTC 2025",
|
||||
"machine": "x86_64",
|
||||
"processor": "x86_64",
|
||||
"accelerator": "NVIDIA L40S"
|
||||
},
|
||||
"pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n"
|
||||
}
|
||||
}
|
@ -25,11 +25,12 @@ import random
|
||||
import sys
|
||||
import textwrap
|
||||
import time
|
||||
from contextlib import AbstractContextManager, nullcontext
|
||||
from contextlib import nullcontext
|
||||
from functools import partial
|
||||
from typing import Any, Callable, Literal, Optional
|
||||
|
||||
import torch
|
||||
from data import get_train_valid_test_datasets
|
||||
from torch import nn
|
||||
from torch.amp import GradScaler, autocast
|
||||
from tqdm import tqdm
|
||||
@ -53,9 +54,8 @@ from utils import (
|
||||
validate_experiment_path,
|
||||
)
|
||||
|
||||
from data import get_train_valid_test_datasets
|
||||
from peft import AdaLoraConfig, PeftConfig
|
||||
from peft.utils import infer_device, CONFIG_NAME
|
||||
from peft.utils import CONFIG_NAME, infer_device
|
||||
|
||||
|
||||
# # suppress all warnings
|
||||
|
@ -44,7 +44,8 @@ from transformers import (
|
||||
import peft
|
||||
from peft import PeftConfig, get_peft_model, prepare_model_for_kbit_training
|
||||
from peft.optimizers import create_lorafa_optimizer, create_loraplus_optimizer
|
||||
from peft.utils import infer_device, SAFETENSORS_WEIGHTS_NAME
|
||||
from peft.utils import SAFETENSORS_WEIGHTS_NAME, infer_device
|
||||
|
||||
|
||||
device = infer_device()
|
||||
|
||||
|
@ -33,6 +33,7 @@ metric_preferences = {
|
||||
"file_size": "lower",
|
||||
"test_accuracy": "higher",
|
||||
"train_loss": "lower",
|
||||
"num_trainable_params": "lower",
|
||||
}
|
||||
|
||||
|
||||
|
@ -51,6 +51,7 @@ def preprocess(rows, task_name: str, print_fn=print):
|
||||
"total_time": run_info["total_time"],
|
||||
"train_time": train_info["train_time"],
|
||||
"file_size": train_info["file_size"],
|
||||
"num_trainable_params": train_info["num_trainable_params"],
|
||||
"test_accuracy": train_metrics["test accuracy"],
|
||||
"train_loss": train_metrics["train loss"],
|
||||
"train_samples": train_metrics["train samples"],
|
||||
@ -103,6 +104,7 @@ def load_df(path, task_name, print_fn=print):
|
||||
"train_loss": float,
|
||||
"train_samples": int,
|
||||
"train_total_tokens": int,
|
||||
"num_trainable_params": int,
|
||||
"peft_version": "string",
|
||||
"peft_branch": "string",
|
||||
"transformers_version": "string",
|
||||
@ -131,6 +133,7 @@ def load_df(path, task_name, print_fn=print):
|
||||
"accelerator_memory_max",
|
||||
"accelerator_memory_reserved_99th",
|
||||
"accelerator_memory_reserved_avg",
|
||||
"num_trainable_params",
|
||||
"file_size",
|
||||
"created_at",
|
||||
"task_name",
|
||||
@ -138,7 +141,6 @@ def load_df(path, task_name, print_fn=print):
|
||||
other_columns = [col for col in df if col not in important_columns]
|
||||
df = df[important_columns + other_columns]
|
||||
|
||||
size_before_drop_dups = len(df)
|
||||
columns = ["experiment_name", "model_id", "peft_type", "created_at"]
|
||||
# we want to keep only the most recent run for each experiment
|
||||
df = df.sort_values("created_at").drop_duplicates(columns, keep="last")
|
||||
|
@ -24,11 +24,12 @@ import subprocess
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from enum import Enum
|
||||
from typing import Any, Callable, Optional
|
||||
from peft.utils import infer_device
|
||||
|
||||
import psutil
|
||||
import torch
|
||||
|
||||
from peft.utils import infer_device
|
||||
|
||||
|
||||
FILE_NAME_BENCHMARK_PARAMS = "benchmark_params.json"
|
||||
FILE_NAME_DEFAULT_CONFIG = "default_benchmark_params.json"
|
||||
|
@ -49,3 +49,8 @@ markers = [
|
||||
"regression: whether to run regression suite test",
|
||||
"bitsandbytes: select bitsandbytes integration tests"
|
||||
]
|
||||
|
||||
filterwarnings = [
|
||||
"error::DeprecationWarning:transformers",
|
||||
"error::FutureWarning:transformers",
|
||||
]
|
||||
|
4
setup.py
4
setup.py
@ -56,7 +56,7 @@ setup(
|
||||
packages=find_packages("src"),
|
||||
package_data={"peft": ["py.typed", "tuners/boft/fbd/fbd_cuda.cpp", "tuners/boft/fbd/fbd_cuda_kernel.cu"]},
|
||||
entry_points={},
|
||||
python_requires=">=3.9.0",
|
||||
python_requires=">=3.10.0",
|
||||
install_requires=[
|
||||
"numpy>=1.17",
|
||||
"packaging>=20.0",
|
||||
@ -78,10 +78,10 @@ setup(
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
"Operating System :: OS Independent",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Programming Language :: Python :: 3.13",
|
||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||
],
|
||||
)
|
||||
|
@ -59,6 +59,8 @@ from .tuners import (
|
||||
C3AModel,
|
||||
CPTConfig,
|
||||
CPTEmbedding,
|
||||
DeloraConfig,
|
||||
DeloraModel,
|
||||
EvaConfig,
|
||||
FourierFTConfig,
|
||||
FourierFTModel,
|
||||
@ -104,6 +106,8 @@ from .tuners import (
|
||||
VBLoRAModel,
|
||||
VeraConfig,
|
||||
VeraModel,
|
||||
WaveFTConfig,
|
||||
WaveFTModel,
|
||||
XLoraConfig,
|
||||
XLoraModel,
|
||||
create_arrow_model,
|
||||
@ -152,6 +156,8 @@ __all__ = [
|
||||
"C3AModel",
|
||||
"CPTConfig",
|
||||
"CPTEmbedding",
|
||||
"DeloraConfig",
|
||||
"DeloraModel",
|
||||
"EvaConfig",
|
||||
"FourierFTConfig",
|
||||
"FourierFTModel",
|
||||
@ -211,6 +217,8 @@ __all__ = [
|
||||
"VBLoRAModel",
|
||||
"VeraConfig",
|
||||
"VeraModel",
|
||||
"WaveFTConfig",
|
||||
"WaveFTModel",
|
||||
"XLoraConfig",
|
||||
"XLoraModel",
|
||||
"bloom_model_postprocess_past_key_value",
|
||||
|
@ -11,6 +11,9 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib.metadata
|
||||
import inspect
|
||||
import json
|
||||
import os
|
||||
@ -18,9 +21,12 @@ import warnings
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from typing import Optional, Union
|
||||
|
||||
import packaging.version
|
||||
from huggingface_hub import hf_hub_download
|
||||
from transformers.utils import PushToHubMixin, http_user_agent
|
||||
|
||||
from peft import __version__
|
||||
|
||||
from .utils import CONFIG_NAME, PeftType, TaskType
|
||||
|
||||
|
||||
@ -43,6 +49,30 @@ def _check_and_remove_unused_kwargs(cls, kwargs):
|
||||
return kwargs, unexpected_kwargs
|
||||
|
||||
|
||||
def _is_dev_version(version: str) -> bool:
|
||||
# check if the given version is a dev version
|
||||
return packaging.version.Version(version).dev is not None
|
||||
|
||||
|
||||
def _get_commit_hash(pkg_name: str) -> str | None:
|
||||
# If PEFT was installed from a specific commit hash, try to get it. This works e.g. when installing PEFT with `pip
|
||||
# install git+https://github.com/huggingface/peft.git@<HASH>`. This works not for other means, like editable
|
||||
# installs.
|
||||
try:
|
||||
dist = importlib.metadata.distribution(pkg_name)
|
||||
except importlib.metadata.PackageNotFoundError:
|
||||
return None
|
||||
|
||||
# See: https://packaging.python.org/en/latest/specifications/direct-url/
|
||||
for path in dist.files or []:
|
||||
if path.name == "direct_url.json":
|
||||
direct_url = json.loads((dist.locate_file(path)).read_text())
|
||||
vcs_info = direct_url.get("vcs_info")
|
||||
if vcs_info and "commit_id" in vcs_info:
|
||||
return vcs_info["commit_id"]
|
||||
return None
|
||||
|
||||
|
||||
@dataclass
|
||||
class PeftConfigMixin(PushToHubMixin):
|
||||
r"""
|
||||
@ -60,6 +90,7 @@ class PeftConfigMixin(PushToHubMixin):
|
||||
auto_mapping: Optional[dict] = field(
|
||||
default=None, metadata={"help": "An auto mapping dict to help retrieve the base model class if needed."}
|
||||
)
|
||||
peft_version: Optional[str] = field(default=None, metadata={"help": "PEFT version, leave empty to auto-fill."})
|
||||
|
||||
def __post_init__(self):
|
||||
# check for invalid task type
|
||||
@ -67,6 +98,30 @@ class PeftConfigMixin(PushToHubMixin):
|
||||
raise ValueError(
|
||||
f"Invalid task type: '{self.task_type}'. Must be one of the following task types: {', '.join(TaskType)}."
|
||||
)
|
||||
if self.peft_version is None:
|
||||
self.peft_version = self._get_peft_version()
|
||||
|
||||
@staticmethod
|
||||
def _get_peft_version() -> str:
|
||||
# gets the current peft version; if it's a dev version, try to get the commit hash too, as the dev version is
|
||||
# ambiguous
|
||||
version = __version__
|
||||
if not _is_dev_version(version):
|
||||
return version
|
||||
|
||||
try:
|
||||
git_hash = _get_commit_hash("peft")
|
||||
if git_hash is None:
|
||||
git_hash = "UNKNOWN"
|
||||
except Exception:
|
||||
# Broad exception: We never want to break user code just because the git_hash could not be determined
|
||||
warnings.warn(
|
||||
"A dev version of PEFT is used but there was an error while trying to determine the commit hash. "
|
||||
"Please open an issue: https://github.com/huggingface/peft/issues"
|
||||
)
|
||||
git_hash = "UNKNOWN"
|
||||
version = version + f"@{git_hash}"
|
||||
return version
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
r"""
|
||||
|
34
src/peft/functional.py
Normal file
34
src/peft/functional.py
Normal file
@ -0,0 +1,34 @@
|
||||
# Copyright 2025-present the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Functions that are useful for integration with non-PeftModel models, e.g. transformers or diffusers.
|
||||
|
||||
The functions provided here can be considered "public API" of PEFT and hence are safe to be used by packages that
|
||||
provide PEFT integrations.
|
||||
"""
|
||||
|
||||
from peft.mapping import inject_adapter_in_model
|
||||
from peft.tuners.tuners_utils import cast_adapter_dtype, delete_adapter, set_adapter, set_requires_grad
|
||||
from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict
|
||||
|
||||
|
||||
__all__ = [
|
||||
"cast_adapter_dtype",
|
||||
"delete_adapter",
|
||||
"get_peft_model_state_dict",
|
||||
"inject_adapter_in_model",
|
||||
"set_adapter",
|
||||
"set_peft_model_state_dict",
|
||||
"set_requires_grad",
|
||||
]
|
@ -52,13 +52,16 @@ def inject_adapter_in_model(
|
||||
state_dict: Optional[dict[str, torch.Tensor]] = None,
|
||||
) -> torch.nn.Module:
|
||||
r"""
|
||||
A simple API to create and inject adapter in-place into a model. Currently the API does not support prompt learning
|
||||
methods and adaption prompt. Make sure to have the correct `target_names` set in the `peft_config` object. The API
|
||||
calls `get_peft_model` under the hood but would be restricted only to non-prompt learning methods.
|
||||
Create PEFT layers and inject them into the model in-place.
|
||||
|
||||
Currently the API does not support prompt learning methods and adaption prompt.
|
||||
|
||||
This function is similar to [`get_peft_model`] but it does not return a [`PeftModel`] instance. Instead, it returns
|
||||
the original, mutated instance of the passed model.
|
||||
|
||||
Args:
|
||||
peft_config (`PeftConfig`):
|
||||
Configuration object containing the parameters of the Peft model.
|
||||
Configuration object containing the parameters of the PEFT model.
|
||||
model (`torch.nn.Module`):
|
||||
The input model where the adapter will be injected.
|
||||
adapter_name (`str`, `optional`, defaults to `"default"`):
|
||||
@ -66,9 +69,9 @@ def inject_adapter_in_model(
|
||||
low_cpu_mem_usage (`bool`, `optional`, defaults to `False`):
|
||||
Create empty adapter weights on meta device. Useful to speed up the loading process.
|
||||
state_dict (`dict`, *optional*, defaults to `None`)
|
||||
If a state_dict is passed here, the adapters will be injected based on the entries of the state_dict. This
|
||||
can be useful when the exact `target_modules` of the PEFT method is unknown, for instance because the
|
||||
checkpoint was created without meta data. Note that the values from the state_dict are not used, only the
|
||||
If a `state_dict` is passed here, the adapters will be injected based on the entries of the state_dict.
|
||||
This can be useful when the exact `target_modules` of the PEFT method is unknown, for instance because the
|
||||
checkpoint was created without meta data. Note that the values from the `state_dict` are not used, only the
|
||||
keys are used to determine the correct layers that should be adapted.
|
||||
"""
|
||||
if peft_config.is_prompt_learning or peft_config.is_adaption_prompt:
|
||||
|
@ -101,7 +101,7 @@ def get_peft_model(
|
||||
prefix = PEFT_TYPE_TO_PREFIX_MAPPING.get(peft_config.peft_type)
|
||||
if prefix and adapter_name in prefix:
|
||||
warnings.warn(
|
||||
f"Adapter name {adapter_name} should not be contained in the prefix {prefix}."
|
||||
f"Adapter name '{adapter_name}' should not be contained in the prefix '{prefix}'. "
|
||||
"This may lead to reinitialization of the adapter weights during loading."
|
||||
)
|
||||
|
||||
|
@ -71,12 +71,8 @@ class PeftMixedModel(PushToHubMixin, torch.nn.Module):
|
||||
This class does not support loading/saving, and it shouldn't usually be initialized directly. Instead, use
|
||||
`get_peft_model` with the argument `mixed=True`.
|
||||
|
||||
<Tip>
|
||||
|
||||
Read the [Mixed adapter types](https://huggingface.co/docs/peft/en/developer_guides/mixed_models) guide to learn
|
||||
more about using different adapter types.
|
||||
|
||||
</Tip>
|
||||
> [!TIP] > Read the [Mixed adapter types](https://huggingface.co/docs/peft/en/developer_guides/mixed_models) guide
|
||||
to learn > more about using different adapter types.
|
||||
|
||||
Example:
|
||||
|
||||
@ -224,12 +220,8 @@ class PeftMixedModel(PushToHubMixin, torch.nn.Module):
|
||||
Create empty adapter weights on meta device. Useful to speed up the process when loading saved
|
||||
adapters.
|
||||
|
||||
<Tip>
|
||||
|
||||
Don't use `low_cpu_mem_usage=True` when creating a new PEFT adapter for training (training is untested
|
||||
and discouraged for PeftMixedModel in general).
|
||||
|
||||
</Tip>
|
||||
> [!TIP] > Don't use `low_cpu_mem_usage=True` when creating a new PEFT adapter for training (training
|
||||
is untested > and discouraged for PeftMixedModel in general).
|
||||
"""
|
||||
_check_config_compatible(peft_config)
|
||||
|
||||
@ -430,15 +422,17 @@ class PeftMixedModel(PushToHubMixin, torch.nn.Module):
|
||||
|
||||
# load the config
|
||||
if config is None:
|
||||
config = PEFT_TYPE_TO_CONFIG_MAPPING[
|
||||
PeftConfig._get_peft_type(
|
||||
model_id,
|
||||
subfolder=kwargs.get("subfolder", None),
|
||||
revision=kwargs.get("revision", None),
|
||||
cache_dir=kwargs.get("cache_dir", None),
|
||||
use_auth_token=kwargs.get("use_auth_token", None),
|
||||
)
|
||||
].from_pretrained(model_id, **kwargs)
|
||||
hf_kwargs = {
|
||||
"subfolder": kwargs.get("subfolder", None),
|
||||
"revision": kwargs.get("revision", None),
|
||||
"cache_dir": kwargs.get("cache_dir", None),
|
||||
"token": kwargs.get("token", None),
|
||||
}
|
||||
if use_auth_token := kwargs.get("use_auth_token", None):
|
||||
hf_kwargs["use_auth_token"] = use_auth_token
|
||||
config = PEFT_TYPE_TO_CONFIG_MAPPING[PeftConfig._get_peft_type(model_id, **hf_kwargs)].from_pretrained(
|
||||
model_id, **kwargs
|
||||
)
|
||||
elif isinstance(config, PeftConfig):
|
||||
config.inference_mode = not is_trainable
|
||||
else:
|
||||
|
@ -19,6 +19,7 @@ import copy
|
||||
import inspect
|
||||
import os
|
||||
import warnings
|
||||
from collections.abc import Sequence
|
||||
from contextlib import contextmanager, nullcontext
|
||||
from copy import deepcopy
|
||||
from dataclasses import dataclass
|
||||
@ -40,9 +41,10 @@ from transformers.utils import PushToHubMixin
|
||||
|
||||
from peft.tuners.lora.variants import get_alora_offsets_for_forward, get_alora_offsets_for_generate
|
||||
from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer
|
||||
from peft.utils import AuxiliaryTrainingWrapper
|
||||
from peft.utils.constants import DUMMY_MODEL_CONFIG
|
||||
from peft.utils.integrations import init_empty_weights
|
||||
from peft.utils.other import create_attention_mask, set_additional_trainable_modules
|
||||
from peft.utils.other import TrainableTokensWrapper, create_attention_mask, set_additional_trainable_modules
|
||||
|
||||
from . import __version__
|
||||
from .config import PeftConfig
|
||||
@ -82,11 +84,7 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
|
||||
low_cpu_mem_usage (`bool`, `optional`, defaults to `False`):
|
||||
Create empty adapter weights on meta device. Useful to speed up the loading loading process.
|
||||
|
||||
<Tip>
|
||||
|
||||
Don't use `low_cpu_mem_usage=True` when creating a new PEFT adapter for training.
|
||||
|
||||
</Tip>
|
||||
> [!TIP] > Don't use `low_cpu_mem_usage=True` when creating a new PEFT adapter for training.
|
||||
|
||||
**Attributes**:
|
||||
- **base_model** ([`torch.nn.Module`]) -- The base transformer model used for Peft.
|
||||
@ -437,16 +435,17 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
|
||||
|
||||
# load the config
|
||||
if config is None:
|
||||
config = PEFT_TYPE_TO_CONFIG_MAPPING[
|
||||
PeftConfig._get_peft_type(
|
||||
model_id,
|
||||
subfolder=kwargs.get("subfolder", None),
|
||||
revision=kwargs.get("revision", None),
|
||||
cache_dir=kwargs.get("cache_dir", None),
|
||||
use_auth_token=kwargs.get("use_auth_token", None),
|
||||
token=kwargs.get("token", None),
|
||||
)
|
||||
].from_pretrained(model_id, **kwargs)
|
||||
hf_kwargs = {
|
||||
"subfolder": kwargs.get("subfolder", None),
|
||||
"revision": kwargs.get("revision", None),
|
||||
"cache_dir": kwargs.get("cache_dir", None),
|
||||
"token": kwargs.get("token", None),
|
||||
}
|
||||
if use_auth_token := kwargs.get("use_auth_token", None):
|
||||
hf_kwargs["use_auth_token"] = use_auth_token
|
||||
config = PEFT_TYPE_TO_CONFIG_MAPPING[PeftConfig._get_peft_type(model_id, **hf_kwargs)].from_pretrained(
|
||||
model_id, **kwargs
|
||||
)
|
||||
elif isinstance(config, PeftConfig):
|
||||
config.inference_mode = not is_trainable
|
||||
else:
|
||||
@ -578,10 +577,10 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
|
||||
|
||||
prefix = PEFT_TYPE_TO_PREFIX_MAPPING.get(config.peft_type)
|
||||
if prefix and adapter_name in prefix:
|
||||
warn_message += (
|
||||
f"Adapter name {adapter_name} should not be contained in the prefix {prefix}."
|
||||
"This could be the potential reason for missing adapter keys."
|
||||
)
|
||||
warn_message = (
|
||||
f"Adapter name '{adapter_name}' should not be contained in the prefix '{prefix}'. "
|
||||
"This could be the potential reason for missing adapter keys. "
|
||||
) + warn_message
|
||||
|
||||
warnings.warn(warn_message)
|
||||
|
||||
@ -999,7 +998,7 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
|
||||
prefix = PEFT_TYPE_TO_PREFIX_MAPPING.get(peft_config.peft_type)
|
||||
if prefix and adapter_name in prefix:
|
||||
warnings.warn(
|
||||
f"Adapter name {adapter_name} should not be contained in the prefix {prefix}."
|
||||
f"Adapter name '{adapter_name}' should not be contained in the prefix '{prefix}'. "
|
||||
"This may lead to reinitialization of the adapter weights during loading."
|
||||
)
|
||||
|
||||
@ -1460,6 +1459,26 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
|
||||
# handle auxiliary modules
|
||||
_set_adapter(self, adapter_name)
|
||||
|
||||
def set_requires_grad(self, adapter_names: str | Sequence[str], requires_grad: bool = True) -> None:
|
||||
"""
|
||||
Enable or disable gradients on the given adapter(s).
|
||||
|
||||
Note: Not supported for prompt learning methods like prompt tuning.
|
||||
|
||||
Args:
|
||||
adapter_name (`str` or `Sequence[str]`):
|
||||
The name of the adapter(s) whose gradients should be enabled/disabled.
|
||||
requires_grad (`bool`, *optional*)
|
||||
Whether to enable (`True`, default) or disable (`False`).
|
||||
"""
|
||||
if self.active_peft_config.is_prompt_learning:
|
||||
raise TypeError(
|
||||
"Setting `requires_grad` is not supported for prompt learning methods like "
|
||||
f"{self.active_peft_config.peft_type.value}."
|
||||
)
|
||||
|
||||
self.base_model.set_requires_grad(adapter_names=adapter_names, requires_grad=requires_grad)
|
||||
|
||||
@property
|
||||
def base_model_torch_dtype(self):
|
||||
return getattr(self.base_model, "dtype", None)
|
||||
@ -3047,7 +3066,11 @@ def get_layer_status(model: torch.nn.Module) -> list[TunerLayerStatus]:
|
||||
|
||||
layer_status: list[TunerLayerStatus] = []
|
||||
for name, module in base_model.named_modules():
|
||||
if not isinstance(module, BaseTunerLayer):
|
||||
if not isinstance(module, (BaseTunerLayer, AuxiliaryTrainingWrapper)):
|
||||
continue
|
||||
if isinstance(module, TrainableTokensWrapper):
|
||||
# Skip TrainableTokensWrapper, since it wraps TrainableTokensLayer, which is the actual PEFT layer we're
|
||||
# interested in.
|
||||
continue
|
||||
|
||||
# determine if all submodules/parameters if this module require grad or not
|
||||
|
@ -18,6 +18,7 @@ from .boft import BOFTConfig, BOFTModel
|
||||
from .bone import BoneConfig, BoneModel
|
||||
from .c3a import C3AConfig, C3AModel
|
||||
from .cpt import CPTConfig, CPTEmbedding
|
||||
from .delora import DeloraConfig, DeloraModel
|
||||
from .fourierft import FourierFTConfig, FourierFTModel
|
||||
from .hra import HRAConfig, HRAModel
|
||||
from .ia3 import IA3Config, IA3Model
|
||||
@ -49,6 +50,7 @@ from .shira import ShiraConfig, ShiraModel
|
||||
from .trainable_tokens import TrainableTokensConfig, TrainableTokensModel
|
||||
from .vblora import VBLoRAConfig, VBLoRAModel
|
||||
from .vera import VeraConfig, VeraModel
|
||||
from .waveft import WaveFTConfig, WaveFTModel
|
||||
from .xlora import XLoraConfig, XLoraModel
|
||||
|
||||
|
||||
@ -66,6 +68,8 @@ __all__ = [
|
||||
"C3AModel",
|
||||
"CPTConfig",
|
||||
"CPTEmbedding",
|
||||
"DeloraConfig",
|
||||
"DeloraModel",
|
||||
"EvaConfig",
|
||||
"FourierFTConfig",
|
||||
"FourierFTModel",
|
||||
@ -113,6 +117,8 @@ __all__ = [
|
||||
"VBLoRAModel",
|
||||
"VeraConfig",
|
||||
"VeraModel",
|
||||
"WaveFTConfig",
|
||||
"WaveFTModel",
|
||||
"XLoraConfig",
|
||||
"XLoraModel",
|
||||
"create_arrow_model",
|
||||
|
@ -63,7 +63,8 @@ class AdaLoraModel(LoraModel):
|
||||
- **peft_config** ([`AdaLoraConfig`]): The configuration of the AdaLora model.
|
||||
"""
|
||||
|
||||
# Note: don't redefine prefix here, it should be inherited from LoraModel
|
||||
# Note: don't redefine prefix or tuner_layer_cls here, it should be inherited from LoraModel
|
||||
target_module_mapping = TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING
|
||||
|
||||
def __init__(self, model, config, adapter_name, **kwargs):
|
||||
super().__init__(model, config, adapter_name, **kwargs)
|
||||
@ -221,25 +222,6 @@ class AdaLoraModel(LoraModel):
|
||||
|
||||
return new_module
|
||||
|
||||
@staticmethod
|
||||
def _prepare_adapter_config(peft_config, model_config):
|
||||
if peft_config.target_modules is None:
|
||||
if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING:
|
||||
raise ValueError("Please specify `target_modules` in `peft_config`")
|
||||
peft_config.target_modules = TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING[
|
||||
model_config["model_type"]
|
||||
]
|
||||
return peft_config
|
||||
|
||||
def __getattr__(self, name: str):
|
||||
"""Forward missing attributes to the wrapped module."""
|
||||
try:
|
||||
return super().__getattr__(name) # defer to nn.Module's logic
|
||||
except AttributeError:
|
||||
if name == "model": # see #1892: prevent infinite recursion if class is not initialized
|
||||
raise
|
||||
return getattr(self.model, name)
|
||||
|
||||
def forward(self, *args, **kwargs):
|
||||
outputs = self.model.forward(*args, **kwargs)
|
||||
|
||||
|
@ -16,27 +16,15 @@
|
||||
# via Butterfly Factorization" (https://huggingface.co/papers/2311.06243) in ICLR 2024.
|
||||
|
||||
import warnings
|
||||
from dataclasses import asdict
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from tqdm import tqdm
|
||||
|
||||
from peft.tuners.tuners_utils import (
|
||||
BaseTuner,
|
||||
BaseTunerLayer,
|
||||
check_target_module_exists,
|
||||
onload_layer,
|
||||
)
|
||||
from peft.utils import (
|
||||
TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,
|
||||
ModulesToSaveWrapper,
|
||||
_get_submodules,
|
||||
)
|
||||
from peft.utils import TRANSFORMERS_MODELS_TO_BOFT_TARGET_MODULES_MAPPING
|
||||
|
||||
from .config import BOFTConfig
|
||||
from .layer import BOFTLayer, Conv2d, Linear
|
||||
|
||||
|
||||
@ -73,25 +61,8 @@ class BOFTModel(BaseTuner):
|
||||
"""
|
||||
|
||||
prefix: str = "boft_"
|
||||
|
||||
def _check_new_adapter_config(self, config: BOFTConfig) -> None:
|
||||
"""
|
||||
A helper method to check the config when a new adapter is being added.
|
||||
|
||||
Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters.
|
||||
|
||||
"""
|
||||
# TODO: there should be a check if any of the existing adapters actually has bias != "none", or else the check
|
||||
# does not fully correspond to the error message.
|
||||
if (len(self.peft_config) > 1) and (config.bias != "none"):
|
||||
raise ValueError(
|
||||
f"{self.__class__.__name__} supports only 1 adapter with bias. When using multiple adapters, "
|
||||
"set bias to 'none' for all adapters."
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _check_target_module_exists(boft_config, key):
|
||||
return check_target_module_exists(boft_config, key)
|
||||
tuner_layer_cls = BOFTLayer
|
||||
target_module_mapping = TRANSFORMERS_MODELS_TO_BOFT_TARGET_MODULES_MAPPING
|
||||
|
||||
def _create_and_replace(
|
||||
self,
|
||||
@ -134,55 +105,6 @@ class BOFTModel(BaseTuner):
|
||||
init_weights=boft_config.init_weights,
|
||||
)
|
||||
|
||||
def _replace_module(self, parent, child_name, new_module, child):
|
||||
setattr(parent, child_name, new_module)
|
||||
# It's not necessary to set requires_grad here, as that is handled by
|
||||
# _mark_only_adapters_as_trainable
|
||||
|
||||
# child layer wraps the original module, unpack it
|
||||
if hasattr(child, "base_layer"):
|
||||
child = child.base_layer
|
||||
|
||||
if not hasattr(new_module, "base_layer"):
|
||||
new_module.weight = child.weight
|
||||
if hasattr(child, "bias"):
|
||||
new_module.bias = child.bias
|
||||
|
||||
if getattr(child, "state", None) is not None:
|
||||
if hasattr(new_module, "base_layer"):
|
||||
new_module.base_layer.state = child.state
|
||||
else:
|
||||
new_module.state = child.state
|
||||
new_module.to(child.weight.device)
|
||||
|
||||
meta = torch.device("meta")
|
||||
# dispatch to correct device
|
||||
for name, module in new_module.named_modules():
|
||||
if self.prefix in name:
|
||||
if not any(p.device == meta for p in module.parameters()):
|
||||
module.to(child.weight.device)
|
||||
|
||||
def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None:
|
||||
for n, p in model.named_parameters():
|
||||
if self.prefix not in n:
|
||||
p.requires_grad = False
|
||||
|
||||
for active_adapter in self.active_adapters:
|
||||
bias = self.peft_config[active_adapter].bias
|
||||
if bias == "none":
|
||||
continue
|
||||
|
||||
if bias == "all":
|
||||
for n, p in model.named_parameters():
|
||||
if "bias" in n:
|
||||
p.requires_grad = True
|
||||
elif bias == "boft_only":
|
||||
for name, m in model.named_modules():
|
||||
if isinstance(m, BOFTLayer) and hasattr(m, "bias") and m.bias is not None:
|
||||
m.bias.requires_grad = True
|
||||
else:
|
||||
raise NotImplementedError(f"Requested bias: {bias}, is not implemented.")
|
||||
|
||||
@staticmethod
|
||||
def _create_new_module(boft_config, adapter_name, target, **kwargs):
|
||||
if isinstance(target, BaseTunerLayer):
|
||||
@ -207,146 +129,3 @@ class BOFTModel(BaseTuner):
|
||||
)
|
||||
|
||||
return new_module
|
||||
|
||||
def __getattr__(self, name: str):
|
||||
"""Forward missing attributes to the wrapped module."""
|
||||
try:
|
||||
return super().__getattr__(name) # defer to nn.Module's logic
|
||||
except AttributeError:
|
||||
if name == "model": # see #1892: prevent infinite recursion if class is not initialized
|
||||
raise
|
||||
return getattr(self.model, name)
|
||||
|
||||
def get_peft_config_as_dict(self, inference: bool = False):
|
||||
config_dict = {}
|
||||
for key, value in self.peft_config.items():
|
||||
config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()}
|
||||
if inference:
|
||||
config["inference_mode"] = True
|
||||
config_dict[key] = config
|
||||
return config
|
||||
|
||||
def _set_adapter_layers(self, enabled=True):
|
||||
for module in self.model.modules():
|
||||
if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)):
|
||||
module.enable_adapters(enabled)
|
||||
|
||||
def enable_adapter_layers(self):
|
||||
self._set_adapter_layers(enabled=True)
|
||||
|
||||
def disable_adapter_layers(self):
|
||||
for active_adapter in self.active_adapters:
|
||||
val = self.peft_config[active_adapter].bias
|
||||
if val != "none":
|
||||
msg = (
|
||||
f"Careful, disabling adapter layers with bias configured to be '{val}' does not produce the same "
|
||||
"output as the base model would without adaption."
|
||||
)
|
||||
warnings.warn(msg)
|
||||
self._set_adapter_layers(enabled=False)
|
||||
|
||||
def set_adapter(self, adapter_name, inference_mode: bool = False):
|
||||
self.set_auxiliary_adapters(adapter_name, inference_mode=inference_mode)
|
||||
for module in self.model.modules():
|
||||
if isinstance(module, BOFTLayer):
|
||||
if module.merged:
|
||||
warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.")
|
||||
module.unmerge()
|
||||
module.set_adapter(adapter_name, inference_mode=inference_mode)
|
||||
self.active_adapter = adapter_name
|
||||
|
||||
@staticmethod
|
||||
def _prepare_adapter_config(peft_config, model_config):
|
||||
if peft_config.target_modules is None:
|
||||
if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING:
|
||||
raise ValueError("Please specify `target_modules` in `peft_config`")
|
||||
peft_config.target_modules = set(
|
||||
TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING[model_config["model_type"]]
|
||||
)
|
||||
return peft_config
|
||||
|
||||
def _unload_and_optionally_merge(
|
||||
self,
|
||||
merge=True,
|
||||
progressbar: bool = False,
|
||||
safe_merge: bool = False,
|
||||
adapter_names: Optional[list[str]] = None,
|
||||
):
|
||||
if merge:
|
||||
self._check_merge_allowed()
|
||||
|
||||
key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
|
||||
desc = "Unloading " + ("and merging " if merge else "") + "model"
|
||||
for key in tqdm(key_list, disable=not progressbar, desc=desc):
|
||||
try:
|
||||
parent, target, target_name = _get_submodules(self.model, key)
|
||||
except AttributeError:
|
||||
continue
|
||||
with onload_layer(target):
|
||||
if hasattr(target, "base_layer"):
|
||||
if merge:
|
||||
target.merge(safe_merge=safe_merge, adapter_names=adapter_names)
|
||||
self._replace_module(parent, target_name, target.get_base_layer(), target)
|
||||
elif isinstance(target, ModulesToSaveWrapper):
|
||||
# save any additional trainable modules part of `modules_to_save`
|
||||
new_module = target.modules_to_save[target.active_adapter]
|
||||
if hasattr(new_module, "base_layer"):
|
||||
# check if the module is itself a tuner layer
|
||||
if merge:
|
||||
new_module.merge(safe_merge=safe_merge, adapter_names=adapter_names)
|
||||
new_module = new_module.get_base_layer()
|
||||
setattr(parent, target_name, new_module)
|
||||
|
||||
return self.model
|
||||
|
||||
def delete_adapter(self, adapter_name: str) -> None:
|
||||
"""
|
||||
Deletes an existing adapter.
|
||||
|
||||
Args:
|
||||
adapter_name (str): Name of the adapter to be deleted.
|
||||
"""
|
||||
if adapter_name not in list(self.peft_config.keys()):
|
||||
raise ValueError(f"Adapter {adapter_name} does not exist")
|
||||
del self.peft_config[adapter_name]
|
||||
|
||||
key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
|
||||
new_adapter = None
|
||||
for key in key_list:
|
||||
_, target, _ = _get_submodules(self.model, key)
|
||||
if isinstance(target, BOFTLayer):
|
||||
target.delete_adapter(adapter_name)
|
||||
if new_adapter is None:
|
||||
new_adapter = target.active_adapters[:]
|
||||
|
||||
self.active_adapter = new_adapter or []
|
||||
self._delete_auxiliary_adapter(adapter_name, new_active_adapters=new_adapter)
|
||||
|
||||
def merge_and_unload(
|
||||
self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[list[str]] = None
|
||||
) -> torch.nn.Module:
|
||||
r"""
|
||||
This method merges the BOFT layers into the base model. This is needed if someone wants to use the base model
|
||||
as a standalone model.
|
||||
|
||||
Args:
|
||||
progressbar (`bool`):
|
||||
whether to show a progressbar indicating the unload and merge process
|
||||
safe_merge (`bool`):
|
||||
whether to activate the safe merging check to check if there is any potential Nan in the adapter
|
||||
weights
|
||||
adapter_names (`List[str]`, *optional*):
|
||||
The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
|
||||
to `None`.
|
||||
|
||||
"""
|
||||
return self._unload_and_optionally_merge(
|
||||
progressbar=progressbar, safe_merge=safe_merge, adapter_names=adapter_names
|
||||
)
|
||||
|
||||
def unload(self) -> torch.nn.Module:
|
||||
"""
|
||||
Gets back the base model by removing all the boft modules without merging. This gives back the original base
|
||||
model.
|
||||
"""
|
||||
return self._unload_and_optionally_merge(merge=False)
|
||||
|
@ -96,7 +96,7 @@ class BoneConfig(PeftConfig):
|
||||
"help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern."
|
||||
},
|
||||
)
|
||||
bias: str = field(default="none", metadata={"help": "Bias type for Bone. Can be 'none', 'all' or 'Bone_only'"})
|
||||
bias: str = field(default="none", metadata={"help": "Bias type for Bone. Can be 'none', 'all' or 'bone_only'"})
|
||||
modules_to_save: Optional[list[str]] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
|
@ -12,23 +12,12 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import warnings
|
||||
from dataclasses import asdict
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from tqdm import tqdm
|
||||
|
||||
from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists
|
||||
from peft.utils import (
|
||||
TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,
|
||||
ModulesToSaveWrapper,
|
||||
_get_submodules,
|
||||
)
|
||||
from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer
|
||||
from peft.utils import TRANSFORMERS_MODELS_TO_BONE_TARGET_MODULES_MAPPING
|
||||
|
||||
from .config import BoneConfig
|
||||
from .layer import BoneLayer, BoneLinear
|
||||
|
||||
|
||||
@ -83,25 +72,8 @@ class BoneModel(BaseTuner):
|
||||
"""
|
||||
|
||||
prefix: str = "bone_"
|
||||
|
||||
def _check_new_adapter_config(self, config: BoneConfig) -> None:
|
||||
"""
|
||||
A helper method to check the config when a new adapter is being added.
|
||||
|
||||
Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters.
|
||||
|
||||
"""
|
||||
# TODO: there should be a check if any of the existing adapters actually has bias != "none", or else the check
|
||||
# does not fully correspond to the error message.
|
||||
if (len(self.peft_config) > 1) and (config.bias != "none"):
|
||||
raise ValueError(
|
||||
f"{self.__class__.__name__} supports only 1 adapter with bias. When using multiple adapters, "
|
||||
"set bias to 'none' for all adapters."
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _check_target_module_exists(bone_config, key):
|
||||
return check_target_module_exists(bone_config, key)
|
||||
tuner_layer_cls = BoneLayer
|
||||
target_module_mapping = TRANSFORMERS_MODELS_TO_BONE_TARGET_MODULES_MAPPING
|
||||
|
||||
def _create_and_replace(
|
||||
self,
|
||||
@ -137,55 +109,6 @@ class BoneModel(BaseTuner):
|
||||
init_weights=bone_config.init_weights,
|
||||
)
|
||||
|
||||
def _replace_module(self, parent, child_name, new_module, child):
|
||||
setattr(parent, child_name, new_module)
|
||||
# It's not necessary to set requires_grad here, as that is handled by
|
||||
# _mark_only_adapters_as_trainable
|
||||
|
||||
# child layer wraps the original module, unpack it
|
||||
if hasattr(child, "base_layer"):
|
||||
child = child.base_layer
|
||||
|
||||
if not hasattr(new_module, "base_layer"):
|
||||
new_module.weight = child.weight
|
||||
if hasattr(child, "bias"):
|
||||
new_module.bias = child.bias
|
||||
|
||||
if getattr(child, "state", None) is not None:
|
||||
if hasattr(new_module, "base_layer"):
|
||||
new_module.base_layer.state = child.state
|
||||
else:
|
||||
new_module.state = child.state
|
||||
new_module.to(child.weight.device)
|
||||
|
||||
meta = torch.device("meta")
|
||||
# dispatch to correct device
|
||||
for name, module in new_module.named_modules():
|
||||
if self.prefix in name:
|
||||
if not any(p.device == meta for p in module.parameters()):
|
||||
module.to(child.weight.device)
|
||||
|
||||
def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None:
|
||||
for n, p in model.named_parameters():
|
||||
if self.prefix not in n:
|
||||
p.requires_grad = False
|
||||
|
||||
for active_adapter in self.active_adapters:
|
||||
bias = self.peft_config[active_adapter].bias
|
||||
if bias == "none":
|
||||
continue
|
||||
|
||||
if bias == "all":
|
||||
for n, p in model.named_parameters():
|
||||
if "bias" in n:
|
||||
p.requires_grad = True
|
||||
elif bias == "bone_only":
|
||||
for name, m in model.named_modules():
|
||||
if isinstance(m, BoneLayer) and hasattr(m, "bias") and m.bias is not None:
|
||||
m.bias.requires_grad = True
|
||||
else:
|
||||
raise NotImplementedError(f"Requested bias: {bias}, is not implemented.")
|
||||
|
||||
@staticmethod
|
||||
def _create_new_module(bone_config, adapter_name, target, **kwargs):
|
||||
if isinstance(target, BaseTunerLayer):
|
||||
@ -201,138 +124,3 @@ class BoneModel(BaseTuner):
|
||||
)
|
||||
|
||||
return new_module
|
||||
|
||||
def __getattr__(self, name: str):
|
||||
"""Forward missing attributes to the wrapped module."""
|
||||
try:
|
||||
return super().__getattr__(name) # defer to nn.Module's logic
|
||||
except AttributeError:
|
||||
if name == "base_model":
|
||||
raise
|
||||
return getattr(self.model, name)
|
||||
|
||||
def get_peft_config_as_dict(self, inference: bool = False):
|
||||
config_dict = {}
|
||||
for key, value in self.peft_config.items():
|
||||
config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()}
|
||||
if inference:
|
||||
config["inference_mode"] = True
|
||||
config_dict[key] = config
|
||||
return config
|
||||
|
||||
def _set_adapter_layers(self, enabled=True):
|
||||
for module in self.model.modules():
|
||||
if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)):
|
||||
module.enable_adapters(enabled)
|
||||
|
||||
def enable_adapter_layers(self):
|
||||
self._set_adapter_layers(enabled=True)
|
||||
|
||||
def disable_adapter_layers(self):
|
||||
for active_adapter in self.active_adapters:
|
||||
val = self.peft_config[active_adapter].bias
|
||||
if val != "none":
|
||||
msg = (
|
||||
f"Careful, disabling adapter layers with bias configured to be '{val}' does not produce the same "
|
||||
"output as the base model would without adaption."
|
||||
)
|
||||
warnings.warn(msg)
|
||||
self._set_adapter_layers(enabled=False)
|
||||
|
||||
def set_adapter(self, adapter_name, inference_mode: bool = False):
|
||||
self.set_auxiliary_adapters(adapter_name, inference_mode=inference_mode)
|
||||
for module in self.model.modules():
|
||||
if isinstance(module, BoneLayer):
|
||||
if module.merged:
|
||||
warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.")
|
||||
module.unmerge()
|
||||
module.set_adapter(adapter_name, inference_mode=inference_mode)
|
||||
self.active_adapter = adapter_name
|
||||
|
||||
@staticmethod
|
||||
def _prepare_adapter_config(peft_config, model_config):
|
||||
if peft_config.target_modules is None:
|
||||
if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING:
|
||||
raise ValueError("Please specify `target_modules` in `peft_config`")
|
||||
peft_config.target_modules = set(
|
||||
TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING[model_config["model_type"]]
|
||||
)
|
||||
return peft_config
|
||||
|
||||
def _unload_and_optionally_merge(
|
||||
self,
|
||||
merge=True,
|
||||
progressbar: bool = False,
|
||||
safe_merge: bool = False,
|
||||
adapter_names: Optional[list[str]] = None,
|
||||
):
|
||||
self._unloading_checks(adapter_names)
|
||||
key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
|
||||
desc = "Unloading " + ("and merging " if merge else "") + "model"
|
||||
for key in tqdm(key_list, disable=not progressbar, desc=desc):
|
||||
try:
|
||||
parent, target, target_name = _get_submodules(self.model, key)
|
||||
except AttributeError:
|
||||
continue
|
||||
|
||||
if hasattr(target, "base_layer"):
|
||||
if merge:
|
||||
target.merge(safe_merge=safe_merge, adapter_names=adapter_names)
|
||||
self._replace_module(parent, target_name, target.get_base_layer(), target)
|
||||
elif isinstance(target, ModulesToSaveWrapper):
|
||||
# save any additional trainable modules part of `modules_to_save`
|
||||
setattr(parent, target_name, target.modules_to_save[target.active_adapter])
|
||||
|
||||
return self.model
|
||||
|
||||
def delete_adapter(self, adapter_name: str) -> None:
|
||||
"""
|
||||
Deletes an existing adapter.
|
||||
|
||||
Args:
|
||||
adapter_name (str): Name of the adapter to be deleted.
|
||||
"""
|
||||
if adapter_name not in list(self.peft_config.keys()):
|
||||
raise ValueError(f"Adapter {adapter_name} does not exist")
|
||||
del self.peft_config[adapter_name]
|
||||
|
||||
key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
|
||||
new_adapter = None
|
||||
for key in key_list:
|
||||
_, target, _ = _get_submodules(self.model, key)
|
||||
if isinstance(target, BoneLayer):
|
||||
target.delete_adapter(adapter_name)
|
||||
if new_adapter is None:
|
||||
new_adapter = target.active_adapters[:]
|
||||
|
||||
self.active_adapter = new_adapter or []
|
||||
self._delete_auxiliary_adapter(adapter_name, new_active_adapters=new_adapter)
|
||||
|
||||
def merge_and_unload(
|
||||
self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[list[str]] = None
|
||||
) -> torch.nn.Module:
|
||||
r"""
|
||||
This method merges the Bone layers into the base model. This is needed if someone wants to use the base model
|
||||
as a standalone model.
|
||||
|
||||
Args:
|
||||
progressbar (`bool`):
|
||||
whether to show a progressbar indicating the unload and merge process
|
||||
safe_merge (`bool`):
|
||||
whether to activate the safe merging check to check if there is any potential Nan in the adapter
|
||||
weights
|
||||
adapter_names (`List[str]`, *optional*):
|
||||
The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
|
||||
to `None`.
|
||||
|
||||
"""
|
||||
return self._unload_and_optionally_merge(
|
||||
progressbar=progressbar, safe_merge=safe_merge, adapter_names=adapter_names
|
||||
)
|
||||
|
||||
def unload(self) -> torch.nn.Module:
|
||||
"""
|
||||
Gets back the base model by removing all the bone modules without merging. This gives back the original base
|
||||
model.
|
||||
"""
|
||||
return self._unload_and_optionally_merge(merge=False)
|
||||
|
@ -14,23 +14,15 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import warnings
|
||||
from dataclasses import asdict
|
||||
from enum import Enum
|
||||
from itertools import chain
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists
|
||||
from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer
|
||||
from peft.utils import (
|
||||
TRANSFORMERS_MODELS_TO_C3A_TARGET_MODULES_MAPPING,
|
||||
ModulesToSaveWrapper,
|
||||
_get_submodules,
|
||||
)
|
||||
|
||||
from .config import C3AConfig
|
||||
from .layer import C3ALayer, C3ALinear
|
||||
|
||||
|
||||
@ -54,25 +46,8 @@ class C3AModel(BaseTuner):
|
||||
"""
|
||||
|
||||
prefix: str = "c3a_"
|
||||
|
||||
def _check_new_adapter_config(self, config: C3AConfig) -> None:
|
||||
"""
|
||||
A helper method to check the config when a new adapter is being added.
|
||||
|
||||
Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters.
|
||||
|
||||
"""
|
||||
# TODO: there should be a check if any of the existing adapters actually has bias != "none", or else the check
|
||||
# does not fully correspond to the error message.
|
||||
if (len(self.peft_config) > 1) and (config.bias != "none"):
|
||||
raise ValueError(
|
||||
f"{self.__class__.__name__} supports only 1 adapter with bias. When using multiple adapters, "
|
||||
"set bias to 'none' for all adapters."
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _check_target_module_exists(c3a_config, key):
|
||||
return check_target_module_exists(c3a_config, key)
|
||||
tuner_layer_cls = C3ALayer
|
||||
target_module_mapping = TRANSFORMERS_MODELS_TO_C3A_TARGET_MODULES_MAPPING
|
||||
|
||||
def _create_and_replace(
|
||||
self,
|
||||
@ -109,55 +84,6 @@ class C3AModel(BaseTuner):
|
||||
new_module.requires_grad_(False)
|
||||
self._replace_module(parent, target_name, new_module, target)
|
||||
|
||||
def _replace_module(self, parent, child_name, new_module, child):
|
||||
setattr(parent, child_name, new_module)
|
||||
# It's not necessary to set requires_grad here, as that is handled by
|
||||
# _mark_only_adapters_as_trainable
|
||||
|
||||
# child layer wraps the original module, unpack it
|
||||
if hasattr(child, "base_layer"):
|
||||
child = child.base_layer
|
||||
|
||||
if not hasattr(new_module, "base_layer"):
|
||||
new_module.weight = child.weight
|
||||
if hasattr(child, "bias"):
|
||||
new_module.bias = child.bias
|
||||
|
||||
if getattr(child, "state", None) is not None:
|
||||
if hasattr(new_module, "base_layer"):
|
||||
new_module.base_layer.state = child.state
|
||||
else:
|
||||
new_module.state = child.state
|
||||
new_module.to(child.weight.device)
|
||||
|
||||
meta = torch.device("meta")
|
||||
# dispatch to correct device
|
||||
for name, module in new_module.named_modules():
|
||||
if self.prefix in name:
|
||||
if not any(p.device == meta for p in module.parameters()):
|
||||
module.to(child.weight.device)
|
||||
|
||||
def _mark_only_adapters_as_trainable(self, model: torch.nn.Module) -> None:
|
||||
for n, p in model.named_parameters():
|
||||
if self.prefix not in n:
|
||||
p.requires_grad = False
|
||||
|
||||
for active_adapter in self.active_adapters:
|
||||
bias = self.peft_config[active_adapter].bias
|
||||
if bias == "none":
|
||||
continue
|
||||
|
||||
if bias == "all":
|
||||
for n, p in model.named_parameters():
|
||||
if "bias" in n:
|
||||
p.requires_grad = True
|
||||
elif bias == "c3a_only":
|
||||
for m in model.modules():
|
||||
if isinstance(m, C3ALayer) and hasattr(m, "bias") and m.bias is not None:
|
||||
m.bias.requires_grad = True
|
||||
else:
|
||||
raise NotImplementedError(f"Requested bias: {bias}, is not implemented.")
|
||||
|
||||
@staticmethod
|
||||
def _create_new_module(c3a_config, adapter_name, target, **kwargs):
|
||||
if isinstance(target, BaseTunerLayer):
|
||||
@ -169,127 +95,3 @@ class C3AModel(BaseTuner):
|
||||
new_module = C3ALinear(target, adapter_name, **kwargs)
|
||||
|
||||
return new_module
|
||||
|
||||
def __getattr__(self, name: str):
|
||||
"""Forward missing attributes to the wrapped module."""
|
||||
try:
|
||||
return super().__getattr__(name) # defer to nn.Module's logic
|
||||
except AttributeError:
|
||||
return getattr(self.model, name)
|
||||
|
||||
def get_peft_config_as_dict(self, inference: bool = False):
|
||||
config_dict = {}
|
||||
for key, value in self.peft_config.items():
|
||||
config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()}
|
||||
if inference:
|
||||
config["inference_mode"] = True
|
||||
config_dict[key] = config
|
||||
return config
|
||||
|
||||
def _set_adapter_layers(self, enabled: bool = True) -> None:
|
||||
for module in self.model.modules():
|
||||
if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)):
|
||||
module.enable_adapters(enabled)
|
||||
|
||||
def enable_adapter_layers(self) -> None:
|
||||
"""Enable all adapters.
|
||||
|
||||
Call this if you have previously disabled all adapters and want to re-enable them.
|
||||
"""
|
||||
self._set_adapter_layers(enabled=True)
|
||||
|
||||
def disable_adapter_layers(self) -> None:
|
||||
"""Disable all adapters.
|
||||
|
||||
When disabling all adapters, the model output corresponds to the output of the base model.
|
||||
"""
|
||||
for active_adapter in self.active_adapters:
|
||||
val = self.peft_config[active_adapter].bias
|
||||
if val != "none":
|
||||
msg = (
|
||||
f"Careful, disabling adapter layers with bias configured to be '{val}' does not produce the same "
|
||||
"output as the the base model would without adaption."
|
||||
)
|
||||
warnings.warn(msg)
|
||||
self._set_adapter_layers(enabled=False)
|
||||
|
||||
def set_adapter(self, adapter_name: str | list[str], inference_mode: bool = False) -> None:
|
||||
"""Set the active adapter(s).
|
||||
|
||||
Args:
|
||||
adapter_name (`str` or `list[str]`):
|
||||
Name(s) of the adapter(s) to be activated.
|
||||
inference_mode (bool, optional):
|
||||
Whether the activated adapter should be frozen (i.e. `requires_grad=False`). Default is False.
|
||||
"""
|
||||
self.set_auxiliary_adapters(adapter_name, inference_mode=inference_mode)
|
||||
for module in self.model.modules():
|
||||
if isinstance(module, C3ALayer):
|
||||
if module.merged:
|
||||
warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.")
|
||||
module.unmerge()
|
||||
module.set_adapter(adapter_name, inference_mode=inference_mode)
|
||||
self.active_adapter = adapter_name
|
||||
|
||||
@staticmethod
|
||||
def _prepare_adapter_config(peft_config, model_config):
|
||||
if peft_config.target_modules is None:
|
||||
if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_C3A_TARGET_MODULES_MAPPING:
|
||||
raise ValueError("Please specify `target_modules` in `peft_config`")
|
||||
peft_config.target_modules = set(
|
||||
TRANSFORMERS_MODELS_TO_C3A_TARGET_MODULES_MAPPING[model_config["model_type"]]
|
||||
)
|
||||
return peft_config
|
||||
|
||||
def _unload_and_optionally_merge(
|
||||
self,
|
||||
merge=True,
|
||||
progressbar: bool = False,
|
||||
safe_merge: bool = False,
|
||||
adapter_names: Optional[list[str]] = None,
|
||||
):
|
||||
key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
|
||||
desc = "Unloading " + ("and merging " if merge else "") + "model"
|
||||
for key in tqdm(key_list, disable=not progressbar, desc=desc):
|
||||
try:
|
||||
parent, target, target_name = _get_submodules(self.model, key)
|
||||
except AttributeError:
|
||||
continue
|
||||
|
||||
if hasattr(target, "base_layer"):
|
||||
if merge:
|
||||
target.merge(safe_merge=safe_merge, adapter_names=adapter_names)
|
||||
self._replace_module(parent, target_name, target.get_base_layer(), target)
|
||||
elif isinstance(target, ModulesToSaveWrapper):
|
||||
# save any additional trainable modules part of `modules_to_save`
|
||||
setattr(parent, target_name, target.modules_to_save[target.active_adapter])
|
||||
|
||||
return self.model
|
||||
|
||||
def merge_and_unload(
|
||||
self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[list[str]] = None
|
||||
) -> torch.nn.Module:
|
||||
r"""
|
||||
This method merges the C3A layers into the base model. This is needed if someone wants to use the base model as
|
||||
a standalone model.
|
||||
|
||||
Args:
|
||||
progressbar (`bool`):
|
||||
whether to show a progressbar indicating the unload and merge process
|
||||
safe_merge (`bool`):
|
||||
whether to activate the safe merging check to check if there is any potential Nan in the adapter
|
||||
weights
|
||||
adapter_names (`list[str]`, *optional*):
|
||||
The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
|
||||
to `None`.
|
||||
"""
|
||||
return self._unload_and_optionally_merge(
|
||||
progressbar=progressbar, safe_merge=safe_merge, adapter_names=adapter_names
|
||||
)
|
||||
|
||||
def unload(self) -> torch.nn.Module:
|
||||
"""
|
||||
Gets back the base model by removing all the C3A modules without merging. This gives back the original base
|
||||
model.
|
||||
"""
|
||||
return self._unload_and_optionally_merge(merge=False)
|
||||
|
23
src/peft/tuners/delora/__init__.py
Normal file
23
src/peft/tuners/delora/__init__.py
Normal file
@ -0,0 +1,23 @@
|
||||
# Copyright 2025-present the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from peft.utils import register_peft_method
|
||||
|
||||
from .config import DeloraConfig
|
||||
from .layer import DeloraLayer, DeloraLinear
|
||||
from .model import DeloraModel
|
||||
|
||||
|
||||
__all__ = ["DeloraConfig", "DeloraLayer", "DeloraLinear", "DeloraModel"]
|
||||
|
||||
register_peft_method(name="delora", model_cls=DeloraModel, config_cls=DeloraConfig)
|
154
src/peft/tuners/delora/config.py
Normal file
154
src/peft/tuners/delora/config.py
Normal file
@ -0,0 +1,154 @@
|
||||
# Copyright 2025-present the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional, Union
|
||||
|
||||
from peft.config import PeftConfig
|
||||
from peft.utils import PeftType
|
||||
|
||||
|
||||
@dataclass
|
||||
class DeloraConfig(PeftConfig):
|
||||
"""
|
||||
This is the configuration class to store the configuration of a [`DeloraModel`].
|
||||
|
||||
Args:
|
||||
r (`int`):
|
||||
The rank of the DeLoRA adapter.
|
||||
delora_lambda (`int`):
|
||||
The initial value of the boundary of the DeLoRA adapter. This variable sets an upper bound to the Frobenius
|
||||
norm of the weight change, avoiding the finetuned model to deviate too much from the original model.
|
||||
module_dropout (`float`):
|
||||
The dropout probability for disabling DeLoRA modules during training.
|
||||
target_modules (`Optional[Union[List[str], str]]`):
|
||||
The names of the modules to apply the adapter to. If this is specified, only the modules with the specified
|
||||
names will be replaced. When passing a string, a regex match will be performed. When passing a list of
|
||||
strings, either an exact match will be performed or it is checked if the name of the module ends with any
|
||||
of the passed strings. If this is specified as 'all-linear', then all linear/Conv1D modules are chosen,
|
||||
excluding the output layer. If this is not specified, modules will be chosen according to the model
|
||||
architecture. If the architecture is not known, an error will be raised -- in this case, you should specify
|
||||
the target modules manually.
|
||||
exclude_modules (`Optional[Union[List[str], str]]`):
|
||||
The names of the modules to not apply the adapter. When passing a string, a regex match will be performed.
|
||||
When passing a list of strings, either an exact match will be performed or it is checked if the name of the
|
||||
module ends with any of the passed strings.
|
||||
bias (`str`):
|
||||
Bias type for DeLoRA. Can be 'none', 'all' or 'delora_only'. If 'all' or 'delora_only', the corresponding
|
||||
biases will be updated during training. Be aware that this means that, even when disabling the adapters,
|
||||
the model will not produce the same output as the base model would have without adaptation.
|
||||
init_weights (`bool`):
|
||||
Whether to perform initialization of adapter weights. If `True` (default): A is initialized with kaiming
|
||||
uniform initialization, while B is initialized with zeros. If `False`: A and B are both initialized with
|
||||
kaiming uniform, immediately contributing a non-zero delta. This is generally discouraged for normal use.
|
||||
layers_to_transform (`Union[List[int], int]`):
|
||||
The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices
|
||||
that are specified in this list. If a single integer is passed, it will apply the transformations on the
|
||||
layer at this index.
|
||||
layers_pattern (`Optional[Union[List[str], str]]`):
|
||||
The layer pattern name, used only if `layers_to_transform` is different from `None`. This should target the
|
||||
`nn.ModuleList` of the model, which is often called `'layers'` or `'h'`.
|
||||
rank_pattern (`dict`):
|
||||
The mapping from layer names or regexp expression to ranks which are different from the default rank
|
||||
specified by `r`. For example, `{'^model.decoder.layers.0.encoder_attn.k_proj': 16}`.
|
||||
lambda_pattern (`dict`):
|
||||
The mapping from layer names or regexp expression to lambdas which are different from the default lambda
|
||||
specified by `delora_lambda`. For example, `{'^model.decoder.layers.0.encoder_attn.k_proj': 16}`.
|
||||
modules_to_save (`Optional[List[str]]`):
|
||||
List of modules apart from adapter layers to be set as trainable and saved in the final checkpoint.
|
||||
"""
|
||||
|
||||
r: int = field(default=8, metadata={"help": "DeLoRA rank"})
|
||||
delora_lambda: int = field(
|
||||
default=15,
|
||||
metadata={
|
||||
"help": "The initial value of the boundary of the DeLoRA adapter. This variable sets an upper bound to the "
|
||||
"Frobenius norm of the weight change, avoiding the finetuned model to deviate too much from the original model."
|
||||
},
|
||||
)
|
||||
module_dropout: float = field(
|
||||
default=0.0, metadata={"help": "The dropout probability for disabling DeLoRA modules during training"}
|
||||
)
|
||||
target_modules: Optional[Union[list[str], str]] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "List of module names or regex expression of the module names to replace with DeLoRA."
|
||||
"For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
|
||||
"This can also be a wildcard 'all-linear' which matches all linear layers except the output layer."
|
||||
},
|
||||
)
|
||||
exclude_modules: Optional[Union[list[str], str]] = field(
|
||||
default=None,
|
||||
metadata={"help": "List of module names or regex expression of the module names to exclude from DeLoRA."},
|
||||
)
|
||||
bias: str = field(default="none", metadata={"help": "Bias type for DeLoRA. Can be 'none' or 'all'"})
|
||||
init_weights: bool = field(
|
||||
default=True,
|
||||
metadata={
|
||||
"help": "Whether to perform initialization of adapter weights. If `True` (default): A is initialized with kaiming uniform "
|
||||
"initialization, while B is initialized with zeros. If `False`: A and B are both initialized with kaiming uniform, "
|
||||
"immediately contributing a non-zero delta. This is generally discouraged for normal use."
|
||||
},
|
||||
)
|
||||
layers_to_transform: Optional[Union[list[int], int]] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that "
|
||||
"are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index."
|
||||
},
|
||||
)
|
||||
layers_pattern: Optional[Union[list[str], str]] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the "
|
||||
"common layers pattern. This should target the `nn.ModuleList` of the model, which is often called `'layers'` or `'h'`."
|
||||
},
|
||||
)
|
||||
rank_pattern: Optional[dict] = field(
|
||||
default_factory=dict,
|
||||
metadata={
|
||||
"help": "The mapping from layer names or regexp expression to ranks which are different from the default rank specified "
|
||||
"by `r`. For example, `{'^model.decoder.layers.0.encoder_attn.k_proj': 16}`."
|
||||
},
|
||||
)
|
||||
lambda_pattern: Optional[dict] = field(
|
||||
default_factory=dict,
|
||||
metadata={
|
||||
"help": "The mapping from layer names or regexp expression to lambdas which are different from the default lambda specified by `delora_lambda`."
|
||||
},
|
||||
)
|
||||
modules_to_save: Optional[list[str]] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "List of modules apart from DeLoRA layers to be set as trainable and saved in the final checkpoint. "
|
||||
"For example, in Sequence Classification or Token Classification tasks, the final layer `classifier/score` "
|
||||
"are randomly initialized and as such need to be trainable and saved."
|
||||
},
|
||||
)
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
# PeftType enum members are uppercase; use DELORA
|
||||
self.peft_type = PeftType.DELORA
|
||||
self.target_modules = (
|
||||
set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
|
||||
)
|
||||
# if target_modules is a regex expression, then layers_to_transform should be None
|
||||
if isinstance(self.target_modules, str) and self.layers_to_transform is not None:
|
||||
raise ValueError("`layers_to_transform` cannot be used when `target_modules` is a str.")
|
||||
|
||||
# check for layers_to_transform and layers_pattern
|
||||
if self.layers_pattern and not self.layers_to_transform:
|
||||
raise ValueError("When `layers_pattern` is specified, `layers_to_transform` must also be specified. ")
|
269
src/peft/tuners/delora/layer.py
Normal file
269
src/peft/tuners/delora/layer.py
Normal file
@ -0,0 +1,269 @@
|
||||
# Copyright 2025-present the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
import warnings
|
||||
from typing import Any, Optional
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from peft.tuners._buffer_dict import BufferDict
|
||||
from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge
|
||||
|
||||
|
||||
class DeloraLayer(BaseTunerLayer):
|
||||
# All names of layers that may contain (trainable) adapter weights
|
||||
adapter_layer_names = (
|
||||
"delora_A",
|
||||
"delora_B",
|
||||
"delora_lambda",
|
||||
)
|
||||
# All names of other parameters that may contain adapter-related parameters
|
||||
other_param_names = (
|
||||
"r",
|
||||
"module_dropout",
|
||||
"delora_w_norm",
|
||||
)
|
||||
|
||||
def __init__(self, base_layer: nn.Module, **kwargs) -> None:
|
||||
self.base_layer = base_layer
|
||||
self.r = {}
|
||||
self.module_dropout = nn.ModuleDict({})
|
||||
self.delora_A = nn.ParameterDict({})
|
||||
self.delora_B = nn.ParameterDict({})
|
||||
self.delora_lambda = nn.ParameterDict({})
|
||||
# Use persistent buffers so they are included in state_dict and saved.
|
||||
self.delora_w_norm = BufferDict({}, persistent=True)
|
||||
# Mark the weight as unmerged
|
||||
self._disable_adapters = False
|
||||
self.merged_adapters = []
|
||||
self.kwargs = kwargs
|
||||
|
||||
base_layer_mod = self.get_base_layer()
|
||||
if isinstance(base_layer_mod, nn.Linear):
|
||||
self.in_features, self.out_features = base_layer_mod.in_features, base_layer_mod.out_features
|
||||
else:
|
||||
raise ValueError(f"Unsupported layer type {type(base_layer_mod)}")
|
||||
|
||||
@staticmethod
|
||||
def _compute_delta(
|
||||
A: torch.Tensor, B: torch.Tensor, delora_lambda: torch.Tensor, r: int, w_norm: torch.Tensor
|
||||
) -> torch.Tensor:
|
||||
"""Compute delta = B @ diag(delora_lambda/r / (||A_i||*||B^j||)) @ A, scaled by provided w_norm (per-input channel)"""
|
||||
An = torch.clamp(A.norm(dim=1), min=1e-4)
|
||||
Bn = torch.clamp(B.norm(dim=0), min=1e-4)
|
||||
diag = torch.diag_embed(delora_lambda / r / (An * Bn))
|
||||
delta = B @ diag @ A
|
||||
delta = delta * w_norm.unsqueeze(0)
|
||||
return delta
|
||||
|
||||
def get_delta_weight(self, adapter: str) -> torch.Tensor:
|
||||
if adapter not in self.delora_A or adapter not in self.delora_B:
|
||||
raise ValueError(f"Adapter {adapter} not found.")
|
||||
|
||||
delta = self._compute_delta(
|
||||
self.delora_A[adapter],
|
||||
self.delora_B[adapter],
|
||||
self.delora_lambda[adapter],
|
||||
self.r[adapter],
|
||||
self.delora_w_norm[adapter],
|
||||
)
|
||||
return delta
|
||||
|
||||
def update_layer(
|
||||
self,
|
||||
adapter_name: str,
|
||||
r: int,
|
||||
delora_lambda: float,
|
||||
module_dropout: float,
|
||||
init_weights: bool = True,
|
||||
inference_mode: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Internal function to create delora adapter
|
||||
|
||||
Args:
|
||||
adapter_name (`str`): Name for the adapter to add.
|
||||
r (`int`): Rank for the added adapter.
|
||||
delora_lambda (`float`): Boundary for the adapter's norm.
|
||||
module_dropout (`float`): The dropout probability for disabling adapter during training.
|
||||
init_weights (`bool`): Whether to initialize weights.
|
||||
"""
|
||||
if r <= 0:
|
||||
raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
|
||||
|
||||
self.r[adapter_name] = r
|
||||
self.delora_A[adapter_name] = nn.Parameter(torch.empty(r, self.in_features))
|
||||
self.delora_B[adapter_name] = nn.Parameter(torch.empty(self.out_features, r))
|
||||
self.delora_lambda[adapter_name] = nn.Parameter(torch.empty(1))
|
||||
if module_dropout > 0.0:
|
||||
module_dropout_layer = nn.Dropout(p=module_dropout)
|
||||
else:
|
||||
module_dropout_layer = nn.Identity()
|
||||
self.module_dropout.update(nn.ModuleDict({adapter_name: module_dropout_layer}))
|
||||
|
||||
# Initialize weights
|
||||
self.reset_delora_parameters(adapter_name, init_weights, delora_lambda)
|
||||
|
||||
# Move new weights to device
|
||||
self._move_adapter_to_device_of_base_layer(adapter_name)
|
||||
self.set_adapter(self.active_adapters, inference_mode=inference_mode)
|
||||
|
||||
def reset_delora_parameters(
|
||||
self,
|
||||
adapter_name: str,
|
||||
init_weights: bool = True,
|
||||
delora_lambda: float = 15.0,
|
||||
) -> None:
|
||||
if adapter_name not in self.delora_A.keys():
|
||||
return
|
||||
|
||||
if init_weights is True:
|
||||
nn.init.kaiming_uniform_(self.delora_A[adapter_name], a=math.sqrt(5))
|
||||
nn.init.zeros_(self.delora_B[adapter_name])
|
||||
else:
|
||||
nn.init.kaiming_uniform_(self.delora_A[adapter_name], a=math.sqrt(5))
|
||||
nn.init.kaiming_uniform_(self.delora_B[adapter_name], a=math.sqrt(5))
|
||||
|
||||
self.delora_lambda[adapter_name].data.fill_(float(delora_lambda))
|
||||
|
||||
# capture a fixed norm for this adapter to use for future delta computations
|
||||
with torch.no_grad():
|
||||
w = self.get_base_layer().weight
|
||||
if w.device.type != "meta":
|
||||
w_norm = torch.norm(w.data, dim=0).detach()
|
||||
else:
|
||||
# For meta tensors, we can't compute the norm, so use a default value
|
||||
w_norm = torch.ones(w.shape[1], device=w.device)
|
||||
self.delora_w_norm[adapter_name] = w_norm
|
||||
|
||||
|
||||
class DeloraLinear(nn.Module, DeloraLayer):
|
||||
# DeLoRA implemented in a dense layer
|
||||
def __init__(
|
||||
self,
|
||||
base_layer,
|
||||
adapter_name: str,
|
||||
r: int,
|
||||
delora_lambda: float,
|
||||
module_dropout: float,
|
||||
init_weights: bool = True,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
DeloraLayer.__init__(self, base_layer, **kwargs)
|
||||
self._active_adapter = adapter_name
|
||||
self.update_layer(adapter_name, r, delora_lambda, module_dropout, init_weights)
|
||||
|
||||
def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
|
||||
"""
|
||||
Merge the active adapter weights into the base weights
|
||||
|
||||
Args:
|
||||
safe_merge (`bool`, *optional*):
|
||||
If True, the merge operation will be performed in a copy of the original weights and check for NaNs
|
||||
before merging the weights. This is useful if you want to check if the merge operation will produce
|
||||
NaNs. Defaults to `False`.
|
||||
adapter_names (`list[str]`, *optional*):
|
||||
The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
|
||||
to `None`.
|
||||
"""
|
||||
adapter_names = check_adapters_to_merge(self, adapter_names)
|
||||
if not adapter_names:
|
||||
return
|
||||
|
||||
for active_adapter in adapter_names:
|
||||
if active_adapter in self.delora_A.keys():
|
||||
base_layer = self.get_base_layer()
|
||||
delta_weight = (
|
||||
self.get_delta_weight(active_adapter)
|
||||
.detach()
|
||||
.to(dtype=base_layer.weight.dtype, device=base_layer.weight.device)
|
||||
)
|
||||
with torch.no_grad():
|
||||
if safe_merge:
|
||||
orig_weights = base_layer.weight.data.clone()
|
||||
orig_weights = orig_weights + delta_weight
|
||||
|
||||
if not torch.isfinite(orig_weights).all():
|
||||
raise ValueError(
|
||||
f"NaNs detected in merged weights for adapter {active_adapter}; aborting merge"
|
||||
)
|
||||
|
||||
base_layer.weight.data = orig_weights
|
||||
else:
|
||||
base_layer.weight.data.add_(delta_weight)
|
||||
|
||||
self.merged_adapters.append(active_adapter)
|
||||
|
||||
def unmerge(self) -> None:
|
||||
"""
|
||||
Unmerge all merged adapter layers from the base weights.
|
||||
"""
|
||||
if not self.merged:
|
||||
warnings.warn("Already unmerged. Nothing to do.")
|
||||
return
|
||||
while len(self.merged_adapters) > 0:
|
||||
active_adapter = self.merged_adapters.pop()
|
||||
if active_adapter in self.delora_A.keys():
|
||||
self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter)
|
||||
|
||||
def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
|
||||
previous_dtype = x.dtype
|
||||
|
||||
if self.disable_adapters:
|
||||
if self.merged:
|
||||
self.unmerge()
|
||||
result = self.base_layer(x, *args, **kwargs)
|
||||
elif self.merged:
|
||||
result = self.base_layer(x, *args, **kwargs)
|
||||
else:
|
||||
if not self.active_adapters:
|
||||
return self.base_layer(x, *args, **kwargs).to(previous_dtype)
|
||||
|
||||
base_out = self.base_layer(x, *args, **kwargs)
|
||||
add_out = torch.zeros_like(base_out)
|
||||
|
||||
for adapter in self.active_adapters:
|
||||
if adapter not in self.delora_A:
|
||||
continue
|
||||
|
||||
x_d = self.module_dropout[adapter](x)
|
||||
|
||||
# Decomposed delta calculation
|
||||
# 1. (x * w_norm) @ A.T
|
||||
h = nn.functional.linear(x_d * self.delora_w_norm[adapter], self.delora_A[adapter])
|
||||
|
||||
# 2. h @ diag
|
||||
An = torch.clamp(self.delora_A[adapter].norm(dim=1), min=1e-4)
|
||||
Bn = torch.clamp(self.delora_B[adapter].norm(dim=0), min=1e-4)
|
||||
scaling = (self.delora_lambda[adapter] / self.r[adapter]) / (An * Bn)
|
||||
|
||||
h = h * scaling
|
||||
|
||||
# 3. h @ B.T
|
||||
h = nn.functional.linear(h, self.delora_B[adapter])
|
||||
|
||||
add_out += h
|
||||
|
||||
result = base_out + add_out.to(base_out.dtype)
|
||||
|
||||
result = result.to(previous_dtype)
|
||||
return result
|
||||
|
||||
def __repr__(self) -> str:
|
||||
rep = super().__repr__()
|
||||
return "delora." + rep
|
105
src/peft/tuners/delora/model.py
Normal file
105
src/peft/tuners/delora/model.py
Normal file
@ -0,0 +1,105 @@
|
||||
# Copyright 2025-present the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import annotations
|
||||
|
||||
import torch
|
||||
|
||||
from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer
|
||||
from peft.utils import (
|
||||
TRANSFORMERS_MODELS_TO_DELORA_TARGET_MODULES_MAPPING,
|
||||
)
|
||||
from peft.utils.other import get_pattern_key
|
||||
|
||||
from .config import DeloraConfig
|
||||
from .layer import DeloraLayer, DeloraLinear
|
||||
|
||||
|
||||
class DeloraModel(BaseTuner):
|
||||
"""
|
||||
Creates DeLoRA model from a pretrained transformers model.
|
||||
|
||||
The method is described in detail in [TODO].
|
||||
|
||||
Args:
|
||||
model ([`torch.nn.Module`]): The model to be adapted.
|
||||
config ([`DeloraConfig`]): The configuration of the DeLoRA model.
|
||||
adapter_name (`str`): The name of the adapter, defaults to `"default"`.
|
||||
|
||||
Returns:
|
||||
`torch.nn.Module`: The DeLoRA model.
|
||||
|
||||
**Attributes**:
|
||||
- **model** ([`~transformers.PreTrainedModel`]) -- The model to be adapted.
|
||||
- **peft_config** ([`DeloraConfig`]): The configuration of the DeLoRA model.
|
||||
"""
|
||||
|
||||
prefix: str = "delora_"
|
||||
tuner_layer_cls = DeloraLayer
|
||||
target_module_mapping = TRANSFORMERS_MODELS_TO_DELORA_TARGET_MODULES_MAPPING
|
||||
|
||||
def _check_new_adapter_config(self, config: DeloraConfig) -> None:
|
||||
"""
|
||||
A helper method to check the config when a new adapter is being added.
|
||||
|
||||
Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters.
|
||||
|
||||
"""
|
||||
super()._check_new_adapter_config(config)
|
||||
|
||||
def _create_and_replace(
|
||||
self,
|
||||
delora_config,
|
||||
adapter_name,
|
||||
target,
|
||||
target_name,
|
||||
parent,
|
||||
current_key,
|
||||
**optional_kwargs,
|
||||
):
|
||||
if current_key is None:
|
||||
raise ValueError("Current Key shouldn't be `None`")
|
||||
|
||||
# Regexp matching - Find key which matches current target_name in patterns provided
|
||||
r_key = get_pattern_key(delora_config.rank_pattern.keys(), current_key)
|
||||
lambda_key = get_pattern_key(delora_config.lambda_pattern.keys(), current_key)
|
||||
r = delora_config.rank_pattern.get(r_key, delora_config.r)
|
||||
delora_lambda = delora_config.lambda_pattern.get(lambda_key, delora_config.delora_lambda)
|
||||
|
||||
kwargs = {
|
||||
"r": r,
|
||||
"delora_lambda": delora_lambda,
|
||||
"module_dropout": delora_config.module_dropout,
|
||||
"init_weights": delora_config.init_weights,
|
||||
}
|
||||
|
||||
if isinstance(target, DeloraLinear):
|
||||
target.update_layer(adapter_name, **kwargs)
|
||||
else:
|
||||
new_module = self._create_new_module(delora_config, adapter_name, target, **kwargs)
|
||||
if adapter_name != self.active_adapter:
|
||||
# adding an additional adapter: it is not automatically trainable
|
||||
new_module.requires_grad_(False)
|
||||
self._replace_module(parent, target_name, new_module, target)
|
||||
|
||||
@staticmethod
|
||||
def _create_new_module(delora_config, adapter_name, target, **kwargs):
|
||||
if isinstance(target, BaseTunerLayer):
|
||||
target_base_layer = target.get_base_layer()
|
||||
else:
|
||||
target_base_layer = target
|
||||
|
||||
if isinstance(target_base_layer, torch.nn.Linear):
|
||||
new_module = DeloraLinear(target, adapter_name, **kwargs)
|
||||
|
||||
return new_module
|
@ -15,23 +15,16 @@ from __future__ import annotations
|
||||
|
||||
import re
|
||||
import warnings
|
||||
from dataclasses import asdict
|
||||
from enum import Enum
|
||||
from itertools import chain
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
from transformers.pytorch_utils import Conv1D
|
||||
|
||||
from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists
|
||||
from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer
|
||||
from peft.utils import (
|
||||
TRANSFORMERS_MODELS_TO_FOURIERFT_TARGET_MODULES_MAPPING,
|
||||
ModulesToSaveWrapper,
|
||||
_get_submodules,
|
||||
)
|
||||
|
||||
from .config import FourierFTConfig
|
||||
from .layer import FourierFTLayer, FourierFTLinear
|
||||
|
||||
|
||||
@ -57,25 +50,8 @@ class FourierFTModel(BaseTuner):
|
||||
"""
|
||||
|
||||
prefix: str = "fourierft_"
|
||||
|
||||
def _check_new_adapter_config(self, config: FourierFTConfig) -> None:
|
||||
"""
|
||||
A helper method to check the config when a new adapter is being added.
|
||||
|
||||
Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters.
|
||||
|
||||
"""
|
||||
# TODO: there should be a check if any of the existing adapters actually has bias != "none", or else the check
|
||||
# does not fully correspond to the error message.
|
||||
if (len(self.peft_config) > 1) and (config.bias != "none"):
|
||||
raise ValueError(
|
||||
f"{self.__class__.__name__} supports only 1 adapter with bias. When using multiple adapters, "
|
||||
"set bias to 'none' for all adapters."
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _check_target_module_exists(fourierft_config, key):
|
||||
return check_target_module_exists(fourierft_config, key)
|
||||
tuner_layer_cls = FourierFTLayer
|
||||
target_module_mapping = TRANSFORMERS_MODELS_TO_FOURIERFT_TARGET_MODULES_MAPPING
|
||||
|
||||
def _create_and_replace(
|
||||
self,
|
||||
@ -120,55 +96,6 @@ class FourierFTModel(BaseTuner):
|
||||
new_module.requires_grad_(False)
|
||||
self._replace_module(parent, target_name, new_module, target)
|
||||
|
||||
def _replace_module(self, parent, child_name, new_module, child):
|
||||
setattr(parent, child_name, new_module)
|
||||
# It's not necessary to set requires_grad here, as that is handled by
|
||||
# _mark_only_adapters_as_trainable
|
||||
|
||||
# child layer wraps the original module, unpack it
|
||||
if hasattr(child, "base_layer"):
|
||||
child = child.base_layer
|
||||
|
||||
if not hasattr(new_module, "base_layer"):
|
||||
new_module.weight = child.weight
|
||||
if hasattr(child, "bias"):
|
||||
new_module.bias = child.bias
|
||||
|
||||
if getattr(child, "state", None) is not None:
|
||||
if hasattr(new_module, "base_layer"):
|
||||
new_module.base_layer.state = child.state
|
||||
else:
|
||||
new_module.state = child.state
|
||||
new_module.to(child.weight.device)
|
||||
|
||||
meta = torch.device("meta")
|
||||
# dispatch to correct device
|
||||
for name, module in new_module.named_modules():
|
||||
if "fourierft_" in name:
|
||||
if not any(p.device == meta for p in module.parameters()):
|
||||
module.to(child.weight.device)
|
||||
|
||||
def _mark_only_adapters_as_trainable(self, model: torch.nn.Module) -> None:
|
||||
for n, p in model.named_parameters():
|
||||
if self.prefix not in n:
|
||||
p.requires_grad = False
|
||||
|
||||
for active_adapter in self.active_adapters:
|
||||
bias = self.peft_config[active_adapter].bias
|
||||
if bias == "none":
|
||||
continue
|
||||
|
||||
if bias == "all":
|
||||
for n, p in model.named_parameters():
|
||||
if "bias" in n:
|
||||
p.requires_grad = True
|
||||
elif bias == "fourier_only":
|
||||
for m in model.modules():
|
||||
if isinstance(m, FourierFTLayer) and hasattr(m, "bias") and m.bias is not None:
|
||||
m.bias.requires_grad = True
|
||||
else:
|
||||
raise NotImplementedError(f"Requested bias: {bias}, is not implemented.")
|
||||
|
||||
@staticmethod
|
||||
def _create_new_module(fourierft_config, adapter_name, target, **kwargs):
|
||||
if isinstance(target, BaseTunerLayer):
|
||||
@ -199,153 +126,3 @@ class FourierFTModel(BaseTuner):
|
||||
new_module = FourierFTLinear(target, adapter_name, **kwargs)
|
||||
|
||||
return new_module
|
||||
|
||||
def __getattr__(self, name: str):
|
||||
"""Forward missing attributes to the wrapped module."""
|
||||
try:
|
||||
return super().__getattr__(name) # defer to nn.Module's logic
|
||||
except AttributeError:
|
||||
if name == "model":
|
||||
raise
|
||||
return getattr(self.model, name)
|
||||
|
||||
def get_peft_config_as_dict(self, inference: bool = False):
|
||||
config_dict = {}
|
||||
for key, value in self.peft_config.items():
|
||||
config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()}
|
||||
if inference:
|
||||
config["inference_mode"] = True
|
||||
config_dict[key] = config
|
||||
return config
|
||||
|
||||
def _set_adapter_layers(self, enabled: bool = True) -> None:
|
||||
for module in self.model.modules():
|
||||
if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)):
|
||||
module.enable_adapters(enabled)
|
||||
|
||||
def enable_adapter_layers(self) -> None:
|
||||
"""Enable all adapters.
|
||||
|
||||
Call this if you have previously disabled all adapters and want to re-enable them.
|
||||
"""
|
||||
self._set_adapter_layers(enabled=True)
|
||||
|
||||
def disable_adapter_layers(self) -> None:
|
||||
"""Disable all adapters.
|
||||
|
||||
When disabling all adapters, the model output corresponds to the output of the base model.
|
||||
"""
|
||||
for active_adapter in self.active_adapters:
|
||||
val = self.peft_config[active_adapter].bias
|
||||
if val != "none":
|
||||
msg = (
|
||||
f"Careful, disabling adapter layers with bias configured to be '{val}' does not produce the same "
|
||||
"output as the base model would without adaption."
|
||||
)
|
||||
warnings.warn(msg)
|
||||
self._set_adapter_layers(enabled=False)
|
||||
|
||||
def set_adapter(self, adapter_name: str | list[str], inference_mode: bool = False) -> None:
|
||||
"""Set the active adapter(s).
|
||||
|
||||
Args:
|
||||
adapter_name (`str` or `list[str]`):
|
||||
Name(s) of the adapter(s) to be activated.
|
||||
inference_mode (bool, optional):
|
||||
Whether the activated adapter should be frozen (i.e. `requires_grad=False`). Default is False.
|
||||
"""
|
||||
self.set_auxiliary_adapters(adapter_name, inference_mode=inference_mode)
|
||||
for module in self.model.modules():
|
||||
if isinstance(module, FourierFTLayer):
|
||||
if module.merged:
|
||||
warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.")
|
||||
module.unmerge()
|
||||
module.set_adapter(adapter_name, inference_mode=inference_mode)
|
||||
self.active_adapter = adapter_name
|
||||
|
||||
@staticmethod
|
||||
def _prepare_adapter_config(peft_config, model_config):
|
||||
if peft_config.target_modules is None:
|
||||
if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_FOURIERFT_TARGET_MODULES_MAPPING:
|
||||
raise ValueError("Please specify `target_modules` in `peft_config`")
|
||||
peft_config.target_modules = set(
|
||||
TRANSFORMERS_MODELS_TO_FOURIERFT_TARGET_MODULES_MAPPING[model_config["model_type"]]
|
||||
)
|
||||
return peft_config
|
||||
|
||||
def _unload_and_optionally_merge(
|
||||
self,
|
||||
merge=True,
|
||||
progressbar: bool = False,
|
||||
safe_merge: bool = False,
|
||||
adapter_names: Optional[list[str]] = None,
|
||||
):
|
||||
key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
|
||||
desc = "Unloading " + ("and merging " if merge else "") + "model"
|
||||
for key in tqdm(key_list, disable=not progressbar, desc=desc):
|
||||
try:
|
||||
parent, target, target_name = _get_submodules(self.model, key)
|
||||
except AttributeError:
|
||||
continue
|
||||
|
||||
if hasattr(target, "base_layer"):
|
||||
if merge:
|
||||
target.merge(safe_merge=safe_merge, adapter_names=adapter_names)
|
||||
self._replace_module(parent, target_name, target.get_base_layer(), target)
|
||||
elif isinstance(target, ModulesToSaveWrapper):
|
||||
# save any additional trainable modules part of `modules_to_save`
|
||||
setattr(parent, target_name, target.modules_to_save[target.active_adapter])
|
||||
|
||||
return self.model
|
||||
|
||||
def delete_adapter(self, adapter_name: str):
|
||||
"""
|
||||
Deletes an existing adapter.
|
||||
|
||||
Args:
|
||||
adapter_name (str): Name of the adapter to be deleted.
|
||||
"""
|
||||
if adapter_name not in list(self.peft_config.keys()):
|
||||
raise ValueError(f"Adapter {adapter_name} does not exist")
|
||||
del self.peft_config[adapter_name]
|
||||
|
||||
# we cannot use self.prefix as we want to include non-trainable fourierft parameters
|
||||
key_list = [key for key, _ in self.model.named_modules() if "fourierft" not in key]
|
||||
new_adapter = None
|
||||
for key in key_list:
|
||||
_, target, _ = _get_submodules(self.model, key)
|
||||
if isinstance(target, FourierFTLayer):
|
||||
target.delete_adapter(adapter_name)
|
||||
if new_adapter is None:
|
||||
new_adapter = target.active_adapter[:]
|
||||
|
||||
self.active_adapter = new_adapter or []
|
||||
self._delete_auxiliary_adapter(adapter_name, new_active_adapters=new_adapter)
|
||||
|
||||
def merge_and_unload(
|
||||
self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[list[str]] = None
|
||||
) -> torch.nn.Module:
|
||||
r"""
|
||||
This method merges the Fourier layers into the base model. This is needed if someone wants to use the base
|
||||
model as a standalone model.
|
||||
|
||||
Args:
|
||||
progressbar (`bool`):
|
||||
whether to show a progressbar indicating the unload and merge process
|
||||
safe_merge (`bool`):
|
||||
whether to activate the safe merging check to check if there is any potential Nan in the adapter
|
||||
weights
|
||||
adapter_names (`List[str]`, *optional*):
|
||||
The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
|
||||
to `None`.
|
||||
"""
|
||||
return self._unload_and_optionally_merge(
|
||||
progressbar=progressbar, safe_merge=safe_merge, adapter_names=adapter_names
|
||||
)
|
||||
|
||||
def unload(self) -> torch.nn.Module:
|
||||
"""
|
||||
Gets back the base model by removing all the Fourier modules without merging. This gives back the original base
|
||||
model.
|
||||
"""
|
||||
return self._unload_and_optionally_merge(merge=False)
|
||||
|
@ -12,23 +12,12 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import warnings
|
||||
from dataclasses import asdict
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from tqdm import tqdm
|
||||
|
||||
from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists
|
||||
from peft.utils import (
|
||||
TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,
|
||||
ModulesToSaveWrapper,
|
||||
_get_submodules,
|
||||
)
|
||||
from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer
|
||||
from peft.utils import TRANSFORMERS_MODELS_TO_HRA_TARGET_MODULES_MAPPING
|
||||
|
||||
from .config import HRAConfig
|
||||
from .layer import HRAConv2d, HRALayer, HRALinear
|
||||
|
||||
|
||||
@ -83,25 +72,8 @@ class HRAModel(BaseTuner):
|
||||
"""
|
||||
|
||||
prefix: str = "hra_"
|
||||
|
||||
def _check_new_adapter_config(self, config: HRAConfig) -> None:
|
||||
"""
|
||||
A helper method to check the config when a new adapter is being added.
|
||||
|
||||
Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters.
|
||||
|
||||
"""
|
||||
# TODO: there should be a check if any of the existing adapters actually has bias != "none", or else the check
|
||||
# does not fully correspond to the error message.
|
||||
if (len(self.peft_config) > 1) and (config.bias != "none"):
|
||||
raise ValueError(
|
||||
f"{self.__class__.__name__} supports only 1 adapter with bias. When using multiple adapters, "
|
||||
"set bias to 'none' for all adapters."
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _check_target_module_exists(hra_config, key):
|
||||
return check_target_module_exists(hra_config, key)
|
||||
tuner_layer_cls = HRALayer
|
||||
target_module_mapping = TRANSFORMERS_MODELS_TO_HRA_TARGET_MODULES_MAPPING
|
||||
|
||||
def _create_and_replace(
|
||||
self,
|
||||
@ -139,55 +111,6 @@ class HRAModel(BaseTuner):
|
||||
init_weights=hra_config.init_weights,
|
||||
)
|
||||
|
||||
def _replace_module(self, parent, child_name, new_module, child):
|
||||
setattr(parent, child_name, new_module)
|
||||
# It's not necessary to set requires_grad here, as that is handled by
|
||||
# _mark_only_adapters_as_trainable
|
||||
|
||||
# child layer wraps the original module, unpack it
|
||||
if hasattr(child, "base_layer"):
|
||||
child = child.base_layer
|
||||
|
||||
if not hasattr(new_module, "base_layer"):
|
||||
new_module.weight = child.weight
|
||||
if hasattr(child, "bias"):
|
||||
new_module.bias = child.bias
|
||||
|
||||
if getattr(child, "state", None) is not None:
|
||||
if hasattr(new_module, "base_layer"):
|
||||
new_module.base_layer.state = child.state
|
||||
else:
|
||||
new_module.state = child.state
|
||||
new_module.to(child.weight.device)
|
||||
|
||||
meta = torch.device("meta")
|
||||
# dispatch to correct device
|
||||
for name, module in new_module.named_modules():
|
||||
if self.prefix in name:
|
||||
if not any(p.device == meta for p in module.parameters()):
|
||||
module.to(child.weight.device)
|
||||
|
||||
def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None:
|
||||
for n, p in model.named_parameters():
|
||||
if self.prefix not in n:
|
||||
p.requires_grad = False
|
||||
|
||||
for active_adapter in self.active_adapters:
|
||||
bias = self.peft_config[active_adapter].bias
|
||||
if bias == "none":
|
||||
continue
|
||||
|
||||
if bias == "all":
|
||||
for n, p in model.named_parameters():
|
||||
if "bias" in n:
|
||||
p.requires_grad = True
|
||||
elif bias == "hra_only":
|
||||
for name, m in model.named_modules():
|
||||
if isinstance(m, HRALayer) and hasattr(m, "bias") and m.bias is not None:
|
||||
m.bias.requires_grad = True
|
||||
else:
|
||||
raise NotImplementedError(f"Requested bias: {bias}, is not implemented.")
|
||||
|
||||
@staticmethod
|
||||
def _create_new_module(hra_config, adapter_name, target, **kwargs):
|
||||
if isinstance(target, BaseTunerLayer):
|
||||
@ -206,138 +129,3 @@ class HRAModel(BaseTuner):
|
||||
)
|
||||
|
||||
return new_module
|
||||
|
||||
def __getattr__(self, name: str):
|
||||
"""Forward missing attributes to the wrapped module."""
|
||||
try:
|
||||
return super().__getattr__(name) # defer to nn.Module's logic
|
||||
except AttributeError:
|
||||
if name == "base_model":
|
||||
raise
|
||||
return getattr(self.model, name)
|
||||
|
||||
def get_peft_config_as_dict(self, inference: bool = False):
|
||||
config_dict = {}
|
||||
for key, value in self.peft_config.items():
|
||||
config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()}
|
||||
if inference:
|
||||
config["inference_mode"] = True
|
||||
config_dict[key] = config
|
||||
return config
|
||||
|
||||
def _set_adapter_layers(self, enabled=True):
|
||||
for module in self.model.modules():
|
||||
if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)):
|
||||
module.enable_adapters(enabled)
|
||||
|
||||
def enable_adapter_layers(self):
|
||||
self._set_adapter_layers(enabled=True)
|
||||
|
||||
def disable_adapter_layers(self):
|
||||
for active_adapter in self.active_adapters:
|
||||
val = self.peft_config[active_adapter].bias
|
||||
if val != "none":
|
||||
msg = (
|
||||
f"Careful, disabling adapter layers with bias configured to be '{val}' does not produce the same "
|
||||
"output as the base model would without adaption."
|
||||
)
|
||||
warnings.warn(msg)
|
||||
self._set_adapter_layers(enabled=False)
|
||||
|
||||
def set_adapter(self, adapter_name, inference_mode: bool = False):
|
||||
self.set_auxiliary_adapters(adapter_name, inference_mode=inference_mode)
|
||||
for module in self.model.modules():
|
||||
if isinstance(module, HRALayer):
|
||||
if module.merged:
|
||||
warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.")
|
||||
module.unmerge()
|
||||
module.set_adapter(adapter_name, inference_mode=inference_mode)
|
||||
self.active_adapter = adapter_name
|
||||
|
||||
@staticmethod
|
||||
def _prepare_adapter_config(peft_config, model_config):
|
||||
if peft_config.target_modules is None:
|
||||
if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING:
|
||||
raise ValueError("Please specify `target_modules` in `peft_config`")
|
||||
peft_config.target_modules = set(
|
||||
TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING[model_config["model_type"]]
|
||||
)
|
||||
return peft_config
|
||||
|
||||
def _unload_and_optionally_merge(
|
||||
self,
|
||||
merge=True,
|
||||
progressbar: bool = False,
|
||||
safe_merge: bool = False,
|
||||
adapter_names: Optional[list[str]] = None,
|
||||
):
|
||||
self._unloading_checks(adapter_names)
|
||||
key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
|
||||
desc = "Unloading " + ("and merging " if merge else "") + "model"
|
||||
for key in tqdm(key_list, disable=not progressbar, desc=desc):
|
||||
try:
|
||||
parent, target, target_name = _get_submodules(self.model, key)
|
||||
except AttributeError:
|
||||
continue
|
||||
|
||||
if hasattr(target, "base_layer"):
|
||||
if merge:
|
||||
target.merge(safe_merge=safe_merge, adapter_names=adapter_names)
|
||||
self._replace_module(parent, target_name, target.get_base_layer(), target)
|
||||
elif isinstance(target, ModulesToSaveWrapper):
|
||||
# save any additional trainable modules part of `modules_to_save`
|
||||
setattr(parent, target_name, target.modules_to_save[target.active_adapter])
|
||||
|
||||
return self.model
|
||||
|
||||
def delete_adapter(self, adapter_name: str) -> None:
|
||||
"""
|
||||
Deletes an existing adapter.
|
||||
|
||||
Args:
|
||||
adapter_name (str): Name of the adapter to be deleted.
|
||||
"""
|
||||
if adapter_name not in list(self.peft_config.keys()):
|
||||
raise ValueError(f"Adapter {adapter_name} does not exist")
|
||||
del self.peft_config[adapter_name]
|
||||
|
||||
key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
|
||||
new_adapter = None
|
||||
for key in key_list:
|
||||
_, target, _ = _get_submodules(self.model, key)
|
||||
if isinstance(target, HRALayer):
|
||||
target.delete_adapter(adapter_name)
|
||||
if new_adapter is None:
|
||||
new_adapter = target.active_adapters[:]
|
||||
|
||||
self.active_adapter = new_adapter or []
|
||||
self._delete_auxiliary_adapter(adapter_name, new_active_adapters=new_adapter)
|
||||
|
||||
def merge_and_unload(
|
||||
self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[list[str]] = None
|
||||
) -> torch.nn.Module:
|
||||
r"""
|
||||
This method merges the HRA layers into the base model. This is needed if someone wants to use the base model as
|
||||
a standalone model.
|
||||
|
||||
Args:
|
||||
progressbar (`bool`):
|
||||
whether to show a progressbar indicating the unload and merge process
|
||||
safe_merge (`bool`):
|
||||
whether to activate the safe merging check to check if there is any potential Nan in the adapter
|
||||
weights
|
||||
adapter_names (`List[str]`, *optional*):
|
||||
The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
|
||||
to `None`.
|
||||
|
||||
"""
|
||||
return self._unload_and_optionally_merge(
|
||||
progressbar=progressbar, safe_merge=safe_merge, adapter_names=adapter_names
|
||||
)
|
||||
|
||||
def unload(self) -> torch.nn.Module:
|
||||
"""
|
||||
Gets back the base model by removing all the hra modules without merging. This gives back the original base
|
||||
model.
|
||||
"""
|
||||
return self._unload_and_optionally_merge(merge=False)
|
||||
|
@ -15,16 +15,13 @@ from __future__ import annotations
|
||||
|
||||
import re
|
||||
import warnings
|
||||
from dataclasses import asdict, replace
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
from dataclasses import replace
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from transformers.pytorch_utils import Conv1D
|
||||
|
||||
from peft.import_utils import is_bnb_4bit_available, is_bnb_available
|
||||
from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists
|
||||
from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer
|
||||
from peft.utils import (
|
||||
TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING,
|
||||
TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING,
|
||||
@ -74,6 +71,7 @@ class IA3Model(BaseTuner):
|
||||
"""
|
||||
|
||||
prefix: str = "ia3_"
|
||||
tuner_layer_cls = IA3Layer
|
||||
|
||||
@staticmethod
|
||||
def _create_new_module(ia3_config, adapter_name, target, **kwargs):
|
||||
@ -143,15 +141,6 @@ class IA3Model(BaseTuner):
|
||||
)
|
||||
return new_module
|
||||
|
||||
@staticmethod
|
||||
def _check_target_module_exists(ia3_config, key):
|
||||
return check_target_module_exists(ia3_config, key)
|
||||
|
||||
def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None:
|
||||
for n, p in model.named_parameters():
|
||||
if self.prefix not in n:
|
||||
p.requires_grad = False
|
||||
|
||||
def _create_and_replace(
|
||||
self,
|
||||
ia3_config,
|
||||
@ -196,88 +185,6 @@ class IA3Model(BaseTuner):
|
||||
is_feedforward = any(key.endswith(target_key) for target_key in ia3_config.feedforward_modules)
|
||||
return is_feedforward
|
||||
|
||||
def _replace_module(self, parent, child_name, new_module, child):
|
||||
setattr(parent, child_name, new_module)
|
||||
|
||||
# child layer wraps the original module, unpack it
|
||||
if hasattr(child, "base_layer"):
|
||||
child = child.base_layer
|
||||
|
||||
# layers with base_layer don't need the weight to be copied, as they have a reference already
|
||||
if not hasattr(new_module, "base_layer"):
|
||||
new_module.weight = child.weight
|
||||
if hasattr(child, "bias"):
|
||||
new_module.bias = child.bias
|
||||
|
||||
if getattr(child, "state", None) is not None:
|
||||
if hasattr(new_module, "base_layer"):
|
||||
new_module.base_layer.state = child.state
|
||||
else:
|
||||
new_module.state = child.state
|
||||
new_module.to(child.weight.device)
|
||||
|
||||
meta = torch.device("meta")
|
||||
# dispatch to correct device
|
||||
for name, module in new_module.named_modules():
|
||||
if self.prefix in name:
|
||||
if not any(p.device == meta for p in module.parameters()):
|
||||
module.to(child.weight.device)
|
||||
|
||||
def __getattr__(self, name: str):
|
||||
"""Forward missing attributes to the wrapped module."""
|
||||
try:
|
||||
return super().__getattr__(name) # defer to nn.Module's logic
|
||||
except AttributeError:
|
||||
if name == "model": # see #1892: prevent infinite recursion if class is not initialized
|
||||
raise
|
||||
return getattr(self.model, name)
|
||||
|
||||
def get_peft_config_as_dict(self, inference: bool = False):
|
||||
config_dict = {}
|
||||
for key, value in self.peft_config.items():
|
||||
config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()}
|
||||
if inference:
|
||||
config["inference_mode"] = True
|
||||
config_dict[key] = config
|
||||
return config
|
||||
|
||||
def _set_adapter_layers(self, enabled=True):
|
||||
for module in self.model.modules():
|
||||
if isinstance(module, (IA3Layer, ModulesToSaveWrapper)):
|
||||
module.enable_adapters(enabled)
|
||||
|
||||
def enable_adapter_layers(self) -> None:
|
||||
"""Enable all adapters.
|
||||
|
||||
Call this if you have previously disabled all adapters and want to re-enable them.
|
||||
"""
|
||||
self._set_adapter_layers(enabled=True)
|
||||
|
||||
def disable_adapter_layers(self) -> None:
|
||||
"""Disable all adapters.
|
||||
|
||||
When disabling all adapters, the model output corresponds to the output of the base model.
|
||||
"""
|
||||
self._set_adapter_layers(enabled=False)
|
||||
|
||||
def set_adapter(self, adapter_name: str | list[str], inference_mode: bool = False) -> None:
|
||||
"""Set the active adapter(s).
|
||||
|
||||
Args:
|
||||
adapter_name (`str` or `list[str]`):
|
||||
Name(s) of the adapter(s) to be activated.
|
||||
inference_mode (bool, optional):
|
||||
Whether the activated adapter should be frozen (i.e. `requires_grad=False`). Default is False.
|
||||
"""
|
||||
self.set_auxiliary_adapters(adapter_name, inference_mode=inference_mode)
|
||||
for module in self.model.modules():
|
||||
if isinstance(module, IA3Layer):
|
||||
if module.merged:
|
||||
warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.")
|
||||
module.unmerge()
|
||||
module.set_adapter(adapter_name, inference_mode=inference_mode)
|
||||
self.active_adapter = adapter_name
|
||||
|
||||
@staticmethod
|
||||
def _prepare_adapter_config(peft_config, model_config):
|
||||
if peft_config.target_modules is None:
|
||||
@ -294,9 +201,7 @@ class IA3Model(BaseTuner):
|
||||
)
|
||||
return peft_config
|
||||
|
||||
def _unload_and_optionally_merge(
|
||||
self, merge: bool = True, safe_merge: bool = False, adapter_names: Optional[list[str]] = None
|
||||
):
|
||||
def _unload_and_optionally_merge(self, *args, **kwargs):
|
||||
r"""
|
||||
This method merges the (IA)^3 layers into the base model. This is needed if someone wants to use the base model
|
||||
as a standalone model.
|
||||
@ -316,86 +221,7 @@ class IA3Model(BaseTuner):
|
||||
if getattr(self.model, "is_loaded_in_4bit", False):
|
||||
raise ValueError("Cannot merge ia3 layers when the model is loaded in 4-bit mode")
|
||||
|
||||
self._unloading_checks(adapter_names)
|
||||
key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
|
||||
for key in key_list:
|
||||
try:
|
||||
parent, target, target_name = _get_submodules(self.model, key)
|
||||
except AttributeError:
|
||||
continue
|
||||
|
||||
if hasattr(target, "base_layer"):
|
||||
if merge:
|
||||
target.merge(safe_merge=safe_merge, adapter_names=adapter_names)
|
||||
self._replace_module(parent, target_name, target.get_base_layer(), target)
|
||||
elif isinstance(target, ModulesToSaveWrapper):
|
||||
# save any additional trainable modules part of `modules_to_save`
|
||||
new_module = target.modules_to_save[target.active_adapter]
|
||||
if hasattr(new_module, "base_layer"):
|
||||
# check if the module is itself a tuner layer
|
||||
if merge:
|
||||
new_module.merge(safe_merge=safe_merge, adapter_names=adapter_names)
|
||||
new_module = new_module.get_base_layer()
|
||||
setattr(parent, target_name, new_module)
|
||||
|
||||
return self.model
|
||||
|
||||
def merge_and_unload(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> torch.nn.Module:
|
||||
r"""
|
||||
This method merges the IA³ layers into the base model. This is needed if someone wants to use the base model as
|
||||
a standalone model.
|
||||
|
||||
Args:
|
||||
safe_merge (`bool`):
|
||||
whether to activate the safe merging check to check if there is any potential Nan in the adapter
|
||||
weights
|
||||
adapter_names (`List[str]`, *optional*):
|
||||
The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
|
||||
to `None`.
|
||||
|
||||
Example:
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoModelForCausalLM
|
||||
>>> from peft import PeftModel
|
||||
|
||||
>>> base_model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-40b")
|
||||
>>> peft_model_id = "smangrul/falcon-40B-int4-peft-lora-sfttrainer-sample"
|
||||
>>> model = PeftModel.from_pretrained(base_model, peft_model_id)
|
||||
>>> merged_model = model.merge_and_unload()
|
||||
```
|
||||
"""
|
||||
return self._unload_and_optionally_merge(safe_merge=safe_merge, adapter_names=adapter_names)
|
||||
|
||||
def unload(self) -> torch.nn.Module:
|
||||
"""
|
||||
Gets back the base model by removing all the IA³ modules without merging. This gives back the original base
|
||||
model.
|
||||
"""
|
||||
return self._unload_and_optionally_merge(merge=False)
|
||||
|
||||
def delete_adapter(self, adapter_name: str) -> None:
|
||||
"""
|
||||
Deletes an existing adapter.
|
||||
|
||||
Args:
|
||||
adapter_name (str): Name of the adapter to be deleted.
|
||||
"""
|
||||
if adapter_name not in self.peft_config:
|
||||
raise ValueError(f"Adapter {adapter_name} does not exist")
|
||||
del self.peft_config[adapter_name]
|
||||
|
||||
key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
|
||||
new_adapter = None
|
||||
for key in key_list:
|
||||
_, target, _ = _get_submodules(self.model, key)
|
||||
if isinstance(target, IA3Layer):
|
||||
target.delete_adapter(adapter_name)
|
||||
if new_adapter is None:
|
||||
new_adapter = target.active_adapters[:]
|
||||
|
||||
self.active_adapter = new_adapter or []
|
||||
self._delete_auxiliary_adapter(adapter_name, new_active_adapters=new_adapter)
|
||||
return super()._unload_and_optionally_merge(*args, **kwargs)
|
||||
|
||||
def _check_add_weighted_adapter(self, adapters: list[str]) -> tuple[str, str]:
|
||||
"""
|
||||
|
@ -19,7 +19,7 @@ from typing import Optional
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge
|
||||
from peft.tuners.tuners_utils import BaseTunerLayer, _get_in_out_features, check_adapters_to_merge
|
||||
|
||||
|
||||
class LNTuningLayer(nn.Module, BaseTunerLayer):
|
||||
@ -37,6 +37,10 @@ class LNTuningLayer(nn.Module, BaseTunerLayer):
|
||||
self._active_adapter = adapter_name
|
||||
self.merged_adapters = []
|
||||
|
||||
in_features, out_features = _get_in_out_features(self.get_base_layer())
|
||||
self.in_features = in_features
|
||||
self.out_features = out_features
|
||||
|
||||
def update_layer(self, layer: nn.Module, adapter_name: str, inference_mode: bool = False, **kwargs):
|
||||
self.ln_tuning_layers[adapter_name] = deepcopy(layer)
|
||||
self.set_adapter(adapter_name, inference_mode=inference_mode)
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user