Release: v0.6.2

fix doc typo (#1121 )
Correctly deal with ModulesToSaveWrapper when using Low-level API (#1112 )
2025-10-29 21:44:31 +08:00 · 2023-11-13 23:31:41 +05:30 · 2023-11-13 10:48:50 +01:00 · 2023-11-13 12:22:30 +05:30 · 2023-11-10 18:37:38 +01:00 · 2023-11-10 14:21:14 +01:00
106 changed files with 12478 additions and 3438 deletions
--- a/.github/workflows/build_docker_images.yml
+++ b/.github/workflows/build_docker_images.yml
@ -28,7 +28,7 @@ jobs:
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v1
      - name: Check out code
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
      - name: Login to DockerHub
        uses: docker/login-action@v2
        with:
@ -59,7 +59,7 @@ jobs:
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v1
      - name: Check out code
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
      - name: Login to DockerHub
        uses: docker/login-action@v1
        with:
@ -67,8 +67,8 @@ jobs:
          password: ${{ secrets.DOCKERHUB_PASSWORD }}

      - name: Build and Push GPU
-        uses: docker/build-push-action@v2
+        uses: docker/build-push-action@v4
        with:
          context: ./docker/peft-gpu
          push: true
-          tags: huggingface/peft-gpu
+          tags: huggingface/peft-gpu
--- a/.github/workflows/integrations_tests.yml
+++ b/.github/workflows/integrations_tests.yml
@ -0,0 +1,82 @@
+name: integration tests
+
+on:
+  workflow_dispatch:
+    inputs:
+      branch:
+        description: 'Branch to test on'
+        required: true
+
+jobs:
+  run_transformers_integration_tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        transformers-version: ['main', 'latest']
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.inputs.branch }}
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+          cache: "pip"
+          cache-dependency-path: "setup.py"
+      - name: print environment variables
+        run: |
+          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
+          echo "env.CI_SHA = ${{ env.CI_SHA }}"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install .[test]
+          if [ "${{ matrix.transformers-version }}" == "main" ]; then
+              pip install -U git+https://github.com/huggingface/transformers.git
+          else
+              echo "Nothing to do as transformers latest already installed"
+          fi
+
+      - name: Test transformers integration
+        run: |
+          cd .. && git clone https://github.com/huggingface/transformers.git && cd transformers/ && git rev-parse HEAD
+          RUN_SLOW=1 pytest tests/peft_integration/test_peft_integration.py
+  run_diffusers_integration_tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        # For now diffusers integration is not on PyPI
+        diffusers-version: ['main']
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.inputs.branch }}
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+          cache: "pip"
+          cache-dependency-path: "setup.py"
+      - name: print environment variables
+        run: |
+          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
+          echo "env.CI_SHA = ${{ env.CI_SHA }}"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install .[test]
+          
+          if [ "${{ matrix.diffusers-version }}" == "main" ]; then
+              pip install -U git+https://github.com/huggingface/diffusers.git
+          else
+              echo "Nothing to do as diffusers latest already installed"
+          fi
+
+      - name: Test diffusers integration
+        run: |
+          cd .. && git clone https://github.com/huggingface/diffusers.git && cd diffusers/ && git rev-parse HEAD
+          pytest tests/lora/test_lora_layers_peft.py
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@ -8,6 +8,8 @@ on:
 env:
  RUN_SLOW: "yes"
  IS_GITHUB_CI: "1"
+  # To be able to run tests on CUDA 12.2
+  NVIDIA_DISABLE_REQUIRE: "1"
  SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }}


@ -19,7 +21,7 @@ jobs:
      TEST_TYPE: "single_gpu"
    container:
      image: huggingface/peft-gpu:latest
-      options: --gpus all --shm-size "16gb"
+      options: --gpus all --shm-size "16gb" -e NVIDIA_DISABLE_REQUIRE=true
    defaults:
      run:
        working-directory: peft/
@ -61,7 +63,7 @@ jobs:
      TEST_TYPE: "multi_gpu"
    container:
      image: huggingface/peft-gpu:latest
-      options: --gpus all --shm-size "16gb"
+      options: --gpus all --shm-size "16gb" -e NVIDIA_DISABLE_REQUIRE=true
    defaults:
      run:
        working-directory: peft/
--- a/.github/workflows/torch_compile_tests.yml
+++ b/.github/workflows/torch_compile_tests.yml
@ -0,0 +1,43 @@
+name: torch compile tests
+
+# see peft/tests/__init__.py
+
+on:
+  workflow_dispatch:
+    inputs:
+      branch:
+        description: 'Branch to test on'
+        required: true
+      pytorch_nightly:
+        description: 'Whether to use PyTorch nightly (true/false)'
+        required: false
+        default: false
+
+jobs:
+  run_tests_with_compile:
+    runs-on: ubuntu-latest
+    env:
+      PEFT_DEBUG_WITH_TORCH_COMPILE: 1
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.inputs.branch }}
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+          cache: "pip"
+          cache-dependency-path: "setup.py"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install .[test]
+          if [ "${{ github.event.inputs.pytorch_nightly }}" = "true" ]; then
+            python -m pip install --upgrade --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
+          fi
+      - name: Test compile with pytest
+        run: |
+          echo "PEFT_DEBUG_WITH_TORCH_COMPILE=$PEFT_DEBUG_WITH_TORCH_COMPILE"
+          git status
+          make test
--- a/README.md
+++ b/README.md
@ -30,7 +30,10 @@ Supported methods:
 3. P-Tuning: [GPT Understands, Too](https://arxiv.org/abs/2103.10385)
 4. Prompt Tuning: [The Power of Scale for Parameter-Efficient Prompt Tuning](https://arxiv.org/abs/2104.08691)
 5. AdaLoRA: [Adaptive Budget Allocation for Parameter-Efficient Fine-Tuning](https://arxiv.org/abs/2303.10512)  
-6. $(IA)^3$ : [Infused Adapter by Inhibiting and Amplifying Inner Activations](https://arxiv.org/abs/2205.05638)
+6. $(IA)^3$: [Few-Shot Parameter-Efficient Fine-Tuning is Better and Cheaper than In-Context Learning](https://arxiv.org/abs/2205.05638)
+7. MultiTask Prompt Tuning: [Multitask Prompt Tuning Enables Parameter-Efficient Transfer Learning](https://arxiv.org/abs/2303.02861)
+8. LoHa: [FedPara: Low-Rank Hadamard Product for Communication-Efficient Federated Learning](https://arxiv.org/abs/2108.06098)
+9. LoKr: [KronA: Parameter Efficient Tuning with Kronecker Adapter](https://arxiv.org/abs/2212.10650) based on [Navigating Text-To-Image Customization:From LyCORIS Fine-Tuning to Model Evaluation](https://arxiv.org/abs/2309.14859) implementation

 ## Getting started

@ -55,7 +58,7 @@ model.print_trainable_parameters()
 ### Get comparable performance to full finetuning by adapting LLMs to downstream tasks using consumer hardware

 GPU memory required for adapting LLMs on the few-shot dataset [`ought/raft/twitter_complaints`](https://huggingface.co/datasets/ought/raft/viewer/twitter_complaints). Here, settings considered
-are full finetuning, PEFT-LoRA using plain PyTorch and  PEFT-LoRA using DeepSpeed with CPU Offloading. 
+are full finetuning, PEFT-LoRA using plain PyTorch and PEFT-LoRA using DeepSpeed with CPU Offloading. 

 Hardware: Single A100 80GB GPU with CPU RAM above 64GB

@ -67,7 +70,7 @@ Hardware: Single A100 80GB GPU with CPU RAM above 64GB

 Performance of PEFT-LoRA tuned [`bigscience/T0_3B`](https://huggingface.co/bigscience/T0_3B) on [`ought/raft/twitter_complaints`](https://huggingface.co/datasets/ought/raft/viewer/twitter_complaints) leaderboard. 
 A point to note is that we didn't try to squeeze performance by playing around with input instruction templates, LoRA hyperparams and other training related hyperparams. Also, we didn't use the larger 13B [mt0-xxl](https://huggingface.co/bigscience/mt0-xxl) model.
-So, we are already seeing comparable performance to SoTA with parameter efficient tuning. Also, the final checkpoint size is just `19MB` in comparison to `11GB` size of the backbone [`bigscience/T0_3B`](https://huggingface.co/bigscience/T0_3B) model.
+So, we are already seeing comparable performance to SoTA with parameter efficient tuning. Also, the final additional checkpoint size is just `19MB` in comparison to `11GB` size of the backbone [`bigscience/T0_3B`](https://huggingface.co/bigscience/T0_3B) model, but one still has to load the original full size model.

 |   Submission Name        | Accuracy |
 | --------- | ---- |
@ -132,9 +135,11 @@ Try out the 🤗 Gradio Space which should run seamlessly on a T4 instance:
 **NEW** ✨ Multi Adapter support and combining multiple LoRA adapters in a weighted combination 
 ![peft lora dreambooth weighted adapter](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/weighted_adapter_dreambooth_lora.png)

+**NEW** ✨ Dreambooth training for Stable Diffusion using LoHa and LoKr adapters [`examples/stable_diffusion/train_dreambooth.py`](examples/stable_diffusion/train_dreambooth.py)
+
 ### Parameter Efficient Tuning of LLMs for RLHF components such as Ranker and Policy
 - Here is an example in [trl](https://github.com/lvwerra/trl) library using PEFT+INT8 for tuning policy model: [gpt2-sentiment_peft.py](https://github.com/lvwerra/trl/blob/main/examples/sentiment/scripts/gpt2-sentiment_peft.py) and corresponding [Blog](https://huggingface.co/blog/trl-peft)
- Example using PEFT for Instrction finetuning, reward model and policy : [stack_llama](https://github.com/lvwerra/trl/tree/main/examples/stack_llama/scripts) and corresponding [Blog](https://huggingface.co/blog/stackllama) 
+- Example using PEFT for Instruction finetuning, reward model and policy : [stack_llama](https://github.com/lvwerra/trl/tree/main/examples/research_projects/stack_llama/scripts) and corresponding [Blog](https://huggingface.co/blog/stackllama) 

 ### INT8 training of large models in Colab using PEFT LoRA and bits_and_bytes

@ -148,7 +153,7 @@ Save storage by avoiding full finetuning of models on each of the downstream tas
 With PEFT methods, users only need to store tiny checkpoints in the order of `MBs` all the while retaining 
 performance comparable to full finetuning.

-An example of using LoRA for the task of adapting `LayoutLMForTokenClassification` on `FUNSD` dataset is given in `~examples/token_classification/PEFT_LoRA_LayoutLMForTokenClassification_on_FUNSD.py`. We can observe that with only `0.62 %` of parameters being trainable, we achieve performance (F1 0.777) comparable to full finetuning (F1 0.786) (without any hyerparam tuning runs for extracting more performance), and the checkpoint of this is only `2.8MB`. Now, if there are `N` such datasets, just have these PEFT models one for each dataset and save a lot of storage without having to worry about the problem of catastrophic forgetting or overfitting of backbone/base model.
+An example of using LoRA for the task of adapting `LayoutLMForTokenClassification` on `FUNSD` dataset is given in `~examples/token_classification/PEFT_LoRA_LayoutLMForTokenClassification_on_FUNSD.py`. We can observe that with only `0.62 %` of parameters being trainable, we achieve performance (F1 0.777) comparable to full finetuning (F1 0.786) (without any hyperparam tuning runs for extracting more performance), and the checkpoint of this is only `2.8MB`. Now, if there are `N` such datasets, just have these PEFT models one for each dataset and save a lot of storage without having to worry about the problem of catastrophic forgetting or overfitting of backbone/base model.

 Another example is fine-tuning [`roberta-large`](https://huggingface.co/roberta-large) on [`MRPC` GLUE](https://huggingface.co/datasets/glue/viewer/mrpc) dataset using different PEFT methods. The notebooks are given in `~examples/sequence_classification`. 

@ -269,9 +274,9 @@ An example is provided in `~examples/causal_language_modeling/peft_lora_clm_acce

 ### Text-to-Image Generation

-|   Model         | LoRA | Prefix Tuning  | P-Tuning | Prompt Tuning  | IA3 |
-| --------- | ---- | ---- | ---- | ----  | ----  |
-| Stable Diffusion           | ✅  |   |   |   |   |  
+|   Model         | LoRA | LoHa | LoKr | Prefix Tuning  | P-Tuning | Prompt Tuning  | IA3 |
+| --------- | ---- | ---- | ---- | ---- | ---- | ----  | ----  |
+| Stable Diffusion           | ✅  | ✅  | ✅  |  |   |   |


 ### Image Classification
@ -355,12 +360,45 @@ any GPU memory savings. Please refer issue [[FSDP] FSDP with CPU offload consume

 2. When using ZeRO3 with zero3_init_flag=True, if you find the gpu memory increase with training steps. we might need to update deepspeed after [deepspeed commit 42858a9891422abc](https://github.com/microsoft/DeepSpeed/commit/42858a9891422abcecaa12c1bd432d28d33eb0d4) . The related issue is [[BUG] Peft Training with Zero.Init() and Zero3 will increase GPU memory every forward step ](https://github.com/microsoft/DeepSpeed/issues/3002)

-## Backlog:
- [x] Add tests
- [x] Multi Adapter training and inference support
- [x] Add more use cases and examples
- [x] Integrate`(IA)^3`, `AdaptionPrompt`
- [ ] Explore and possibly integrate methods like `Bottleneck Adapters`,  ...
+## 🤗 PEFT as a utility library
+
+Inject trainable adapters on any `torch` model using `inject_adapter_in_model` method. Note the method will make no further change to the model.
+
+```python
+import torch 
+from peft import inject_adapter_in_model, LoraConfig
+
+class DummyModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.embedding = torch.nn.Embedding(10, 10)
+        self.linear = torch.nn.Linear(10, 10)
+        self.lm_head = torch.nn.Linear(10, 10)
+    
+    def forward(self, input_ids):
+        x = self.embedding(input_ids)
+        x = self.linear(x)
+        x = self.lm_head(x)
+        return x
+
+lora_config = LoraConfig(
+    lora_alpha=16,
+    lora_dropout=0.1,
+    r=64,
+    bias="none",
+    target_modules=["linear"],
+)
+
+model = DummyModel()
+model = inject_adapter_in_model(lora_config, model)
+
+dummy_inputs = torch.LongTensor([[0, 1, 2, 3, 4, 5, 6, 7]])
+dummy_outputs = model(dummy_inputs)
+```
+
+## Contributing
+
+If you would like to contribute to PEFT, please check out our [contributing guide](https://huggingface.co/docs/peft/developer_guides/contributing).

 ## Citing 🤗 PEFT

@ -369,7 +407,7 @@ If you use 🤗 PEFT in your publication, please cite it by using the following
 ```bibtex
@Misc{peft,
  title =        {PEFT: State-of-the-art Parameter-Efficient Fine-Tuning methods},
-  author =       {Sourab Mangrulkar and Sylvain Gugger and Lysandre Debut and Younes Belkada and Sayak Paul},
+  author =       {Sourab Mangrulkar and Sylvain Gugger and Lysandre Debut and Younes Belkada and Sayak Paul and Benjamin Bossan},
  howpublished = {\url{https://github.com/huggingface/peft}},
  year =         {2022}
 }
--- a/docker/peft-gpu/Dockerfile
+++ b/docker/peft-gpu/Dockerfile
@ -29,9 +29,20 @@ ENV PATH /opt/conda/envs/peft/bin:$PATH
 # Activate our bash shell
 RUN chsh -s /bin/bash
 SHELL ["/bin/bash", "-c"]
+
+# Stage 2
+FROM nvidia/cuda:12.2.2-devel-ubuntu22.04 AS build-image
+COPY --from=compile-image /opt/conda /opt/conda
+ENV PATH /opt/conda/bin:$PATH
+
+RUN chsh -s /bin/bash
+SHELL ["/bin/bash", "-c"]
+RUN source activate peft && \ 
+    python3 -m pip install --no-cache-dir bitsandbytes optimum auto-gptq
+
 # Activate the conda env and install transformers + accelerate from source
 RUN source activate peft && \
-    python3 -m pip install --no-cache-dir \
+    python3 -m pip install -U --no-cache-dir \
    librosa \
    "soundfile>=0.12.1" \
    scipy \
@ -39,12 +50,7 @@ RUN source activate peft && \
    git+https://github.com/huggingface/accelerate \
    peft[test]@git+https://github.com/huggingface/peft

-RUN python3 -m pip install --no-cache-dir bitsandbytes
-
-# Stage 2
-FROM nvidia/cuda:11.3.1-devel-ubuntu20.04 AS build-image
-COPY --from=compile-image /opt/conda /opt/conda
-ENV PATH /opt/conda/bin:$PATH
+RUN pip freeze | grep transformers

 # Install apt libs
 RUN apt-get update && \
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@ -32,6 +32,12 @@
  sections:
  - local: developer_guides/custom_models
    title: Working with custom models
+  - local: developer_guides/low_level_api
+    title: PEFT low level API
+  - local: developer_guides/contributing
+    title: Contributing to PEFT
+  - local: developer_guides/troubleshooting
+    title: Troubleshooting

 - title: 🤗 Accelerate integrations
  sections:
--- a/docs/source/conceptual_guides/ia3.mdx
+++ b/docs/source/conceptual_guides/ia3.mdx
@ -28,10 +28,13 @@ Being similar to LoRA, IA3 carries many of the same advantages:
 * Performance of models fine-tuned using IA3 is comparable to the performance of fully fine-tuned models.
 * IA3 does not add any inference latency because adapter weights can be merged with the base model.

-In principle, IA3 can be applied to any subset of weight matrices in a neural network to reduce the number of trainable 
-parameters. Following the authors' implementation, IA3 weights are added to the key, value and feedforward layers 
-of a Transformer model. Given the target layers for injecting IA3 parameters, the number of trainable parameters 
-can be determined based on the size of the weight matrices. 
+In principle, IA3 can be applied to any subset of weight matrices in a neural network to reduce the number of trainable
+parameters. Following the authors' implementation, IA3 weights are added to the key, value and feedforward layers
+of a Transformer model. To be specific, for transformer models, IA3 weights are added to the outputs of key and value layers, and to the input of the second feedforward layer
+in each transformer block.
+
+Given the target layers for injecting IA3 parameters, the number of trainable parameters
+can be determined based on the size of the weight matrices.


 ## Common IA3 parameters in PEFT
@ -43,10 +46,19 @@ As with other methods supported by PEFT, to fine-tune a model using IA3, you nee
 3. Wrap the base model with `get_peft_model()` to get a trainable `PeftModel`.
 4. Train the `PeftModel` as you normally would train the base model.

-`IA3Config` allows you to control how IA3 is applied to the base model through the following parameters: 
+`IA3Config` allows you to control how IA3 is applied to the base model through the following parameters:

 - `target_modules`: The modules (for example, attention blocks) to apply the IA3 vectors.
- `feedforward_modules`: The list of modules to be treated as feedforward layers in `target_modules`. While learned vectors are multiplied with 
-the output activation for attention blocks, the vectors are multiplied with the input for classic feedforward layers.
+- `feedforward_modules`: The list of modules to be treated as feedforward layers in `target_modules`. While learned vectors are multiplied with
+the output activation for attention blocks, the vectors are multiplied with the input for classic feedforward layers. Note that `feedforward_modules` must be a subset of `target_modules`.
 - `modules_to_save`: List of modules apart from IA3 layers to be set as trainable and saved in the final checkpoint. These typically include model's custom head that is randomly initialized for the fine-tuning task.

+## Example Usage
+
+For the task of sequence classification, one can initialize the IA3 config for a Llama model as follows:
+
+```py
+peft_config = IA3Config(
+    task_type=TaskType.SEQ_CLS, target_modules=["k_proj", "v_proj", "down_proj"], feedforward_modules=["down_proj"]
+)
+```
--- a/docs/source/conceptual_guides/lora.mdx
+++ b/docs/source/conceptual_guides/lora.mdx
@ -77,6 +77,8 @@ As with other methods supported by PEFT, to fine-tune a model using LoRA, you ne
 - `modules_to_save`: List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. These typically include model's custom head that is randomly initialized for the fine-tuning task.
 - `layers_to_transform`: List of layers to be transformed by LoRA. If not specified, all layers in `target_modules` are transformed.
 - `layers_pattern`: Pattern to match layer names in `target_modules`, if `layers_to_transform` is specified. By default `PeftModel` will look at common layer pattern (`layers`, `h`, `blocks`, etc.), use it for exotic and custom models.
+- `rank_pattern`: The mapping from layer names or regexp expression to ranks which are different from the default rank specified by `r`.
+- `alpha_pattern`: The mapping from layer names or regexp expression to alphas which are different from the default alpha specified by `lora_alpha`.

 ## LoRA examples

--- a/docs/source/developer_guides/contributing.mdx
+++ b/docs/source/developer_guides/contributing.mdx
@ -0,0 +1,89 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Contributing to PEFT
+
+We are happy to accept contributions to PEFT. If you plan to contribute, please read this document to make the process as smooth as possible.
+
+## Installation
+
+The installation instructions can be found [here](https://huggingface.co/docs/peft/install). If you want to provide code contributions to PEFT, you should choose the "source" installation method.
+
+If you are new to creating a pull request, follow [these instructions from GitHub](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request).
+
+## Running tests and code quality checks
+
+Regardless of the type of contribution (unless it’s only about the docs), you should run tests and code quality checks before creating a PR to ensure that your contribution doesn’t break anything and follows the standards of the project.
+
+We provide a Makefile to facilitate those steps. Run the code below for the unit test:
+
+```sh
+make test
+```
+
+Run one of the following to either check or check and fix code quality and style:
+
+```sh
+make quality  # just check
+make style  # check and fix
+```
+
+
+Running all the tests can take a couple of minutes. Therefore, during development, it can be useful to run only those tests specific to your change:
+
+```sh
+pytest tests/ -k <name-of-test>
+```
+
+This should finish much quicker and allow faster iteration. Before creating the PR, however, please still run the whole test suite, as some changes can inadvertently break tests that at first glance are unrelated.
+
+If your change is specific to a hardware setting (e.g. it requires CUDA), take a look at `tests/test_gpu_examples.py` and `tests/test_common_gpu.py` – maybe it makes sense to add a test there.
+
+It can happen that while you’re working on your PR, the underlying code base changes due to other changes being merged. If that happens – especially when there is a merge conflict – please update your branch to be on the latest changes. This can be a merge or a rebase, whatever you prefer. We will squash and merge the PR once it’s ready.
+
+## PR description
+
+When opening the PR, please provide a nice description of the change you provide. If it relates to other issues or PRs, please reference them. Providing a good description will not only help the reviewers review your code better and faster, it can also later be used (as a basis) for the commit message, which helps with long term maintenance of the project.
+
+If your code makes some non-trivial changes, it can also be a good idea to add comments to the code to explain those changes. For example, if you had to iterate on your implementation multiple times because the most obvious way didn’t work, it’s a good indication that a code comment is needed.
+
+## Providing a bugfix
+
+Please give a description of the circumstances that lead to the bug. If there is an existing issue, please link to it (e.g. “Resolves #12345”).
+
+Ideally, when a bugfix is provided, it should be accompanied by a test for this bug. The test should fail with the current code and pass with the bugfix. Add a comment to the test that references the issue or PR. Without such a test, it is difficult to prevent regressions in the future.
+
+## Adding a new fine-tuning method
+
+New parameter-efficient fine-tuning methods are developed all the time. If you would like to add a new, promising method to PEFT, please follow these steps.
+
+**Requirements**
+
+1. Please add a link to the source (usually a paper) of the method.
+2. Some evidence should be provided that there is general interest in using the method. We will not add new methods that are freshly published but without evidence that there is demand for it.
+3. Ideally, we want to not only add the implementation of the new method, but also examples (notebooks, scripts), documentation, and an extensive test suite that proves that the method works with a variety of tasks. However, this can be very daunting. Therefore, it is also acceptable to only provide the implementation and at least one working example. Documentation and tests can be added in follow up PRs.
+
+**Steps**
+
+Before you start to implement the new method, please open an issue on GitHub with your proposal. That way, the maintainers can give you some early feedback.
+
+When implementing the method, it makes sense to look for existing implementations that already exist as a guide. Moreover, when you structure your code, please take inspiration from the other PEFT methods. For example, if your method is similar to LoRA, it makes sense to structure your code similarly or even re-use some functions or classes where it makes sense (but don’t overdo it, some code duplication is okay).
+
+Once you have something that seems to be working, don’t hesitate to create a draft PR, even if it’s not in a mergeable state yet. The maintainers will be happy to give you feedback and guidance along the way.
+
+## Adding other features
+
+It is best if you first open an issue on GitHub with a proposal to add the new feature. That way, you can discuss with the maintainers if it makes sense to add the feature before spending too much time on implementing it.
+
+New features should generally be accompanied by tests and documentation or examples. Without the latter, users will have a hard time discovering your cool new feature.
+
+Changes to the code should be implemented in a backward-compatible way. For example, existing code should continue to work the same way after the feature is merged.
--- a/docs/source/developer_guides/custom_models.mdx
+++ b/docs/source/developer_guides/custom_models.mdx
@ -16,7 +16,7 @@ Some fine-tuning techniques, such as prompt tuning, are specific to language mod
 assumed a 🤗 Transformers model is being used. However, other fine-tuning techniques - like
 [LoRA](./conceptual_guides/lora) - are not restricted to specific model types.

-In this guide, we will see how LoRA can be applied to a multilayer perception and a computer vision model from the [timm](https://huggingface.co/docs/timm/index) library.
+In this guide, we will see how LoRA can be applied to a multilayer perceptron and a computer vision model from the [timm](https://huggingface.co/docs/timm/index) library.

 ## Multilayer perceptron

@ -72,7 +72,7 @@ This should print:
 ('seq.5', torch.nn.modules.activation.LogSoftmax)]
 ```

-Let's say we want to apply LoRA to the input layer and to the hidden layer, those are `'seq.0'` and `'seq.1'`. Moreover,
+Let's say we want to apply LoRA to the input layer and to the hidden layer, those are `'seq.0'` and `'seq.2'`. Moreover,
 let's assume we want to update the output layer without LoRA, that would be `'seq.4'`. The corresponding config would
 be:

@ -91,7 +91,7 @@ With that, we can create our PEFT model and check the fraction of parameters tra
 from peft import get_peft_model

 model = MLP()
-peft_model = get_peft_model(module, config)
+peft_model = get_peft_model(model, config)
 peft_model.print_trainable_parameters()
 # prints trainable params: 56,164 || all params: 4,100,164 || trainable%: 1.369798866581922
 ```
--- a/docs/source/developer_guides/low_level_api.mdx
+++ b/docs/source/developer_guides/low_level_api.mdx
@ -0,0 +1,103 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# PEFT as a utility library
+
+Let's cover in this section how you can leverage PEFT's low level API to inject trainable adapters into any `torch` module. 
+The development of this API has been motivated by the need for super users to not rely on modeling classes that are exposed in PEFT library and still be able to use adapter methods such as LoRA, IA3 and AdaLoRA.
+
+## Supported tuner types
+
+Currently the supported adapter types are the 'injectable' adapters, meaning adapters where an inplace modification of the model is sufficient to correctly perform the fine tuning. As such, only [LoRA](./conceptual_guides/lora), AdaLoRA and [IA3](./conceptual_guides/ia3) are currently supported in this API.
+
+## `inject_adapter_in_model` method 
+
+To perform the adapter injection, simply use `inject_adapter_in_model` method that takes 3 arguments, the PEFT config and the model itself and an optional adapter name. You can also attach multiple adapters in the model if you call multiple times `inject_adapter_in_model` with different adapter names.
+
+Below is a basic example usage of how to inject LoRA adapters into the submodule `linear` of the module `DummyModel`.
+```python
+import torch
+from peft import inject_adapter_in_model, LoraConfig
+
+
+class DummyModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.embedding = torch.nn.Embedding(10, 10)
+        self.linear = torch.nn.Linear(10, 10)
+        self.lm_head = torch.nn.Linear(10, 10)
+
+    def forward(self, input_ids):
+        x = self.embedding(input_ids)
+        x = self.linear(x)
+        x = self.lm_head(x)
+        return x
+
+
+lora_config = LoraConfig(
+    lora_alpha=16,
+    lora_dropout=0.1,
+    r=64,
+    bias="none",
+    target_modules=["linear"],
+)
+
+model = DummyModel()
+model = inject_adapter_in_model(lora_config, model)
+
+dummy_inputs = torch.LongTensor([[0, 1, 2, 3, 4, 5, 6, 7]])
+dummy_outputs = model(dummy_inputs)
+```
+
+If you print the model, you will notice that the adapters have been correctly injected into the model
+
+```bash
+DummyModel(
+  (embedding): Embedding(10, 10)
+  (linear): Linear(
+    in_features=10, out_features=10, bias=True
+    (lora_dropout): ModuleDict(
+      (default): Dropout(p=0.1, inplace=False)
+    )
+    (lora_A): ModuleDict(
+      (default): Linear(in_features=10, out_features=64, bias=False)
+    )
+    (lora_B): ModuleDict(
+      (default): Linear(in_features=64, out_features=10, bias=False)
+    )
+    (lora_embedding_A): ParameterDict()
+    (lora_embedding_B): ParameterDict()
+  )
+  (lm_head): Linear(in_features=10, out_features=10, bias=True)
+)
+```
+Note that it should be up to users to properly take care of saving the adapters (in case they want to save adapters only), as `model.state_dict()` will return the full state dict of the model.
+In case you want to extract the adapters state dict you can use the `get_peft_model_state_dict` method:
+
+```python
+from peft import get_peft_model_state_dict
+
+peft_state_dict = get_peft_model_state_dict(model)
+print(peft_state_dict)
+```
+
+## Pros and cons 
+
+When to use this API and when to not use it? Let's discuss in this section the pros and cons 
+
+Pros:
+- The model gets modified in-place, meaning the model will preserve all its original attributes and methods
+- Works for any torch module, and any modality (vision, text, multi-modal)
+
+Cons:
+- You need to manually writing Hugging Face `from_pretrained` and `save_pretrained` utility methods if you want to easily save / load adapters from the Hugging Face Hub.
+- You cannot use any of the utility method provided by `PeftModel` such as disabling adapters, merging adapters, etc.
--- a/docs/source/developer_guides/troubleshooting.mdx
+++ b/docs/source/developer_guides/troubleshooting.mdx
@ -0,0 +1,79 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Troubleshooting
+
+If you encounter any issue when using PEFT, please check the following list of common issues and their solutions.
+
+## Examples don't work
+
+Examples often rely on the most recent package versions, so please ensure they're up-to-date. In particular, check the version of the following packages:
+
+- `peft`
+- `transformers`
+- `accelerate`
+- `torch`
+
+In general, you can update the package version by running this command inside your Python environment:
+
+```bash
+python -m pip install -U <package_name>
+```
+
+Installing PEFT from source is useful for keeping up with the latest developments:
+
+```bash
+python -m pip install git+https://github.com/huggingface/peft
+```
+
+## Bad results from a loaded PEFT model
+
+There can be several reasons for getting a poor result from a loaded PEFT model, which are listed below. If you're still unable to troubleshoot the problem, see if anyone else had a similar [issue](https://github.com/huggingface/peft/issues) on GitHub, and if you can't find any, open a new issue.
+
+When opening an issue, it helps a lot if you provide a minimal code example that reproduces the issue. Also, please report if the loaded model performs at the same level as the model did before fine-tuning, if it performs at a random level, or if it is only slightly worse than expected. This information helps us identify the problem more quickly.
+
+### Random deviations
+
+If your model outputs are not exactly the same as previous runs, there could be an issue with random elements. For example:
+
+1. please ensure it is in `.eval()` mode, which is important, for instance, if the model uses dropout
+2. if you use [`~transformers.GenerationMixin.generate`] on a language model, there could be random sampling, so obtaining the same result requires setting a random seed
+3. if you used quantization and merged the weights, small deviations are expected due to rounding errors
+
+### Incorrectly loaded model
+
+Please ensure that you load the model correctly. A common error is trying to load a _trained_ model with `get_peft_model`, which is incorrect. Instead, the loading code should look like this:
+
+```python
+from peft import PeftModel, PeftConfig
+
+base_model = ...  # to load the base model, use the same code as when you trained it
+config = PeftConfig.from_pretrained(peft_model_id)
+peft_model = PeftModel.from_pretrained(base_model, peft_model_id)
+```
+
+### Randomly initialized layers
+
+For some tasks, it is important to correctly configure `modules_to_save` in the config to account for randomly initialized layers. 
+
+As an example, this is necessary if you use LoRA to fine-tune a language model for sequence classification because 🤗 Transformers adds a randomly initialized classification head on top of the model. If you do not add this layer to `modules_to_save`, the classification head won't be saved. The next time you load the model, you'll get a _different_ randomly initialized classification head, resulting in completely different results.
+
+In PEFT, we try to correctly guess the `modules_to_save` if you provide the `task_type` argument in the config. This should work for transformers models that follow the standard naming scheme. It is always a good idea to double check though because we can't guarantee all models follow the naming scheme.
+
+When you load a transformers model that has randomly initialized layers, you should see a warning along the lines of:
+
+```
+Some weights of <MODEL> were not initialized from the model checkpoint at <ID> and are newly initialized: [<LAYER_NAMES>].
+You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+```
+
+The mentioned layers should be added to `modules_to_save` in the config to avoid the described problem.
--- a/docs/source/task_guides/dreambooth_lora.mdx
+++ b/docs/source/task_guides/dreambooth_lora.mdx
@ -83,6 +83,7 @@ accelerate launch train_dreambooth.py \
  --output_dir=$OUTPUT_DIR \
  --train_text_encoder \
  --with_prior_preservation --prior_loss_weight=1.0 \
+  --num_dataloader_workers=1 \
  --instance_prompt="a photo of sks dog" \
  --class_prompt="a photo of dog" \
  --resolution=512 \
@ -101,6 +102,8 @@ accelerate launch train_dreambooth.py \
  --max_train_steps=800
 ```

+If you are running this script on Windows, you may need to set the `--num_dataloader_workers` to 0.
+
 ## Inference with a single adapter

 To run inference with the fine-tuned model, first specify the base model with which the fine-tuned LoRA weights will be combined:
@ -171,7 +174,7 @@ image.save("DESTINATION_PATH_FOR_THE_IMAGE")
 ## Multi-adapter inference

 With PEFT you can combine multiple adapters for inference. In the previous example you have fine-tuned Stable Diffusion on 
-some dog images. The pipeline created based on these weights got a name - `adapter_name="dog`. Now, suppose you also fine-tuned 
+some dog images. The pipeline created based on these weights got a name - `adapter_name="dog"`. Now, suppose you also fine-tuned 
 this base model on images of a crochet toy. Let's see how we can use both adapters. 

 First, you'll need to perform all the steps as in the single adapter inference example:
--- a/docs/source/task_guides/ptuning-seq-classification.mdx
+++ b/docs/source/task_guides/ptuning-seq-classification.mdx
@ -197,7 +197,7 @@ Once the model has been uploaded to the Hub, anyone can easily use it for infere
 ```py
 import torch
 from peft import PeftModel, PeftConfig
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForSequenceClassification, AutoTokenizer

 peft_model_id = "smangrul/roberta-large-peft-p-tuning"
 config = PeftConfig.from_pretrained(peft_model_id)
--- a/docs/source/task_guides/semantic-similarity-lora.md
+++ b/docs/source/task_guides/semantic-similarity-lora.md
@ -24,9 +24,7 @@ Install all the necessary required libraries with:
 pip install -r requirements.txt
 ```

-## Setup
-
-Let's start by importing all the necessary libraries you'll need:
+Next, import all the necessary libraries:

 - 🤗 Transformers for loading the `intfloat/e5-large-v2` model and tokenizer
 - 🤗 Accelerate for the training loop
@ -292,4 +290,4 @@ cosine_sim_score=0.88 product='Mastering Machine Learning with scikit-learn - Se

 Books on deep learning and machine learning are retrieved even though `machine learning` wasn't included in the query. This means the model has learned that these books are semantically relevant to the query based on the purchase behavior of customers on Amazon. 

-The next steps would ideally involve using ONNX/TensorRT to optimize the model and using a Triton server to host it. Check out 🤗 [Optimum](https://huggingface.co/docs/optimum/index) for related optimizations for efficient serving!
+The next steps would ideally involve using ONNX/TensorRT to optimize the model and using a Triton server to host it. Check out 🤗 [Optimum](https://huggingface.co/docs/optimum/index) for related optimizations for efficient serving!
--- a/docs/source/task_guides/semantic_segmentation_lora.mdx
+++ b/docs/source/task_guides/semantic_segmentation_lora.mdx
@ -80,14 +80,14 @@ num_labels = len(id2label)
 ## Prepare datasets for training and evaluation

 Next, load the SegFormer image processor to prepare the images and annotations for the model. This dataset uses the 
-zero-index as the background class, so make sure to set `reduce_labels=True` to subtract one from all labels since the 
+zero-index as the background class, so make sure to set `do_reduce_labels=True` to subtract one from all labels since the
 background class is not among the 150 classes. 

 ```python
 from transformers import AutoImageProcessor

 checkpoint = "nvidia/mit-b0"
-image_processor = AutoImageProcessor.from_pretrained(checkpoint, reduce_labels=True)
+image_processor = AutoImageProcessor.from_pretrained(checkpoint, do_reduce_labels=True)
 ```

 Add a function to apply data augmentation to the images, so that the model is more robust against overfitting. Here we use the 
@ -180,7 +180,7 @@ def compute_metrics(eval_pred):
            references=labels,
            num_labels=len(id2label),
            ignore_index=0,
-            reduce_labels=image_processor.reduce_labels,
+            reduce_labels=image_processor.do_reduce_labels,
        )

        per_category_accuracy = metrics.pop("per_category_accuracy").tolist()
--- a/examples/causal_language_modeling/peft_lora_clm_accelerate_big_model_inference.ipynb
+++ b/examples/causal_language_modeling/peft_lora_clm_accelerate_big_model_inference.ipynb
@ -124,10 +124,10 @@
    "    inputs = [f\"{text_column} : {x} Label : \" for x in examples[text_column]]\n",
    "    targets = [str(x) for x in examples[label_column]]\n",
    "    model_inputs = tokenizer(inputs)\n",
-    "    labels = tokenizer(targets)\n",
+    "    labels = tokenizer(targets, add_special_tokens=False)  # don't add bos token because we concatenate with inputs\n",
    "    for i in range(batch_size):\n",
    "        sample_input_ids = model_inputs[\"input_ids\"][i]\n",
-    "        label_input_ids = labels[\"input_ids\"][i] + [tokenizer.pad_token_id]\n",
+    "        label_input_ids = labels[\"input_ids\"][i] + [tokenizer.eos_token_id]\n",
    "        # print(i, sample_input_ids, label_input_ids)\n",
    "        model_inputs[\"input_ids\"][i] = sample_input_ids + label_input_ids\n",
    "        labels[\"input_ids\"][i] = [-100] * len(sample_input_ids) + label_input_ids\n",
--- a/examples/causal_language_modeling/peft_lora_clm_accelerate_ds_zero3_offload.py
+++ b/examples/causal_language_modeling/peft_lora_clm_accelerate_ds_zero3_offload.py
@ -136,10 +136,10 @@ def main():
        inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]]
        targets = [str(x) for x in examples[label_column]]
        model_inputs = tokenizer(inputs)
-        labels = tokenizer(targets)
+        labels = tokenizer(targets, add_special_tokens=False)  # don't add bos token because we concatenate with inputs
        for i in range(batch_size):
            sample_input_ids = model_inputs["input_ids"][i]
-            label_input_ids = labels["input_ids"][i] + [tokenizer.pad_token_id]
+            label_input_ids = labels["input_ids"][i] + [tokenizer.eos_token_id]
            model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
            labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
            model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
--- a/examples/causal_language_modeling/peft_prefix_tuning_clm.ipynb
+++ b/examples/causal_language_modeling/peft_prefix_tuning_clm.ipynb
@ -173,10 +173,10 @@
    "    inputs = [f\"{text_column} : {x} Label : \" for x in examples[text_column]]\n",
    "    targets = [str(x) for x in examples[label_column]]\n",
    "    model_inputs = tokenizer(inputs)\n",
-    "    labels = tokenizer(targets)\n",
+    "    labels = tokenizer(targets, add_special_tokens=False)  # don't add bos token because we concatenate with inputs\n",
    "    for i in range(batch_size):\n",
    "        sample_input_ids = model_inputs[\"input_ids\"][i]\n",
-    "        label_input_ids = labels[\"input_ids\"][i] + [tokenizer.pad_token_id]\n",
+    "        label_input_ids = labels[\"input_ids\"][i] + [tokenizer.eos_token_id]\n",
    "        # print(i, sample_input_ids, label_input_ids)\n",
    "        model_inputs[\"input_ids\"][i] = sample_input_ids + label_input_ids\n",
    "        labels[\"input_ids\"][i] = [-100] * len(sample_input_ids) + label_input_ids\n",
--- a/examples/causal_language_modeling/peft_prompt_tuning_clm.ipynb
+++ b/examples/causal_language_modeling/peft_prompt_tuning_clm.ipynb
@ -83,10 +83,10 @@
    "    inputs = [f\"{text_column} : {x} Label : \" for x in examples[text_column]]\n",
    "    targets = [str(x) for x in examples[label_column]]\n",
    "    model_inputs = tokenizer(inputs)\n",
-    "    labels = tokenizer(targets)\n",
+    "    labels = tokenizer(targets, add_special_tokens=False)  # don't add bos token because we concatenate with inputs\n",
    "    for i in range(batch_size):\n",
    "        sample_input_ids = model_inputs[\"input_ids\"][i]\n",
-    "        label_input_ids = labels[\"input_ids\"][i] + [tokenizer.pad_token_id]\n",
+    "        label_input_ids = labels[\"input_ids\"][i] + [tokenizer.eos_token_id]\n",
    "        # print(i, sample_input_ids, label_input_ids)\n",
    "        model_inputs[\"input_ids\"][i] = sample_input_ids + label_input_ids\n",
    "        labels[\"input_ids\"][i] = [-100] * len(sample_input_ids) + label_input_ids\n",
--- a/examples/conditional_generation/multitask_prompt_tuning.ipynb
+++ b/examples/conditional_generation/multitask_prompt_tuning.ipynb
@ -0,0 +1,408 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "58ff91ca-ce92-43d0-ae8b-4e9e89e193f6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "from transformers import set_seed, AutoModelForSeq2SeqLM, AutoTokenizer\n",
+    "from peft import get_peft_model, MultitaskPromptTuningConfig, TaskType, MultitaskPromptTuningInit\n",
+    "\n",
+    "set_seed(42)\n",
+    "\n",
+    "model_name = \"google/flan-t5-base\"\n",
+    "\n",
+    "peft_config = MultitaskPromptTuningConfig(\n",
+    "    tokenizer_name_or_path=model_name,\n",
+    "    num_tasks=2,\n",
+    "    task_type=TaskType.SEQ_2_SEQ_LM,\n",
+    "    prompt_tuning_init=MultitaskPromptTuningInit.TEXT,\n",
+    "    num_virtual_tokens=50,\n",
+    "    num_transformer_submodules=1,\n",
+    "    prompt_tuning_init_text=\"classify the following into either positive or negative, or entailment, neutral or contradiction:\",\n",
+    ")\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "model = AutoModelForSeq2SeqLM.from_pretrained(model_name)\n",
+    "model = get_peft_model(model, peft_config)\n",
+    "\n",
+    "model = model.cuda()\n",
+    "\n",
+    "\n",
+    "def send_to_device(batch):\n",
+    "    for i in batch:\n",
+    "        batch[i] = batch[i].cuda()\n",
+    "    return batch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eb112bc1-ffaf-49fa-a216-0d601ec304ee",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def get_sst2(split: str):\n",
+    "    examples = load_dataset(\"sst2\")[split]\n",
+    "    result_examples = []\n",
+    "    for example in examples:\n",
+    "        result_examples.append({})\n",
+    "\n",
+    "        result_examples[-1][\"input\"] = example[\"sentence\"].strip() + \"</s>\"\n",
+    "        result_examples[-1][\"output\"] = (\n",
+    "            f\"positive{tokenizer.eos_token}\" if example[\"label\"] == 1 else f\"negative{tokenizer.eos_token}\"\n",
+    "        )\n",
+    "        result_examples[-1][\"task_id\"] = 0\n",
+    "\n",
+    "    return result_examples\n",
+    "\n",
+    "\n",
+    "def get_mnli(split: str):\n",
+    "    examples = load_dataset(\"multi_nli\")[split]\n",
+    "    result_examples = []\n",
+    "    for example in examples:\n",
+    "        result_examples.append({})\n",
+    "\n",
+    "        result_examples[-1][\"input\"] = example[\"premise\"].strip() + \" \" + example[\"hypothesis\"].strip() + \"</s>\"\n",
+    "\n",
+    "        if example[\"label\"] == 0:\n",
+    "            result_examples[-1][\"output\"] = f\"entailment{tokenizer.eos_token}\"\n",
+    "        elif example[\"label\"] == 1:\n",
+    "            result_examples[-1][\"output\"] = f\"neutral{tokenizer.eos_token}\"\n",
+    "        else:\n",
+    "            result_examples[-1][\"output\"] = f\"contradiction{tokenizer.eos_token}\"\n",
+    "\n",
+    "        result_examples[-1][\"task_id\"] = 1\n",
+    "\n",
+    "    return result_examples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e5a16ec4-8fef-4ba9-95b6-a661eb51e50c",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from typing import Tuple\n",
+    "from torch.utils.data import Dataset, DataLoader\n",
+    "import torch\n",
+    "\n",
+    "\n",
+    "class MyDataset(Dataset):\n",
+    "    def __init__(self, split: str, mode: str = \"source\") -> None:\n",
+    "        super().__init__()\n",
+    "\n",
+    "        if split == \"train\":\n",
+    "            if mode == \"source\":\n",
+    "                self.examples = get_sst2(split) + get_mnli(split)\n",
+    "            elif mode == \"target\":\n",
+    "                self.examples = get_sst2(split)\n",
+    "        if split == \"val\":\n",
+    "            self.examples = get_sst2(\"validation\")\n",
+    "        if split == \"test\":\n",
+    "            self.examples = get_sst2(\"validation\")\n",
+    "\n",
+    "    def __getitem__(self, index) -> dict:\n",
+    "        return self.examples[index]\n",
+    "\n",
+    "    def __len__(self) -> int:\n",
+    "        return len(self.examples)\n",
+    "\n",
+    "    def __getitem__(self, index) -> dict:\n",
+    "        return self.examples[index]\n",
+    "\n",
+    "    def __len__(self) -> int:\n",
+    "        return len(self.examples)\n",
+    "\n",
+    "\n",
+    "def collate_fn(batch: dict) -> Tuple[torch.Tensor, torch.Tensor]:\n",
+    "    input = [i[\"input\"] for i in batch]\n",
+    "    input = tokenizer(input, add_special_tokens=False, return_tensors=\"pt\", padding=True)\n",
+    "\n",
+    "    output = [i[\"output\"] for i in batch]\n",
+    "    output = tokenizer(output, add_special_tokens=False, return_tensors=\"pt\", padding=True).input_ids\n",
+    "    output[output == tokenizer.pad_token_id] = -100\n",
+    "\n",
+    "    task_ids = [i[\"task_id\"] for i in batch]\n",
+    "    task_ids = torch.tensor(task_ids)\n",
+    "\n",
+    "    return {\n",
+    "        \"input_ids\": input.input_ids,\n",
+    "        \"attention_mask\": input.attention_mask,\n",
+    "        \"labels\": output,\n",
+    "        \"task_ids\": task_ids,\n",
+    "    }\n",
+    "\n",
+    "\n",
+    "train = DataLoader(MyDataset(\"train\"), shuffle=True, batch_size=8, collate_fn=collate_fn)\n",
+    "val = DataLoader(MyDataset(\"val\"), shuffle=False, batch_size=8, collate_fn=collate_fn)\n",
+    "test = DataLoader(MyDataset(\"test\"), shuffle=False, batch_size=8, collate_fn=collate_fn)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fe0aec7b-f61e-4b00-a90e-c1201dc1f84c",
+   "metadata": {},
+   "source": [
+    "## source training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cceecc94-f43a-4f62-8d45-926f2f02f36d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from torch.optim.adamw import AdamW\n",
+    "from transformers import get_cosine_schedule_with_warmup\n",
+    "from tqdm import tqdm\n",
+    "from sklearn.metrics import f1_score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eae5516b-73ab-44a8-a083-4e8de6127f30",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "POSITIVE_TOKEN_ID = tokenizer(\" positive\", add_special_tokens=False)[\"input_ids\"][0]\n",
+    "NEGATIVE_TOKEN_ID = tokenizer(\" negative\", add_special_tokens=False)[\"input_ids\"][0]\n",
+    "\n",
+    "\n",
+    "def classify(batch):\n",
+    "    batch = send_to_device(batch)\n",
+    "    # we pass labels here since we need to generate and peft doesn't support generation yet.\n",
+    "    # No clue how to get around this\n",
+    "    scores = model(**batch).logits\n",
+    "    preds = []\n",
+    "    for i in range(scores.shape[0]):\n",
+    "        if scores[i, 0, POSITIVE_TOKEN_ID] > scores[i, 0, NEGATIVE_TOKEN_ID]:\n",
+    "            preds.append(POSITIVE_TOKEN_ID)\n",
+    "        else:\n",
+    "            preds.append(NEGATIVE_TOKEN_ID)\n",
+    "    return preds\n",
+    "\n",
+    "\n",
+    "@torch.inference_mode()\n",
+    "def evaluate(model, data):\n",
+    "    loss = 0\n",
+    "    preds = []\n",
+    "    golds = []\n",
+    "\n",
+    "    for batch in tqdm(data):\n",
+    "        batch = send_to_device(batch)\n",
+    "        loss += model(**batch).loss\n",
+    "        golds.extend(batch[\"labels\"][:, 0].tolist())\n",
+    "        preds.extend(classify(batch))\n",
+    "\n",
+    "    return loss / len(val), f1_score(golds, preds, pos_label=POSITIVE_TOKEN_ID)\n",
+    "\n",
+    "\n",
+    "optimizer = AdamW(model.parameters(), lr=1e-4)\n",
+    "scheduler = get_cosine_schedule_with_warmup(optimizer, 200, len(train))\n",
+    "\n",
+    "n = 1000\n",
+    "step = 0\n",
+    "train_ = tqdm(train)\n",
+    "\n",
+    "val_loss, f1 = evaluate(model, val)\n",
+    "print(\n",
+    "    f\"\"\"\n",
+    "before source training\n",
+    "val loss = {val_loss}\n",
+    "f1 = {f1}\"\"\"\n",
+    ")\n",
+    "\n",
+    "for batch in train_:\n",
+    "    if step % n == 0:\n",
+    "        val_loss, f1 = evaluate(model, val)\n",
+    "        print(\n",
+    "            f\"\"\"\n",
+    "step = {step}\n",
+    "val loss = {val_loss}\n",
+    "f1 = {f1}\"\"\"\n",
+    "        )\n",
+    "        model.save_pretrained(f\"checkpoints_source/{step}\")\n",
+    "\n",
+    "    step += 1\n",
+    "    batch = send_to_device(batch)\n",
+    "    loss = model(**batch).loss\n",
+    "    loss.backward()\n",
+    "    optimizer.step()\n",
+    "    scheduler.step()\n",
+    "    train_.set_postfix(train_loss=loss)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "74168ef3-66f3-41a7-a40b-7840b103fbf9",
+   "metadata": {},
+   "source": [
+    "## target training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b09fd456-163e-4dc1-b24d-f2d0d349036c",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "train = DataLoader(MyDataset(\"train\", \"target\"), shuffle=True, batch_size=8, collate_fn=collate_fn)\n",
+    "val = DataLoader(MyDataset(\"val\", \"target\"), shuffle=False, batch_size=8, collate_fn=collate_fn)\n",
+    "test = DataLoader(MyDataset(\"test\", \"target\"), shuffle=False, batch_size=8, collate_fn=collate_fn)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4a539944-f16c-4c3f-bb4a-7b5d9a6042e2",
+   "metadata": {},
+   "source": [
+    "#### create a fresh model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5520d904-aa6c-4654-9335-ed4e7d76cba2",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "peft_config = MultitaskPromptTuningConfig(\n",
+    "    tokenizer_name_or_path=model_name,\n",
+    "    num_tasks=1,\n",
+    "    task_type=TaskType.SEQ_2_SEQ_LM,\n",
+    "    prompt_tuning_init=MultitaskPromptTuningInit.EXACT_SOURCE_TASK,\n",
+    "    prompt_tuning_init_state_dict_path=\"checkpoints_source/50000/adapter_model.bin\",\n",
+    "    num_virtual_tokens=50,\n",
+    "    num_transformer_submodules=1,\n",
+    ")\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "model = AutoModelForSeq2SeqLM.from_pretrained(model_name)\n",
+    "model = get_peft_model(model, peft_config)\n",
+    "\n",
+    "model = model.cuda()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dfa39c2d-d1c5-4ed4-90f8-26e8e324371c",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "optimizer = AdamW(model.parameters(), lr=1e-4)\n",
+    "scheduler = get_cosine_schedule_with_warmup(optimizer, 200, len(train))\n",
+    "\n",
+    "n = 1000\n",
+    "step = 0\n",
+    "train_ = tqdm(train)\n",
+    "\n",
+    "val_loss, f1 = evaluate(model, val)\n",
+    "print(\n",
+    "    f\"\"\"\n",
+    "before target training\n",
+    "val loss = {val_loss}\n",
+    "f1 = {f1}\"\"\"\n",
+    ")\n",
+    "\n",
+    "for batch in train_:\n",
+    "    if step % n == 0:\n",
+    "        val_loss, f1 = evaluate(model, val)\n",
+    "        print(\n",
+    "            f\"\"\"\n",
+    "step = {step}\n",
+    "val loss = {val_loss}\n",
+    "f1 = {f1}\"\"\"\n",
+    "        )\n",
+    "        model.save_pretrained(f\"checkpoints_target/{step}\")\n",
+    "\n",
+    "    step += 1\n",
+    "    batch = send_to_device(batch)\n",
+    "    loss = model(**batch).loss\n",
+    "    loss.backward()\n",
+    "    optimizer.step()\n",
+    "    scheduler.step()\n",
+    "    train_.set_postfix(train_loss=loss)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b6a6eeda-1e09-49a6-8845-cd96c8573145",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# load last checkpoint for now\n",
+    "from peft import set_peft_model_state_dict\n",
+    "\n",
+    "sd_6000 = torch.load(\"checkpoints_target/6000/adapter_model.bin\")\n",
+    "set_peft_model_state_dict(model, sd_6000)\n",
+    "\n",
+    "# evaluate val\n",
+    "val_loss, f1 = evaluate(model, val)\n",
+    "print(\n",
+    "    f\"\"\"\n",
+    "final\n",
+    "val loss = {val_loss}\n",
+    "f1 = {f1}\"\"\"\n",
+    ")\n",
+    "\n",
+    "# evaluate test\n",
+    "test_loss, f1 = evaluate(model, test)\n",
+    "print(\n",
+    "    f\"\"\"\n",
+    "final\n",
+    "test loss = {test_loss}\n",
+    "f1 = {f1}\"\"\"\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/examples/feature_extraction/peft_lora_embedding_semantic_search.py
+++ b/examples/feature_extraction/peft_lora_embedding_semantic_search.py
@ -207,9 +207,13 @@ def get_loss(cosine_score, labels):

 def main():
    args = parse_args()
-    accelerator = (
-        Accelerator(log_with=args.report_to, project_dir=args.output_dir) if args.with_tracking else Accelerator()
-    )
+
+    accelerator_kwargs = {"gradient_accumulation_steps": args.gradient_accumulation_steps}
+    if args.with_tracking:
+        accelerator_kwargs["log_with"] = args.report_to
+        accelerator_kwargs["project_dir"] = args.output_dir
+    accelerator = Accelerator(**accelerator_kwargs)
+
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@ -402,7 +406,7 @@ def main():
            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
            starting_epoch = resume_step // len(train_dataloader)
            resume_step -= starting_epoch * len(train_dataloader)
-            completed_steps = resume_step // args.gradient_accumulation_stepp
+            completed_steps = resume_step // args.gradient_accumulation_steps

    # update the progress_bar if load from checkpoint
    progress_bar.update(completed_steps)
--- a/examples/int8_training/peft_adalora_whisper_large_training.py
+++ b/examples/int8_training/peft_adalora_whisper_large_training.py
@ -422,16 +422,11 @@ def evaluation_loop(model, eval_dataloader, processor, normalizer, metric, force
 def main():
    args = parse_args()

-    # initialize accelerator
-    accelerator = (
-        Accelerator(
-            log_with=args.report_to,
-            project_dir=args.output_dir,
-            gradient_accumulation_steps=args.gradient_accumulation_steps,
-        )
-        if args.with_tracking
-        else Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps)
-    )
+    accelerator_kwargs = {"gradient_accumulation_steps": args.gradient_accumulation_steps}
+    if args.with_tracking:
+        accelerator_kwargs["log_with"] = args.report_to
+        accelerator_kwargs["project_dir"] = args.output_dir
+    accelerator = Accelerator(**accelerator_kwargs)

    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
--- a/examples/lora_dreambooth/convert_kohya_ss_sd_lora_to_peft.py
+++ b/examples/lora_dreambooth/convert_kohya_ss_sd_lora_to_peft.py
@ -1,11 +1,11 @@
 import argparse
 import os
-import re
-from typing import Callable, List, Optional, Union
+from collections import Counter
+from dataclasses import dataclass
+from typing import Dict, Optional

 import safetensors
 import torch
-import torch.nn as nn
 from diffusers import UNet2DConditionModel
 from transformers import CLIPTextModel

@ -21,44 +21,66 @@ LORA_PREFIX_UNET = "lora_unet"
 LORA_PREFIX_TEXT_ENCODER = "lora_te"


-def get_modules_names(
-    root_module: nn.Module,
-    target_replace_modules_linear: Optional[List[str]] = [],
-    target_replace_modules_conv2d: Optional[List[str]] = [],
-):
-    # Combine replacement modules
-    target_replace_modules = target_replace_modules_linear + target_replace_modules_conv2d
+@dataclass
+class LoRAInfo:
+    kohya_key: str
+    peft_key: str
+    alpha: Optional[float] = None
+    rank: Optional[int] = None
+    lora_A: Optional[torch.Tensor] = None
+    lora_B: Optional[torch.Tensor] = None

-    # Store result
-    modules_names = set()
-    # https://github.com/kohya-ss/sd-scripts/blob/c924c47f374ac1b6e33e71f82948eb1853e2243f/networks/lora.py#L720
-    for name, module in root_module.named_modules():
-        if module.__class__.__name__ in target_replace_modules:
-            if len(name) == 0:
-                continue
-            for child_name, child_module in module.named_modules():
-                if len(child_name) == 0:
-                    continue
-                is_linear = child_module.__class__.__name__ == "Linear"
-                is_conv2d = child_module.__class__.__name__ == "Conv2d"
-
-                if (is_linear and module.__class__.__name__ in target_replace_modules_linear) or (
-                    is_conv2d and module.__class__.__name__ in target_replace_modules_conv2d
-                ):
-                    modules_names.add(f"{name}.{child_name}")
-
-    return sorted(modules_names)
+    def peft_state_dict(self) -> Dict[str, torch.Tensor]:
+        if self.lora_A is None or self.lora_B is None:
+            raise ValueError("At least one of lora_A or lora_B is None, they must both be provided")
+        return {f"{peft_key}.lora_A.weight": self.lora_A, f"{peft_key}.lora_B.weight": self.lora_A}


-def get_rank_alpha(
-    layer_names: List[str],
-    value_getter: Callable[[str], Union[int, float]],
-    filter_string: str,
-) -> Union[int, float]:
-    values = [value_getter(p) for p in filter(lambda x: bool(re.search(filter_string, x)), layer_names)]
-    value = values[0]
-    assert all(v == value for v in values), f"All LoRA ranks and alphas must be same, found: {values}"
-    return value
+def construct_peft_loraconfig(info: Dict[str, LoRAInfo]) -> LoraConfig:
+    """Constructs LoraConfig from data extracted from kohya checkpoint
+
+    Args:
+        info (Dict[str, LoRAInfo]): Information extracted from kohya checkpoint
+
+    Returns:
+        LoraConfig: config for constructing LoRA
+    """
+
+    # Unpack all ranks and alphas
+    ranks = {x[0]: x[1].rank for x in info.items()}
+    alphas = {x[0]: x[1].alpha or x[1].rank for x in info.items()}
+
+    # Determine which modules needs to be transformed
+    target_modules = list(info.keys())
+
+    # Determine most common rank and alpha
+    r = Counter(ranks.values()).most_common(1)[0]
+    lora_alpha = Counter(alphas.values()).most_common(1)[0]
+
+    # Determine which modules have different rank and alpha
+    rank_pattern = dict(filter(lambda x: x[1] != r, ranks.items()))
+    alpha_pattern = dict(filter(lambda x: x[1] != lora_alpha, alphas.items()))
+
+    config = LoraConfig(
+        r=r,
+        lora_alpha=lora_alpha,
+        target_modules=target_modules,
+        lora_dropout=0.0,
+        bias="none",
+        init_lora_weights=False,
+        rank_pattern=rank_pattern,
+        alpha_pattern=alpha_pattern,
+    )
+
+    return config
+
+
+def combine_peft_state_dict(info: Dict[str, LoRAInfo]) -> Dict[str, torch.Tensor]:
+    result = {}
+    for key_name, key_info in info.items():
+        result[f"base_model.model.{key_name}.lora_A.weight"] = key_info.lora_A
+        result[f"base_model.model.{key_name}.lora_B.weight"] = key_info.lora_B
+    return result


 if __name__ == "__main__":
@ -75,93 +97,79 @@ if __name__ == "__main__":
    parser.add_argument("--half", action="store_true", help="Save weights in half precision.")
    args = parser.parse_args()

-    # Find text encoder modules to add LoRA to
+    # Load all models that we need to add adapter to
    text_encoder = CLIPTextModel.from_pretrained(args.sd_checkpoint, subfolder="text_encoder")
-    text_encoder_modules_names = get_modules_names(
-        text_encoder, target_replace_modules_linear=TEXT_ENCODER_TARGET_REPLACE_MODULE
-    )
-
-    # Find unet2d modules to add LoRA to
    unet = UNet2DConditionModel.from_pretrained(args.sd_checkpoint, subfolder="unet")
-    unet_modules_names = get_modules_names(
-        unet,
-        target_replace_modules_linear=UNET_TARGET_REPLACE_MODULE,
-        target_replace_modules_conv2d=UNET_TARGET_REPLACE_MODULE,
-    )
+
+    # Construct possible mapping from kohya keys to peft keys
+    models_keys = {}
+    for model, model_key, model_name in [
+        (text_encoder, LORA_PREFIX_TEXT_ENCODER, "text_encoder"),
+        (unet, LORA_PREFIX_UNET, "unet"),
+    ]:
+        models_keys.update(
+            {
+                f"{model_key}.{peft_key}".replace(".", "_"): peft_key
+                for peft_key in (x[0] for x in model.named_modules())
+            }
+        )
+
+    # Store conversion info (model_type -> peft_key -> LoRAInfo)
+    lora_info: Dict[str, Dict[str, LoRAInfo]] = {
+        "text_encoder": {},
+        "unet": {},
+    }

    # Open kohya_ss checkpoint
    with safetensors.safe_open(args.kohya_lora_path, framework="pt", device="cpu") as f:
        # Extract information about LoRA structure
        metadata = f.metadata()
-        if (metadata is not None) and ("ss_network_dim" in metadata) and ("ss_network_alpha" in metadata):
-            # LoRA rank and alpha are in safetensors metadata, just get it
-            lora_r = lora_text_encoder_r = int(metadata["ss_network_dim"])
-            lora_alpha = lora_text_encoder_alpha = float(metadata["ss_network_alpha"])
-        else:
-            # LoRA rank and alpha are not present, so infer them
-            lora_r = get_rank_alpha(
-                f.keys(), lambda n: f.get_tensor(n).size(0), f"^{LORA_PREFIX_UNET}\w+\.lora_down\.weight$"
-            )
-            lora_text_encoder_r = get_rank_alpha(
-                f.keys(), lambda n: f.get_tensor(n).size(0), f"^{LORA_PREFIX_TEXT_ENCODER}\w+\.lora_down\.weight$"
-            )
-            lora_alpha = get_rank_alpha(f.keys(), lambda n: f.get_tensor(n).item(), f"^{LORA_PREFIX_UNET}\w+\.alpha$")
-            lora_text_encoder_alpha = get_rank_alpha(
-                f.keys(), lambda n: f.get_tensor(n).item(), f"^{LORA_PREFIX_TEXT_ENCODER}\w+\.alpha$"
-            )

-        # Create LoRA for text encoder
-        text_encoder_config = LoraConfig(
-            r=lora_text_encoder_r,
-            lora_alpha=lora_text_encoder_alpha,
-            target_modules=text_encoder_modules_names,
-            lora_dropout=0.0,
-            bias="none",
-        )
-        text_encoder = get_peft_model(text_encoder, text_encoder_config)
-        text_encoder_lora_state_dict = {x: None for x in get_peft_model_state_dict(text_encoder).keys()}
+        # Iterate through available info and unpack all the values
+        for key in f.keys():
+            kohya_key, kohya_type = key.split(".")[:2]

-        # Load text encoder values from kohya_ss LoRA
-        for peft_te_key in text_encoder_lora_state_dict.keys():
-            kohya_ss_te_key = peft_te_key.replace("base_model.model", LORA_PREFIX_TEXT_ENCODER)
-            kohya_ss_te_key = kohya_ss_te_key.replace("lora_A", "lora_down")
-            kohya_ss_te_key = kohya_ss_te_key.replace("lora_B", "lora_up")
-            kohya_ss_te_key = kohya_ss_te_key.replace(".", "_", kohya_ss_te_key.count(".") - 2)
-            text_encoder_lora_state_dict[peft_te_key] = f.get_tensor(kohya_ss_te_key).to(text_encoder.dtype)
+            # Find which model this key belongs to
+            if kohya_key.startswith(LORA_PREFIX_TEXT_ENCODER):
+                model_type = "text_encoder"
+            elif kohya_key.startswith(LORA_PREFIX_UNET):
+                model_type = "unet"
+            else:
+                raise ValueError(f"Cannot determine model for key: {key}")

-        # Load converted kohya_ss text encoder LoRA back to PEFT
-        set_peft_model_state_dict(text_encoder, text_encoder_lora_state_dict)
+            # Find corresponding peft key
+            if kohya_key not in models_keys:
+                raise ValueError(f"Cannot find corresponding key for diffusers/transformers model: {kohya_key}")
+            peft_key = models_keys[kohya_key]
+
+            if peft_key not in lora_info[model_type]:
+                lora_info[model_type][peft_key] = LoRAInfo(kohya_key=kohya_key, peft_key=peft_key)
+
+            if kohya_type == "alpha":
+                lora_info[model_type][peft_key].alpha = f.get_tensor(key).item()
+            elif kohya_type == "lora_down":
+                tensor = f.get_tensor(key)
+                lora_info[model_type][peft_key].lora_A = tensor
+                lora_info[model_type][peft_key].rank = tensor.shape[0]
+            elif kohya_type == "lora_up":
+                tensor = f.get_tensor(key)
+                lora_info[model_type][peft_key].lora_B = f.get_tensor(key)
+                lora_info[model_type][peft_key].rank = tensor.shape[1]
+            else:
+                raise ValueError(f"Unknown weight name in key: {key} - {kohya_type}")
+
+    # Process each model
+    for model, model_name in [(text_encoder, "text_encoder"), (unet, "unet")]:
+        config = construct_peft_loraconfig(lora_info[model_name])
+        model = get_peft_model(model, config)
+
+        keys_peft = list(get_peft_model_state_dict(model).keys())
+        keys_new = list(combine_peft_state_dict(lora_info[model_name]).keys())
+
+        set_peft_model_state_dict(model, combine_peft_state_dict(lora_info[model_name]))

        if args.half:
-            text_encoder.to(torch.float16)
+            model.to(torch.float16)

-        # Save text encoder result
-        text_encoder.save_pretrained(
-            os.path.join(args.dump_path, "text_encoder"),
-        )
-
-        # Create LoRA for unet2d
-        unet_config = LoraConfig(
-            r=lora_r, lora_alpha=lora_alpha, target_modules=unet_modules_names, lora_dropout=0.0, bias="none"
-        )
-        unet = get_peft_model(unet, unet_config)
-        unet_lora_state_dict = {x: None for x in get_peft_model_state_dict(unet).keys()}
-
-        # Load unet2d values from kohya_ss LoRA
-        for peft_unet_key in unet_lora_state_dict.keys():
-            kohya_ss_unet_key = peft_unet_key.replace("base_model.model", LORA_PREFIX_UNET)
-            kohya_ss_unet_key = kohya_ss_unet_key.replace("lora_A", "lora_down")
-            kohya_ss_unet_key = kohya_ss_unet_key.replace("lora_B", "lora_up")
-            kohya_ss_unet_key = kohya_ss_unet_key.replace(".", "_", kohya_ss_unet_key.count(".") - 2)
-            unet_lora_state_dict[peft_unet_key] = f.get_tensor(kohya_ss_unet_key).to(unet.dtype)
-
-        # Load converted kohya_ss unet LoRA back to PEFT
-        set_peft_model_state_dict(unet, unet_lora_state_dict)
-
-        if args.half:
-            unet.to(torch.float16)
-
-        # Save text encoder result
-        unet.save_pretrained(
-            os.path.join(args.dump_path, "unet"),
-        )
+        # Save model to disk
+        model.save_pretrained(os.path.join(args.dump_path, model_name))
--- a/examples/lora_dreambooth/train_dreambooth.py
+++ b/examples/lora_dreambooth/train_dreambooth.py
@ -213,6 +213,10 @@ def parse_args(input_args=None):
        help="Bias type for Lora. Can be 'none', 'all' or 'lora_only', only used if use_lora and `train_text_encoder` are True",
    )

+    parser.add_argument(
+        "--num_dataloader_workers", type=int, default=1, help="Num of workers for the training dataloader."
+    )
+
    parser.add_argument(
        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
    )
@ -799,7 +803,7 @@ def main(args):
        batch_size=args.train_batch_size,
        shuffle=True,
        collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation),
-        num_workers=1,
+        num_workers=args.num_dataloader_workers,
    )

    # Scheduler and math around the number of training steps.
--- a/examples/semantic_segmentation/semantic_segmentation_peft_lora.ipynb
+++ b/examples/semantic_segmentation/semantic_segmentation_peft_lora.ipynb
@ -200,7 +200,7 @@
    "from transformers import AutoImageProcessor\n",
    "\n",
    "checkpoint = \"nvidia/mit-b0\"\n",
-    "image_processor = AutoImageProcessor.from_pretrained(checkpoint, reduce_labels=True)"
+    "image_processor = AutoImageProcessor.from_pretrained(checkpoint, do_reduce_labels=True)"
   ]
  },
  {
@ -309,7 +309,7 @@
    "            references=labels,\n",
    "            num_labels=len(id2label),\n",
    "            ignore_index=0,\n",
-    "            reduce_labels=image_processor.reduce_labels,\n",
+    "            reduce_labels=image_processor.do_reduce_labels,\n",
    "        )\n",
    "\n",
    "        # add per category metrics as individual key-value pairs\n",
--- a/examples/stable_diffusion/convert_sd_adapter_to_peft.py
+++ b/examples/stable_diffusion/convert_sd_adapter_to_peft.py
@ -0,0 +1,506 @@
+import argparse
+import json
+import logging
+import os
+from collections import Counter
+from dataclasses import dataclass
+from operator import attrgetter
+from typing import Dict, List, Optional, Union
+
+import safetensors
+import torch
+import torch.nn as nn
+from diffusers import UNet2DConditionModel
+from transformers import CLIPTextModel
+
+from peft import LoHaConfig, LoKrConfig, LoraConfig, PeftType, get_peft_model, set_peft_model_state_dict
+from peft.tuners.lokr.layer import factorization
+
+
+# Default kohya_ss LoRA replacement modules
+# https://github.com/kohya-ss/sd-scripts/blob/c924c47f374ac1b6e33e71f82948eb1853e2243f/networks/lora.py#L661
+UNET_TARGET_REPLACE_MODULE = ["Transformer2DModel", "Attention"]
+UNET_TARGET_REPLACE_MODULE_CONV2D_3X3 = ["ResnetBlock2D", "Downsample2D", "Upsample2D"]
+TEXT_ENCODER_TARGET_REPLACE_MODULE = ["CLIPAttention", "CLIPMLP"]
+PREFIX_UNET = "lora_unet"
+PREFIX_TEXT_ENCODER = "lora_te"
+
+
+@dataclass
+class LoRAInfo:
+    kohya_key: str
+    peft_key: str
+    alpha: Optional[float] = None
+    rank: Optional[int] = None
+    lora_A: Optional[torch.Tensor] = None
+    lora_B: Optional[torch.Tensor] = None
+
+    def peft_state_dict(self) -> Dict[str, torch.Tensor]:
+        if self.lora_A is None or self.lora_B is None:
+            raise ValueError("At least one of lora_A or lora_B is None, they must both be provided")
+        return {
+            f"base_model.model{self.peft_key}.lora_A.weight": self.lora_A,
+            f"base_model.model.{self.peft_key}.lora_B.weight": self.lora_B,
+        }
+
+
+@dataclass
+class LoHaInfo:
+    kohya_key: str
+    peft_key: str
+    alpha: Optional[float] = None
+    rank: Optional[int] = None
+    hada_w1_a: Optional[torch.Tensor] = None
+    hada_w1_b: Optional[torch.Tensor] = None
+    hada_w2_a: Optional[torch.Tensor] = None
+    hada_w2_b: Optional[torch.Tensor] = None
+    hada_t1: Optional[torch.Tensor] = None
+    hada_t2: Optional[torch.Tensor] = None
+
+    def peft_state_dict(self) -> Dict[str, torch.Tensor]:
+        if self.hada_w1_a is None or self.hada_w1_b is None or self.hada_w2_a is None or self.hada_w2_b is None:
+            raise ValueError(
+                "At least one of hada_w1_a, hada_w1_b, hada_w2_a, hada_w2_b is missing, they all must be provided"
+            )
+        state_dict = {
+            f"base_model.model.{self.peft_key}.hada_w1_a": self.hada_w1_a,
+            f"base_model.model.{self.peft_key}.hada_w1_b": self.hada_w1_b,
+            f"base_model.model.{self.peft_key}.hada_w2_a": self.hada_w2_a,
+            f"base_model.model.{self.peft_key}.hada_w2_b": self.hada_w2_b,
+        }
+        if not (
+            (self.hada_t1 is None and self.hada_t2 is None) or (self.hada_t1 is not None and self.hada_t2 is not None)
+        ):
+            raise ValueError("hada_t1 and hada_t2 must be either both present or not present at the same time")
+        if self.hada_t1 is not None and self.hada_t2 is not None:
+            state_dict[f"base_model.model.{self.peft_key}.hada_t1"] = self.hada_t1
+            state_dict[f"base_model.model.{self.peft_key}.hada_t2"] = self.hada_t2
+        return state_dict
+
+
+@dataclass
+class LoKrInfo:
+    kohya_key: str
+    peft_key: str
+    alpha: Optional[float] = None
+    rank: Optional[int] = None
+    lokr_w1: Optional[torch.Tensor] = None
+    lokr_w1_a: Optional[torch.Tensor] = None
+    lokr_w1_b: Optional[torch.Tensor] = None
+    lokr_w2: Optional[torch.Tensor] = None
+    lokr_w2_a: Optional[torch.Tensor] = None
+    lokr_w2_b: Optional[torch.Tensor] = None
+    lokr_t2: Optional[torch.Tensor] = None
+
+    def peft_state_dict(self) -> Dict[str, torch.Tensor]:
+        if (self.lokr_w1 is None) and ((self.lokr_w1_a is None) or (self.lokr_w1_b is None)):
+            raise ValueError("Either lokr_w1 or both lokr_w1_a and lokr_w1_b should be provided")
+
+        if (self.lokr_w2 is None) and ((self.lokr_w2_a is None) or (self.lokr_w2_b is None)):
+            raise ValueError("Either lokr_w2 or both lokr_w2_a and lokr_w2_b should be provided")
+
+        state_dict = {}
+
+        if self.lokr_w1 is not None:
+            state_dict[f"base_model.model.{self.peft_key}.lokr_w1"] = self.lokr_w1
+        elif self.lokr_w1_a is not None:
+            state_dict[f"base_model.model.{self.peft_key}.lokr_w1_a"] = self.lokr_w1_a
+            state_dict[f"base_model.model.{self.peft_key}.lokr_w1_b"] = self.lokr_w1_b
+
+        if self.lokr_w2 is not None:
+            state_dict[f"base_model.model.{self.peft_key}.lokr_w2"] = self.lokr_w2
+        elif self.lokr_w2_a is not None:
+            state_dict[f"base_model.model.{self.peft_key}.lokr_w2_a"] = self.lokr_w2_a
+            state_dict[f"base_model.model.{self.peft_key}.lokr_w2_b"] = self.lokr_w2_b
+
+        if self.lokr_t2 is not None:
+            state_dict[f"base_model.model.{self.peft_key}.lokr_t2"] = self.lokr_t2
+
+        return state_dict
+
+
+def construct_peft_loraconfig(info: Dict[str, LoRAInfo], **kwargs) -> LoraConfig:
+    """Constructs LoraConfig from data extracted from adapter checkpoint
+
+    Args:
+        info (Dict[str, LoRAInfo]): Information extracted from adapter checkpoint
+
+    Returns:
+        LoraConfig: config for constructing LoRA
+    """
+
+    # Unpack all ranks and alphas
+    ranks = {key: val.rank for key, val in info.items()}
+    alphas = {x[0]: x[1].alpha or x[1].rank for x in info.items()}
+
+    # Determine which modules needs to be transformed
+    target_modules = sorted(info.keys())
+
+    # Determine most common rank and alpha
+    r = int(Counter(ranks.values()).most_common(1)[0][0])
+    lora_alpha = Counter(alphas.values()).most_common(1)[0][0]
+
+    # Determine which modules have different rank and alpha
+    rank_pattern = dict(sorted(filter(lambda x: x[1] != r, ranks.items()), key=lambda x: x[0]))
+    alpha_pattern = dict(sorted(filter(lambda x: x[1] != lora_alpha, alphas.items()), key=lambda x: x[0]))
+
+    config = LoraConfig(
+        r=r,
+        lora_alpha=lora_alpha,
+        target_modules=target_modules,
+        lora_dropout=0.0,
+        bias="none",
+        init_lora_weights=False,
+        rank_pattern=rank_pattern,
+        alpha_pattern=alpha_pattern,
+    )
+
+    return config
+
+
+def construct_peft_lohaconfig(info: Dict[str, LoHaInfo], **kwargs) -> LoHaConfig:
+    """Constructs LoHaConfig from data extracted from adapter checkpoint
+
+    Args:
+        info (Dict[str, LoHaInfo]): Information extracted from adapter checkpoint
+
+    Returns:
+        LoHaConfig: config for constructing LoHA
+    """
+
+    # Unpack all ranks and alphas
+    ranks = {x[0]: x[1].rank for x in info.items()}
+    alphas = {x[0]: x[1].alpha or x[1].rank for x in info.items()}
+
+    # Determine which modules needs to be transformed
+    target_modules = sorted(info.keys())
+
+    # Determine most common rank and alpha
+    r = int(Counter(ranks.values()).most_common(1)[0][0])
+    alpha = Counter(alphas.values()).most_common(1)[0][0]
+
+    # Determine which modules have different rank and alpha
+    rank_pattern = dict(sorted(filter(lambda x: x[1] != r, ranks.items()), key=lambda x: x[0]))
+    alpha_pattern = dict(sorted(filter(lambda x: x[1] != alpha, alphas.items()), key=lambda x: x[0]))
+
+    # Determine whether any of modules have effective conv2d decomposition
+    use_effective_conv2d = any(((val.hada_t1 is not None) or (val.hada_t2 is not None) for val in info.values()))
+
+    config = LoHaConfig(
+        r=r,
+        alpha=alpha,
+        target_modules=target_modules,
+        rank_dropout=0.0,
+        module_dropout=0.0,
+        init_weights=False,
+        rank_pattern=rank_pattern,
+        alpha_pattern=alpha_pattern,
+        use_effective_conv2d=use_effective_conv2d,
+    )
+
+    return config
+
+
+def construct_peft_lokrconfig(info: Dict[str, LoKrInfo], decompose_factor: int = -1, **kwargs) -> LoKrConfig:
+    """Constructs LoKrConfig from data extracted from adapter checkpoint
+
+    Args:
+        info (Dict[str, LoKrInfo]): Information extracted from adapter checkpoint
+
+    Returns:
+        LoKrConfig: config for constructing LoKr
+    """
+
+    # Unpack all ranks and alphas
+    ranks = {x[0]: x[1].rank for x in info.items()}
+    alphas = {x[0]: x[1].alpha or x[1].rank for x in info.items()}
+
+    # Determine which modules needs to be transformed
+    target_modules = sorted(info.keys())
+
+    # Determine most common rank and alpha
+    r = int(Counter(ranks.values()).most_common(1)[0][0])
+    alpha = Counter(alphas.values()).most_common(1)[0][0]
+
+    # Determine which modules have different rank and alpha
+    rank_pattern = dict(sorted(filter(lambda x: x[1] != r, ranks.items()), key=lambda x: x[0]))
+    alpha_pattern = dict(sorted(filter(lambda x: x[1] != alpha, alphas.items()), key=lambda x: x[0]))
+
+    # Determine whether any of modules have effective conv2d decomposition
+    use_effective_conv2d = any(((val.lokr_t2 is not None) for val in info.values()))
+
+    # decompose_both should be enabled if any w1 matrix in any layer is decomposed into 2
+    decompose_both = any((val.lokr_w1_a is not None and val.lokr_w1_b is not None) for val in info.values())
+
+    # Determining decompose factor is a bit tricky (but it is most often -1)
+    # Check that decompose_factor is equal to provided
+    for val in info.values():
+        # Determine shape of first matrix
+        if val.lokr_w1 is not None:
+            w1_shape = tuple(val.lokr_w1.shape)
+        else:
+            w1_shape = (val.lokr_w1_a.shape[0], val.lokr_w1_b.shape[1])
+
+        # Determine shape of second matrix
+        if val.lokr_w2 is not None:
+            w2_shape = tuple(val.lokr_w2.shape[:2])
+        elif val.lokr_t2 is not None:
+            w2_shape = (val.lokr_w2_a.shape[1], val.lokr_w2_b.shape[1])
+        else:
+            # We may iterate over Conv2d layer, for which second item in shape is multiplied by ksize^2
+            w2_shape = (val.lokr_w2_a.shape[0], val.lokr_w2_b.shape[1])
+
+        # We need to check, whether decompose_factor is really -1 or not
+        shape = (w1_shape[0], w2_shape[0])
+        if factorization(shape[0] * shape[1], factor=-1) != shape:
+            raise ValueError("Cannot infer decompose_factor, probably it is not equal to -1")
+
+    config = LoKrConfig(
+        r=r,
+        alpha=alpha,
+        target_modules=target_modules,
+        rank_dropout=0.0,
+        module_dropout=0.0,
+        init_weights=False,
+        rank_pattern=rank_pattern,
+        alpha_pattern=alpha_pattern,
+        use_effective_conv2d=use_effective_conv2d,
+        decompose_both=decompose_both,
+        decompose_factor=decompose_factor,
+    )
+
+    return config
+
+
+def combine_peft_state_dict(info: Dict[str, Union[LoRAInfo, LoHaInfo]]) -> Dict[str, torch.Tensor]:
+    result = {}
+    for key_info in info.values():
+        result.update(key_info.peft_state_dict())
+    return result
+
+
+def detect_adapter_type(keys: List[str]) -> PeftType:
+    # Detect type of adapter by keys
+    # Inspired by this:
+    # https://github.com/bmaltais/kohya_ss/blob/ed4e3b0239a40506de9a17e550e6cf2d0b867a4f/tools/lycoris_utils.py#L312
+    for key in keys:
+        if "alpha" in key:
+            continue
+        elif any(x in key for x in ["lora_down", "lora_up"]):
+            # LoRA
+            return PeftType.LORA
+        elif any(x in key for x in ["hada_w1", "hada_w2", "hada_t1", "hada_t2"]):
+            # LoHa may have the following keys:
+            # hada_w1_a, hada_w1_b, hada_w2_a, hada_w2_b, hada_t1, hada_t2
+            return PeftType.LOHA
+        elif any(x in key for x in ["lokr_w1", "lokr_w2", "lokr_t1", "lokr_t2"]):
+            # LoKr may have the following keys:
+            # lokr_w1, lokr_w2, lokr_w1_a, lokr_w1_b, lokr_w2_a, lokr_w2_b, lokr_t1, lokr_t2
+            return PeftType.LOKR
+        elif "diff" in key:
+            raise ValueError("Currently full diff adapters are not implemented")
+        else:
+            raise ValueError("Unkown adapter type, probably not implemented")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--sd_checkpoint", default=None, type=str, required=True, help="SD checkpoint to use")
+
+    parser.add_argument(
+        "--adapter_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to downloaded adapter to convert",
+    )
+
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output peft adapter.")
+
+    parser.add_argument("--half", action="store_true", help="Save weights in half precision.")
+    parser.add_argument(
+        "--loha_conv2d_weights_fix",
+        action="store_true",
+        help="""LoHa checkpoints trained with lycoris-lora<=1.9.0 contain a bug described in this PR https://github.com/KohakuBlueleaf/LyCORIS/pull/115.
+        This option fixes this bug during weight conversion (replaces hada_t2 with hada_t1 for Conv2d 3x3 layers).
+        The output results may differ from webui, but in general, they should be better in terms of quality.
+        This option should be set to True in case the provided checkpoint has been trained with lycoris-lora version for which the mentioned PR wasn't merged.
+        This option should be set to False in case the provided checkpoint has been trained with lycoris-lora version for which the mentioned PR is merged or full compatibility with webui outputs is required.""",
+    )
+    args = parser.parse_args()
+
+    # Load all models that we need to add adapter to
+    text_encoder = CLIPTextModel.from_pretrained(args.sd_checkpoint, subfolder="text_encoder")
+    unet = UNet2DConditionModel.from_pretrained(args.sd_checkpoint, subfolder="unet")
+
+    # Construct possible mapping from kohya keys to peft keys
+    models_keys = {}
+    for model, model_key, model_name in [
+        (text_encoder, PREFIX_TEXT_ENCODER, "text_encoder"),
+        (unet, PREFIX_UNET, "unet"),
+    ]:
+        models_keys.update(
+            {
+                f"{model_key}.{peft_key}".replace(".", "_"): peft_key
+                for peft_key in (x[0] for x in model.named_modules())
+            }
+        )
+
+    # Store conversion info (model_type -> peft_key -> LoRAInfo | LoHaInfo | LoKrInfo)
+    adapter_info: Dict[str, Dict[str, Union[LoRAInfo, LoHaInfo, LoKrInfo]]] = {
+        "text_encoder": {},
+        "unet": {},
+    }
+
+    # Store decompose_factor for LoKr
+    decompose_factor = -1
+
+    # Open adapter checkpoint
+    with safetensors.safe_open(args.adapter_path, framework="pt", device="cpu") as f:
+        # Extract information about adapter structure
+        metadata = f.metadata()
+
+        # It may be difficult to determine rank for LoKr adapters
+        # If checkpoint was trained with large rank it may not be utilized during weights creation at all
+        # So we need to get it from checkpoint metadata (along with decompose_factor)
+        rank, conv_rank = None, None
+        if metadata is not None:
+            rank = metadata.get("ss_network_dim", None)
+            rank = int(rank) if rank else None
+            if "ss_network_args" in metadata:
+                network_args = json.loads(metadata["ss_network_args"])
+                conv_rank = network_args.get("conv_dim", None)
+                conv_rank = int(conv_rank) if conv_rank else rank
+                decompose_factor = network_args.get("factor", -1)
+                decompose_factor = int(decompose_factor)
+
+        # Detect adapter type based on keys
+        adapter_type = detect_adapter_type(f.keys())
+        adapter_info_cls = {
+            PeftType.LORA: LoRAInfo,
+            PeftType.LOHA: LoHaInfo,
+            PeftType.LOKR: LoKrInfo,
+        }[adapter_type]
+
+        # Iterate through available info and unpack all the values
+        for key in f.keys():
+            kohya_key, kohya_type = key.split(".")[:2]
+
+            # Find which model this key belongs to
+            if kohya_key.startswith(PREFIX_TEXT_ENCODER):
+                model_type, model = "text_encoder", text_encoder
+            elif kohya_key.startswith(PREFIX_UNET):
+                model_type, model = "unet", unet
+            else:
+                raise ValueError(f"Cannot determine model for key: {key}")
+
+            # Find corresponding peft key
+            if kohya_key not in models_keys:
+                raise ValueError(f"Cannot find corresponding key for diffusers/transformers model: {kohya_key}")
+            peft_key = models_keys[kohya_key]
+
+            # Retrieve corresponding layer of model
+            layer = attrgetter(peft_key)(model)
+
+            # Create a corresponding adapter info
+            if peft_key not in adapter_info[model_type]:
+                adapter_info[model_type][peft_key] = adapter_info_cls(kohya_key=kohya_key, peft_key=peft_key)
+
+            tensor = f.get_tensor(key)
+            if kohya_type == "alpha":
+                adapter_info[model_type][peft_key].alpha = tensor.item()
+            elif kohya_type == "lora_down":
+                adapter_info[model_type][peft_key].lora_A = tensor
+                adapter_info[model_type][peft_key].rank = tensor.shape[0]
+            elif kohya_type == "lora_up":
+                adapter_info[model_type][peft_key].lora_B = tensor
+                adapter_info[model_type][peft_key].rank = tensor.shape[1]
+            elif kohya_type == "hada_w1_a":
+                adapter_info[model_type][peft_key].hada_w1_a = tensor
+            elif kohya_type == "hada_w1_b":
+                adapter_info[model_type][peft_key].hada_w1_b = tensor
+                adapter_info[model_type][peft_key].rank = tensor.shape[0]
+            elif kohya_type == "hada_w2_a":
+                adapter_info[model_type][peft_key].hada_w2_a = tensor
+            elif kohya_type == "hada_w2_b":
+                adapter_info[model_type][peft_key].hada_w2_b = tensor
+                adapter_info[model_type][peft_key].rank = tensor.shape[0]
+            elif kohya_type in {"hada_t1", "hada_t2"}:
+                if args.loha_conv2d_weights_fix:
+                    if kohya_type == "hada_t1":
+                        # This code block fixes a bug that exists for some LoHa checkpoints
+                        # that resulted in accidentally using hada_t1 weight instead of hada_t2, see
+                        # https://github.com/KohakuBlueleaf/LyCORIS/pull/115
+                        adapter_info[model_type][peft_key].hada_t1 = tensor
+                        adapter_info[model_type][peft_key].hada_t2 = tensor
+                        adapter_info[model_type][peft_key].rank = tensor.shape[0]
+                else:
+                    if kohya_type == "hada_t1":
+                        adapter_info[model_type][peft_key].hada_t1 = tensor
+                        adapter_info[model_type][peft_key].rank = tensor.shape[0]
+                    elif kohya_type == "hada_t2":
+                        adapter_info[model_type][peft_key].hada_t2 = tensor
+                        adapter_info[model_type][peft_key].rank = tensor.shape[0]
+            elif kohya_type == "lokr_t2":
+                adapter_info[model_type][peft_key].lokr_t2 = tensor
+                adapter_info[model_type][peft_key].rank = tensor.shape[0]
+            elif kohya_type == "lokr_w1":
+                adapter_info[model_type][peft_key].lokr_w1 = tensor
+                if isinstance(layer, nn.Linear) or (
+                    isinstance(layer, nn.Conv2d) and tuple(layer.weight.shape[2:]) == (1, 1)
+                ):
+                    adapter_info[model_type][peft_key].rank = rank
+                elif isinstance(layer, nn.Conv2d):
+                    adapter_info[model_type][peft_key].rank = conv_rank
+            elif kohya_type == "lokr_w2":
+                adapter_info[model_type][peft_key].lokr_w2 = tensor
+                if isinstance(layer, nn.Linear) or (
+                    isinstance(layer, nn.Conv2d) and tuple(layer.weight.shape[2:]) == (1, 1)
+                ):
+                    adapter_info[model_type][peft_key].rank = rank
+                elif isinstance(layer, nn.Conv2d):
+                    adapter_info[model_type][peft_key].rank = conv_rank
+            elif kohya_type == "lokr_w1_a":
+                adapter_info[model_type][peft_key].lokr_w1_a = tensor
+                adapter_info[model_type][peft_key].rank = tensor.shape[1]
+            elif kohya_type == "lokr_w1_b":
+                adapter_info[model_type][peft_key].lokr_w1_b = tensor
+                adapter_info[model_type][peft_key].rank = tensor.shape[0]
+            elif kohya_type == "lokr_w2_a":
+                adapter_info[model_type][peft_key].lokr_w2_a = tensor
+            elif kohya_type == "lokr_w2_b":
+                adapter_info[model_type][peft_key].lokr_w2_b = tensor
+            else:
+                raise ValueError(f"Unknown weight name in key: {key} - {kohya_type}")
+
+    # Get function which will create adapter config based on extracted info
+    construct_config_fn = {
+        PeftType.LORA: construct_peft_loraconfig,
+        PeftType.LOHA: construct_peft_lohaconfig,
+        PeftType.LOKR: construct_peft_lokrconfig,
+    }[adapter_type]
+
+    # Process each model sequentially
+    for model, model_name in [(text_encoder, "text_encoder"), (unet, "unet")]:
+        config = construct_config_fn(adapter_info[model_name], decompose_factor=decompose_factor)
+
+        # Output warning for LoHa with use_effective_conv2d
+        if (
+            isinstance(config, LoHaConfig)
+            and getattr(config, "use_effective_conv2d", False)
+            and args.loha_conv2d_weights_fix is False
+        ):
+            logging.warning(
+                'lycoris-lora<=1.9.0 LoHa implementation contains a bug, which can be fixed with "--loha_conv2d_weights_fix".\n'
+                "For more info, please refer to https://github.com/huggingface/peft/pull/1021 and https://github.com/KohakuBlueleaf/LyCORIS/pull/115"
+            )
+
+        model = get_peft_model(model, config)
+        set_peft_model_state_dict(model, combine_peft_state_dict(adapter_info[model_name]))
+
+        if args.half:
+            model.to(torch.float16)
+
+        # Save model to disk
+        model.save_pretrained(os.path.join(args.dump_path, model_name))
--- a/examples/stable_diffusion/train_dreambooth.py
+++ b/examples/stable_diffusion/train_dreambooth.py
--- a/scripts/log_reports.py
+++ b/scripts/log_reports.py
@ -10,12 +10,16 @@ group_info = []

 total_num_failed = 0
 empty_file = False or len(list(Path().glob("*.log"))) == 0
+
+total_empty_files = []
+
 for log in Path().glob("*.log"):
    section_num_failed = 0
+    i = 0
    with open(log, "r") as f:
-        nb_lines = sum(1 for _ in f)
-        for i, line in f:
+        for line in f:
            line = json.loads(line)
+            i += 1
            if line.get("nodeid", "") != "":
                test = line["nodeid"]
                if line.get("duration", None) is not None:
@ -26,16 +30,16 @@ for log in Path().glob("*.log"):
                        total_num_failed += 1
                    else:
                        passed.append([test, duration, log.name.split('_')[0]])
-        if nb_lines == 0:
-            empty_file = True
+        empty_file = i == 0
    group_info.append([str(log), section_num_failed, failed])
+    total_empty_files.append(empty_file)
    os.remove(log)
    failed = []
 no_error_payload = {
    "type": "section",
    "text": {
        "type": "plain_text",
-        "text": "🌞 There were no failures!" if not empty_file else "Something went wrong - please check GH action results.",
+        "text": "🌞 There were no failures!" if not any(total_empty_files) else "Something went wrong there is at least one empty file - please check GH action results.",
        "emoji": True
    }
 }
@ -51,7 +55,7 @@ payload = [
    },
 ]
 if total_num_failed > 0:
-    for name, num_failed, failed_tests in group_info:
+    for i, (name, num_failed, failed_tests) in enumerate(group_info):
        if num_failed > 0:
            if num_failed == 1:
                message += f"*{name}: {num_failed} failed test*\n"
@ -62,10 +66,12 @@ if total_num_failed > 0:
                failed_table.append(test[0].split("::"))
            failed_table = tabulate(failed_table, headers=["Test Location", "Test Case", "Test Name"], showindex="always", tablefmt="grid", maxcolwidths=[12, 12, 12])
            message += '\n```\n' +failed_table + '\n```'
+        
+        if total_empty_files[i]:
+            message += f"\n*{name}: Warning! Empty file - please check the GitHub action job *\n"
    print(f'### {message}')
 else:
    payload.append(no_error_payload)
-    

 if os.environ.get("TEST_TYPE", "") != "":
    from slack_sdk import WebClient
--- a/scripts/stale.py
+++ b/scripts/stale.py
@ -15,8 +15,9 @@
 Script to close stale issue. Taken in part from the AllenNLP repository.
 https://github.com/allenai/allennlp.
 """
-from datetime import datetime as dt
 import os
+from datetime import datetime as dt
+from datetime import timezone

 from github import Github

@ -42,14 +43,14 @@ def main():
        last_comment = comments[0] if len(comments) > 0 else None
        if (
            last_comment is not None and last_comment.user.login == "github-actions[bot]"
-            and (dt.utcnow() - issue.updated_at).days > 7
-            and (dt.utcnow() - issue.created_at).days >= 30
+            and (dt.now(timezone.utc) - issue.updated_at).days > 7
+            and (dt.now(timezone.utc) - issue.created_at).days >= 30
            and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels())
        ):
            issue.edit(state="closed")
        elif (
-            (dt.utcnow() - issue.updated_at).days > 23
-            and (dt.utcnow() - issue.created_at).days >= 30
+            (dt.now(timezone.utc) - issue.updated_at).days > 23
+            and (dt.now(timezone.utc) - issue.created_at).days >= 30
            and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels())
        ):
            issue.create_comment(
--- a/setup.py
+++ b/setup.py
@ -18,11 +18,11 @@ extras = {}
 extras["quality"] = ["black ~= 22.0", "ruff>=0.0.241", "urllib3<=2.0.0"]
 extras["docs_specific"] = ["hf-doc-builder"]
 extras["dev"] = extras["quality"] + extras["docs_specific"]
-extras["test"] = extras["dev"] + ["pytest", "pytest-cov", "pytest-xdist", "parameterized", "datasets", "diffusers"]
+extras["test"] = extras["dev"] + ["pytest", "pytest-cov", "pytest-xdist", "parameterized", "datasets", "diffusers<0.21.0"]

 setup(
    name="peft",
-    version="0.5.0.dev0",
+    version="0.6.2",
    description="Parameter-Efficient Fine-Tuning (PEFT)",
    license_files=["LICENSE"],
    long_description=open("README.md", "r", encoding="utf-8").read(),
@ -45,7 +45,7 @@ setup(
        "torch>=1.13.0",
        "transformers",
        "tqdm",
-        "accelerate",
+        "accelerate>=0.21.0",
        "safetensors",
    ],
    extras_require=extras,
@ -63,19 +63,23 @@ setup(
 )

 # Release checklist
-# 1. Change the version in __init__.py and setup.py.
-# 2. Commit these changes with the message: "Release: VERSION"
-# 3. Add a tag in git to mark the release: "git tag VERSION -m 'Adds tag VERSION for pypi' "
-#    Push the tag to git: git push --tags origin main
-# 4. Run the following commands in the top-level directory:
+# 1. Change the version in __init__.py and setup.py to the release version, e.g. from "0.6.0.dev0" to "0.6.0"
+# 2. Check if there are any deprecations that need to be addressed for this release by seaching for "# TODO" in the code
+# 3. Commit these changes with the message: "Release: VERSION", create a PR and merge it.
+# 4. Add a tag in git to mark the release: "git tag -a VERSION -m 'Adds tag VERSION for pypi' "
+#    Push the tag to git:
+#      git push --tags origin main
+#    It is necessary to work on the original repository, not on a fork.
+# 5. Run the following commands in the top-level directory:
 #      python setup.py bdist_wheel
 #      python setup.py sdist
-# 5. Upload the package to the pypi test server first:
+#    Ensure that you are on the clean and up-to-date main branch (git status --untracked-files=no should not list any
+#    files and show the main branch)
+# 6. Upload the package to the pypi test server first:
 #      twine upload dist/* -r pypitest
-#      twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/
-# 6. Check that you can install it in a virtualenv by running:
+# 7. Check that you can install it in a virtualenv by running:
 #      pip install -i https://testpypi.python.org/pypi peft
-# 7. Upload the final version to actual pypi:
+# 8. Upload the final version to actual pypi:
 #      twine upload dist/* -r pypi
-# 8. Add release notes to the tag in github once everything is looking hunky-dory.
-# 9. Update the version in __init__.py, setup.py to the new version "-dev" and push to master
+# 9. Add release notes to the tag on https://github.com/huggingface/peft/releases once everything is looking hunky-dory.
+# 10. Update the version in __init__.py, setup.py to the bumped minor version + ".dev0" (e.g. from "0.6.0" to "0.7.0.dev0")
--- a/src/peft/init.py
+++ b/src/peft/init.py
@ -17,7 +17,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-__version__ = "0.5.0.dev0"
+__version__ = "0.6.2"

 from .auto import (
    AutoPeftModel,
@ -49,6 +49,10 @@ from .tuners import (
    AdaptionPromptModel,
    LoraConfig,
    LoraModel,
+    LoHaConfig,
+    LoHaModel,
+    LoKrConfig,
+    LoKrModel,
    IA3Config,
    IA3Model,
    AdaLoraConfig,
@ -61,6 +65,8 @@ from .tuners import (
    PromptEncoderReparameterizationType,
    PromptTuningConfig,
    PromptTuningInit,
+    MultitaskPromptTuningConfig,
+    MultitaskPromptTuningInit,
 )
 from .utils import (
    TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING,
--- a/src/peft/config.py
+++ b/src/peft/config.py
@ -16,7 +16,7 @@ import inspect
 import json
 import os
 from dataclasses import asdict, dataclass, field
-from typing import Optional, Union
+from typing import Dict, Optional, Union

 from huggingface_hub import hf_hub_download
 from transformers.utils import PushToHubMixin
@ -40,10 +40,10 @@ class PeftConfigMixin(PushToHubMixin):
        default=None, metadata={"help": "An auto mapping dict to help retrieve the base model class if needed."}
    )

-    def to_dict(self):
+    def to_dict(self) -> Dict:
        return asdict(self)

-    def save_pretrained(self, save_directory, **kwargs):
+    def save_pretrained(self, save_directory: str, **kwargs) -> None:
        r"""
        This method saves the configuration of your adapter model in a directory.

@ -61,6 +61,11 @@ class PeftConfigMixin(PushToHubMixin):
        auto_mapping_dict = kwargs.pop("auto_mapping_dict", None)

        output_dict = asdict(self)
+        # converting set type to list
+        for key, value in output_dict.items():
+            if isinstance(value, set):
+                output_dict[key] = list(value)
+
        output_path = os.path.join(save_directory, CONFIG_NAME)

        # Add auto mapping details for custom models.
@ -72,7 +77,7 @@ class PeftConfigMixin(PushToHubMixin):
            writer.write(json.dumps(output_dict, indent=2, sort_keys=True))

    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, subfolder=None, **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path: str, subfolder: Optional[str] = None, **kwargs):
        r"""
        This method loads the configuration of your adapter model from a directory.

@ -125,16 +130,12 @@ class PeftConfigMixin(PushToHubMixin):
        else:
            config_cls = cls

-        config = config_cls(**class_kwargs)
-
-        for key, value in loaded_attributes.items():
-            if hasattr(config, key):
-                setattr(config, key, value)
-
+        kwargs = {**class_kwargs, **loaded_attributes}
+        config = config_cls(**kwargs)
        return config

    @classmethod
-    def from_json_file(cls, path_json_file, **kwargs):
+    def from_json_file(cls, path_json_file: str, **kwargs):
        r"""
        Loads a configuration file from a json file.

@ -166,7 +167,7 @@ class PeftConfigMixin(PushToHubMixin):
    @classmethod
    def _get_peft_type(
        cls,
-        model_id,
+        model_id: str,
        **hf_hub_download_kwargs,
    ):
        subfolder = hf_hub_download_kwargs.get("subfolder", None)
@ -189,7 +190,7 @@ class PeftConfigMixin(PushToHubMixin):
        return loaded_attributes["peft_type"]

    @property
-    def is_prompt_learning(self):
+    def is_prompt_learning(self) -> bool:
        r"""
        Utility method to check if the configuration is for prompt learning.
        """
@ -244,7 +245,7 @@ class PromptLearningConfig(PeftConfig):
    num_layers: Optional[int] = field(default=None, metadata={"help": "Number of transformer layers"})

    @property
-    def is_prompt_learning(self):
+    def is_prompt_learning(self) -> bool:
        r"""
        Utility method to check if the configuration is for prompt learning.
        """
--- a/src/peft/helpers.py
+++ b/src/peft/helpers.py
@ -0,0 +1,113 @@
+import inspect
+from copy import deepcopy
+from functools import update_wrapper
+from types import MethodType
+
+from .peft_model import PeftModel
+
+
+def update_forward_signature(model: PeftModel) -> None:
+    """
+    Args:
+    Updates the forward signature of the PeftModel to include parents class signature
+        model (`PeftModel`): Peft model to update the forward signature
+    Example:
+
+    ```python
+    >>> from transformers import WhisperForConditionalGeneration
+    >>> from peft import get_peft_model, LoraConfig, update_forward_signature
+
+    >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+    >>> peft_config = LoraConfig(r=8, lora_alpha=32, lora_dropout=0.1, target_modules=["q_proj", "v_proj"])
+
+    >>> peft_model = get_peft_model(model, peft_config)
+    >>> update_forward_signature(peft_model)
+    ```
+    """
+
+    # Only update signature when the current forward signature only has *args and **kwargs
+    current_signature = inspect.signature(model.forward)
+    if (
+        len(current_signature.parameters) == 2
+        and "args" in current_signature.parameters
+        and "kwargs" in current_signature.parameters
+    ):
+        forward = deepcopy(model.forward.__func__)
+        update_wrapper(
+            forward, type(model.get_base_model()).forward, assigned=("__doc__", "__name__", "__annotations__")
+        )
+        model.forward = MethodType(forward, model)
+
+
+def update_generate_signature(model: PeftModel) -> None:
+    """
+    Args:
+    Updates the generate signature of a PeftModel with overriding generate to include parents class signature
+        model (`PeftModel`): Peft model to update the generate signature
+    Example:
+
+    ```python
+    >>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+    >>> from peft import get_peft_model, LoraConfig, TaskType, update_generate_signature
+
+    >>> model_name_or_path = "bigscience/mt0-large"
+    >>> tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+    >>> model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
+
+    >>> peft_config = LoraConfig(
+    ...     task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
+    ... )
+    >>> peft_model = get_peft_model(model, peft_config)
+    >>> update_generate_signature(peft_model)
+    >>> help(peft_model.generate)
+    ```
+    """
+    if not hasattr(model, "generate"):
+        return
+    current_signature = inspect.signature(model.generate)
+    if (
+        len(current_signature.parameters) == 2
+        and "args" in current_signature.parameters
+        and "kwargs" in current_signature.parameters
+    ) or (len(current_signature.parameters) == 1 and "kwargs" in current_signature.parameters):
+        generate = deepcopy(model.generate.__func__)
+        update_wrapper(
+            generate,
+            type(model.get_base_model()).generate,
+            assigned=("__doc__", "__name__", "__annotations__"),
+        )
+        model.generate = MethodType(generate, model)
+
+
+def update_signature(model: PeftModel, method: str = "all") -> None:
+    """
+    Args:
+    Updates the signature of a PeftModel include parents class signature for forward or generate method
+        model (`PeftModel`): Peft model to update generate or forward signature method (`str`): method to update
+        signature choose one of "forward", "generate", "all"
+    Example:
+     ```python
+    >>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+    >>> from peft import get_peft_model, LoraConfig, TaskType, update_signature
+
+    >>> model_name_or_path = "bigscience/mt0-large"
+    >>> tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+    >>> model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
+
+    >>> peft_config = LoraConfig(
+    ...     task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
+    ... )
+    >>> peft_model = get_peft_model(model, peft_config)
+    >>> update_signature(peft_model)
+    >>> help(peft_model.generate)
+    ```
+    """
+    if method == "forward":
+        update_forward_signature(model)
+    elif method == "generate":
+        update_generate_signature(model)
+    elif method == "all":
+        update_forward_signature(model)
+        update_generate_signature(model)
+    else:
+        raise ValueError(f"method {method} is not supported please choose one of ['forward', 'generate', 'all']")
--- a/src/peft/import_utils.py
+++ b/src/peft/import_utils.py
@ -13,16 +13,54 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import importlib
+import importlib.metadata as importlib_metadata
+from functools import lru_cache
+
+import packaging.version


-def is_bnb_available():
+def is_bnb_available() -> bool:
    return importlib.util.find_spec("bitsandbytes") is not None


-def is_bnb_4bit_available():
+def is_bnb_4bit_available() -> bool:
    if not is_bnb_available():
        return False

    import bitsandbytes as bnb

    return hasattr(bnb.nn, "Linear4bit")
+
+
+def is_auto_gptq_available():
+    if importlib.util.find_spec("auto_gptq") is not None:
+        AUTOGPTQ_MINIMUM_VERSION = packaging.version.parse("0.5.0")
+        version_autogptq = packaging.version.parse(importlib_metadata.version("auto_gptq"))
+        if AUTOGPTQ_MINIMUM_VERSION <= version_autogptq:
+            return True
+        else:
+            raise ImportError(
+                f"Found an incompatible version of auto-gptq. Found version {version_autogptq}, "
+                f"but only versions above {AUTOGPTQ_MINIMUM_VERSION} are supported"
+            )
+
+
+def is_optimum_available() -> bool:
+    return importlib.util.find_spec("optimum") is not None
+
+
+@lru_cache()
+def is_torch_tpu_available(check_device=True):
+    "Checks if `torch_xla` is installed and potentially if a TPU is in the environment"
+    if importlib.util.find_spec("torch_xla") is not None:
+        if check_device:
+            # We need to check if `xla_device` can be found, will raise a RuntimeError if not
+            try:
+                import torch_xla.core.xla_model as xm
+
+                _ = xm.xla_device()
+                return True
+            except RuntimeError:
+                return False
+        return True
+    return False
--- a/src/peft/mapping.py
+++ b/src/peft/mapping.py
@ -35,8 +35,13 @@ from .tuners import (
    AdaptionPromptConfig,
    IA3Config,
    IA3Model,
+    LoHaConfig,
+    LoHaModel,
+    LoKrConfig,
+    LoKrModel,
    LoraConfig,
    LoraModel,
+    MultitaskPromptTuningConfig,
    PrefixTuningConfig,
    PromptEncoderConfig,
    PromptTuningConfig,
@ -48,7 +53,7 @@ if TYPE_CHECKING:
    from transformers import PreTrainedModel


-MODEL_TYPE_TO_PEFT_MODEL_MAPPING = {
+MODEL_TYPE_TO_PEFT_MODEL_MAPPING: Dict[str, PeftModel] = {
    "SEQ_CLS": PeftModelForSequenceClassification,
    "SEQ_2_SEQ_LM": PeftModelForSeq2SeqLM,
    "CAUSAL_LM": PeftModelForCausalLM,
@ -57,24 +62,29 @@ MODEL_TYPE_TO_PEFT_MODEL_MAPPING = {
    "FEATURE_EXTRACTION": PeftModelForFeatureExtraction,
 }

-PEFT_TYPE_TO_CONFIG_MAPPING = {
+PEFT_TYPE_TO_CONFIG_MAPPING: Dict[str, PeftConfig] = {
    "ADAPTION_PROMPT": AdaptionPromptConfig,
    "PROMPT_TUNING": PromptTuningConfig,
    "PREFIX_TUNING": PrefixTuningConfig,
    "P_TUNING": PromptEncoderConfig,
    "LORA": LoraConfig,
+    "LOHA": LoHaConfig,
+    "LOKR": LoKrConfig,
    "ADALORA": AdaLoraConfig,
    "IA3": IA3Config,
+    "MULTITASK_PROMPT_TUNING": MultitaskPromptTuningConfig,
 }

 PEFT_TYPE_TO_TUNER_MAPPING = {
    "LORA": LoraModel,
+    "LOHA": LoHaModel,
+    "LOKR": LoKrModel,
    "ADALORA": AdaLoraModel,
    "IA3": IA3Model,
 }


-def get_peft_config(config_dict: Dict[str, Any]):
+def get_peft_config(config_dict: Dict[str, Any]) -> PeftConfig:
    """
    Returns a Peft config object from a dictionary.

@ -106,7 +116,9 @@ def get_peft_model(model: PreTrainedModel, peft_config: PeftConfig, adapter_name
    return MODEL_TYPE_TO_PEFT_MODEL_MAPPING[peft_config.task_type](model, peft_config, adapter_name=adapter_name)


-def inject_adapter_in_model(peft_config: PeftConfig, model: torch.nn.Module, adapter_name: str):
+def inject_adapter_in_model(
+    peft_config: PeftConfig, model: torch.nn.Module, adapter_name: str = "default"
+) -> torch.nn.Module:
    r"""
    A simple API to create and inject adapter in-place into a model. Currently the API does not support prompt learning
    methods and adaption prompt. Make sure to have the correct `target_names` set in the `peft_config` object. The API
@ -117,8 +129,8 @@ def inject_adapter_in_model(peft_config: PeftConfig, model: torch.nn.Module, ada
            Configuration object containing the parameters of the Peft model.
        model (`torch.nn.Module`):
            The input model where the adapter will be injected.
-        adapter_name (`str`):
-            The name of the adapter to be injected.
+        adapter_name (`str`, `optional`, defaults to `"default"`):
+            The name of the adapter to be injected, if not provided, the default adapter name is used ("default").
    """
    if peft_config.is_prompt_learning or peft_config.is_adaption_prompt:
        raise ValueError("`create_and_replace` does not support prompt learning and adaption prompt yet.")
--- a/src/peft/peft_model.py
+++ b/src/peft/peft_model.py
@ -15,6 +15,7 @@

 from __future__ import annotations

+import collections
 import inspect
 import os
 import warnings
@ -26,7 +27,7 @@ import torch
 from accelerate import dispatch_model, infer_auto_device_map
 from accelerate.hooks import AlignDevicesHook, add_hook_to_module, remove_hook_from_submodules
 from accelerate.utils import get_balanced_memory
-from huggingface_hub import hf_hub_download
+from huggingface_hub import ModelCard, ModelCardData, hf_hub_download
 from safetensors.torch import save_file as safe_save_file
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers import PreTrainedModel
@ -39,7 +40,10 @@ from .tuners import (
    AdaLoraModel,
    AdaptionPromptModel,
    IA3Model,
+    LoHaModel,
+    LoKrModel,
    LoraModel,
+    MultitaskPromptEmbedding,
    PrefixEncoder,
    PromptEmbedding,
    PromptEncoder,
@ -54,8 +58,8 @@ from .utils import (
    _prepare_prompt_learning_config,
    _set_adapter,
    _set_trainable,
-    add_library_to_model_card,
    get_peft_model_state_dict,
+    id_tensor_storage,
    infer_device,
    load_peft_weights,
    set_peft_model_state_dict,
@ -65,6 +69,8 @@ from .utils import (

 PEFT_TYPE_TO_MODEL_MAPPING = {
    PeftType.LORA: LoraModel,
+    PeftType.LOHA: LoHaModel,
+    PeftType.LOKR: LoKrModel,
    PeftType.PROMPT_TUNING: PromptEmbedding,
    PeftType.P_TUNING: PromptEncoder,
    PeftType.PREFIX_TUNING: PrefixEncoder,
@ -100,21 +106,22 @@ class PeftModel(PushToHubMixin, torch.nn.Module):

    def __init__(self, model: PreTrainedModel, peft_config: PeftConfig, adapter_name: str = "default"):
        super().__init__()
-        self.base_model = model
-        self.config = getattr(self.base_model, "config", {"model_type": "custom"})
        self.modules_to_save = None
-        self.peft_config = {}
        self.active_adapter = adapter_name
        self.peft_type = peft_config.peft_type
-        if not peft_config.is_prompt_learning:
-            self.peft_config[adapter_name] = peft_config
-            self.base_model = PEFT_TYPE_TO_MODEL_MAPPING[peft_config.peft_type](
-                self.base_model, self.peft_config, adapter_name
-            )
-            self.set_additional_trainable_modules(peft_config, adapter_name)
-        else:
-            self.add_adapter(adapter_name, peft_config)

+        self._is_prompt_learning = peft_config.is_prompt_learning
+        if self._is_prompt_learning:
+            self._peft_config = {adapter_name: peft_config}
+            self.base_model = model
+            self.add_adapter(adapter_name, peft_config)
+        else:
+            self._peft_config = None
+            cls = PEFT_TYPE_TO_MODEL_MAPPING[peft_config.peft_type]
+            self.base_model = cls(model, {adapter_name: peft_config}, adapter_name)
+            self.set_additional_trainable_modules(peft_config, adapter_name)
+
+        self.config = getattr(self.base_model, "config", {"model_type": "custom"})
        if getattr(model, "is_gradient_checkpointing", True):
            model = self._prepare_model_for_gradient_checkpointing(model)

@ -124,6 +131,29 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
        if hasattr(self.base_model, "config") and hasattr(self.base_model.config, "pretraining_tp"):
            self.base_model.config.pretraining_tp = 1

+    @property
+    def peft_config(self) -> Dict[str, PeftConfig]:
+        if self._is_prompt_learning:
+            return self._peft_config
+        return self.base_model.peft_config
+
+    @property
+    def active_adapters(self):
+        try:
+            adapters = self.base_model.active_adapters
+        except AttributeError:
+            adapters = self.active_adapter
+            if isinstance(adapters, str):
+                adapters = [adapters]
+        return adapters
+
+    @peft_config.setter
+    def peft_config(self, value: Dict[str, PeftConfig]):
+        if self._is_prompt_learning:
+            self._peft_config = value
+        else:
+            self.base_model.peft_config = value
+
    def save_pretrained(
        self,
        save_directory: str,
@ -133,13 +163,15 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
    ):
        r"""
        This function saves the adapter model and the adapter configuration files to a directory, so that it can be
-        reloaded using the [`LoraModel.from_pretrained`] class method, and also used by the [`LoraModel.push_to_hub`]
+        reloaded using the [`PeftModel.from_pretrained`] class method, and also used by the [`PeftModel.push_to_hub`]
        method.

        Args:
            save_directory (`str`):
                Directory where the adapter model and configuration files will be saved (will be created if it does not
                exist).
+            safe_serialization (`bool`, *optional*):
+                Whether to save the adapter files in safetensors format.
            kwargs (additional keyword arguments, *optional*):
                Additional keyword arguments passed along to the `push_to_hub` method.
        """
@ -171,6 +203,28 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
            os.makedirs(output_dir, exist_ok=True)

            if safe_serialization:
+                # Section copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_utils.py#L2111-L2134
+                # Safetensors does not allow tensor aliasing.
+                # We're going to remove aliases before saving
+                ptrs = collections.defaultdict(list)
+                for name, tensor in output_state_dict.items():
+                    # Sometimes in the state_dict we have non-tensor objects.
+                    # e.g. in bitsandbytes we have some `str` objects in the state_dict
+                    if isinstance(tensor, torch.Tensor):
+                        ptrs[id_tensor_storage(tensor)].append(name)
+                    else:
+                        # In the non-tensor case, fall back to the pointer of the object itself
+                        ptrs[id(tensor)].append(name)
+
+                # These are all the pointers of shared tensors.
+                shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1}
+
+                for _, names in shared_ptrs.items():
+                    # Here we just clone the shared tensors to avoid tensor aliasing which is
+                    # not supported in safetensors.
+                    for shared_tensor_name in names[1:]:
+                        output_state_dict[shared_tensor_name] = output_state_dict[shared_tensor_name].clone()
+
                safe_save_file(
                    output_state_dict,
                    os.path.join(output_dir, SAFETENSORS_WEIGHTS_NAME),
@ -299,12 +353,24 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
            config.num_transformer_submodules = 2 if config.task_type == TaskType.SEQ_2_SEQ_LM else 1

        for named_param, value in list(transformer_backbone.named_parameters()):
-            if value.shape[0] == self.base_model.config.vocab_size:
+            # for ZeRO-3, the tensor is sharded across accelerators and deepspeed modifies it to a tensor with shape [0]
+            # the actual unsharded shape is stored in "ds_shape" attribute
+            # special handling is needed in case the model is initialized in deepspeed.zero.Init() context or HfDeepSpeedConfig
+            # has been called before
+            # For reference refer to issue: https://github.com/huggingface/peft/issues/996
+            deepspeed_distributed_tensor_shape = getattr(value, "ds_shape", None)
+
+            if value.shape[0] == self.base_model.config.vocab_size or (
+                deepspeed_distributed_tensor_shape is not None
+                and deepspeed_distributed_tensor_shape[0] == self.base_model.config.vocab_size
+            ):
                self.word_embeddings = transformer_backbone.get_submodule(named_param.replace(".weight", ""))
                break

        if config.peft_type == PeftType.PROMPT_TUNING:
            prompt_encoder = PromptEmbedding(config, self.word_embeddings)
+        elif config.peft_type == PeftType.MULTITASK_PROMPT_TUNING:
+            prompt_encoder = MultitaskPromptEmbedding(config, self.word_embeddings)
        elif config.peft_type == PeftType.P_TUNING:
            prompt_encoder = PromptEncoder(config)
        elif config.peft_type == PeftType.PREFIX_TUNING:
@ -322,7 +388,11 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
        r"""
        Prepares the model for gradient checkpointing if necessary
        """
-        if not (getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False)):
+        if not (
+            getattr(model, "is_loaded_in_8bit", False)
+            or getattr(model, "is_loaded_in_4bit", False)
+            or getattr(model, "is_quantized", False)
+        ):
            if hasattr(model, "enable_input_require_grads"):
                model.enable_input_require_grads()
            elif hasattr(model, "get_input_embeddings"):
@ -344,10 +414,15 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
        )
        if self.peft_config[adapter_name].peft_type == PeftType.PREFIX_TUNING:
            prompt_tokens = prompt_tokens[:, : self.peft_config[adapter_name].num_virtual_tokens]
-        prompt_embeddings = prompt_encoder(prompt_tokens)
+
+        if self.peft_config[adapter_name].peft_type == PeftType.MULTITASK_PROMPT_TUNING:
+            prompt_embeddings = super(MultitaskPromptEmbedding, prompt_encoder).forward(prompt_tokens)
+        else:
+            prompt_embeddings = prompt_encoder(prompt_tokens)
+
        return prompt_embeddings[0].detach().cpu()

-    def get_prompt(self, batch_size: int):
+    def get_prompt(self, batch_size: int, task_ids: Optional[torch.Tensor] = None):
        """
        Returns the virtual prompts to use for Peft. Only applicable when `peft_config.peft_type != PeftType.LORA`.
        """
@ -384,10 +459,13 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
                past_key_values = post_process_fn(past_key_values)
            return past_key_values
        else:
-            if peft_config.inference_mode:
-                prompts = prompt_encoder.embedding.weight.repeat(batch_size, 1, 1)
+            if peft_config.peft_type == PeftType.MULTITASK_PROMPT_TUNING:
+                prompts = prompt_encoder(prompt_tokens, task_ids)
            else:
-                prompts = prompt_encoder(prompt_tokens)
+                if peft_config.inference_mode:
+                    prompts = prompt_encoder.embedding.weight.repeat(batch_size, 1, 1)
+                else:
+                    prompts = prompt_encoder(prompt_tokens)
            return prompts

    def get_nb_trainable_parameters(self):
@ -481,10 +559,9 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
                f"Found {self.peft_type} and {peft_config.peft_type}."
            )

-        self.peft_config[adapter_name] = peft_config
-
        try:
            if peft_config.is_prompt_learning:
+                self.peft_config[adapter_name] = peft_config
                if hasattr(self.config, "to_dict"):
                    dict_config = self.config.to_dict()
                else:
@ -495,9 +572,11 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
            elif peft_config.is_adaption_prompt:
                self.base_model.add_adapter(adapter_name, peft_config)
            else:
+                self.peft_config[adapter_name] = peft_config
                self.base_model.inject_adapter(self, adapter_name)
        except Exception:  # somthing went wrong, roll back
-            del self.peft_config[adapter_name]
+            if adapter_name in self.peft_config:
+                del self.peft_config[adapter_name]
            raise

        self.set_additional_trainable_modules(peft_config, adapter_name)
@ -620,13 +699,22 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
        Updates or create model card to include information about peft:
        1. Adds `peft` library tag
        2. Adds peft version
-        3. Adds quantization information if it was used
+        3. Adds base model info
+        4. Adds quantization information if it was used
        """
-        # Adds `peft` library tag
-        add_library_to_model_card(output_dir)

-        with open(os.path.join(output_dir, "README.md"), "r") as f:
-            lines = f.readlines()
+        filename = os.path.join(output_dir, "README.md")
+
+        card = ModelCard.load(filename) if os.path.exists(filename) else ModelCard.from_template(ModelCardData())
+
+        card.data["library_name"] = "peft"
+        model_config = self.config
+        if hasattr(model_config, "to_dict"):
+            model_config = model_config.to_dict()
+        if model_config.get("model_type", "custom") != "custom":
+            card.data["base_model"] = model_config["_name_or_path"]
+
+        lines = card.text.splitlines()

        quantization_config = None
        if hasattr(self.config, "quantization_config"):
@ -651,9 +739,8 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
        else:
            lines.append(f"{framework_block_heading}\n\n- PEFT {__version__}\n")

-        # write the lines back to README.md
-        with open(os.path.join(output_dir, "README.md"), "w") as f:
-            f.writelines(lines)
+        card.text = "\n".join(lines)
+        card.save(filename)


 class PeftModelForSequenceClassification(PeftModel):
@ -720,6 +807,7 @@ class PeftModelForSequenceClassification(PeftModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
+        task_ids=None,
        **kwargs,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@ -767,7 +855,7 @@ class PeftModelForSequenceClassification(PeftModel):
                ).long()
            if inputs_embeds is None:
                inputs_embeds = self.word_embeddings(input_ids)
-            prompts = self.get_prompt(batch_size=batch_size)
+            prompts = self.get_prompt(batch_size=batch_size, task_ids=task_ids)
            prompts = prompts.to(inputs_embeds.dtype)
            inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1)
            return self.base_model(inputs_embeds=inputs_embeds, **kwargs)
@ -894,6 +982,7 @@ class PeftModelForCausalLM(PeftModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
+        task_ids=None,
        **kwargs,
    ):
        peft_config = self.active_peft_config
@ -956,7 +1045,7 @@ class PeftModelForCausalLM(PeftModel):
            if labels is not None:
                prefix_labels = torch.full((batch_size, peft_config.num_virtual_tokens), -100).to(labels.device)
                kwargs["labels"] = torch.cat((prefix_labels, labels), dim=1)
-            prompts = self.get_prompt(batch_size=batch_size)
+            prompts = self.get_prompt(batch_size=batch_size, task_ids=task_ids)
            prompts = prompts.to(inputs_embeds.dtype)
            inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1)
            return self.base_model(inputs_embeds=inputs_embeds, **kwargs)
@ -976,7 +1065,7 @@ class PeftModelForCausalLM(PeftModel):
            self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation
            return outputs

-    def prepare_inputs_for_generation(self, *args, **kwargs):
+    def prepare_inputs_for_generation(self, *args, task_ids: torch.Tensor = None, **kwargs):
        peft_config = self.active_peft_config
        model_kwargs = self.base_model_prepare_inputs_for_generation(*args, **kwargs)
        if peft_config.is_prompt_learning:
@ -1004,7 +1093,7 @@ class PeftModelForCausalLM(PeftModel):
            else:
                if model_kwargs["past_key_values"] is None:
                    inputs_embeds = self.word_embeddings(model_kwargs["input_ids"])
-                    prompts = self.get_prompt(batch_size=model_kwargs["input_ids"].shape[0])
+                    prompts = self.get_prompt(batch_size=model_kwargs["input_ids"].shape[0], task_ids=task_ids)
                    prompts = prompts.to(inputs_embeds.dtype)
                    model_kwargs["inputs_embeds"] = torch.cat((prompts, inputs_embeds), dim=1)
                    model_kwargs["input_ids"] = None
@ -1067,6 +1156,7 @@ class PeftModelForSeq2SeqLM(PeftModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
+        task_ids=None,
        **kwargs,
    ):
        peft_config = self.active_peft_config
@ -1091,7 +1181,8 @@ class PeftModelForSeq2SeqLM(PeftModel):
            prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to(
                decoder_attention_mask.device
            )
-            decoder_attention_mask = torch.cat((prefix_attention_mask, decoder_attention_mask), dim=1)
+            if peft_config.peft_type not in [PeftType.PROMPT_TUNING, PeftType.P_TUNING]:
+                decoder_attention_mask = torch.cat((prefix_attention_mask, decoder_attention_mask), dim=1)

        if kwargs.get("position_ids", None) is not None:
            warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.")
@ -1162,7 +1253,7 @@ class PeftModelForSeq2SeqLM(PeftModel):
                elif peft_config.num_transformer_submodules == 2:
                    prefix_labels = torch.full((batch_size, peft_config.num_virtual_tokens), -100).to(labels.device)
                    kwargs["labels"] = torch.cat((prefix_labels, labels), dim=1)
-            prompts = self.get_prompt(batch_size=batch_size)
+            prompts = self.get_prompt(batch_size=batch_size, task_ids=task_ids)
            prompts = prompts.to(inputs_embeds.dtype)
            inputs_embeds = torch.cat((prompts[:, : peft_config.num_virtual_tokens], inputs_embeds), dim=1)
            if peft_config.num_transformer_submodules == 1:
@ -1200,7 +1291,11 @@ class PeftModelForSeq2SeqLM(PeftModel):

                if peft_config.peft_type == PeftType.PREFIX_TUNING:
                    outputs = self.base_model.generate(**kwargs)
-                elif peft_config.peft_type in [PeftType.PROMPT_TUNING, PeftType.P_TUNING]:
+                elif peft_config.peft_type in [
+                    PeftType.PROMPT_TUNING,
+                    PeftType.P_TUNING,
+                    PeftType.MULTITASK_PROMPT_TUNING,
+                ]:
                    kwargs = deepcopy(kwargs)

                    if "encoder_outputs" in kwargs:
@ -1212,7 +1307,7 @@ class PeftModelForSeq2SeqLM(PeftModel):
                    input_ids = kwargs.pop("input_ids")
                    inputs_embeds = self.word_embeddings(input_ids)
                    batch_size = inputs_embeds.shape[0]
-                    prompts = self.get_prompt(batch_size=batch_size)
+                    prompts = self.get_prompt(batch_size=batch_size, task_ids=kwargs.pop("task_ids", None))
                    prompts = prompts.to(inputs_embeds.dtype)

                    inputs_embeds = torch.cat((prompts[:, : peft_config.num_virtual_tokens], inputs_embeds), dim=1)
@ -1315,6 +1410,7 @@ class PeftModelForTokenClassification(PeftModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
+        task_ids=None,
        **kwargs,
    ):
        peft_config = self.active_peft_config
@ -1363,7 +1459,7 @@ class PeftModelForTokenClassification(PeftModel):
                ).long()
            if inputs_embeds is None:
                inputs_embeds = self.word_embeddings(input_ids)
-            prompts = self.get_prompt(batch_size=batch_size)
+            prompts = self.get_prompt(batch_size=batch_size, task_ids=task_ids)
            prompts = prompts.to(inputs_embeds.dtype)
            inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1)
            return self.base_model(inputs_embeds=inputs_embeds, **kwargs)
--- a/src/peft/tuners/init.py
+++ b/src/peft/tuners/init.py
@ -19,11 +19,14 @@

 from .adaption_prompt import AdaptionPromptConfig, AdaptionPromptModel
 from .lora import LoraConfig, LoraModel
+from .loha import LoHaConfig, LoHaModel
+from .lokr import LoKrConfig, LoKrModel
 from .ia3 import IA3Config, IA3Model
 from .adalora import AdaLoraConfig, AdaLoraModel
 from .p_tuning import PromptEncoder, PromptEncoderConfig, PromptEncoderReparameterizationType
 from .prefix_tuning import PrefixEncoder, PrefixTuningConfig
 from .prompt_tuning import PromptEmbedding, PromptTuningConfig, PromptTuningInit
+from .multitask_prompt_tuning import MultitaskPromptEmbedding, MultitaskPromptTuningConfig, MultitaskPromptTuningInit

 # Mapping of tuners that support direct plugging
 TUNERS_MAPPING = {
--- a/src/peft/tuners/adalora.py
+++ b/src/peft/tuners/adalora.py
@ -1,758 +0,0 @@
-import warnings
-from dataclasses import dataclass, field
-from typing import Optional
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from transformers.pytorch_utils import Conv1D
-
-from ..import_utils import is_bnb_4bit_available, is_bnb_available
-from ..utils import (
-    TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING,
-    PeftType,
-    _freeze_adapter,
-    _get_submodules,
-    transpose,
-)
-from .lora import (
-    LoraConfig,
-    LoraLayer,
-    LoraModel,
-)
-
-
-if is_bnb_available():
-    import bitsandbytes as bnb
-
-
-@dataclass
-class AdaLoraConfig(LoraConfig):
-    """
-    This is the configuration class to store the configuration of a [`~peft.AdaLora`].
-
-    Args:
-        target_r (`int`): The target average rank of incremental matrix.
-        init_r (`int`): The initial rank for each incremental matrix.
-        tinit (`int`): The steps of initial fine-tuning warmup.
-        tfinal (`int`): The step of final fine-tuning.
-        deltaT (`int`): The time internval between two budget allocations.
-        beta1 (`float`): The hyperparameter of EMA for sensitivity smoothing.
-        beta2 (`float`): The hyperparameter of EMA for undertainty quantification.
-        orth_reg_weight (`float`): The coefficient of orthogonal regularization.
-        total_step (`int`): The total training steps that should be specified before training.
-        rank_pattern (`list`): The allocated rank for each weight matrix by RankAllocator.
-    """
-
-    target_r: int = field(default=8, metadata={"help": "Target Lora matrix dimension."})
-    init_r: int = field(default=12, metadata={"help": "Intial Lora matrix dimension."})
-    tinit: int = field(default=0, metadata={"help": "The steps of initial warmup."})
-    tfinal: int = field(default=0, metadata={"help": "The steps of final warmup."})
-    deltaT: int = field(default=1, metadata={"help": "Step interval of rank allocation."})
-    beta1: float = field(default=0.85, metadata={"help": "Hyperparameter of EMA."})
-    beta2: float = field(default=0.85, metadata={"help": "Hyperparameter of EMA."})
-    orth_reg_weight: float = field(default=0.5, metadata={"help": "The orthogonal regularization coefficient."})
-    total_step: Optional[int] = field(default=None, metadata={"help": "The total training steps."})
-    rank_pattern: Optional[dict] = field(default=None, metadata={"help": "The saved rank pattern."})
-
-    def __post_init__(self):
-        self.peft_type = PeftType.ADALORA
-
-
-class AdaLoraLayer(LoraLayer):
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-    ):
-        super().__init__(in_features, out_features)
-        self.lora_E = nn.ParameterDict({})
-        self.lora_A = nn.ParameterDict({})
-        self.lora_B = nn.ParameterDict({})
-        self.ranknum = nn.ParameterDict({})
-
-    def update_layer(self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights):
-        self.r[adapter_name] = r
-        self.lora_alpha[adapter_name] = lora_alpha
-        if lora_dropout > 0.0:
-            lora_dropout_layer = nn.Dropout(p=lora_dropout)
-        else:
-            lora_dropout_layer = nn.Identity()
-
-        self.lora_dropout.update(nn.ModuleDict({adapter_name: lora_dropout_layer}))
-        # Actual trainable parameters
-        # Right singular vectors
-        self.lora_A.update(nn.ParameterDict({adapter_name: nn.Parameter(torch.randn(r, self.in_features))}))
-        # Singular values
-        self.lora_E.update(nn.ParameterDict({adapter_name: nn.Parameter(torch.randn(r, 1))}))
-        # Left singular vectors
-        self.lora_B.update(nn.ParameterDict({adapter_name: nn.Parameter(torch.randn(self.out_features, r))}))
-        # The current rank
-        self.ranknum.update(nn.ParameterDict({adapter_name: nn.Parameter(torch.randn(1), requires_grad=False)}))
-        self.ranknum[adapter_name].data.fill_(float(r))
-        self.ranknum[adapter_name].requires_grad = False
-        self.scaling[adapter_name] = lora_alpha if lora_alpha > 0 else float(r)
-        if init_lora_weights:
-            self.reset_lora_parameters(adapter_name)
-        self.to(self.weight.device)
-
-    def reset_lora_parameters(self, adapter_name):
-        if adapter_name in self.lora_A.keys():
-            nn.init.normal_(self.lora_E[adapter_name], mean=0.0, std=0.02)
-            nn.init.normal_(self.lora_A[adapter_name], mean=0.0, std=0.02)
-            nn.init.normal_(self.lora_B[adapter_name], mean=0.0, std=0.02)
-
-
-class AdaLoraModel(LoraModel):
-    """
-    Creates AdaLoRA (Adaptive LoRA) model from a pretrained transformers model. Paper:
-    https://openreview.net/forum?id=lq62uWRJjiY
-
-    Args:
-        model ([`transformers.PreTrainedModel`]): The model to be adapted.
-        config ([`AdaLoraConfig`]): The configuration of the AdaLora model.
-        adapter_name (`str`): The name of the adapter, defaults to `"default"`.
-
-    Returns:
-        `torch.nn.Module`: The AdaLora model.
-
-    Example::
-
-        >>> from transformers import AutoModelForSeq2SeqLM, LoraConfig >>> from peft import AdaLoraModel, AdaLoraConfig
-        >>> config = AdaLoraConfig(
-                peft_type="ADALORA", task_type="SEQ_2_SEQ_LM", r=8, lora_alpha=32, target_modules=["q", "v"],
-                lora_dropout=0.01,
-            )
-        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") >>> model = AdaLoraModel(model, config, "default")
-
-    **Attributes**:
-        - **model** ([`transformers.PreTrainedModel`]) -- The model to be adapted.
-        - **peft_config** ([`AdaLoraConfig`]): The configuration of the AdaLora model.
-    """
-
-    def __init__(self, model, config, adapter_name):
-        super().__init__(model, config, adapter_name)
-
-        traininable_mode_counter = 0
-        for config in self.peft_config.values():
-            if not config.inference_mode:
-                traininable_mode_counter += 1
-
-        if traininable_mode_counter > 1:
-            raise ValueError(
-                "AdaLoraModel supports only 1 trainable adapter. "
-                "When using multiple adapters, set inference_mode to True for all adapters except the one you want to train."
-            )
-
-        if self.peft_config[adapter_name].inference_mode:
-            _freeze_adapter(self.model, adapter_name)
-        else:
-            self.trainable_adapter_name = adapter_name
-            self.rankallocator = RankAllocator(self.model, self.peft_config[adapter_name], self.trainable_adapter_name)
-
-    def _check_new_adapter_config(self, config: LoraConfig) -> None:
-        """
-        A helper method to check the config when a new adapter is being added.
-
-        Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters.
-
-        """
-        super()._check_new_adapter_config(config)
-
-        traininable_mode_counter = 0
-        for config_ in self.peft_config.values():
-            if not config_.inference_mode:
-                traininable_mode_counter += 1
-
-        if traininable_mode_counter > 1:
-            raise ValueError(
-                f"{self.__class__.__name__} supports only 1 trainable adapter. "
-                "When using multiple adapters, set inference_mode to True for all adapters except the one "
-                "you want to train."
-            )
-
-    def _create_and_replace(
-        self,
-        lora_config,
-        adapter_name,
-        target,
-        target_name,
-        parent,
-        **optionnal_kwargs,
-    ):
-        loaded_in_8bit = optionnal_kwargs.get("loaded_in_8bit", False)
-        loaded_in_4bit = optionnal_kwargs.get("loaded_in_4bit", False)
-
-        if (loaded_in_8bit or loaded_in_4bit) and not is_bnb_available():
-            raise ImportError(
-                "To use Lora with 8-bit quantization, please install the `bitsandbytes` package. "
-                "You can install it with `pip install bitsandbytes`."
-            )
-        kwargs = {
-            "r": lora_config.init_r,
-            "lora_alpha": lora_config.lora_alpha,
-            "lora_dropout": lora_config.lora_dropout,
-            "fan_in_fan_out": lora_config.fan_in_fan_out,
-            "init_lora_weights": lora_config.init_lora_weights,
-            "loaded_in_8bit": loaded_in_8bit,
-            "loaded_in_4bit": loaded_in_4bit,
-        }
-
-        # If it is not a LoraLayer, create a new module, else update it with new adapters
-        if not isinstance(target, AdaLoraLayer):
-            new_module = self._create_new_module(lora_config, adapter_name, target, **kwargs)
-            self._replace_module(parent, target_name, new_module, target)
-        else:
-            target.update_layer(
-                adapter_name,
-                lora_config.init_r,
-                lora_config.lora_alpha,
-                lora_config.lora_dropout,
-                lora_config.init_lora_weights,
-            )
-
-    @staticmethod
-    def _create_new_module(lora_config, adapter_name, target, **kwargs):
-        bias = target.bias is not None
-        loaded_in_8bit = kwargs.pop("loaded_in_8bit", False)
-        loaded_in_4bit = kwargs.pop("loaded_in_4bit", False)
-
-        if loaded_in_8bit and isinstance(target, bnb.nn.Linear8bitLt):
-            kwargs.update(
-                {
-                    "has_fp16_weights": target.state.has_fp16_weights,
-                    "memory_efficient_backward": target.state.memory_efficient_backward,
-                    "threshold": target.state.threshold,
-                    "index": target.index,
-                }
-            )
-            new_module = SVDLinear8bitLt(adapter_name, target.in_features, target.out_features, bias=bias, **kwargs)
-        elif loaded_in_4bit and is_bnb_4bit_available() and isinstance(target, bnb.nn.Linear4bit):
-            fourbit_kwargs = kwargs.copy()
-            fourbit_kwargs.update(
-                {
-                    "compute_dtype": target.compute_dtype,
-                    "compress_statistics": target.weight.compress_statistics,
-                    "quant_type": target.weight.quant_type,
-                }
-            )
-            new_module = SVDLinear4bit(
-                adapter_name, target.in_features, target.out_features, bias=bias, **fourbit_kwargs
-            )
-        else:
-            if isinstance(target, torch.nn.Linear):
-                in_features, out_features = target.in_features, target.out_features
-                if kwargs["fan_in_fan_out"]:
-                    warnings.warn(
-                        "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
-                        "Setting fan_in_fan_out to False."
-                    )
-                    kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False
-            elif isinstance(target, Conv1D):
-                in_features, out_features = (
-                    target.weight.ds_shape if hasattr(target.weight, "ds_shape") else target.weight.shape
-                )
-                if not kwargs["fan_in_fan_out"]:
-                    warnings.warn(
-                        "fan_in_fan_out is set to False but the target module is `Conv1D`. "
-                        "Setting fan_in_fan_out to True."
-                    )
-                    kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = True
-            else:
-                raise ValueError(
-                    f"Target module {target} is not supported. "
-                    f"Currently, only `torch.nn.Linear` and `Conv1D` are supported."
-                )
-            new_module = SVDLinear(adapter_name, in_features, out_features, bias=bias, **kwargs)
-
-        return new_module
-
-    @staticmethod
-    def _prepare_adapter_config(peft_config, model_config):
-        if peft_config.target_modules is None:
-            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING:
-                raise ValueError("Please specify `target_modules` in `peft_config`")
-            peft_config.target_modules = TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING[
-                model_config["model_type"]
-            ]
-        return peft_config
-
-    def __getattr__(self, name: str):
-        """Forward missing attributes to the wrapped module."""
-        try:
-            return super().__getattr__(name)  # defer to nn.Module's logic
-        except AttributeError:
-            return getattr(self.model, name)
-
-    def forward(self, *args, **kwargs):
-        outputs = self.model.forward(*args, **kwargs)
-
-        if getattr(outputs, "loss", None) is not None:
-            # Calculate the orthogonal regularization
-            orth_reg_weight = self.peft_config[self.trainable_adapter_name].orth_reg_weight
-
-            if orth_reg_weight <= 0:
-                raise ValueError("orth_reg_weight should be greater than 0. ")
-
-            regu_loss = 0
-            num_param = 0
-            for n, p in self.model.named_parameters():
-                if ("lora_A" in n or "lora_B" in n) and self.trainable_adapter_name in n:
-                    para_cov = p @ p.T if "lora_A" in n else p.T @ p
-                    I = torch.eye(*para_cov.size(), out=torch.empty_like(para_cov))
-                    I.requires_grad = False
-                    num_param += 1
-                    regu_loss += torch.norm(para_cov - I, p="fro")
-            if num_param > 0:
-                regu_loss = regu_loss / num_param
-            else:
-                regu_loss = 0
-            outputs.loss += orth_reg_weight * regu_loss
-        return outputs
-
-    def resize_modules_by_rank_pattern(self, rank_pattern, adapter_name):
-        lora_config = self.peft_config[adapter_name]
-        for name, rank_idx in rank_pattern.items():
-            if isinstance(rank_idx, list):
-                rank = sum(rank_idx)
-            elif isinstance(rank_idx, torch.Tensor):
-                rank_idx = rank_idx.view(-1)
-                rank = rank_idx.sum().item()
-            else:
-                raise ValueError("Unexcepted type of rank_idx")
-            key = ".".join(name.split(".")[0:-2]) if adapter_name in name else ".".join(name.split(".")[0:-1])
-            _, target, _ = _get_submodules(self.model, key)
-            lora_E_weights = target.lora_E[adapter_name][rank_idx]
-            lora_A_weights = target.lora_A[adapter_name][rank_idx]
-            lora_B_weights = target.lora_B[adapter_name][:, rank_idx]
-            ranknum = target.ranknum[adapter_name]
-            target.update_layer(
-                adapter_name,
-                rank,
-                lora_config.lora_alpha,
-                lora_config.lora_dropout,
-                lora_config.init_lora_weights,
-            )
-            with torch.no_grad():
-                if rank > 0:
-                    target.lora_E[adapter_name].copy_(lora_E_weights)
-                    target.lora_A[adapter_name].copy_(lora_A_weights)
-                    target.lora_B[adapter_name].copy_(lora_B_weights)
-                    # The scaling is exactly as the previous
-                    target.ranknum[adapter_name].copy_(ranknum)
-
-    def resize_state_dict_by_rank_pattern(self, rank_pattern, state_dict, adapter_name):
-        for name, rank_idx in rank_pattern.items():
-            rank = sum(rank_idx)
-            prefix = ".".join(name.split(".")[0:-2]) if adapter_name in name else ".".join(name.split(".")[0:-1])
-            for layer in ["lora_E", "lora_A", "lora_B"]:
-                key = f"base_model.model.{prefix}.{layer}.{adapter_name}"
-                if layer != "lora_B":
-                    state_dict[key] = (
-                        state_dict[key][rank_idx] if rank != state_dict[key].shape[0] else state_dict[key]
-                    )
-                else:
-                    state_dict[key] = (
-                        state_dict[key][:, rank_idx] if rank != state_dict[key].shape[1] else state_dict[key]
-                    )
-        return state_dict
-
-    def update_and_allocate(self, global_step):
-        lora_config = self.peft_config[self.trainable_adapter_name]
-        # Update the importance score and allocate the budget
-        if global_step < lora_config.total_step - lora_config.tfinal:
-            _, rank_pattern = self.rankallocator.update_and_allocate(self.model, global_step)
-            if rank_pattern:
-                lora_config.rank_pattern = rank_pattern
-        # Finalize the budget allocation
-        elif global_step == lora_config.total_step - lora_config.tfinal:
-            _, rank_pattern = self.rankallocator.update_and_allocate(self.model, global_step, force_mask=True)
-            # for some reason, this freezes the trainable parameters and nothing gets updates
-            # self.resize_modules_by_rank_pattern(rank_pattern, self.trainable_adapter_name)
-            lora_config.rank_pattern = rank_pattern
-            self.rankallocator.reset_ipt()
-        # Currently using inefficient way to mask the unimportant weights using the rank pattern
-        #  due to problem mentioned above
-        elif global_step > lora_config.total_step - lora_config.tfinal:
-            self.rankallocator.mask_using_rank_pattern(self.model, lora_config.rank_pattern)
-        # Pass the function and do forward propagation
-        else:
-            return None
-
-
-class SVDLinear(nn.Linear, AdaLoraLayer):
-    # SVD-based adaptation by a dense layer
-    def __init__(
-        self,
-        adapter_name: str,
-        in_features: int,
-        out_features: int,
-        r: int = 0,
-        lora_alpha: int = 1,
-        lora_dropout: float = 0.0,
-        fan_in_fan_out: bool = False,
-        **kwargs,
-    ):
-        init_lora_weights = kwargs.pop("init_lora_weights", True)
-        nn.Linear.__init__(self, in_features, out_features, **kwargs)
-        AdaLoraLayer.__init__(self, in_features=in_features, out_features=out_features)
-        # Freezing the pre-trained weight matrix
-        self.weight.requires_grad = False
-
-        self.fan_in_fan_out = fan_in_fan_out
-        if fan_in_fan_out:
-            self.weight.data = self.weight.data.T
-
-        nn.Linear.reset_parameters(self)
-        self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
-        self.active_adapter = adapter_name
-
-    def merge(self):
-        if self.active_adapter not in self.lora_A.keys():
-            return
-        if self.merged:
-            warnings.warn("Already merged. Nothing to do.")
-            return
-        if self.r[self.active_adapter] > 0:
-            self.weight.data += (
-                transpose(
-                    self.lora_B[self.active_adapter]
-                    @ (self.lora_A[self.active_adapter] * self.lora_E[self.active_adapter]),
-                    self.fan_in_fan_out,
-                )
-                * self.scaling[self.active_adapter]
-                / (self.ranknum[self.active_adapter] + 1e-5)
-            )
-            self.merged = True
-
-    def unmerge(self):
-        if self.active_adapter not in self.lora_A.keys():
-            return
-        if not self.merged:
-            warnings.warn("Already unmerged. Nothing to do.")
-            return
-        if self.r[self.active_adapter] > 0:
-            self.weight.data -= (
-                transpose(
-                    self.lora_B[self.active_adapter]
-                    @ (self.lora_A[self.active_adapter] * self.lora_E[self.active_adapter])
-                )
-                * self.scaling[self.active_adapter]
-                / (self.ranknum[self.active_adapter] + 1e-5)
-            )
-            self.merged = False
-
-    def forward(self, x: torch.Tensor):
-        if self.active_adapter not in self.lora_A.keys():
-            return F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
-        if self.disable_adapters:
-            if self.r[self.active_adapter] > 0 and self.merged:
-                self.unmerge()
-            result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
-        elif self.r[self.active_adapter] > 0 and not self.merged:
-            result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
-            result += (
-                (
-                    self.lora_dropout[self.active_adapter](x)
-                    @ (self.lora_A[self.active_adapter] * self.lora_E[self.active_adapter]).T
-                    @ self.lora_B[self.active_adapter].T
-                )
-                * self.scaling[self.active_adapter]
-                / (self.ranknum[self.active_adapter] + 1e-5)
-            )
-        else:
-            result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
-        return result
-
-
-if is_bnb_available():
-
-    class SVDLinear8bitLt(bnb.nn.Linear8bitLt, AdaLoraLayer):
-        # Low-rank matrix for SVD-based adaptation
-        def __init__(
-            self,
-            adapter_name,
-            in_features,
-            out_features,
-            r: int = 0,
-            lora_alpha: int = 1,
-            lora_dropout: float = 0.0,
-            **kwargs,
-        ):
-            bnb.nn.Linear8bitLt.__init__(
-                self,
-                in_features,
-                out_features,
-                bias=kwargs.get("bias", True),
-                has_fp16_weights=kwargs.get("has_fp16_weights", True),
-                memory_efficient_backward=kwargs.get("memory_efficient_backward", False),
-                threshold=kwargs.get("threshold", 0.0),
-                index=kwargs.get("index", None),
-            )
-            AdaLoraLayer.__init__(self, in_features=in_features, out_features=out_features)
-            # Freezing the pre-trained weight matrix
-            self.weight.requires_grad = False
-
-            init_lora_weights = kwargs.pop("init_lora_weights", True)
-            self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
-            self.active_adapter = adapter_name
-
-        def forward(self, x: torch.Tensor):
-            result = super().forward(x)
-
-            if self.disable_adapters or self.active_adapter not in self.lora_A.keys():
-                return result
-            elif self.r[self.active_adapter] > 0:
-                if not torch.is_autocast_enabled():
-                    expected_dtype = result.dtype
-
-                    if x.dtype != torch.float32:
-                        x = x.float()
-                    output = (
-                        (
-                            self.lora_dropout[self.active_adapter](x)
-                            @ (self.lora_A[self.active_adapter] * self.lora_E[self.active_adapter]).T
-                            @ self.lora_B[self.active_adapter].T
-                        ).to(expected_dtype)
-                        * self.scaling[self.active_adapter]
-                        / (self.ranknum[self.active_adapter] + 1e-5)
-                    )
-                else:
-                    output = (
-                        (
-                            self.lora_dropout[self.active_adapter](x)
-                            @ (self.lora_A[self.active_adapter] * self.lora_E[self.active_adapter]).T
-                            @ self.lora_B[self.active_adapter].T
-                        )
-                        * self.scaling[self.active_adapter]
-                        / (self.ranknum[self.active_adapter] + 1e-5)
-                    )
-                result = result + output
-            return result
-
-
-if is_bnb_4bit_available():
-
-    class SVDLinear4bit(bnb.nn.Linear4bit, AdaLoraLayer):
-        # Low-rank matrix for SVD-based adaptation
-        def __init__(
-            self,
-            adapter_name,
-            in_features,
-            out_features,
-            r: int = 0,
-            lora_alpha: int = 1,
-            lora_dropout: float = 0.0,
-            **kwargs,
-        ):
-            bnb.nn.Linear4bit.__init__(
-                self,
-                in_features,
-                out_features,
-                bias=kwargs.get("bias", True),
-                compute_dtype=kwargs.get("compute_dtype", torch.float32),
-                compress_statistics=kwargs.get("compress_statistics", True),
-                quant_type=kwargs.get("quant_type", "nf4"),
-            )
-            AdaLoraLayer.__init__(self, in_features=in_features, out_features=out_features)
-            # Freezing the pre-trained weight matrix
-            self.weight.requires_grad = False
-
-            init_lora_weights = kwargs.pop("init_lora_weights", True)
-            self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
-            self.active_adapter = adapter_name
-
-        def forward(self, x: torch.Tensor):
-            result = super().forward(x)
-
-            if self.disable_adapters or self.active_adapter not in self.lora_A.keys():
-                return result
-            elif self.r[self.active_adapter] > 0:
-                if not torch.is_autocast_enabled():
-                    expected_dtype = result.dtype
-
-                    if x.dtype != torch.float32:
-                        x = x.float()
-                    output = (
-                        (
-                            self.lora_dropout[self.active_adapter](x)
-                            @ (self.lora_A[self.active_adapter] * self.lora_E[self.active_adapter]).T
-                            @ self.lora_B[self.active_adapter].T
-                        ).to(expected_dtype)
-                        * self.scaling[self.active_adapter]
-                        / (self.ranknum[self.active_adapter] + 1e-5)
-                    )
-                else:
-                    output = (
-                        (
-                            self.lora_dropout[self.active_adapter](x)
-                            @ (self.lora_A[self.active_adapter] * self.lora_E[self.active_adapter]).T
-                            @ self.lora_B[self.active_adapter].T
-                        )
-                        * self.scaling[self.active_adapter]
-                        / (self.ranknum[self.active_adapter] + 1e-5)
-                    )
-                result = result + output
-            return result
-
-
-class RankAllocator(object):
-    """
-    The RankAllocator for AdaLoraModel. Paper: https://openreview.net/pdf?id=lq62uWRJjiY
-
-    Args:
-        config ([`AdaLoraConfig`]): The configuration of the AdaLora model.
-        model: the model that we apply AdaLoRA to.
-
-    """
-
-    def __init__(self, model, peft_config, adapter_name):
-        self.peft_config = peft_config
-        self.adapter_name = adapter_name
-        self.beta1 = peft_config.beta1
-        self.beta2 = peft_config.beta2
-        assert self.beta1 > 0 and self.beta1 < 1
-        assert self.beta2 > 0 and self.beta2 < 1
-
-        self.reset_ipt()
-        self._set_budget_scheduler(model)
-
-    def set_total_step(self, total_step):
-        self.peft_config.total_step = total_step
-
-    def reset_ipt(self):
-        self.ipt = {}
-        self.exp_avg_ipt = {}
-        self.exp_avg_unc = {}
-
-    def _set_budget_scheduler(self, model):
-        self.init_bgt = 0
-        self.name_set = set()
-        for n, p in model.named_parameters():
-            if f"lora_A.{self.adapter_name}" in n:
-                self.init_bgt += p.size(0)
-                self.name_set.add(n.replace("lora_A", "%s"))
-        self.name_set = sorted(self.name_set)
-        # The total final rank budget
-        self.target_bgt = self.peft_config.target_r * len(self.name_set)
-
-    def budget_schedule(self, step: int):
-        tinit = self.peft_config.tinit
-        tfinal = self.peft_config.tfinal
-        total_step = self.peft_config.total_step
-        # Initial warmup
-        if step <= tinit:
-            budget = self.init_bgt
-            mask_ind = False
-        # Final fine-tuning
-        elif step > total_step - tfinal:
-            budget = self.target_bgt
-            mask_ind = True
-        else:
-            # Budget decreasing with a cubic scheduler
-            mul_coeff = 1 - (step - tinit) / (total_step - tfinal - tinit)
-            budget = int((self.init_bgt - self.target_bgt) * (mul_coeff**3) + self.target_bgt)
-            mask_ind = True if step % self.peft_config.deltaT == 0 else False
-        return budget, mask_ind
-
-    def update_ipt(self, model):
-        # Update the sensitivity and uncertainty for every weight
-        for n, p in model.named_parameters():
-            if "lora_" in n and self.adapter_name in n:
-                if n not in self.ipt:
-                    self.ipt[n] = torch.zeros_like(p)
-                    self.exp_avg_ipt[n] = torch.zeros_like(p)
-                    self.exp_avg_unc[n] = torch.zeros_like(p)
-                with torch.no_grad():
-                    self.ipt[n] = (p * p.grad).abs().detach()
-                    # Sensitivity smoothing
-                    self.exp_avg_ipt[n] = self.beta1 * self.exp_avg_ipt[n] + (1 - self.beta1) * self.ipt[n]
-                    # Uncertainty quantification
-                    self.exp_avg_unc[n] = (
-                        self.beta2 * self.exp_avg_unc[n] + (1 - self.beta2) * (self.ipt[n] - self.exp_avg_ipt[n]).abs()
-                    )
-
-    def _element_score(self, n):
-        return self.exp_avg_ipt[n] * self.exp_avg_unc[n]
-
-    def _combine_ipt(self, ipt_E, ipt_AB):
-        ipt_AB = ipt_AB.sum(dim=1, keepdim=False)
-        sum_ipt = ipt_E.view(-1) + ipt_AB.view(-1)
-        return sum_ipt
-
-    def mask_to_budget(self, model, budget):
-        value_ipt = {}
-        vector_ipt = {}
-        triplet_ipt = {}
-        # Get the importance score for A, E, B
-        for n, p in model.named_parameters():
-            if f"lora_A.{self.adapter_name}" in n:
-                entry_ipt = self._element_score(n)
-                comb_ipt = torch.mean(entry_ipt, dim=1, keepdim=True)
-                name_m = n.replace("lora_A", "%s")
-                if name_m not in vector_ipt:
-                    vector_ipt[name_m] = [comb_ipt]
-                else:
-                    vector_ipt[name_m].append(comb_ipt)
-            if f"lora_B.{self.adapter_name}" in n:
-                entry_ipt = self._element_score(n)
-                comb_ipt = torch.mean(entry_ipt, dim=0, keepdim=False).view(-1, 1)
-                name_m = n.replace("lora_B", "%s")
-                if name_m not in vector_ipt:
-                    vector_ipt[name_m] = [comb_ipt]
-                else:
-                    vector_ipt[name_m].append(comb_ipt)
-            if f"lora_E.{self.adapter_name}" in n:
-                entry_ipt = self._element_score(n)
-                name_m = n.replace("lora_E", "%s")
-                value_ipt[name_m] = entry_ipt
-
-        all_score = []
-        # Calculate the score for each triplet
-        for name_m in vector_ipt:
-            ipt_E = value_ipt[name_m]
-            ipt_AB = torch.cat(vector_ipt[name_m], dim=1)
-            sum_ipt = self._combine_ipt(ipt_E, ipt_AB)
-            name_E = name_m % "lora_E"
-            triplet_ipt[name_E] = sum_ipt.view(-1, 1)
-            all_score.append(sum_ipt.view(-1))
-
-        # Get the threshold by ranking ipt
-        mask_threshold = torch.kthvalue(
-            torch.cat(all_score),
-            k=self.init_bgt - budget,
-        )[0].item()
-
-        rank_pattern = {}
-        # Mask the unimportant triplets
-        with torch.no_grad():
-            for n, p in model.named_parameters():
-                if f"lora_E.{self.adapter_name}" in n:
-                    p.masked_fill_(triplet_ipt[n] <= mask_threshold, 0.0)
-                    rank_pattern[n] = (~(triplet_ipt[n] <= mask_threshold)).view(-1).tolist()
-        return rank_pattern
-
-    def update_and_allocate(self, model, global_step, force_mask=False):
-        # # Update the importance score and allocate the budget
-        if global_step < self.peft_config.total_step - self.peft_config.tfinal:
-            self.update_ipt(model)
-        budget, mask_ind = self.budget_schedule(global_step)
-        # Allocate the budget according to importance scores
-        if mask_ind or force_mask:
-            rank_pattern = self.mask_to_budget(model, budget)
-        else:
-            rank_pattern = None
-        return budget, rank_pattern
-
-    def mask_using_rank_pattern(self, model, rank_pattern):
-        # Mask the unimportant triplets
-        is_adapter_name_truncated = False
-        if self.adapter_name not in next(iter(rank_pattern.keys())):
-            is_adapter_name_truncated = True
-
-        with torch.no_grad():
-            for n, p in model.named_parameters():
-                if f"lora_E.{self.adapter_name}" in n:
-                    key = n if not is_adapter_name_truncated else n.replace(f".{self.adapter_name}", "")
-                    mask = torch.Tensor(rank_pattern[key]).unsqueeze(-1).to(p.device)
-                    p.masked_fill_(~mask.bool(), 0.0)
--- a/src/peft/tuners/adalora/init.py
+++ b/src/peft/tuners/adalora/init.py
@ -0,0 +1,35 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from peft.import_utils import is_bnb_4bit_available, is_bnb_available
+
+from .config import AdaLoraConfig
+from .gptq import SVDQuantLinear
+from .layer import AdaLoraLayer, RankAllocator, SVDLinear
+from .model import AdaLoraModel
+
+
+__all__ = ["AdaLoraConfig", "AdaLoraLayer", "AdaLoraModel", "SVDLinear", "RankAllocator", "SVDQuantLinear"]
+
+
+if is_bnb_available():
+    from .bnb import SVDLinear8bitLt
+
+    __all__ += ["SVDLinear8bitLt"]
+
+if is_bnb_4bit_available():
+    from .bnb import SVDLinear4bit
+
+    __all__ += ["SVDLinear4bit"]
--- a/src/peft/tuners/adalora/bnb.py
+++ b/src/peft/tuners/adalora/bnb.py
@ -0,0 +1,153 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import bitsandbytes as bnb
+import torch
+
+from peft.import_utils import is_bnb_4bit_available, is_bnb_available
+
+from .layer import AdaLoraLayer
+
+
+if is_bnb_available():
+
+    class SVDLinear8bitLt(bnb.nn.Linear8bitLt, AdaLoraLayer):
+        # Low-rank matrix for SVD-based adaptation
+        def __init__(
+            self,
+            adapter_name,
+            in_features,
+            out_features,
+            r: int = 0,
+            lora_alpha: int = 1,
+            lora_dropout: float = 0.0,
+            **kwargs,
+        ) -> None:
+            bnb.nn.Linear8bitLt.__init__(
+                self,
+                in_features,
+                out_features,
+                bias=kwargs.get("bias", True),
+                has_fp16_weights=kwargs.get("has_fp16_weights", True),
+                memory_efficient_backward=kwargs.get("memory_efficient_backward", False),
+                threshold=kwargs.get("threshold", 0.0),
+                index=kwargs.get("index", None),
+            )
+            AdaLoraLayer.__init__(self, in_features=in_features, out_features=out_features)
+            # Freezing the pre-trained weight matrix
+            self.weight.requires_grad = False
+
+            init_lora_weights = kwargs.pop("init_lora_weights", True)
+            self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
+            self.set_adapter(adapter_name)
+
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            result = super().forward(x)
+
+            if self.disable_adapters:
+                return result
+
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.lora_A.keys():
+                    continue
+                requires_conversion = not torch.is_autocast_enabled()
+                if requires_conversion:
+                    expected_dtype = result.dtype
+                    if x.dtype != torch.float32:
+                        x = x.float()
+
+                lora_A = self.lora_A[active_adapter]
+                lora_B = self.lora_B[active_adapter]
+                lora_E = self.lora_E[active_adapter]
+                dropout = self.lora_dropout[active_adapter]
+                scaling = self.scaling[active_adapter]
+                ranknum = self.ranknum[active_adapter] + 1e-5
+
+                output = dropout(x) @ (lora_A * lora_E).T @ lora_B.T
+                if requires_conversion:
+                    output = output.to(expected_dtype)
+                output = output * scaling / ranknum
+                result += output
+            return result
+
+
+if is_bnb_4bit_available():
+
+    class SVDLinear4bit(bnb.nn.Linear4bit, AdaLoraLayer):
+        # Low-rank matrix for SVD-based adaptation
+        def __init__(
+            self,
+            adapter_name,
+            in_features,
+            out_features,
+            r: int = 0,
+            lora_alpha: int = 1,
+            lora_dropout: float = 0.0,
+            **kwargs,
+        ) -> None:
+            bnb.nn.Linear4bit.__init__(
+                self,
+                in_features,
+                out_features,
+                bias=kwargs.get("bias", True),
+                compute_dtype=kwargs.get("compute_dtype", torch.float32),
+                compress_statistics=kwargs.get("compress_statistics", True),
+                quant_type=kwargs.get("quant_type", "nf4"),
+            )
+            AdaLoraLayer.__init__(self, in_features=in_features, out_features=out_features)
+            # Freezing the pre-trained weight matrix
+            self.weight.requires_grad = False
+
+            init_lora_weights = kwargs.pop("init_lora_weights", True)
+            self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
+            self.set_adapter(adapter_name)
+
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            result = super().forward(x)
+
+            if self.disable_adapters:
+                return result
+
+            # As per Tim Dettmers, for 4bit, we need to defensively clone here.
+            # The reason is that in some cases, an error can occur that backprop
+            # does not work on a manipulated view. This issue may be solved with
+            # newer PyTorch versions but this would need extensive testing to be
+            # sure.
+            result = result.clone()
+
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.lora_A.keys():
+                    continue
+
+                lora_A = self.lora_A[active_adapter]
+                lora_B = self.lora_B[active_adapter]
+                lora_E = self.lora_E[active_adapter]
+                dropout = self.lora_dropout[active_adapter]
+                scaling = self.scaling[active_adapter]
+                ranknum = self.ranknum[active_adapter] + 1e-5
+
+                requires_conversion = not torch.is_autocast_enabled()
+                if requires_conversion:
+                    expected_dtype = result.dtype
+                    compute_dtype = lora_A.weight.dtype
+                    if x.dtype != compute_dtype:
+                        x = x.to(compute_dtype)
+
+                output = dropout(x) @ (lora_A * lora_E).T @ lora_B.T
+                if requires_conversion:
+                    output = output.to(expected_dtype)
+                output = output * scaling / ranknum
+                result += output
+            return result
--- a/src/peft/tuners/adalora/config.py
+++ b/src/peft/tuners/adalora/config.py
@ -0,0 +1,53 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import Optional
+
+from peft.tuners.lora import LoraConfig
+from peft.utils import PeftType
+
+
+@dataclass
+class AdaLoraConfig(LoraConfig):
+    """
+    This is the configuration class to store the configuration of a [`~peft.AdaLora`].
+
+    Args:
+        target_r (`int`): The target average rank of incremental matrix.
+        init_r (`int`): The initial rank for each incremental matrix.
+        tinit (`int`): The steps of initial fine-tuning warmup.
+        tfinal (`int`): The step of final fine-tuning.
+        deltaT (`int`): The time internval between two budget allocations.
+        beta1 (`float`): The hyperparameter of EMA for sensitivity smoothing.
+        beta2 (`float`): The hyperparameter of EMA for undertainty quantification.
+        orth_reg_weight (`float`): The coefficient of orthogonal regularization.
+        total_step (`int`): The total training steps that should be specified before training.
+        rank_pattern (`list`): The allocated rank for each weight matrix by RankAllocator.
+    """
+
+    target_r: int = field(default=8, metadata={"help": "Target Lora matrix dimension."})
+    init_r: int = field(default=12, metadata={"help": "Intial Lora matrix dimension."})
+    tinit: int = field(default=0, metadata={"help": "The steps of initial warmup."})
+    tfinal: int = field(default=0, metadata={"help": "The steps of final warmup."})
+    deltaT: int = field(default=1, metadata={"help": "Step interval of rank allocation."})
+    beta1: float = field(default=0.85, metadata={"help": "Hyperparameter of EMA."})
+    beta2: float = field(default=0.85, metadata={"help": "Hyperparameter of EMA."})
+    orth_reg_weight: float = field(default=0.5, metadata={"help": "The orthogonal regularization coefficient."})
+    total_step: Optional[int] = field(default=None, metadata={"help": "The total training steps."})
+    rank_pattern: Optional[dict] = field(default=None, metadata={"help": "The saved rank pattern."})
+
+    def __post_init__(self):
+        self.peft_type = PeftType.ADALORA
--- a/src/peft/tuners/adalora/gptq.py
+++ b/src/peft/tuners/adalora/gptq.py
@ -0,0 +1,69 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+from .layer import AdaLoraLayer
+
+
+class SVDQuantLinear(torch.nn.Module, AdaLoraLayer):
+    def __init__(
+        self,
+        adapter_name,
+        quant_linear_module,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        **kwargs,
+    ) -> None:
+        torch.nn.Module.__init__(self)
+        AdaLoraLayer.__init__(
+            self, in_features=quant_linear_module.infeatures, out_features=quant_linear_module.outfeatures
+        )
+        self.quant_linear_module = quant_linear_module
+        self.weight = quant_linear_module.qweight
+        init_lora_weights = kwargs.pop("init_lora_weights", True)
+        self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
+        self.set_adapter(adapter_name)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        result = self.quant_linear_module(x)
+
+        if self.disable_adapters:
+            return result
+
+        for active_adapter in self.active_adapters:
+            if active_adapter not in self.lora_A.keys():
+                continue
+            lora_A = self.lora_A[active_adapter]
+            lora_B = self.lora_B[active_adapter]
+            lora_E = self.lora_E[active_adapter]
+            dropout = self.lora_dropout[active_adapter]
+            scaling = self.scaling[active_adapter]
+            ranknum = self.ranknum[active_adapter] + 1e-5
+
+            requires_conversion = not torch.is_autocast_enabled()
+            if requires_conversion:
+                expected_dtype = result.dtype
+                if x.dtype != torch.float32:
+                    x = x.float()
+
+            output = (dropout(x) @ (lora_A * lora_E).T @ lora_B.T) * scaling / ranknum
+            # TODO: here, the dtype conversion is applied on the *whole expression*,
+            # not the intermediate result, unlike for SVDLinear8bitLT and
+            # SVDLinear4bit, is that correct?
+            if requires_conversion:
+                output = output.to(expected_dtype)
+            result += output
+        return result
--- a/src/peft/tuners/adalora/layer.py
+++ b/src/peft/tuners/adalora/layer.py
@ -0,0 +1,339 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from peft.tuners.lora import LoraLayer
+from peft.utils import transpose
+
+
+class AdaLoraLayer(LoraLayer):
+    # List all names of layers that may contain adapter weights
+    # Note: ranknum doesn't need to be included as it is not an nn.Module
+    adapter_layer_names = ("lora_A", "lora_B", "lora_E", "lora_embedding_A", "lora_embedding_B")
+    # other_param_names is defined in LoraLayer
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+    ):
+        super().__init__(in_features, out_features)
+        self.lora_E = nn.ParameterDict({})
+        self.lora_A = nn.ParameterDict({})
+        self.lora_B = nn.ParameterDict({})
+        self.ranknum = nn.ParameterDict({})
+
+    def update_layer(self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights):
+        self.r[adapter_name] = r
+        self.lora_alpha[adapter_name] = lora_alpha
+        if lora_dropout > 0.0:
+            lora_dropout_layer = nn.Dropout(p=lora_dropout)
+        else:
+            lora_dropout_layer = nn.Identity()
+
+        self.lora_dropout[adapter_name] = lora_dropout_layer
+        # Actual trainable parameters
+        # Right singular vectors
+        self.lora_A[adapter_name] = nn.Parameter(torch.randn(r, self.in_features))
+        # Singular values
+        self.lora_E[adapter_name] = nn.Parameter(torch.randn(r, 1))
+        # Left singular vectors
+        self.lora_B[adapter_name] = nn.Parameter(torch.randn(self.out_features, r))
+        # The current rank
+        self.ranknum[adapter_name] = nn.Parameter(torch.randn(1), requires_grad=False)
+        self.ranknum[adapter_name].data.fill_(float(r))
+        self.ranknum[adapter_name].requires_grad = False
+        self.scaling[adapter_name] = lora_alpha if lora_alpha > 0 else float(r)
+        if init_lora_weights:
+            self.reset_lora_parameters(adapter_name)
+        self.to(self.weight.device)
+        self.set_adapter(self.active_adapters)
+
+    def reset_lora_parameters(self, adapter_name):
+        if adapter_name in self.lora_A.keys():
+            nn.init.normal_(self.lora_E[adapter_name], mean=0.0, std=0.02)
+            nn.init.normal_(self.lora_A[adapter_name], mean=0.0, std=0.02)
+            nn.init.normal_(self.lora_B[adapter_name], mean=0.0, std=0.02)
+
+
+class SVDLinear(nn.Linear, AdaLoraLayer):
+    # SVD-based adaptation by a dense layer
+    def __init__(
+        self,
+        adapter_name: str,
+        in_features: int,
+        out_features: int,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        fan_in_fan_out: bool = False,
+        **kwargs,
+    ) -> None:
+        init_lora_weights = kwargs.pop("init_lora_weights", True)
+        nn.Linear.__init__(self, in_features, out_features, **kwargs)
+        AdaLoraLayer.__init__(self, in_features=in_features, out_features=out_features)
+        # Freezing the pre-trained weight matrix
+        self.weight.requires_grad = False
+
+        self.fan_in_fan_out = fan_in_fan_out
+        if fan_in_fan_out:
+            self.weight.data = self.weight.data.T
+
+        nn.Linear.reset_parameters(self)
+        self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
+        self.set_adapter(adapter_name)
+
+    def merge(self, safe_merge: bool = False) -> None:
+        """
+        Merge the active adapter weights into the base weights
+
+        Args:
+            safe_merge (`bool`, *optional*):
+                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
+                before merging the weights. This is useful if you want to check if the merge operation will produce
+                NaNs. Defaults to `False`.
+        """
+        if self.merged:
+            warnings.warn(
+                f"Already following adapters were merged {','.join(self.merged_adapters)}. "
+                f"You are now additionally merging {','.join(self.active_adapters)}."
+            )
+        for active_adapter in self.active_adapters:
+            if active_adapter in self.lora_A.keys():
+                if safe_merge:
+                    # Note that safe_merge will be slower than the normal merge
+                    # because of the copy operation.
+                    orig_weights = self.weight.data.clone()
+                    orig_weights += self.get_delta_weight(active_adapter)
+
+                    if not torch.isfinite(orig_weights).all():
+                        raise ValueError(
+                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                        )
+
+                    self.weight.data = orig_weights
+                else:
+                    self.weight.data += self.get_delta_weight(active_adapter)
+                self.merged_adapters.append(active_adapter)
+
+    def unmerge(self) -> None:
+        if not self.merged:
+            warnings.warn("Already unmerged. Nothing to do.")
+            return
+        while len(self.merged_adapters) > 0:
+            active_adapter = self.merged_adapters.pop()
+            if active_adapter in self.lora_A.keys():
+                self.weight.data -= self.get_delta_weight(active_adapter)
+
+    def get_delta_weight(self, adapter) -> torch.Tensor:
+        return (
+            transpose(self.lora_B[adapter] @ (self.lora_A[adapter] * self.lora_E[adapter]), self.fan_in_fan_out)
+            * self.scaling[adapter]
+            / (self.ranknum[adapter] + 1e-5)
+        )
+
+    def _linear(self, input: torch.Tensor) -> torch.Tensor:
+        return F.linear(input, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # TODO: SVDLinear does not convert dtype, unlike lora linear, is that correct?
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result = self._linear(x)
+        elif self.merged:
+            result = self._linear(x)
+        else:
+            result = self._linear(x)
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.lora_A.keys():
+                    continue
+                lora_A = self.lora_A[active_adapter]
+                lora_B = self.lora_B[active_adapter]
+                lora_E = self.lora_E[active_adapter]
+                dropout = self.lora_dropout[active_adapter]
+                scaling = self.scaling[active_adapter]
+                ranknum = self.ranknum[active_adapter] + 1e-5
+
+                result += (dropout(x) @ (lora_A * lora_E).T @ lora_B.T) * scaling / ranknum
+
+        return result
+
+
+class RankAllocator(object):
+    """
+    The RankAllocator for AdaLoraModel. Paper: https://openreview.net/pdf?id=lq62uWRJjiY
+
+    Args:
+        config ([`AdaLoraConfig`]): The configuration of the AdaLora model.
+        model: the model that we apply AdaLoRA to.
+
+    """
+
+    def __init__(self, model, peft_config, adapter_name):
+        self.peft_config = peft_config
+        self.adapter_name = adapter_name
+        self.beta1 = peft_config.beta1
+        self.beta2 = peft_config.beta2
+        assert self.beta1 > 0 and self.beta1 < 1
+        assert self.beta2 > 0 and self.beta2 < 1
+
+        self.reset_ipt()
+        self._set_budget_scheduler(model)
+
+    def set_total_step(self, total_step):
+        self.peft_config.total_step = total_step
+
+    def reset_ipt(self):
+        self.ipt = {}
+        self.exp_avg_ipt = {}
+        self.exp_avg_unc = {}
+
+    def _set_budget_scheduler(self, model):
+        self.init_bgt = 0
+        self.name_set = set()
+        for n, p in model.named_parameters():
+            if f"lora_A.{self.adapter_name}" in n:
+                self.init_bgt += p.size(0)
+                self.name_set.add(n.replace("lora_A", "%s"))
+        self.name_set = sorted(self.name_set)
+        # The total final rank budget
+        self.target_bgt = self.peft_config.target_r * len(self.name_set)
+
+    def budget_schedule(self, step: int):
+        tinit = self.peft_config.tinit
+        tfinal = self.peft_config.tfinal
+        total_step = self.peft_config.total_step
+        # Initial warmup
+        if step <= tinit:
+            budget = self.init_bgt
+            mask_ind = False
+        # Final fine-tuning
+        elif step > total_step - tfinal:
+            budget = self.target_bgt
+            mask_ind = True
+        else:
+            # Budget decreasing with a cubic scheduler
+            mul_coeff = 1 - (step - tinit) / (total_step - tfinal - tinit)
+            budget = int((self.init_bgt - self.target_bgt) * (mul_coeff**3) + self.target_bgt)
+            mask_ind = True if step % self.peft_config.deltaT == 0 else False
+        return budget, mask_ind
+
+    def update_ipt(self, model):
+        # Update the sensitivity and uncertainty for every weight
+        for n, p in model.named_parameters():
+            if "lora_" in n and self.adapter_name in n:
+                if n not in self.ipt:
+                    self.ipt[n] = torch.zeros_like(p)
+                    self.exp_avg_ipt[n] = torch.zeros_like(p)
+                    self.exp_avg_unc[n] = torch.zeros_like(p)
+                with torch.no_grad():
+                    self.ipt[n] = (p * p.grad).abs().detach()
+                    # Sensitivity smoothing
+                    self.exp_avg_ipt[n] = self.beta1 * self.exp_avg_ipt[n] + (1 - self.beta1) * self.ipt[n]
+                    # Uncertainty quantification
+                    self.exp_avg_unc[n] = (
+                        self.beta2 * self.exp_avg_unc[n] + (1 - self.beta2) * (self.ipt[n] - self.exp_avg_ipt[n]).abs()
+                    )
+
+    def _element_score(self, n):
+        return self.exp_avg_ipt[n] * self.exp_avg_unc[n]
+
+    def _combine_ipt(self, ipt_E, ipt_AB):
+        ipt_AB = ipt_AB.sum(dim=1, keepdim=False)
+        sum_ipt = ipt_E.view(-1) + ipt_AB.view(-1)
+        return sum_ipt
+
+    def mask_to_budget(self, model, budget):
+        value_ipt = {}
+        vector_ipt = {}
+        triplet_ipt = {}
+        # Get the importance score for A, E, B
+        for n, p in model.named_parameters():
+            if f"lora_A.{self.adapter_name}" in n:
+                entry_ipt = self._element_score(n)
+                comb_ipt = torch.mean(entry_ipt, dim=1, keepdim=True)
+                name_m = n.replace("lora_A", "%s")
+                if name_m not in vector_ipt:
+                    vector_ipt[name_m] = [comb_ipt]
+                else:
+                    vector_ipt[name_m].append(comb_ipt)
+            if f"lora_B.{self.adapter_name}" in n:
+                entry_ipt = self._element_score(n)
+                comb_ipt = torch.mean(entry_ipt, dim=0, keepdim=False).view(-1, 1)
+                name_m = n.replace("lora_B", "%s")
+                if name_m not in vector_ipt:
+                    vector_ipt[name_m] = [comb_ipt]
+                else:
+                    vector_ipt[name_m].append(comb_ipt)
+            if f"lora_E.{self.adapter_name}" in n:
+                entry_ipt = self._element_score(n)
+                name_m = n.replace("lora_E", "%s")
+                value_ipt[name_m] = entry_ipt
+
+        all_score = []
+        # Calculate the score for each triplet
+        for name_m in vector_ipt:
+            ipt_E = value_ipt[name_m]
+            ipt_AB = torch.cat(vector_ipt[name_m], dim=1)
+            sum_ipt = self._combine_ipt(ipt_E, ipt_AB)
+            name_E = name_m % "lora_E"
+            triplet_ipt[name_E] = sum_ipt.view(-1, 1)
+            all_score.append(sum_ipt.view(-1))
+
+        # Get the threshold by ranking ipt
+        mask_threshold = torch.kthvalue(
+            torch.cat(all_score),
+            k=self.init_bgt - budget,
+        )[0].item()
+
+        rank_pattern = {}
+        # Mask the unimportant triplets
+        with torch.no_grad():
+            for n, p in model.named_parameters():
+                if f"lora_E.{self.adapter_name}" in n:
+                    p.masked_fill_(triplet_ipt[n] <= mask_threshold, 0.0)
+                    rank_pattern[n] = (~(triplet_ipt[n] <= mask_threshold)).view(-1).tolist()
+        return rank_pattern
+
+    def update_and_allocate(self, model, global_step, force_mask=False):
+        # # Update the importance score and allocate the budget
+        if global_step < self.peft_config.total_step - self.peft_config.tfinal:
+            self.update_ipt(model)
+        budget, mask_ind = self.budget_schedule(global_step)
+        # Allocate the budget according to importance scores
+        if mask_ind or force_mask:
+            rank_pattern = self.mask_to_budget(model, budget)
+        else:
+            rank_pattern = None
+        return budget, rank_pattern
+
+    def mask_using_rank_pattern(self, model, rank_pattern):
+        # Mask the unimportant triplets
+        is_adapter_name_truncated = False
+        if self.adapter_name not in next(iter(rank_pattern.keys())):
+            is_adapter_name_truncated = True
+
+        with torch.no_grad():
+            for n, p in model.named_parameters():
+                if f"lora_E.{self.adapter_name}" in n:
+                    key = n if not is_adapter_name_truncated else n.replace(f".{self.adapter_name}", "")
+                    mask = torch.Tensor(rank_pattern[key]).unsqueeze(-1).to(p.device)
+                    p.masked_fill_(~mask.bool(), 0.0)
--- a/src/peft/tuners/adalora/model.py
+++ b/src/peft/tuners/adalora/model.py
@ -0,0 +1,329 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+import torch
+from transformers.pytorch_utils import Conv1D
+
+from peft.import_utils import is_bnb_4bit_available, is_bnb_available
+from peft.tuners.lora import LoraConfig, LoraModel
+from peft.utils import (
+    TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING,
+    _freeze_adapter,
+    _get_submodules,
+    get_auto_gptq_quant_linear,
+    get_quantization_config,
+)
+
+from .gptq import SVDQuantLinear
+from .layer import AdaLoraLayer, RankAllocator, SVDLinear
+
+
+if is_bnb_available():
+    import bitsandbytes as bnb
+
+    from .bnb import SVDLinear8bitLt
+if is_bnb_4bit_available():
+    from .bnb import SVDLinear4bit
+
+
+class AdaLoraModel(LoraModel):
+    """
+    Creates AdaLoRA (Adaptive LoRA) model from a pretrained transformers model. Paper:
+    https://openreview.net/forum?id=lq62uWRJjiY
+
+    Args:
+        model ([`transformers.PreTrainedModel`]): The model to be adapted.
+        config ([`AdaLoraConfig`]): The configuration of the AdaLora model.
+        adapter_name (`str`): The name of the adapter, defaults to `"default"`.
+
+    Returns:
+        `torch.nn.Module`: The AdaLora model.
+
+    Example::
+
+        >>> from transformers import AutoModelForSeq2SeqLM, LoraConfig >>> from peft import AdaLoraModel, AdaLoraConfig
+        >>> config = AdaLoraConfig(
+                peft_type="ADALORA", task_type="SEQ_2_SEQ_LM", r=8, lora_alpha=32, target_modules=["q", "v"],
+                lora_dropout=0.01,
+            )
+        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") >>> model = AdaLoraModel(model, config, "default")
+
+    **Attributes**:
+        - **model** ([`transformers.PreTrainedModel`]) -- The model to be adapted.
+        - **peft_config** ([`AdaLoraConfig`]): The configuration of the AdaLora model.
+    """
+
+    def __init__(self, model, config, adapter_name):
+        super().__init__(model, config, adapter_name)
+
+        traininable_mode_counter = 0
+        for config in self.peft_config.values():
+            if not config.inference_mode:
+                traininable_mode_counter += 1
+
+        if traininable_mode_counter > 1:
+            raise ValueError(
+                "AdaLoraModel supports only 1 trainable adapter. "
+                "When using multiple adapters, set inference_mode to True for all adapters except the one you want to train."
+            )
+
+        if self.peft_config[adapter_name].inference_mode:
+            _freeze_adapter(self.model, adapter_name)
+        else:
+            self.trainable_adapter_name = adapter_name
+            self.rankallocator = RankAllocator(self.model, self.peft_config[adapter_name], self.trainable_adapter_name)
+
+    def _check_new_adapter_config(self, config: LoraConfig) -> None:
+        """
+        A helper method to check the config when a new adapter is being added.
+
+        Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters.
+
+        """
+        super()._check_new_adapter_config(config)
+
+        traininable_mode_counter = 0
+        for config_ in self.peft_config.values():
+            if not config_.inference_mode:
+                traininable_mode_counter += 1
+
+        if traininable_mode_counter > 1:
+            raise ValueError(
+                f"{self.__class__.__name__} supports only 1 trainable adapter. "
+                "When using multiple adapters, set inference_mode to True for all adapters except the one "
+                "you want to train."
+            )
+
+    def _create_and_replace(
+        self,
+        lora_config,
+        adapter_name,
+        target,
+        target_name,
+        parent,
+        **optional_kwargs,
+    ):
+        loaded_in_8bit = optional_kwargs.get("loaded_in_8bit", False)
+        loaded_in_4bit = optional_kwargs.get("loaded_in_4bit", False)
+        if (loaded_in_8bit or loaded_in_4bit) and not is_bnb_available():
+            raise ImportError(
+                "To use Lora with 8-bit quantization, please install the `bitsandbytes` package. "
+                "You can install it with `pip install bitsandbytes`."
+            )
+        kwargs = {
+            "r": lora_config.init_r,
+            "lora_alpha": lora_config.lora_alpha,
+            "lora_dropout": lora_config.lora_dropout,
+            "fan_in_fan_out": lora_config.fan_in_fan_out,
+            "init_lora_weights": lora_config.init_lora_weights,
+            "loaded_in_8bit": loaded_in_8bit,
+            "loaded_in_4bit": loaded_in_4bit,
+        }
+
+        quantization_config = get_quantization_config(self.model, method="gptq")
+        if quantization_config is not None:
+            kwargs["gptq_quantization_config"] = quantization_config
+
+        # If it is not a LoraLayer, create a new module, else update it with new adapters
+        if not isinstance(target, AdaLoraLayer):
+            new_module = self._create_new_module(lora_config, adapter_name, target, **kwargs)
+            if adapter_name != self.active_adapter:
+                # adding an additional adapter: it is not automatically trainable
+                new_module.requires_grad_(False)
+            self._replace_module(parent, target_name, new_module, target)
+        else:
+            target.update_layer(
+                adapter_name,
+                lora_config.init_r,
+                lora_config.lora_alpha,
+                lora_config.lora_dropout,
+                lora_config.init_lora_weights,
+            )
+
+    @staticmethod
+    def _create_new_module(lora_config, adapter_name, target, **kwargs):
+        gptq_quantization_config = kwargs.get("gptq_quantization_config", None)
+        AutoGPTQQuantLinear = get_auto_gptq_quant_linear(gptq_quantization_config)
+
+        bias = target.bias is not None
+        loaded_in_8bit = kwargs.pop("loaded_in_8bit", False)
+        loaded_in_4bit = kwargs.pop("loaded_in_4bit", False)
+
+        if loaded_in_8bit and isinstance(target, bnb.nn.Linear8bitLt):
+            kwargs.update(
+                {
+                    "has_fp16_weights": target.state.has_fp16_weights,
+                    "memory_efficient_backward": target.state.memory_efficient_backward,
+                    "threshold": target.state.threshold,
+                    "index": target.index,
+                }
+            )
+            new_module = SVDLinear8bitLt(adapter_name, target.in_features, target.out_features, bias=bias, **kwargs)
+        elif loaded_in_4bit and is_bnb_4bit_available() and isinstance(target, bnb.nn.Linear4bit):
+            fourbit_kwargs = kwargs.copy()
+            fourbit_kwargs.update(
+                {
+                    "compute_dtype": target.compute_dtype,
+                    "compress_statistics": target.weight.compress_statistics,
+                    "quant_type": target.weight.quant_type,
+                }
+            )
+            new_module = SVDLinear4bit(
+                adapter_name, target.in_features, target.out_features, bias=bias, **fourbit_kwargs
+            )
+        elif AutoGPTQQuantLinear is not None and isinstance(target, AutoGPTQQuantLinear):
+            new_module = SVDQuantLinear(adapter_name, target, **kwargs)
+            target.weight = target.qweight
+        else:
+            if isinstance(target, torch.nn.Linear):
+                in_features, out_features = target.in_features, target.out_features
+                if kwargs["fan_in_fan_out"]:
+                    warnings.warn(
+                        "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
+                        "Setting fan_in_fan_out to False."
+                    )
+                    kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False
+            elif isinstance(target, Conv1D):
+                in_features, out_features = (
+                    target.weight.ds_shape if hasattr(target.weight, "ds_shape") else target.weight.shape
+                )
+                if not kwargs["fan_in_fan_out"]:
+                    warnings.warn(
+                        "fan_in_fan_out is set to False but the target module is `Conv1D`. "
+                        "Setting fan_in_fan_out to True."
+                    )
+                    kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = True
+            else:
+                raise ValueError(
+                    f"Target module {target} is not supported. "
+                    f"Currently, only `torch.nn.Linear` and `Conv1D` are supported."
+                )
+            new_module = SVDLinear(adapter_name, in_features, out_features, bias=bias, **kwargs)
+
+        return new_module
+
+    @staticmethod
+    def _prepare_adapter_config(peft_config, model_config):
+        if peft_config.target_modules is None:
+            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING:
+                raise ValueError("Please specify `target_modules` in `peft_config`")
+            peft_config.target_modules = TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING[
+                model_config["model_type"]
+            ]
+        return peft_config
+
+    def __getattr__(self, name: str):
+        """Forward missing attributes to the wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Module's logic
+        except AttributeError:
+            return getattr(self.model, name)
+
+    def forward(self, *args, **kwargs):
+        outputs = self.model.forward(*args, **kwargs)
+
+        if getattr(outputs, "loss", None) is not None:
+            # Calculate the orthogonal regularization
+            orth_reg_weight = self.peft_config[self.trainable_adapter_name].orth_reg_weight
+
+            if orth_reg_weight <= 0:
+                raise ValueError("orth_reg_weight should be greater than 0. ")
+
+            regu_loss = 0
+            num_param = 0
+            for n, p in self.model.named_parameters():
+                if ("lora_A" in n or "lora_B" in n) and self.trainable_adapter_name in n:
+                    para_cov = p @ p.T if "lora_A" in n else p.T @ p
+                    I = torch.eye(*para_cov.size(), out=torch.empty_like(para_cov))
+                    I.requires_grad = False
+                    num_param += 1
+                    regu_loss += torch.norm(para_cov - I, p="fro")
+            if num_param > 0:
+                regu_loss = regu_loss / num_param
+            else:
+                regu_loss = 0
+            outputs.loss += orth_reg_weight * regu_loss
+        return outputs
+
+    def resize_modules_by_rank_pattern(self, rank_pattern, adapter_name):
+        lora_config = self.peft_config[adapter_name]
+        for name, rank_idx in rank_pattern.items():
+            if isinstance(rank_idx, list):
+                rank = sum(rank_idx)
+            elif isinstance(rank_idx, torch.Tensor):
+                rank_idx = rank_idx.view(-1)
+                rank = rank_idx.sum().item()
+            else:
+                raise ValueError("Unexcepted type of rank_idx")
+            key = ".".join(name.split(".")[0:-2]) if adapter_name in name else ".".join(name.split(".")[0:-1])
+            _, target, _ = _get_submodules(self.model, key)
+            lora_E_weights = target.lora_E[adapter_name][rank_idx]
+            lora_A_weights = target.lora_A[adapter_name][rank_idx]
+            lora_B_weights = target.lora_B[adapter_name][:, rank_idx]
+            ranknum = target.ranknum[adapter_name]
+            target.update_layer(
+                adapter_name,
+                rank,
+                lora_config.lora_alpha,
+                lora_config.lora_dropout,
+                lora_config.init_lora_weights,
+            )
+            with torch.no_grad():
+                if rank > 0:
+                    target.lora_E[adapter_name].copy_(lora_E_weights)
+                    target.lora_A[adapter_name].copy_(lora_A_weights)
+                    target.lora_B[adapter_name].copy_(lora_B_weights)
+                    # The scaling is exactly as the previous
+                    target.ranknum[adapter_name].copy_(ranknum)
+
+    def resize_state_dict_by_rank_pattern(self, rank_pattern, state_dict, adapter_name):
+        for name, rank_idx in rank_pattern.items():
+            rank = sum(rank_idx)
+            prefix = ".".join(name.split(".")[0:-2]) if adapter_name in name else ".".join(name.split(".")[0:-1])
+            for layer in ["lora_E", "lora_A", "lora_B"]:
+                key = f"base_model.model.{prefix}.{layer}.{adapter_name}"
+                if layer != "lora_B":
+                    state_dict[key] = (
+                        state_dict[key][rank_idx] if rank != state_dict[key].shape[0] else state_dict[key]
+                    )
+                else:
+                    state_dict[key] = (
+                        state_dict[key][:, rank_idx] if rank != state_dict[key].shape[1] else state_dict[key]
+                    )
+        return state_dict
+
+    def update_and_allocate(self, global_step):
+        lora_config = self.peft_config[self.trainable_adapter_name]
+        # Update the importance score and allocate the budget
+        if global_step < lora_config.total_step - lora_config.tfinal:
+            _, rank_pattern = self.rankallocator.update_and_allocate(self.model, global_step)
+            if rank_pattern:
+                lora_config.rank_pattern = rank_pattern
+        # Finalize the budget allocation
+        elif global_step == lora_config.total_step - lora_config.tfinal:
+            _, rank_pattern = self.rankallocator.update_and_allocate(self.model, global_step, force_mask=True)
+            # for some reason, this freezes the trainable parameters and nothing gets updates
+            # self.resize_modules_by_rank_pattern(rank_pattern, self.trainable_adapter_name)
+            lora_config.rank_pattern = rank_pattern
+            self.rankallocator.reset_ipt()
+        # Currently using inefficient way to mask the unimportant weights using the rank pattern
+        #  due to problem mentioned above
+        elif global_step > lora_config.total_step - lora_config.tfinal:
+            self.rankallocator.mask_using_rank_pattern(self.model, lora_config.rank_pattern)
+        # Pass the function and do forward propagation
+        else:
+            return None
--- a/src/peft/tuners/adaption_prompt.py
+++ b/src/peft/tuners/adaption_prompt.py
@ -1,373 +0,0 @@
-# coding=utf-8
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from collections import namedtuple
-from dataclasses import dataclass, field
-from typing import Dict, List
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from ..config import PeftConfig
-from ..utils import PeftType, _freeze_adapter, _get_submodules
-
-
-def llama_rotate_half(x: torch.Tensor) -> torch.Tensor:
-    """
-    Rotate half the hidden dims of the input.
-
-    This function was duplicated verbatim from:
-    https://github.com/huggingface/transformers/blob/1de8ce9ee1191ba761a593ac15d9ccbf5851bfc5/src/transformers/models/llama/modeling_llama.py#L126
-
-    This was done to eliminate the Llama transformers implementation as a dependency of this file. Note that some other
-    functions were also adapted from the transformers implementation but were modified.
-    """
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def llama_apply_rotary_pos_emb(q, cos, sin, position_ids):
-    """
-    Apply rotary position embedding to query states in the Llama model.
-
-    This function was adapted from:
-    https://github.com/huggingface/transformers/blob/1de8ce9ee1191ba761a593ac15d9ccbf5851bfc5/src/transformers/models/llama/modeling_llama.py#L133
-
-    It was modified to remove unnecessary processing of key states.
-    """
-    gather_indices = position_ids[:, None, :, None]  # [bs, 1, seq_len, 1]
-    gather_indices = gather_indices.repeat(1, cos.shape[1], 1, cos.shape[3])
-    cos = torch.gather(cos.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
-    sin = torch.gather(sin.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
-    q_embed = (q * cos) + (llama_rotate_half(q) * sin)
-    return q_embed
-
-
-def llama_compute_query_states(model: nn.Module, **kwargs) -> torch.Tensor:
-    """
-    Compute query states for Llama models specifically.
-
-    They need to be recomputed as the forward() method of the original LlamaModel in the transformers library does not
-    return them. See the related discussion in the PR: https://github.com/huggingface/peft/pull/268
-    """
-    hidden_states = kwargs.get("hidden_states")
-    position_ids = kwargs.get("position_ids")
-    past_key_value = kwargs.get("past_key_value")
-    bsz, q_len, _ = hidden_states.size()
-    query_states = model.q_proj(hidden_states).view(bsz, q_len, model.num_heads, model.head_dim).transpose(1, 2)
-    value_states = model.v_proj(hidden_states).view(bsz, q_len, model.num_heads, model.head_dim).transpose(1, 2)
-
-    seq_len = q_len
-    if past_key_value is not None:
-        seq_len += past_key_value[0].shape[-2]
-    cos, sin = model.rotary_emb(value_states, seq_len=seq_len)
-
-    return llama_apply_rotary_pos_emb(query_states, cos, sin, position_ids)
-
-
-# Contains the config that is specific to a transformers model type.
-ModelTypeConfig = namedtuple(
-    "ModelTypeConfig", ["compute_query_states", "target_modules", "k_proj_layer", "v_proj_layer", "o_proj_layer"]
-)
-# Mapping of transformers model types to their specific configuration.
-TRANSFORMERS_MODEL_CONFIG = {
-    "llama": ModelTypeConfig(
-        compute_query_states=llama_compute_query_states,
-        target_modules="self_attn",
-        k_proj_layer="k_proj",
-        v_proj_layer="v_proj",
-        o_proj_layer="o_proj",
-    ),
-}
-
-
-def is_adaption_prompt_trainable(params: str) -> bool:
-    """Return True if module is trainable under adaption prompt fine-tuning."""
-    return params.split(".")[-1].startswith("adaption_")
-
-
-@dataclass
-class AdaptionPromptConfig(PeftConfig):
-    """Stores the configuration of an [`AdaptionPromptModel`]."""
-
-    target_modules: str = field(
-        default=None, metadata={"help": "Name of the attention submodules to insert adaption prompts into."}
-    )
-    adapter_len: int = field(default=None, metadata={"help": "Number of adapter tokens to insert"})
-    adapter_layers: int = field(default=None, metadata={"help": "Number of adapter layers (from the top)"})
-
-    def __post_init__(self):
-        self.peft_type = PeftType.ADAPTION_PROMPT
-
-    @property
-    def is_adaption_prompt(self) -> bool:
-        """Return True if this is an adaption prompt config."""
-        return True
-
-
-def prepare_config(
-    peft_config: AdaptionPromptConfig,
-    model,
-) -> AdaptionPromptConfig:
-    """Prepare the config based on the llama model type."""
-    if model.config.model_type not in TRANSFORMERS_MODEL_CONFIG:
-        raise ValueError("Unsupported model type for adaption prompt: '{model.config.model_type}'.")
-
-    model_config = TRANSFORMERS_MODEL_CONFIG[model.config.model_type]
-
-    if peft_config.target_modules is None:
-        peft_config.target_modules = model_config.target_modules
-
-    return peft_config
-
-
-class AdaptionPromptModel(nn.Module):
-    """
-    Implements adaption prompts as described in https://arxiv.org/pdf/2303.16199.pdf.
-
-    The top L attention modules are replaced with AdaptedAttention modules that wrap the original ones, but insert
-    trainable prompts with gates (for zero init).
-
-    Notes on the multi-adapter pattern:
-    - We store the states of different adapters by keeping a dictionary of AdaptedAttention modules indexed by adapter
-      name.
-    - Every time we switch adapters, we remove the modules of the currently active adapter from the model, store them
-      in the dictionary, and replace them with the modules of the new adapter.
-    - To avoid duplicated and potentially inconsistent state, the currently active adapter is always removed from the
-      dictionary.
-    - Disabling the adapter would also result in the modules being removed from the model.
-    """
-
-    def __init__(self, model, configs: Dict, adapter_name: str):
-        super().__init__()
-        self.model = model
-        # Store adapter configs by name.
-        self._configs: Dict[str, AdaptionPromptConfig] = {}
-        # Store lists of the parents of the affected attention modules by adapter name.
-        # We keep references to the parents so we can swap the adapters in-and-out of the model.
-        self._parents: Dict[str, List[nn.Module]] = {}
-        # Store lists of cached AdaptedAttention modules by name.
-        self._cached_adapters: Dict[str, List] = {}
-        # The name of the currently active adapter.
-        self._active_adapter = None
-        # Whether the adapter is enabled.
-        self._enabled = True
-        self.forward = self.model.forward
-        self.add_adapter(adapter_name, configs[adapter_name])
-        self._mark_only_adaption_prompts_as_trainable()
-
-    def add_adapter(self, adapter_name: str, config: AdaptionPromptConfig) -> None:
-        """Add an adapter with the given name and config."""
-        config = prepare_config(config, self.model)
-        if adapter_name in self._configs:
-            raise ValueError(f"Adapter with name '{adapter_name}' already exists.")
-
-        parents = []
-        for name, _ in self.model.named_modules():
-            if name.endswith(config.target_modules):
-                par, _, _ = _get_submodules(self.model, name)
-                parents.append(par)
-        if len(parents) < config.adapter_layers:
-            raise ValueError(
-                f"Config specifies more adapter layers '{config.adapter_layers}'"
-                f" than the model has '{len(parents)}'."
-            )
-        # Note that if the target modules are not in Sequential, ModuleList, or
-        # some other PyTorch ordered container, the behavior is undefined as we
-        # assume here that the order of the modules is the same as the order of
-        # the transformer decoder layers.
-        parents = parents[-config.adapter_layers :]
-        self._parents[adapter_name] = parents
-
-        # It is only None during initialization.
-        # If it is disabled, we don't have to remove the modules.
-        if self._active_adapter is not None and self._enabled:
-            self._remove_adapted_attentions(self._active_adapter)
-        self._active_adapter = adapter_name
-        self._configs[adapter_name] = config
-        self._create_adapted_attentions(config, parents)
-        if not self._enabled:
-            self._remove_adapted_attentions(self._active_adapter)
-
-        if config.inference_mode:
-            _freeze_adapter(self.model, adapter_name)
-
-    def set_adapter(self, adapter_name: str) -> None:
-        """Set the model to use the adapter with the given name."""
-        if self._active_adapter == adapter_name:
-            return
-        if adapter_name not in self._configs:
-            raise ValueError(f"Adapter with name '{adapter_name}' does not exist.")
-
-        if self._enabled:
-            self._remove_adapted_attentions(self._active_adapter)
-            self._set_adapted_attentions(adapter_name)
-
-        self._active_adapter = adapter_name
-
-    def enable_adapter_layers(self):
-        """Enable adapter layers by swapping in cached AdaptedAttention modules."""
-        self._enabled = True
-        self._set_adapted_attentions(self._active_adapter)
-
-    def disable_adapter_layers(self):
-        """Disable adapter layers by swapping out AdaptedAttention modules."""
-        self._enabled = False
-        self._remove_adapted_attentions(self._active_adapter)
-
-    def _create_adapted_attentions(self, config: AdaptionPromptConfig, parents: List[nn.Module]) -> None:
-        """Wrap LlamaAttention modules with newly created AdaptedAttention modules."""
-        for par in parents:
-            attn = AdaptedAttention(
-                model_type=self.model.config.model_type,
-                adapter_len=config.adapter_len,
-                model=getattr(par, config.target_modules),
-            )
-            setattr(par, config.target_modules, attn)
-
-    def _set_adapted_attentions(self, adapter_name: str) -> None:
-        """Replace LlamaAttention modules with cached AdaptedAttention modules."""
-        cached = self._cached_adapters[adapter_name]
-        del self._cached_adapters[adapter_name]
-        config = self._configs[adapter_name]
-        for i, par in enumerate(self._parents[adapter_name]):
-            setattr(par, config.target_modules, cached[i])
-
-    def _remove_adapted_attentions(self, adapter_name: str) -> None:
-        """Remove AdaptedAttention modules from the model and store them in the cache."""
-        config = self._configs[adapter_name]
-        adapted_attentions = []
-        for par in self._parents[adapter_name]:
-            attn = getattr(par, config.target_modules)
-            adapted_attentions.append(attn)
-            setattr(par, config.target_modules, attn.model)
-        self._cached_adapters[adapter_name] = adapted_attentions
-
-    def _mark_only_adaption_prompts_as_trainable(self) -> None:
-        """Freeze all parameters of the model except the adaption prompts."""
-        for n, p in self.model.named_parameters():
-            if not is_adaption_prompt_trainable(n):
-                p.requires_grad = False
-
-    def __getattr__(self, name: str):
-        """Forward missing attributes to the wrapped module."""
-        try:
-            return super().__getattr__(name)  # defer to nn.Module's logic
-        except AttributeError:
-            # This is necessary as e.g. causal models have various methods that we
-            # don't want to re-implement here.
-            return getattr(self.model, name)
-
-
-class AdaptedAttention(nn.Module):
-    """This module wraps a LLamaAttention module and injects adaption prompts."""
-
-    def __init__(self, model_type: str, adapter_len: int, model):
-        """
-        Initialize object.
-
-        Args:
-            model_type: The transformer model type. This is used to retrieve the right method to
-                compute query states.
-            adapter_len: The length of the adaption prompt to insert.
-            model: The original transformer attention module that is being wrapped.
-        """
-        assert not isinstance(model, AdaptedAttention)
-        super().__init__()
-        self.model_type = model_type
-        self.model = model
-        self.adapter_len = adapter_len
-        # Assume all parameters of the attention model we are wrapping are on the same device.
-        device = next(model.parameters()).device
-        # Don't think this was specified in the paper, but we follow the official repo which used an Embedding
-        # which initializes the tokens with standard normal values.
-        # https://github.com/ZrrSkywalker/LLaMA-Adapter/blob/41c3546fe1997ab8a65809dc8d8f9252b19d9faf/llama/model.py#L234
-        # (bsz, adapter_len, hidden_size)
-        target_dtype = (
-            model.q_proj.weight.dtype if model.q_proj.weight.dtype not in [torch.int8, torch.uint8] else torch.float32
-        )
-        self.adaption_prompt = nn.Parameter(
-            torch.empty(1, adapter_len, self.model.hidden_size, device=device, dtype=target_dtype).normal_()
-        )
-        # Initialize the gate to 0 as this is "zero-init".
-        self.adaption_gate = nn.Parameter(torch.zeros(1, device=device, dtype=target_dtype))
-
-    def forward(self, **kwargs):
-        """
-        Forward pass for the adapter which wraps the original LlamaAttention module.
-
-        "Official" paper implementation:
-        https://github.com/ZrrSkywalker/LLaMA-Adapter/blob/41c3546fe1997ab8a65809dc8d8f9252b19d9faf/llama/model.py#L141
-
-        Args:
-            kwargs: See the original LlamaAttention module.
-        """
-        if kwargs.get("output_attention", False):
-            raise NotImplementedError("output_attention is not currently supported.")
-
-        output, _, past_key_value = self.model(**kwargs)
-        bsz = output.shape[0]
-        q_len = output.shape[1]
-        embed_dim = output.shape[2]
-        k_proj_layer = TRANSFORMERS_MODEL_CONFIG[self.model_type].k_proj_layer
-        v_proj_layer = TRANSFORMERS_MODEL_CONFIG[self.model_type].v_proj_layer
-        o_proj_layer = TRANSFORMERS_MODEL_CONFIG[self.model_type].o_proj_layer
-
-        if k_proj_layer == v_proj_layer:
-            _, key, value = getattr(self.model, k_proj_layer)(self.adaption_prompt).split(embed_dim, dim=2)
-        else:
-            key = getattr(self.model, k_proj_layer)(self.adaption_prompt)
-            value = getattr(self.model, v_proj_layer)(self.adaption_prompt)
-        # (bsz, num_heads, adapter_len, head_dim)
-        adapter_k = (
-            key.view(1, self.adapter_len, self.model.num_heads, self.model.head_dim)
-            .repeat(bsz, 1, 1, 1)
-            .transpose(1, 2)
-        )
-        # (bsz, num_heads, adapter_len, head_dim)
-        adapter_v = (
-            value.view(1, self.adapter_len, self.model.num_heads, self.model.head_dim)
-            .repeat(bsz, 1, 1, 1)
-            .transpose(1, 2)
-        )
-
-        # Recompute query states.
-        compute_query_states = TRANSFORMERS_MODEL_CONFIG[self.model_type].compute_query_states
-        # (bsz, num_heads, q_len, head_dim)
-        query_states = compute_query_states(model=self.model, **kwargs)
-
-        previous_dtype = query_states.dtype
-        # (bsz, num_heads, q_len, adapter_len)
-        scores = torch.matmul(query_states, adapter_k.transpose(2, 3).to(previous_dtype)) / math.sqrt(
-            self.model.head_dim
-        )
-        # Upcast attention to fp32
-        # (bsz, num_heads, q_len, adapter_len)
-        scores = self.adaption_gate * F.softmax(scores, dim=-1, dtype=torch.float32).to(previous_dtype)
-        # (bsz, q_len, num_heads * head_dim)
-        adapter_output = torch.matmul(scores, adapter_v).transpose(1, 2).reshape(bsz, q_len, -1)
-        # (bsz, q_len, hidden_size)
-        if o_proj_layer is not None:
-            adapter_output = getattr(self.model, o_proj_layer)(adapter_output)
-
-        # Add adaption prompt output to original output.
-        output = output + adapter_output
-
-        # Restore original dtype.
-        output = output.to(previous_dtype)
-        return output, None, past_key_value
--- a/src/peft/tuners/adaption_prompt/init.py
+++ b/src/peft/tuners/adaption_prompt/init.py
@ -0,0 +1,20 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .config import AdaptionPromptConfig
+from .layer import AdaptedAttention
+from .model import AdaptionPromptModel
+
+
+__all__ = ["AdaptionPromptConfig", "AdaptedAttention", "AdaptionPromptModel"]
--- a/src/peft/tuners/adaption_prompt/config.py
+++ b/src/peft/tuners/adaption_prompt/config.py
@ -0,0 +1,74 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import namedtuple
+from dataclasses import dataclass, field
+
+from peft.config import PeftConfig
+from peft.utils import PeftType
+
+from .utils import llama_compute_query_states
+
+
+@dataclass
+class AdaptionPromptConfig(PeftConfig):
+    """Stores the configuration of an [`AdaptionPromptModel`]."""
+
+    target_modules: str = field(
+        default=None, metadata={"help": "Name of the attention submodules to insert adaption prompts into."}
+    )
+    adapter_len: int = field(default=None, metadata={"help": "Number of adapter tokens to insert"})
+    adapter_layers: int = field(default=None, metadata={"help": "Number of adapter layers (from the top)"})
+
+    def __post_init__(self):
+        self.peft_type = PeftType.ADAPTION_PROMPT
+
+    @property
+    def is_adaption_prompt(self) -> bool:
+        """Return True if this is an adaption prompt config."""
+        return True
+
+
+# Contains the config that is specific to a transformers model type.
+ModelTypeConfig = namedtuple(
+    "ModelTypeConfig", ["compute_query_states", "target_modules", "k_proj_layer", "v_proj_layer", "o_proj_layer"]
+)
+
+# Mapping of transformers model types to their specific configuration.
+TRANSFORMERS_MODEL_CONFIG = {
+    "llama": ModelTypeConfig(
+        compute_query_states=llama_compute_query_states,
+        target_modules="self_attn",
+        k_proj_layer="k_proj",
+        v_proj_layer="v_proj",
+        o_proj_layer="o_proj",
+    ),
+}
+
+
+def prepare_config(
+    peft_config: AdaptionPromptConfig,
+    model,
+) -> AdaptionPromptConfig:
+    """Prepare the config based on the llama model type."""
+    if model.config.model_type not in TRANSFORMERS_MODEL_CONFIG:
+        raise ValueError("Unsupported model type for adaption prompt: '{model.config.model_type}'.")
+
+    model_config = TRANSFORMERS_MODEL_CONFIG[model.config.model_type]
+
+    if peft_config.target_modules is None:
+        peft_config.target_modules = model_config.target_modules
+
+    return peft_config
--- a/src/peft/tuners/adaption_prompt/layer.py
+++ b/src/peft/tuners/adaption_prompt/layer.py
@ -0,0 +1,121 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .config import TRANSFORMERS_MODEL_CONFIG
+
+
+class AdaptedAttention(nn.Module):
+    """This module wraps a LLamaAttention module and injects adaption prompts."""
+
+    def __init__(self, model_type: str, adapter_len: int, model):
+        """
+        Initialize object.
+
+        Args:
+            model_type: The transformer model type. This is used to retrieve the right method to
+                compute query states.
+            adapter_len: The length of the adaption prompt to insert.
+            model: The original transformer attention module that is being wrapped.
+        """
+        assert not isinstance(model, AdaptedAttention)
+        super().__init__()
+        self.model_type = model_type
+        self.model = model
+        self.adapter_len = adapter_len
+        # Assume all parameters of the attention model we are wrapping are on the same device.
+        device = next(model.parameters()).device
+        # Don't think this was specified in the paper, but we follow the official repo which used an Embedding
+        # which initializes the tokens with standard normal values.
+        # https://github.com/ZrrSkywalker/LLaMA-Adapter/blob/41c3546fe1997ab8a65809dc8d8f9252b19d9faf/llama/model.py#L234
+        # (bsz, adapter_len, hidden_size)
+        target_dtype = (
+            model.q_proj.weight.dtype if model.q_proj.weight.dtype not in [torch.int8, torch.uint8] else torch.float32
+        )
+        self.adaption_prompt = nn.Parameter(
+            torch.empty(1, adapter_len, self.model.hidden_size, device=device, dtype=target_dtype).normal_()
+        )
+        # Initialize the gate to 0 as this is "zero-init".
+        self.adaption_gate = nn.Parameter(torch.zeros(1, device=device, dtype=target_dtype))
+
+    def forward(self, **kwargs):
+        """
+        Forward pass for the adapter which wraps the original LlamaAttention module.
+
+        "Official" paper implementation:
+        https://github.com/ZrrSkywalker/LLaMA-Adapter/blob/41c3546fe1997ab8a65809dc8d8f9252b19d9faf/llama/model.py#L141
+
+        Args:
+            kwargs: See the original LlamaAttention module.
+        """
+        if kwargs.get("output_attention", False):
+            raise NotImplementedError("output_attention is not currently supported.")
+
+        output, _, past_key_value = self.model(**kwargs)
+        bsz = output.shape[0]
+        q_len = output.shape[1]
+        embed_dim = output.shape[2]
+        k_proj_layer = TRANSFORMERS_MODEL_CONFIG[self.model_type].k_proj_layer
+        v_proj_layer = TRANSFORMERS_MODEL_CONFIG[self.model_type].v_proj_layer
+        o_proj_layer = TRANSFORMERS_MODEL_CONFIG[self.model_type].o_proj_layer
+
+        if k_proj_layer == v_proj_layer:
+            _, key, value = getattr(self.model, k_proj_layer)(self.adaption_prompt).split(embed_dim, dim=2)
+        else:
+            key = getattr(self.model, k_proj_layer)(self.adaption_prompt)
+            value = getattr(self.model, v_proj_layer)(self.adaption_prompt)
+        # (bsz, num_heads, adapter_len, head_dim)
+        adapter_k = (
+            key.view(1, self.adapter_len, self.model.num_heads, self.model.head_dim)
+            .repeat(bsz, 1, 1, 1)
+            .transpose(1, 2)
+        )
+        # (bsz, num_heads, adapter_len, head_dim)
+        adapter_v = (
+            value.view(1, self.adapter_len, self.model.num_heads, self.model.head_dim)
+            .repeat(bsz, 1, 1, 1)
+            .transpose(1, 2)
+        )
+
+        # Recompute query states.
+        compute_query_states = TRANSFORMERS_MODEL_CONFIG[self.model_type].compute_query_states
+        # (bsz, num_heads, q_len, head_dim)
+        query_states = compute_query_states(model=self.model, **kwargs)
+
+        previous_dtype = query_states.dtype
+        # (bsz, num_heads, q_len, adapter_len)
+        scores = torch.matmul(query_states, adapter_k.transpose(2, 3).to(previous_dtype)) / math.sqrt(
+            self.model.head_dim
+        )
+        # Upcast attention to fp32
+        # (bsz, num_heads, q_len, adapter_len)
+        scores = self.adaption_gate * F.softmax(scores, dim=-1, dtype=torch.float32).to(previous_dtype)
+        # (bsz, q_len, num_heads * head_dim)
+        adapter_output = torch.matmul(scores, adapter_v).transpose(1, 2).reshape(bsz, q_len, -1)
+        # (bsz, q_len, hidden_size)
+        if o_proj_layer is not None:
+            adapter_output = getattr(self.model, o_proj_layer)(adapter_output)
+
+        # Add adaption prompt output to original output.
+        output = output + adapter_output
+
+        # Restore original dtype.
+        output = output.to(previous_dtype)
+        return output, None, past_key_value
--- a/src/peft/tuners/adaption_prompt/model.py
+++ b/src/peft/tuners/adaption_prompt/model.py
@ -0,0 +1,162 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List
+
+import torch.nn as nn
+
+from peft.utils import _freeze_adapter, _get_submodules
+
+from .config import AdaptionPromptConfig, prepare_config
+from .layer import AdaptedAttention
+from .utils import is_adaption_prompt_trainable
+
+
+class AdaptionPromptModel(nn.Module):
+    """
+    Implements adaption prompts as described in https://arxiv.org/pdf/2303.16199.pdf.
+
+    The top L attention modules are replaced with AdaptedAttention modules that wrap the original ones, but insert
+    trainable prompts with gates (for zero init).
+
+    Notes on the multi-adapter pattern:
+    - We store the states of different adapters by keeping a dictionary of AdaptedAttention modules indexed by adapter
+      name.
+    - Every time we switch adapters, we remove the modules of the currently active adapter from the model, store them
+      in the dictionary, and replace them with the modules of the new adapter.
+    - To avoid duplicated and potentially inconsistent state, the currently active adapter is always removed from the
+      dictionary.
+    - Disabling the adapter would also result in the modules being removed from the model.
+    """
+
+    def __init__(self, model, configs: Dict, adapter_name: str):
+        super().__init__()
+        self.model = model
+        # Store adapter configs by name.
+        self.peft_config: Dict[str, AdaptionPromptConfig] = {}
+        # Store lists of the parents of the affected attention modules by adapter name.
+        # We keep references to the parents so we can swap the adapters in-and-out of the model.
+        self._parents: Dict[str, List[nn.Module]] = {}
+        # Store lists of cached AdaptedAttention modules by name.
+        self._cached_adapters: Dict[str, List] = {}
+        # The name of the currently active adapter.
+        self._active_adapter = None
+        # Whether the adapter is enabled.
+        self._enabled = True
+        self.forward = self.model.forward
+        self.add_adapter(adapter_name, configs[adapter_name])
+        self._mark_only_adaption_prompts_as_trainable()
+
+    def add_adapter(self, adapter_name: str, config: AdaptionPromptConfig) -> None:
+        """Add an adapter with the given name and config."""
+        config = prepare_config(config, self.model)
+        if adapter_name in self.peft_config:
+            raise ValueError(f"Adapter with name '{adapter_name}' already exists.")
+
+        parents = []
+        for name, _ in self.model.named_modules():
+            if name.endswith(config.target_modules):
+                par, _, _ = _get_submodules(self.model, name)
+                parents.append(par)
+        if len(parents) < config.adapter_layers:
+            raise ValueError(
+                f"Config specifies more adapter layers '{config.adapter_layers}'"
+                f" than the model has '{len(parents)}'."
+            )
+        # Note that if the target modules are not in Sequential, ModuleList, or
+        # some other PyTorch ordered container, the behavior is undefined as we
+        # assume here that the order of the modules is the same as the order of
+        # the transformer decoder layers.
+        parents = parents[-config.adapter_layers :]
+        self._parents[adapter_name] = parents
+
+        # It is only None during initialization.
+        # If it is disabled, we don't have to remove the modules.
+        if self._active_adapter is not None and self._enabled:
+            self._remove_adapted_attentions(self._active_adapter)
+        self._active_adapter = adapter_name
+        self.peft_config[adapter_name] = config
+        self._create_adapted_attentions(config, parents)
+        if not self._enabled:
+            self._remove_adapted_attentions(self._active_adapter)
+
+        if config.inference_mode:
+            _freeze_adapter(self.model, adapter_name)
+
+    def set_adapter(self, adapter_name: str) -> None:
+        """Set the model to use the adapter with the given name."""
+        if self._active_adapter == adapter_name:
+            return
+        if adapter_name not in self.peft_config:
+            raise ValueError(f"Adapter with name '{adapter_name}' does not exist.")
+
+        if self._enabled:
+            self._remove_adapted_attentions(self._active_adapter)
+            self._set_adapted_attentions(adapter_name)
+
+        self._active_adapter = adapter_name
+
+    def enable_adapter_layers(self):
+        """Enable adapter layers by swapping in cached AdaptedAttention modules."""
+        self._enabled = True
+        self._set_adapted_attentions(self._active_adapter)
+
+    def disable_adapter_layers(self):
+        """Disable adapter layers by swapping out AdaptedAttention modules."""
+        self._enabled = False
+        self._remove_adapted_attentions(self._active_adapter)
+
+    def _create_adapted_attentions(self, config: AdaptionPromptConfig, parents: List[nn.Module]) -> None:
+        """Wrap LlamaAttention modules with newly created AdaptedAttention modules."""
+        for par in parents:
+            attn = AdaptedAttention(
+                model_type=self.model.config.model_type,
+                adapter_len=config.adapter_len,
+                model=getattr(par, config.target_modules),
+            )
+            setattr(par, config.target_modules, attn)
+
+    def _set_adapted_attentions(self, adapter_name: str) -> None:
+        """Replace LlamaAttention modules with cached AdaptedAttention modules."""
+        cached = self._cached_adapters[adapter_name]
+        del self._cached_adapters[adapter_name]
+        config = self.peft_config[adapter_name]
+        for i, par in enumerate(self._parents[adapter_name]):
+            setattr(par, config.target_modules, cached[i])
+
+    def _remove_adapted_attentions(self, adapter_name: str) -> None:
+        """Remove AdaptedAttention modules from the model and store them in the cache."""
+        config = self.peft_config[adapter_name]
+        adapted_attentions = []
+        for par in self._parents[adapter_name]:
+            attn = getattr(par, config.target_modules)
+            adapted_attentions.append(attn)
+            setattr(par, config.target_modules, attn.model)
+        self._cached_adapters[adapter_name] = adapted_attentions
+
+    def _mark_only_adaption_prompts_as_trainable(self) -> None:
+        """Freeze all parameters of the model except the adaption prompts."""
+        for n, p in self.model.named_parameters():
+            if not is_adaption_prompt_trainable(n):
+                p.requires_grad = False
+
+    def __getattr__(self, name: str):
+        """Forward missing attributes to the wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Module's logic
+        except AttributeError:
+            # This is necessary as e.g. causal models have various methods that we
+            # don't want to re-implement here.
+            return getattr(self.model, name)
--- a/src/peft/tuners/adaption_prompt/utils.py
+++ b/src/peft/tuners/adaption_prompt/utils.py
@ -0,0 +1,84 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+
+
+def llama_rotate_half(x: torch.Tensor) -> torch.Tensor:
+    """
+    Rotate half the hidden dims of the input.
+
+    This function was duplicated verbatim from:
+    https://github.com/huggingface/transformers/blob/1de8ce9ee1191ba761a593ac15d9ccbf5851bfc5/src/transformers/models/llama/modeling_llama.py#L126
+
+    This was done to eliminate the Llama transformers implementation as a dependency of this file. Note that some other
+    functions were also adapted from the transformers implementation but were modified.
+    """
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def llama_apply_rotary_pos_emb(q, cos, sin, position_ids):
+    """
+    Apply rotary position embedding to query states in the Llama model.
+
+    This function was adapted from:
+    https://github.com/huggingface/transformers/blob/1de8ce9ee1191ba761a593ac15d9ccbf5851bfc5/src/transformers/models/llama/modeling_llama.py#L133
+
+    It was modified to remove unnecessary processing of key states. The method is compatible with transformers <=
+    4.34.2 and also with the latest version (>=4.35).
+    """
+    # In previous transformers version cos/sin cached had a shape of 4D
+    if len(cos.shape) == 4:
+        gather_indices = position_ids[:, None, :, None]  # [bs, 1, seq_len, 1]
+        gather_indices = gather_indices.repeat(1, cos.shape[1], 1, cos.shape[3])
+        cos = torch.gather(cos.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
+        sin = torch.gather(sin.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
+    # In the new version, it is 2D so we fall back to the new implementation
+    # https://github.com/huggingface/transformers/blame/eef7ea98c31a333bacdc7ae7a2372bde772be8e4/src/transformers/models/llama/modeling_llama.py#L222-L226
+    else:
+        cos = cos[position_ids].unsqueeze(1)
+        sin = sin[position_ids].unsqueeze(1)
+    q_embed = (q * cos) + (llama_rotate_half(q) * sin)
+    return q_embed
+
+
+def llama_compute_query_states(model: nn.Module, **kwargs) -> torch.Tensor:
+    """
+    Compute query states for Llama models specifically.
+
+    They need to be recomputed as the forward() method of the original LlamaModel in the transformers library does not
+    return them. See the related discussion in the PR: https://github.com/huggingface/peft/pull/268
+    """
+    hidden_states = kwargs.get("hidden_states")
+    position_ids = kwargs.get("position_ids")
+    past_key_value = kwargs.get("past_key_value")
+    bsz, q_len, _ = hidden_states.size()
+    query_states = model.q_proj(hidden_states).view(bsz, q_len, model.num_heads, model.head_dim).transpose(1, 2)
+    value_states = model.v_proj(hidden_states).view(bsz, q_len, model.num_heads, model.head_dim).transpose(1, 2)
+
+    seq_len = q_len
+    if past_key_value is not None:
+        seq_len += past_key_value[0].shape[-2]
+    cos, sin = model.rotary_emb(value_states, seq_len=seq_len)
+
+    return llama_apply_rotary_pos_emb(query_states, cos, sin, position_ids)
+
+
+def is_adaption_prompt_trainable(params: str) -> bool:
+    """Return True if module is trainable under adaption prompt fine-tuning."""
+    return params.split(".")[-1].startswith("adaption_")
--- a/src/peft/tuners/ia3.py
+++ b/src/peft/tuners/ia3.py
@ -1,509 +0,0 @@
-# coding=utf-8
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import re
-import warnings
-from dataclasses import asdict, dataclass, field
-from enum import Enum
-from typing import List, Optional, Union
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from transformers.pytorch_utils import Conv1D
-
-from ..config import PeftConfig
-from ..import_utils import is_bnb_available
-from ..utils import (
-    TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING,
-    TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING,
-    ModulesToSaveWrapper,
-    PeftType,
-    _get_submodules,
-    _is_valid_match,
-    transpose,
-)
-from .tuners_utils import BaseTuner, BaseTunerLayer
-
-
-if is_bnb_available():
-    import bitsandbytes as bnb
-
-
-@dataclass
-class IA3Config(PeftConfig):
-    """
-    This is the configuration class to store the configuration of a [`IA3Model`].
-
-    Args:
-        target_modules (`Union[List[str],str]`): The names of the modules to apply (IA)^3 to.
-        feedforward_modules (`Union[List[str],str]`): The names of the modules to be treated as feedforward modules
-        as in the original paper.
-        fan_in_fan_out (`bool`): Set this to True if the layer to replace stores weight like (fan_in, fan_out).
-        For example, gpt-2 uses `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`.:
-        modules_to_save (`List[str]`):List of modules apart from (IA)^3 layers to be set as trainable
-            and saved in the final checkpoint.
-        init_ia3_weights (`bool`): Whether to initialize the vectors in the (IA)^3 layers, defaults to `True`.
-    """
-
-    target_modules: Optional[Union[List[str], str]] = field(
-        default=None,
-        metadata={
-            "help": "List of module names or regex expression of the module names to replace with ia3."
-            "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
-        },
-    )
-    feedforward_modules: Optional[Union[List[str], str]] = field(
-        default=None,
-        metadata={
-            "help": "List of module names or a regex expression of module names which are feedforward"
-            "For example, ['output.dense']"
-        },
-    )
-    fan_in_fan_out: bool = field(
-        default=False,
-        metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"},
-    )
-    modules_to_save: Optional[List[str]] = field(
-        default=None,
-        metadata={
-            "help": "List of modules apart from (IA)^3 layers to be set as trainable and saved in the final checkpoint. "
-            "For example, in Sequence Classification or Token Classification tasks, "
-            "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
-        },
-    )
-    init_ia3_weights: bool = field(
-        default=True,
-        metadata={"help": "Whether to initialize the vectors in the (IA)^3 layers."},
-    )
-
-    def __post_init__(self):
-        self.peft_type = PeftType.IA3
-
-
-class IA3Layer(BaseTunerLayer):
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        is_feedforward: bool,
-    ):
-        self.scaling = {}
-        self.ia3_l = nn.ParameterDict({})
-        # Mark the weight as unmerged
-        self.merged = False
-        self.disable_adapters = False
-        self.in_features = in_features
-        self.out_features = out_features
-        self.is_feedforward = is_feedforward
-
-    def update_layer(self, adapter_name, init_ia3_weights):
-        # Actual trainable parameters
-        if self.is_feedforward:
-            weight = torch.randn((1, self.in_features))
-        else:
-            weight = torch.randn((self.out_features, 1))
-        self.ia3_l.update(nn.ParameterDict({adapter_name: nn.Parameter(weight)}))
-        if init_ia3_weights:
-            self.reset_ia3_parameters(adapter_name)
-        self.to(self.weight.device)
-
-    def reset_ia3_parameters(self, adapter_name):
-        if adapter_name in self.ia3_l.keys():
-            # initialize learned vector with torch.ones
-            nn.init.constant_(self.ia3_l[adapter_name], 1.0)
-
-
-class IA3Model(BaseTuner):
-    """
-    Creates a Infused Adapter by Inhibiting and Amplifying Inner Activations ((IA)^3) model from a pretrained
-    transformers model. The method is described in detail in https://arxiv.org/abs/2205.05638
-
-    Args:
-        model ([`~transformers.PreTrainedModel`]): The model to be adapted.
-        config ([`IA3Config`]): The configuration of the (IA)^3 model.
-        adapter_name (`str`): The name of the adapter, defaults to `"default"`.
-
-    Returns:
-        `torch.nn.Module`: The (IA)^3 model.
-
-    Example:
-
-        ```py
-        >>> from transformers import AutoModelForSeq2SeqLM, ia3Config
-        >>> from peft import IA3Model, IA3Config
-
-        >>> config = IA3Config(
-        ...     peft_type="IA3",
-        ...     task_type="SEQ_2_SEQ_LM",
-        ...     target_modules=["k", "v", "w0"],
-        ...     feedforward_modules=["w0"],
-        ... )
-
-        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
-        >>> ia3_model = IA3Model(config, model)
-        ```
-
-    **Attributes**:
-        - **model** ([`~transformers.PreTrainedModel`]) -- The model to be adapted.
-        - **peft_config** ([`ia3Config`]): The configuration of the (IA)^3 model.
-    """
-
-    def __init__(self, model, config, adapter_name):
-        super().__init__(model, config, adapter_name)
-
-    @staticmethod
-    def _create_new_module(ia3_config, adapter_name, target, **kwargs):
-        bias = hasattr(target, "bias") and target.bias is not None
-        loaded_in_8bit = kwargs.pop("loaded_in_8bit", False)
-        is_feedforward = kwargs.pop("is_feedforward", False)
-
-        if loaded_in_8bit and isinstance(target, bnb.nn.Linear8bitLt):
-            eightbit_kwargs = kwargs.copy()
-            eightbit_kwargs.update(
-                {
-                    "has_fp16_weights": target.state.has_fp16_weights,
-                    "memory_efficient_backward": target.state.memory_efficient_backward,
-                    "threshold": target.state.threshold,
-                    "index": target.index,
-                }
-            )
-            new_module = Linear8bitLt(
-                adapter_name,
-                target.in_features,
-                target.out_features,
-                is_feedforward,
-                bias=bias,
-                **eightbit_kwargs,
-            )
-        else:
-            #  Create a new Linear module with (IA)^3 parameters for torch.nn.Linear
-            # or Conv1D modules
-            if isinstance(target, torch.nn.Linear):
-                in_features, out_features = target.in_features, target.out_features
-                if kwargs["fan_in_fan_out"]:
-                    warnings.warn(
-                        "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
-                        "Setting fan_in_fan_out to False."
-                    )
-                    kwargs["fan_in_fan_out"] = ia3_config.fan_in_fan_out = False
-            elif isinstance(target, Conv1D):
-                in_features, out_features = (
-                    target.weight.ds_shape if hasattr(target.weight, "ds_shape") else target.weight.shape
-                )
-                if not kwargs["fan_in_fan_out"]:
-                    warnings.warn(
-                        "fan_in_fan_out is set to False but the target module is `Conv1D`. "
-                        "Setting fan_in_fan_out to True."
-                    )
-                    kwargs["fan_in_fan_out"] = ia3_config.fan_in_fan_out = True
-            else:
-                raise ValueError(
-                    f"Target module {target} is not supported. "
-                    f"Currently, only `torch.nn.Linear` and `Conv1D` are supported."
-                )
-            new_module = Linear(
-                adapter_name, in_features, out_features, is_feedforward=is_feedforward, bias=bias, **kwargs
-            )
-        return new_module
-
-    @staticmethod
-    def _check_target_module_exists(ia3_config, key):
-        if isinstance(ia3_config.target_modules, str):
-            target_module_found = re.fullmatch(ia3_config.target_modules, key)
-        else:
-            target_module_found = any(_is_valid_match(key, target_key) for target_key in ia3_config.target_modules)
-        return target_module_found
-
-    def _mark_only_adapters_as_trainable(self) -> None:
-        for n, p in self.model.named_parameters():
-            if "ia3_" not in n:
-                p.requires_grad = False
-
-    def _create_and_replace(
-        self,
-        ia3_config,
-        adapter_name,
-        target,
-        target_name,
-        parent,
-        **optionnal_kwargs,
-    ):
-        loaded_in_8bit = optionnal_kwargs["loaded_in_8bit"]
-        current_key = optionnal_kwargs["current_key"]
-
-        # check if target module is in feedforward_modules
-        if isinstance(ia3_config.feedforward_modules, str):
-            is_feedforward = re.fullmatch(ia3_config.feedforward_modules, current_key)
-        else:
-            is_feedforward = any(current_key.endswith(target_key) for target_key in ia3_config.feedforward_modules)
-
-        kwargs = {
-            "fan_in_fan_out": ia3_config.fan_in_fan_out,
-            "init_ia3_weights": ia3_config.init_ia3_weights,
-            "loaded_in_8bit": loaded_in_8bit,
-            "is_feedforward": is_feedforward,
-        }
-
-        if isinstance(target, IA3Layer):
-            target.update_layer(
-                adapter_name,
-                ia3_config.init_ia3_weights,
-            )
-        else:
-            new_module = self._create_new_module(ia3_config, adapter_name, target, **kwargs)
-            self._replace_module(parent, target_name, new_module, target)
-
-    @staticmethod
-    def _replace_module(parent, child_name, new_module, child):
-        setattr(parent, child_name, new_module)
-        new_module.weight = child.weight
-        if child.bias is not None:
-            new_module.bias = child.bias
-        if getattr(child, "state", None) is not None:
-            new_module.state = child.state
-            new_module.to(child.weight.device)
-
-        # dispatch to correct device
-        for name, module in new_module.named_modules():
-            if "ia3_" in name:
-                module.to(child.weight.device)
-
-    def __getattr__(self, name: str):
-        """Forward missing attributes to the wrapped module."""
-        try:
-            return super().__getattr__(name)  # defer to nn.Module's logic
-        except AttributeError:
-            return getattr(self.model, name)
-
-    def get_peft_config_as_dict(self, inference: bool = False):
-        config_dict = {}
-        for key, value in self.peft_config.items():
-            config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()}
-            if inference:
-                config["inference_mode"] = True
-        config_dict[key] = config
-        return config
-
-    def _set_adapter_layers(self, enabled=True):
-        for module in self.model.modules():
-            if isinstance(module, IA3Layer):
-                module.disable_adapters = False if enabled else True
-            elif isinstance(module, ModulesToSaveWrapper):
-                module.disable_adapters = False if enabled else True
-
-    def enable_adapter_layers(self):
-        self._set_adapter_layers(enabled=True)
-
-    def disable_adapter_layers(self):
-        self._set_adapter_layers(enabled=False)
-
-    def set_adapter(self, adapter_name):
-        for module in self.model.modules():
-            if isinstance(module, IA3Layer):
-                if module.merged:
-                    warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.")
-                    module.unmerge()
-                module.active_adapter = adapter_name
-
-    def _prepare_adapter_config(self, peft_config, model_config):
-        if peft_config.target_modules is None:
-            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING:
-                raise ValueError("Please specify `target_modules` in `peft_config`")
-            peft_config.target_modules = TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING[model_config["model_type"]]
-        if peft_config.feedforward_modules is None:
-            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING:
-                raise ValueError("Please specify `feedforward_modules` in `peft_config`")
-            peft_config.feedforward_modules = TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING[
-                model_config["model_type"]
-            ]
-        return peft_config
-
-    def merge_and_unload(self):
-        r"""
-        This method merges the (IA)^3 layers into the base model. This is needed if someone wants to use the base model
-        as a standalone model.
-        """
-        if getattr(self.config, "model_type", None) == "gpt2":
-            raise ValueError("GPT2 models are not supported for merging ia3 layers")
-
-        if getattr(self.model, "is_loaded_in_8bit", False):
-            raise ValueError("Cannot merge ia3 layers when the model is loaded in 8-bit mode")
-
-        key_list = [key for key, _ in self.model.named_modules() if "ia3" not in key]
-        for key in key_list:
-            try:
-                parent, target, target_name = _get_submodules(self.model, key)
-            except AttributeError:
-                continue
-            if isinstance(target, IA3Layer):
-                bias = target.bias is not None
-                new_module = torch.nn.Linear(target.in_features, target.out_features, bias=bias)
-                target.merge()
-                self._replace_module(parent, target_name, new_module, target)
-
-            # save any additional trainable modules part of `modules_to_save`
-            if isinstance(target, ModulesToSaveWrapper):
-                setattr(parent, target_name, target.modules_to_save[target.active_adapter])
-
-        return self.model
-
-
-# Below code is based on https://github.com/microsoft/lora/blob/main/loralib/layers.py
-# and modified to work with PyTorch FSDP
-
-
-#  ------------------------------------------------------------------------------------------
-#  Copyright (c) Microsoft Corporation. All rights reserved.
-#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
-#  ------------------------------------------------------------------------------------------
-
-
-class Linear(nn.Linear, IA3Layer):
-    # (IA)^3 implemented in a dense layer
-    def __init__(
-        self,
-        adapter_name: str,
-        in_features: int,
-        out_features: int,
-        fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
-        is_feedforward: bool = False,  # Set to True if the layer is treated as a feedforward layer
-        **kwargs,
-    ):
-        init_ia3_weights = kwargs.pop("init_ia3_weights", True)
-
-        nn.Linear.__init__(self, in_features, out_features, **kwargs)
-        IA3Layer.__init__(self, in_features=in_features, out_features=out_features, is_feedforward=is_feedforward)
-        # Freezing the pre-trained weight matrix
-        self.weight.requires_grad = False
-
-        self.fan_in_fan_out = fan_in_fan_out
-        if fan_in_fan_out:
-            self.weight.data = self.weight.data.T
-
-        nn.Linear.reset_parameters(self)
-        self.update_layer(adapter_name, init_ia3_weights)
-        self.active_adapter = adapter_name
-
-        self.is_feedforward = is_feedforward
-
-    def merge(self):
-        if self.active_adapter not in self.ia3_l.keys():
-            return
-        if self.merged:
-            warnings.warn("Already merged. Nothing to do.")
-            return
-
-        self.weight = transpose(self.weight, self.fan_in_fan_out)
-        self.weight.data = torch.mul(self.weight.data, self.ia3_l[self.active_adapter].data)
-        self.weight = transpose(self.weight, self.fan_in_fan_out)
-
-        self.merged = True
-
-    def unmerge(self):
-        if self.active_adapter not in self.ia3_l.keys():
-            return
-        if not self.merged:
-            warnings.warn("Already unmerged. Nothing to do.")
-            return
-
-        warnings.warn("Unmerge result can be inaccurate for (IA)^3.")
-        self.weight = transpose(self.weight, self.fan_in_fan_out)
-        # divide by (IA)^3 vector. Add tolerace to avoid division by zero
-        self.weight.data = torch.div(self.weight.data, self.ia3_l[self.active_adapter].data + 1e-8)
-        self.weight = transpose(self.weight, self.fan_in_fan_out)
-
-        self.merged = False
-
-    def forward(self, x: torch.Tensor):
-        previous_dtype = x.dtype
-
-        if self.active_adapter not in self.ia3_l.keys():
-            return F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
-
-        if self.disable_adapters:
-            if self.merged:
-                self.unmerge()
-            result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
-        elif not self.merged:
-            if self.is_feedforward:
-                x = x.to(self.ia3_l[self.active_adapter].dtype)
-                interm = x * self.ia3_l[self.active_adapter].flatten()
-                result = F.linear(
-                    interm.to(self.weight.dtype),
-                    transpose(self.weight, self.fan_in_fan_out),
-                    bias=self.bias,
-                )
-            else:
-                result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
-                result = result.to(self.ia3_l[self.active_adapter].dtype) * self.ia3_l[self.active_adapter].flatten()
-        else:
-            result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
-
-        result = result.to(previous_dtype)
-
-        return result
-
-
-if is_bnb_available():
-
-    class Linear8bitLt(bnb.nn.Linear8bitLt, IA3Layer):
-        # (IA)^3 implemented in a dense layer
-        def __init__(
-            self,
-            adapter_name,
-            in_features,
-            out_features,
-            is_feedforward,
-            **kwargs,
-        ):
-            bnb.nn.Linear8bitLt.__init__(
-                self,
-                in_features,
-                out_features,
-                bias=kwargs.get("bias", True),
-                has_fp16_weights=kwargs.get("has_fp16_weights", True),
-                memory_efficient_backward=kwargs.get("memory_efficient_backward", False),
-                threshold=kwargs.get("threshold", 0.0),
-                index=kwargs.get("index", None),
-            )
-            IA3Layer.__init__(self, in_features=in_features, out_features=out_features, is_feedforward=is_feedforward)
-
-            # Freezing the pre-trained weight matrix
-            self.weight.requires_grad = False
-
-            init_ia3_weights = kwargs.pop("init_ia3_weights", True)
-            self.update_layer(adapter_name, init_ia3_weights)
-            self.active_adapter = adapter_name
-            self.is_feedforward = is_feedforward
-
-        def forward(self, x: torch.Tensor):
-            if self.disable_adapters or self.active_adapter not in self.ia3_l.keys():
-                return super().forward(x)
-            else:
-                if not torch.is_autocast_enabled():
-                    if x.dtype != torch.float32:
-                        x = x.float()
-                    if self.is_feedforward:
-                        result = super().forward(x * self.ia3_l[self.active_adapter].flatten())
-                    else:
-                        result = super().forward(x)
-                        expected_dtype = result.dtype
-                        result = (result * self.ia3_l[self.active_adapter].flatten()).to(expected_dtype)
-                else:
-                    if self.is_feedforward:
-                        result = super().forward(x * self.ia3_l[self.active_adapter].flatten())
-                    else:
-                        result = result * self.ia3_l[self.active_adapter].flatten()
-            return result
--- a/src/peft/tuners/ia3/init.py
+++ b/src/peft/tuners/ia3/init.py
@ -0,0 +1,34 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from peft.import_utils import is_bnb_4bit_available, is_bnb_available
+
+from .config import IA3Config
+from .layer import Conv2d, IA3Layer, Linear
+from .model import IA3Model
+
+
+__all__ = ["Conv2d", "IA3Config", "IA3Layer", "IA3Model", "Linear"]
+
+
+if is_bnb_available():
+    from .bnb import Linear8bitLt
+
+    __all__ += ["Linear8bitLt"]
+
+if is_bnb_4bit_available():
+    from .bnb import Linear4bit
+
+    __all__ += ["Linear4bit"]
--- a/src/peft/tuners/ia3/bnb.py
+++ b/src/peft/tuners/ia3/bnb.py
@ -0,0 +1,142 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import bitsandbytes as bnb
+import torch
+
+from peft.import_utils import is_bnb_4bit_available, is_bnb_available
+
+from .layer import IA3Layer
+
+
+if is_bnb_available():
+
+    class Linear8bitLt(bnb.nn.Linear8bitLt, IA3Layer):
+        # (IA)^3 implemented in a dense layer
+        def __init__(
+            self,
+            adapter_name,
+            in_features,
+            out_features,
+            is_feedforward,
+            **kwargs,
+        ) -> None:
+            bnb.nn.Linear8bitLt.__init__(
+                self,
+                in_features,
+                out_features,
+                bias=kwargs.get("bias", True),
+                has_fp16_weights=kwargs.get("has_fp16_weights", True),
+                memory_efficient_backward=kwargs.get("memory_efficient_backward", False),
+                threshold=kwargs.get("threshold", 0.0),
+                index=kwargs.get("index", None),
+            )
+            IA3Layer.__init__(self, in_features=in_features, out_features=out_features, is_feedforward=is_feedforward)
+            self.is_feedforward = is_feedforward
+
+            # Freezing the pre-trained weight matrix
+            self.weight.requires_grad = False
+
+            init_ia3_weights = kwargs.pop("init_ia3_weights", True)
+            self.update_layer(adapter_name, init_ia3_weights)
+            self.set_adapter(adapter_name)
+
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            if self.disable_adapters:
+                return super().forward(x)
+
+            ia3_scaling = 1
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.ia3_l.keys():
+                    continue
+                ia3_scaling *= self.ia3_l[active_adapter].flatten()
+
+            requires_conversion = (not torch.is_autocast_enabled()) and (x.dtype != torch.float32)
+            if requires_conversion:
+                x = x.float()
+            if self.is_feedforward:
+                result = super().forward(x * ia3_scaling)
+                expected_dtype = result.dtype
+            else:
+                result = super().forward(x)
+                expected_dtype = result.dtype
+                result = result * ia3_scaling
+
+            if requires_conversion:
+                result = result.to(expected_dtype)
+
+            return result
+
+
+if is_bnb_4bit_available():
+
+    class Linear4bit(bnb.nn.Linear4bit, IA3Layer):
+        # IA3 implemented in a dense layer
+        def __init__(
+            self,
+            adapter_name,
+            in_features,
+            out_features,
+            is_feedforward,
+            **kwargs,
+        ) -> None:
+            bnb.nn.Linear4bit.__init__(
+                self,
+                in_features,
+                out_features,
+                bias=kwargs.get("bias", True),
+                compute_dtype=kwargs.get("compute_dtype", torch.float32),
+                compress_statistics=kwargs.get("compress_statistics", True),
+                quant_type=kwargs.get("quant_type", "nf4"),
+            )
+            IA3Layer.__init__(self, in_features=in_features, out_features=out_features, is_feedforward=is_feedforward)
+            self.is_feedforward = is_feedforward
+
+            # Freezing the pre-trained weight matrix
+            self.weight.requires_grad = False
+
+            init_ia3_weights = kwargs.pop("init_ia3_weights", True)
+            self.update_layer(adapter_name, init_ia3_weights)
+            self.set_adapter(adapter_name)
+
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            if self.disable_adapters:
+                return super().forward(x)
+
+            ia3_scaling = 1
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.ia3_l.keys():
+                    continue
+                ia3_scaling *= self.ia3_l[active_adapter].flatten()
+
+            requires_conversion = (not torch.is_autocast_enabled()) and (x.dtype != torch.float32)
+            if requires_conversion:
+                x = x.float()
+            if self.is_feedforward:
+                result = super().forward(x * ia3_scaling)
+                expected_dtype = result.dtype
+            else:
+                result = super().forward(x)
+                expected_dtype = result.dtype
+                result = result * ia3_scaling
+
+            result = result.clone()
+            # adalora.py and lora.py both suggest that this is necessary for 4-bit training on older versions of Pytorch.
+            # This has been duplicated here.
+
+            if requires_conversion:
+                result = result.to(expected_dtype)
+
+            return result
--- a/src/peft/tuners/ia3/config.py
+++ b/src/peft/tuners/ia3/config.py
@ -0,0 +1,87 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import List, Optional, Union
+
+from peft.config import PeftConfig
+from peft.utils import PeftType
+
+
+@dataclass
+class IA3Config(PeftConfig):
+    """
+    This is the configuration class to store the configuration of a [`IA3Model`].
+
+    Args:
+        target_modules (`Union[List[str],str]`):
+            The names of the modules to apply (IA)^3 to.
+        feedforward_modules (`Union[List[str],str]`):
+            The names of the modules to be treated as feedforward modules, as in the original paper. These modules will
+            have (IA)^3 vectors multiplied to the input, instead of the output. feedforward_modules must be a name or a
+            subset of names present in target_modules.
+        fan_in_fan_out (`bool`):
+            Set this to True if the layer to replace stores weight like (fan_in, fan_out). For example, gpt-2 uses
+            `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`.
+        modules_to_save (`List[str]`):
+            List of modules apart from (IA)^3 layers to be set as trainable and saved in the final checkpoint.
+        init_ia3_weights (`bool`):
+            Whether to initialize the vectors in the (IA)^3 layers, defaults to `True`.
+    """
+
+    target_modules: Optional[Union[List[str], str]] = field(
+        default=None,
+        metadata={
+            "help": "List of module names or regex expression of the module names to replace with ia3."
+            "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
+        },
+    )
+    feedforward_modules: Optional[Union[List[str], str]] = field(
+        default=None,
+        metadata={
+            "help": "List of module names or a regex expression of module names which are feedforward"
+            "For example, ['output.dense']"
+        },
+    )
+    fan_in_fan_out: bool = field(
+        default=False,
+        metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"},
+    )
+    modules_to_save: Optional[List[str]] = field(
+        default=None,
+        metadata={
+            "help": "List of modules apart from (IA)^3 layers to be set as trainable and saved in the final checkpoint. "
+            "For example, in Sequence Classification or Token Classification tasks, "
+            "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
+        },
+    )
+    init_ia3_weights: bool = field(
+        default=True,
+        metadata={"help": "Whether to initialize the vectors in the (IA)^3 layers."},
+    )
+
+    def __post_init__(self):
+        self.peft_type = PeftType.IA3
+        self.target_modules = (
+            set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
+        )
+        self.feedforward_modules = (
+            set(self.feedforward_modules) if isinstance(self.feedforward_modules, list) else self.feedforward_modules
+        )
+
+        # check if feedforward_modules is a subset of target_modules. run the check only if both are sets
+        if isinstance(self.feedforward_modules, set) and isinstance(self.target_modules, set):
+            if not self.feedforward_modules.issubset(self.target_modules):
+                raise ValueError("`feedforward_modules` should be a subset of `target_modules`")
--- a/src/peft/tuners/ia3/layer.py
+++ b/src/peft/tuners/ia3/layer.py
@ -0,0 +1,338 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from typing import Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from peft.tuners.tuners_utils import BaseTunerLayer
+from peft.utils import transpose
+
+
+class IA3Layer(BaseTunerLayer):
+    # All names of layers that may contain adapter weights
+    adapter_layer_names = ("ia3_l",)
+    # All names of other parameters that may contain adapter-related parameters
+    other_layer_names = ("scaling",)
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        is_feedforward: bool,
+    ):
+        self.scaling = {}
+        self.ia3_l = nn.ParameterDict({})
+        # Mark the weight as unmerged
+        self._disable_adapters = False
+        self.merged_adapters = []
+        self.in_features = in_features
+        self.out_features = out_features
+        self.is_feedforward = is_feedforward
+
+    def update_layer(self, adapter_name, init_ia3_weights):
+        # Actual trainable parameters
+        if self.is_feedforward:
+            weight = torch.randn((1, self.in_features))
+        else:
+            weight = torch.randn((self.out_features, 1))
+        self.ia3_l[adapter_name] = nn.Parameter(weight)
+        if init_ia3_weights:
+            self.reset_ia3_parameters(adapter_name)
+        self.to(self.weight.device)
+        self.set_adapter(self.active_adapters)
+
+    def reset_ia3_parameters(self, adapter_name):
+        if adapter_name in self.ia3_l.keys():
+            # initialize learned vector with torch.ones
+            nn.init.constant_(self.ia3_l[adapter_name], 1.0)
+
+
+class Linear(nn.Linear, IA3Layer):
+    # (IA)^3 implemented in a dense layer
+    def __init__(
+        self,
+        adapter_name: str,
+        in_features: int,
+        out_features: int,
+        fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
+        is_feedforward: bool = False,  # Set to True if the layer is treated as a feedforward layer
+        is_target_conv_1d_layer: bool = False,  # whether target module is a conv1d layer. useful while unloading later
+        **kwargs,
+    ) -> None:
+        init_ia3_weights = kwargs.pop("init_ia3_weights", True)
+
+        nn.Linear.__init__(self, in_features, out_features, **kwargs)
+        IA3Layer.__init__(self, in_features=in_features, out_features=out_features, is_feedforward=is_feedforward)
+        self.is_feedforward = is_feedforward
+        # Freezing the pre-trained weight matrix
+        self.weight.requires_grad = False
+
+        self.fan_in_fan_out = fan_in_fan_out
+        if fan_in_fan_out:
+            self.weight.data = self.weight.data.T
+
+        self.is_target_conv_1d_layer = is_target_conv_1d_layer
+
+        nn.Linear.reset_parameters(self)
+        self.update_layer(adapter_name, init_ia3_weights)
+        self.set_adapter(adapter_name)
+
+    def update_layer(self, adapter_name, init_ia3_weights):
+        # Actual trainable parameters
+        if self.is_feedforward:
+            weight = torch.randn((1, self.in_features))
+        else:
+            weight = torch.randn((self.out_features, 1))
+        self.ia3_l[adapter_name] = nn.Parameter(weight)
+        if init_ia3_weights:
+            self.reset_ia3_parameters(adapter_name)
+        self.to(self.weight.device)
+        self.set_adapter(self.active_adapters)
+
+    def merge(self, safe_merge: bool = False) -> None:
+        """
+        Merge the active adapter weights into the base weights
+
+        Args:
+            safe_merge (`bool`, *optional*):
+                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
+                before merging the weights. This is useful if you want to check if the merge operation will produce
+                NaNs. Defaults to `False`.
+        """
+        if self.merged:
+            warnings.warn(
+                f"Already following adapters were merged {','.join(self.merged_adapters)}. "
+                f"You are now additionally merging {','.join(self.active_adapters)}."
+            )
+
+        for active_adapter in self.active_adapters:
+            if active_adapter in self.ia3_l.keys():
+                if safe_merge:
+                    orig_weights = transpose(self.weight, self.fan_in_fan_out).clone()
+                    orig_weights = torch.mul(orig_weights.data, self.ia3_l[active_adapter].data)
+
+                    if not torch.isfinite(orig_weights).all():
+                        raise ValueError(
+                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                        )
+                    self.weight.data = orig_weights
+                    self.weight = transpose(self.weight, self.fan_in_fan_out)
+                else:
+                    self.weight = transpose(self.weight, self.fan_in_fan_out)
+                    self.weight.data = torch.mul(self.weight.data, self.ia3_l[active_adapter].data)
+                    self.weight = transpose(self.weight, self.fan_in_fan_out)
+
+                if not self.is_feedforward and (self.bias is not None):
+                    scaling = self.ia3_l[active_adapter].reshape(self.bias.shape)
+                    self.bias.data = torch.mul(self.bias.data, scaling.data)
+
+                self.merged_adapters.append(active_adapter)
+
+    def unmerge(self) -> None:
+        if not self.merged:
+            warnings.warn("Already unmerged. Nothing to do.")
+            return
+
+        warnings.warn("Unmerge result can be inaccurate for (IA)^3.")
+        while len(self.merged_adapters) > 0:
+            active_adapter = self.merged_adapters.pop()
+            if active_adapter in self.ia3_l.keys():
+                self.weight = transpose(self.weight, self.fan_in_fan_out)
+                # divide by (IA)^3 vector. Add tolerace to avoid division by zero
+                self.weight.data = torch.div(self.weight.data, self.ia3_l[active_adapter].data + 1e-8)
+                self.weight = transpose(self.weight, self.fan_in_fan_out)
+
+                if not self.is_feedforward and (self.bias is not None):
+                    scaling = self.ia3_l[active_adapter].reshape(self.bias.shape)
+                    self.bias.data = torch.div(self.bias.data, scaling.data + 1e-8)
+
+    def _linear(self, input: torch.Tensor) -> torch.Tensor:
+        return F.linear(input, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        dtype = previous_dtype = x.dtype
+
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result = self._linear(x)
+        elif self.merged:
+            result = self._linear(x)
+        else:
+            ia3_scaling = 1
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.ia3_l.keys():
+                    continue
+                dtype = self.ia3_l[active_adapter].dtype
+                ia3_scaling *= self.ia3_l[active_adapter].flatten()
+
+            if self.is_feedforward:
+                x = x.to(dtype)
+                # TODO: self.weight.dtype can be != self.ia3_l[self.active_adapters].dtype
+                # e.g. bf16 vs fp32. Is that okay?
+                interm = (x * ia3_scaling).to(self.weight.dtype)
+                result = self._linear(interm)
+            else:
+                result = self._linear(x)
+                result = result.to(dtype) * ia3_scaling
+
+        result = result.to(previous_dtype)
+        return result
+
+
+class Conv2d(nn.Conv2d, IA3Layer):
+    def __init__(
+        self,
+        adapter_name: str,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int]],
+        stride: Union[int, Tuple[int]] = 1,
+        padding: Union[int, Tuple[int]] = 0,
+        fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
+        is_feedforward: bool = False,  # Set to True if the layer is treated as a feedforward layer
+        **kwargs,
+    ) -> None:
+        init_ia3_weights = kwargs.pop("init_ia3_weights", True)
+
+        nn.Conv2d.__init__(self, in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
+        IA3Layer.__init__(self, in_features=in_channels, out_features=out_channels, is_feedforward=is_feedforward)
+        self.is_feedforward = is_feedforward
+        # Freezing the pre-trained weight matrix
+        self.weight.requires_grad = False
+
+        self.fan_in_fan_out = fan_in_fan_out
+        if fan_in_fan_out:
+            self.weight.data = self.weight.data.T
+
+        nn.Conv2d.reset_parameters(self)
+        self.update_layer(adapter_name, init_ia3_weights)
+        self.set_adapter(adapter_name)
+
+    def update_layer(self, adapter_name, init_ia3_weights):
+        # Actual trainable parameters
+        if self.is_feedforward:
+            weight = torch.randn((1, self.in_features, 1, 1))
+        else:
+            weight = torch.randn((1, self.out_features, 1, 1))
+        self.ia3_l[adapter_name] = nn.Parameter(weight)
+        if init_ia3_weights:
+            self.reset_ia3_parameters(adapter_name)
+        self.to(self.weight.device)
+        self.set_adapter(self.active_adapters)
+
+    def merge(self, safe_merge: bool = False) -> None:
+        """
+        Merge the active adapter weights into the base weights
+
+        Args:
+            safe_merge (`bool`, *optional*):
+                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
+                before merging the weights. This is useful if you want to check if the merge operation will produce
+                NaNs. Defaults to `False`.
+        """
+        if self.merged:
+            warnings.warn(
+                f"Already following adapters were merged {','.join(self.merged_adapters)}. "
+                f"You are now additionally merging {','.join(self.active_adapters)}."
+            )
+
+        for active_adapter in self.active_adapters:
+            if active_adapter in self.ia3_l.keys():
+                ia3_scaling = self.ia3_l[active_adapter].data
+                if not self.is_feedforward:
+                    ia3_scaling = ia3_scaling.permute(1, 0, 2, 3)
+
+                if safe_merge:
+                    output_weight = torch.mul(self.weight.data, ia3_scaling).clone()
+
+                    if not torch.isfinite(output_weight).all():
+                        raise ValueError(
+                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                        )
+
+                    self.weight.data = output_weight
+                else:
+                    self.weight.data = torch.mul(self.weight.data, ia3_scaling)
+
+                if not self.is_feedforward and (self.bias is not None):
+                    scaling = self.ia3_l[active_adapter].reshape(self.bias.shape)
+                    self.bias.data = torch.mul(self.bias.data, scaling.data)
+
+                self.merged_adapters.append(active_adapter)
+
+    def unmerge(self) -> None:
+        if not self.merged:
+            warnings.warn("Already unmerged. Nothing to do.")
+            return
+
+        warnings.warn("Unmerge result can be inaccurate for (IA)^3.")
+        while len(self.merged_adapters) > 0:
+            active_adapter = self.merged_adapters.pop()
+            if active_adapter in self.ia3_l.keys():
+                # divide by (IA)^3 vector. Add tolerace to avoid division by zero
+                ia3_scaling = self.ia3_l[active_adapter].data
+                if not self.is_feedforward:
+                    ia3_scaling = ia3_scaling.permute(1, 0, 2, 3)
+                self.weight.data = torch.div(self.weight.data, ia3_scaling + 1e-8)
+
+                if not self.is_feedforward and (self.bias is not None):
+                    scaling = self.ia3_l[active_adapter].reshape(self.bias.shape)
+                    self.bias.data = torch.mul(self.bias.data, scaling.data)
+
+    def _conv2d(self, input: torch.Tensor) -> torch.Tensor:
+        return F.conv2d(
+            input,
+            self.weight,
+            bias=self.bias,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation,
+            groups=self.groups,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        previous_dtype = x.dtype
+
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result = self._conv2d(x)
+        elif self.merged:
+            result = self._conv2d(x)
+        else:
+            ia3_scaling = 1
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.ia3_l.keys():
+                    continue
+                dtype = self.ia3_l[active_adapter].dtype
+                ia3_scaling *= self.ia3_l[active_adapter]
+
+            if self.is_feedforward:
+                x = x.to(dtype)
+                # TODO: self.weight.dtype can be != self.ia3_l[self.active_adapters].dtype
+                # e.g. bf16 vs fp32. Is that okay?
+                interm = (x * ia3_scaling).to(self.weight.dtype)
+                result = self._conv2d(interm)
+            else:
+                result = self._conv2d(x)
+                result = result.to(dtype) * ia3_scaling
+
+        result = result.to(previous_dtype)
+        return result
--- a/src/peft/tuners/ia3/model.py
+++ b/src/peft/tuners/ia3/model.py
@ -0,0 +1,351 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import warnings
+from dataclasses import asdict
+from enum import Enum
+
+import torch
+from transformers.pytorch_utils import Conv1D
+
+from peft.import_utils import is_bnb_4bit_available, is_bnb_available
+from peft.tuners.tuners_utils import BaseTuner, check_target_module_exists
+from peft.utils import (
+    TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING,
+    TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING,
+    ModulesToSaveWrapper,
+    _get_submodules,
+)
+
+from .layer import Conv2d, IA3Layer, Linear
+
+
+if is_bnb_available():
+    import bitsandbytes as bnb
+
+    from .bnb import Linear8bitLt
+
+if is_bnb_4bit_available():
+    from .bnb import Linear4bit
+
+
+class IA3Model(BaseTuner):
+    """
+    Creates a Infused Adapter by Inhibiting and Amplifying Inner Activations ((IA)^3) model from a pretrained
+    transformers model. The method is described in detail in https://arxiv.org/abs/2205.05638
+
+    Args:
+        model ([`~transformers.PreTrainedModel`]): The model to be adapted.
+        config ([`IA3Config`]): The configuration of the (IA)^3 model.
+        adapter_name (`str`): The name of the adapter, defaults to `"default"`.
+
+    Returns:
+        `torch.nn.Module`: The (IA)^3 model.
+
+    Example:
+
+        ```py
+        >>> from transformers import AutoModelForSeq2SeqLM, ia3Config
+        >>> from peft import IA3Model, IA3Config
+
+        >>> config = IA3Config(
+        ...     peft_type="IA3",
+        ...     task_type="SEQ_2_SEQ_LM",
+        ...     target_modules=["k", "v", "w0"],
+        ...     feedforward_modules=["w0"],
+        ... )
+
+        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+        >>> ia3_model = IA3Model(config, model)
+        ```
+
+    **Attributes**:
+        - **model** ([`~transformers.PreTrainedModel`]) -- The model to be adapted.
+        - **peft_config** ([`ia3Config`]): The configuration of the (IA)^3 model.
+    """
+
+    def __init__(self, model, config, adapter_name):
+        super().__init__(model, config, adapter_name)
+
+    @staticmethod
+    def _create_new_module(ia3_config, adapter_name, target, **kwargs):
+        bias = hasattr(target, "bias") and target.bias is not None
+        loaded_in_8bit = kwargs.pop("loaded_in_8bit", False)
+        loaded_in_4bit = kwargs.pop("loaded_in_4bit", False)
+        is_feedforward = kwargs.pop("is_feedforward", False)
+
+        if loaded_in_8bit and isinstance(target, bnb.nn.Linear8bitLt):
+            eightbit_kwargs = kwargs.copy()
+            eightbit_kwargs.update(
+                {
+                    "has_fp16_weights": target.state.has_fp16_weights,
+                    "memory_efficient_backward": target.state.memory_efficient_backward,
+                    "threshold": target.state.threshold,
+                    "index": target.index,
+                }
+            )
+            new_module = Linear8bitLt(
+                adapter_name,
+                target.in_features,
+                target.out_features,
+                is_feedforward,
+                bias=bias,
+                **eightbit_kwargs,
+            )
+        elif loaded_in_4bit and isinstance(target, bnb.nn.Linear4bit):
+            fourbit_kwargs = kwargs.copy()
+            fourbit_kwargs.update(
+                {
+                    "compute_dtype": target.compute_dtype,
+                    "compress_statistics": target.weight.compress_statistics,
+                    "quant_type": target.weight.quant_type,
+                }
+            )
+            new_module = Linear4bit(
+                adapter_name,
+                target.in_features,
+                target.out_features,
+                is_feedforward,
+                bias=bias,
+                **fourbit_kwargs,
+            )
+        elif isinstance(target, torch.nn.Conv2d):
+            out_channels, in_channels = target.weight.size()[:2]
+            kernel_size = target.weight.size()[2:]
+            stride = target.stride
+            padding = target.padding
+            new_module = Conv2d(
+                adapter_name=adapter_name,
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                is_feedforward=is_feedforward,
+                **kwargs,
+            )
+        else:
+            if isinstance(target, torch.nn.Linear):
+                in_features, out_features = target.in_features, target.out_features
+                if kwargs["fan_in_fan_out"]:
+                    warnings.warn(
+                        "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
+                        "Setting fan_in_fan_out to False."
+                    )
+                    kwargs["fan_in_fan_out"] = ia3_config.fan_in_fan_out = False
+            elif isinstance(target, Conv1D):
+                in_features, out_features = (
+                    target.weight.ds_shape if hasattr(target.weight, "ds_shape") else target.weight.shape
+                )
+                kwargs["is_target_conv_1d_layer"] = True  # useful for unloading later
+                if not kwargs["fan_in_fan_out"]:
+                    warnings.warn(
+                        "fan_in_fan_out is set to False but the target module is `Conv1D`. "
+                        "Setting fan_in_fan_out to True."
+                    )
+                    kwargs["fan_in_fan_out"] = ia3_config.fan_in_fan_out = True
+            else:
+                raise ValueError(
+                    f"Target module {target} is not supported. "
+                    f"Currently, only `torch.nn.Linear`, `torch.nn.Conv2d`, and `Conv1D` are supported."
+                )
+            new_module = Linear(
+                adapter_name, in_features, out_features, is_feedforward=is_feedforward, bias=bias, **kwargs
+            )
+        return new_module
+
+    @staticmethod
+    def _check_target_module_exists(ia3_config, key):
+        return check_target_module_exists(ia3_config, key)
+
+    def _mark_only_adapters_as_trainable(self) -> None:
+        for n, p in self.model.named_parameters():
+            if "ia3_" not in n:
+                p.requires_grad = False
+
+    def _create_and_replace(
+        self,
+        ia3_config,
+        adapter_name,
+        target,
+        target_name,
+        parent,
+        **optional_kwargs,
+    ):
+        loaded_in_8bit = optional_kwargs["loaded_in_8bit"]
+        loaded_in_4bit = optional_kwargs["loaded_in_4bit"]
+        current_key = optional_kwargs["current_key"]
+
+        # check if target module is in feedforward_modules
+        is_feedforward = self._check_target_module_feedforward(ia3_config, current_key)
+
+        kwargs = {
+            "fan_in_fan_out": ia3_config.fan_in_fan_out,
+            "init_ia3_weights": ia3_config.init_ia3_weights,
+            "loaded_in_8bit": loaded_in_8bit,
+            "loaded_in_4bit": loaded_in_4bit,
+            "is_feedforward": is_feedforward,
+        }
+
+        if isinstance(target, IA3Layer):
+            if target.is_feedforward != is_feedforward:
+                raise ValueError(
+                    "New adapter should have the same value for `is_feedforward` as previously added adapter."
+                )
+            if isinstance(target, torch.nn.Conv2d):
+                target.update_layer(
+                    adapter_name,
+                    ia3_config.init_ia3_weights,
+                )
+            else:  # Linear
+                target.update_layer(
+                    adapter_name,
+                    ia3_config.init_ia3_weights,
+                )
+        else:
+            new_module = self._create_new_module(ia3_config, adapter_name, target, **kwargs)
+            if adapter_name != self.active_adapter:
+                # adding an additional adapter: it is not automatically trainable
+                new_module.requires_grad_(False)
+            self._replace_module(parent, target_name, new_module, target)
+
+    @staticmethod
+    def _check_target_module_feedforward(ia3_config, key) -> bool:
+        """
+        A helper private method that checks if the target module `key` matches with a feedforward module specified in
+        `ia3_config`
+        """
+        if isinstance(ia3_config.feedforward_modules, str):
+            is_feedforward = bool(re.fullmatch(ia3_config.feedforward_modules, key))
+        else:
+            is_feedforward = any(key.endswith(target_key) for target_key in ia3_config.feedforward_modules)
+        return is_feedforward
+
+    @staticmethod
+    def _replace_module(parent, child_name, new_module, child):
+        setattr(parent, child_name, new_module)
+        new_module.weight = child.weight
+        if child.bias is not None:
+            new_module.bias = child.bias
+        if getattr(child, "state", None) is not None:
+            new_module.state = child.state
+            new_module.to(child.weight.device)
+
+        # dispatch to correct device
+        for name, module in new_module.named_modules():
+            if "ia3_" in name:
+                module.to(child.weight.device)
+
+    def __getattr__(self, name: str):
+        """Forward missing attributes to the wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Module's logic
+        except AttributeError:
+            return getattr(self.model, name)
+
+    def get_peft_config_as_dict(self, inference: bool = False):
+        config_dict = {}
+        for key, value in self.peft_config.items():
+            config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()}
+            if inference:
+                config["inference_mode"] = True
+        config_dict[key] = config
+        return config
+
+    def _set_adapter_layers(self, enabled=True):
+        for module in self.model.modules():
+            if isinstance(module, (IA3Layer, ModulesToSaveWrapper)):
+                module.enable_adapters(enabled)
+
+    def enable_adapter_layers(self):
+        self._set_adapter_layers(enabled=True)
+
+    def disable_adapter_layers(self):
+        self._set_adapter_layers(enabled=False)
+
+    def set_adapter(self, adapter_name):
+        for module in self.model.modules():
+            if isinstance(module, IA3Layer):
+                if module.merged:
+                    warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.")
+                    module.unmerge()
+                module.set_adapter(adapter_name)
+
+    def _prepare_adapter_config(self, peft_config, model_config):
+        if peft_config.target_modules is None:
+            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING:
+                raise ValueError("Please specify `target_modules` in `peft_config`")
+            peft_config.target_modules = TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING[model_config["model_type"]]
+        if peft_config.feedforward_modules is None:
+            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING:
+                raise ValueError("Please specify `feedforward_modules` in `peft_config`")
+            peft_config.feedforward_modules = TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING[
+                model_config["model_type"]
+            ]
+        return peft_config
+
+    def merge_and_unload(self, safe_merge: bool = False):
+        r"""
+        This method merges the (IA)^3 layers into the base model. This is needed if someone wants to use the base model
+        as a standalone model.
+
+        Args:
+            safe_merge (`bool`, `optional`, defaults to `False`):
+                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
+                before merging the weights. This is useful if you want to check if the merge operation will produce
+                NaNs. Defaults to `False`.
+        """
+        if getattr(self.model, "is_loaded_in_8bit", False):
+            raise ValueError("Cannot merge ia3 layers when the model is loaded in 8-bit mode")
+
+        if getattr(self.model, "is_loaded_in_4bit", False):
+            raise ValueError("Cannot merge ia3 layers when the model is loaded in 4-bit mode")
+
+        key_list = [key for key, _ in self.model.named_modules() if "ia3" not in key]
+        for key in key_list:
+            try:
+                parent, target, target_name = _get_submodules(self.model, key)
+            except AttributeError:
+                continue
+
+            # save any additional trainable modules part of `modules_to_save`
+            if isinstance(target, ModulesToSaveWrapper):
+                setattr(parent, target_name, target.modules_to_save[target.active_adapter])
+                continue
+
+            if not isinstance(target, IA3Layer):
+                continue
+
+            if isinstance(target, torch.nn.Conv2d):
+                new_module = torch.nn.Conv2d(
+                    target.in_channels,
+                    target.out_channels,
+                    kernel_size=target.kernel_size,
+                    stride=target.stride,
+                    padding=target.padding,
+                    dilation=target.dilation,
+                )
+            else:
+                bias = target.bias is not None
+                if getattr(target, "is_target_conv_1d_layer", False):
+                    new_module = Conv1D(target.out_features, target.in_features)
+                else:
+                    new_module = torch.nn.Linear(target.in_features, target.out_features, bias=bias)
+
+            target.merge(safe_merge=safe_merge)
+            self._replace_module(parent, target_name, new_module, target)
+
+        return self.model
--- a/src/peft/tuners/loha/init.py
+++ b/src/peft/tuners/loha/init.py
@ -0,0 +1,21 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .config import LoHaConfig
+from .layer import Conv2d, Linear, LoHaLayer
+from .model import LoHaModel
+
+
+__all__ = ["LoHaConfig", "LoHaModel", "Conv2d", "Linear", "LoHaLayer"]
--- a/src/peft/tuners/loha/config.py
+++ b/src/peft/tuners/loha/config.py
@ -0,0 +1,108 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import List, Optional, Union
+
+from peft.tuners.lycoris_utils import LycorisConfig
+from peft.utils import PeftType
+
+
+@dataclass
+class LoHaConfig(LycorisConfig):
+    """
+    This is the configuration class to store the configuration of a [`LoHaModel`].
+
+    Args:
+        r (`int`): LoHa rank.
+        alpha (`int`): The alpha parameter for LoHa scaling.
+        rank_dropout (`int`): The dropout probability for rank dimension during training.
+        module_dropout (`int`): The dropout probability for disabling LoHa modules during training.
+        use_effective_conv2d (`bool`):
+            Use parameter effective decomposition for Conv2d with ksize > 1 ("Proposition 3" from FedPara paper).
+        target_modules (`Union[List[str],str]`): The names of the modules to apply LoHa to.
+        init_weights (`bool`): Whether to perform initialization of LoHa weights.
+        layers_to_transform (`Union[List[int],int]`):
+            The layer indexes to transform, if this argument is specified, it will apply the LoHa transformations on
+            the layer indexes that are specified in this list. If a single integer is passed, it will apply the LoHa
+            transformations on the layer at this index.
+        layers_pattern (`str`):
+            The layer pattern name, used only if `layers_to_transform` is different from `None` and if the layer
+            pattern is not in the common layers pattern.
+        rank_pattern (`dict`):
+            The mapping from layer names or regexp expression to ranks which are different from the default rank
+            specified by `r`.
+        alpha_pattern (`dict`):
+            The mapping from layer names or regexp expression to alphas which are different from the default alpha
+            specified by `alpha`.
+        modules_to_save (`List[str]`): The names of modules to be set as trainable except LoHa parameters.
+    """
+
+    r: int = field(default=8, metadata={"help": "LoHa rank"})
+    alpha: int = field(default=8, metadata={"help": "LoHa alpha"})
+    rank_dropout: float = field(
+        default=0.0, metadata={"help": "The dropout probability for rank dimension during training"}
+    )
+    module_dropout: float = field(
+        default=0.0, metadata={"help": "The dropout probability for disabling LoHa modules during training"}
+    )
+    use_effective_conv2d: bool = field(
+        default=False,
+        metadata={
+            "help": 'Use parameter effective decomposition for Conv2d 3x3 with ksize > 1 ("Proposition 3" from FedPara paper)'
+        },
+    )
+    target_modules: Optional[Union[List[str], str]] = field(
+        default=None,
+        metadata={
+            "help": "List of module names or regex expression of the module names to replace with LoHa."
+            "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
+        },
+    )
+    init_weights: bool = field(
+        default=True,
+        metadata={
+            "help": (
+                "Whether to initialize the weights of the LoHa layers with their default initialization. Don't change "
+                "this setting, except if you know exactly what you're doing."
+            ),
+        },
+    )
+    layers_to_transform: Optional[Union[List[int], int]] = field(
+        default=None,
+        metadata={
+            "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index."
+        },
+    )
+    layers_pattern: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern."
+        },
+    )
+    modules_to_save: Optional[List[str]] = field(
+        default=None,
+        metadata={
+            "help": "List of modules apart from LoHA layers to be set as trainable and saved in the final checkpoint. "
+            "For example, in Sequence Classification or Token Classification tasks, "
+            "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
+        },
+    )
+
+    def __post_init__(self):
+        self.peft_type = PeftType.LOHA
+        self.target_modules = (
+            set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
+        )
--- a/src/peft/tuners/loha/layer.py
+++ b/src/peft/tuners/loha/layer.py
@ -0,0 +1,341 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Optional, Set, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from peft.tuners.lycoris_utils import LycorisLayer
+
+
+class LoHaLayer(LycorisLayer, nn.Module):
+    # All names of layers that may contain adapter weights
+    adapter_layer_names = ("hada_w1_a", "hada_w1_b", "hada_w2_a", "hada_w2_b", "hada_t1", "hada_t2")
+    # other_param_names is defined on parent class
+
+    def __init__(self):
+        LycorisLayer.__init__(self)
+        super(nn.Module, self).__init__()
+
+        # LoHa info
+        self.hada_w1_a = nn.ParameterDict({})
+        self.hada_w1_b = nn.ParameterDict({})
+        self.hada_w2_a = nn.ParameterDict({})
+        self.hada_w2_b = nn.ParameterDict({})
+        self.hada_t1 = nn.ParameterDict({})
+        self.hada_t2 = nn.ParameterDict({})
+
+    @property
+    def _available_adapters(self) -> Set[str]:
+        return {*self.hada_w1_a, *self.hada_w1_b, *self.hada_w2_a, *self.hada_w2_b, *self.hada_t1, *self.hada_t2}
+
+    def create_adapter_parameters(self, adapter_name: str, r: int, shape: Tuple[int, ...]):
+        # https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py#L130C9-L143C75
+        if len(shape) == 4:
+            self.hada_t1[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2], shape[3]))
+            self.hada_w1_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0]))  # out_dim, 1-mode
+            self.hada_w1_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1]))  # in_dim , 2-mode
+
+            self.hada_t2[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2], shape[3]))
+            self.hada_w2_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0]))  # out_dim, 1-mode
+            self.hada_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1]))  # in_dim , 2-mode
+        else:
+            self.hada_w1_a[adapter_name] = nn.Parameter(torch.empty(shape[0], r))
+            self.hada_w1_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1]))
+
+            self.hada_w2_a[adapter_name] = nn.Parameter(torch.empty(shape[0], r))
+            self.hada_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1]))
+
+    def reset_adapter_parameters(self, adapter_name: str):
+        # Original implementation performs initialization with normal distribution
+        # https://github.com/KohakuBlueleaf/LyCORIS/blob/3549fdef8f564761d68b695a08ef88b1122fdedc/lycoris/modules/loha.py#L158
+
+        # FedPara paper proposes to perform He initialization, let's stick with it
+        # It is enough to initialize only single matrix with zeros to make adapter do nothing after initialization
+        if adapter_name in self.hada_w1_a.keys():
+            nn.init.kaiming_uniform_(self.hada_w1_a[adapter_name], a=math.sqrt(5))
+            nn.init.kaiming_uniform_(self.hada_w1_b[adapter_name], a=math.sqrt(5))
+            nn.init.kaiming_uniform_(self.hada_w2_a[adapter_name], a=math.sqrt(5))
+            nn.init.zeros_(self.hada_w2_b[adapter_name])
+        if adapter_name in self.hada_t1.keys():
+            nn.init.kaiming_uniform_(self.hada_t1[adapter_name], a=math.sqrt(5))
+            nn.init.kaiming_uniform_(self.hada_t2[adapter_name], a=math.sqrt(5))
+
+    def update_layer(
+        self,
+        adapter_name: str,
+        r: int,
+        alpha: float,
+        rank_dropout: float,
+        module_dropout: float,
+        init_weights: bool,
+        use_effective_conv2d: bool = False,
+        **kwargs,
+    ) -> None:
+        """Internal function to create loha adapter
+
+        Args:
+            adapter_name (`str`): Name for the adapter to add.
+            r (`int`): Rank for the added adapter.
+            alpha (`float`): Alpha for the added adapter.
+            rank_dropout (`float`): The dropout probability for rank dimension during training.
+            module_dropout (`float`): The dropout probability for disabling adapter during training.
+            init_weights (`bool`): Whether to initialize weights.
+            use_effective_conv2d (`bool`, *optional*, defaults to `False`):
+                Use parameter effective decomposition for Conv2d with ksize > 1.
+        """
+
+        self.r[adapter_name] = r
+        self.alpha[adapter_name] = alpha
+        self.scaling[adapter_name] = alpha / r
+        self.rank_dropout[adapter_name] = rank_dropout
+        self.module_dropout[adapter_name] = module_dropout
+
+        # Determine shape of LoHa weights
+        if isinstance(self, nn.Linear):
+            shape = tuple(self.weight.shape)
+        elif isinstance(self, nn.Conv2d):
+            use_effective_conv2d = use_effective_conv2d and self.kernel_size != (1, 1)
+            if use_effective_conv2d:
+                shape = (self.out_channels, self.in_channels, *self.kernel_size)
+            else:
+                shape = (self.out_channels, self.in_channels * self.kernel_size[0] * self.kernel_size[1])
+        else:
+            raise TypeError(f"LoHa is not implemented for {type(self).__name__} layer")
+
+        # Create weights with provided shape
+        self.create_adapter_parameters(adapter_name, r, shape)
+
+        # Initialize weights
+        if init_weights:
+            self.reset_adapter_parameters(adapter_name)
+
+        # Move new weights to device
+        weight = getattr(self, "weight", None)
+        if weight is not None:
+            # the layer is already completely initialized, this is an update
+            if weight.dtype.is_floating_point or weight.dtype.is_complex:
+                self.to(weight.device, dtype=weight.dtype)
+            else:
+                self.to(weight.device)
+        self.set_adapter(self.active_adapters)
+
+    def get_delta_weight(self, adapter_name: str) -> torch.Tensor:
+        # https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py#L178
+        if adapter_name in self.hada_t1.keys():
+            weight = make_weight_cp(
+                self.hada_t1[adapter_name],
+                self.hada_w1_a[adapter_name],
+                self.hada_w1_b[adapter_name],
+                self.hada_t2[adapter_name],
+                self.hada_w2_a[adapter_name],
+                self.hada_w2_b[adapter_name],
+                scale=torch.tensor(self.scaling[adapter_name]),
+            )
+        else:
+            weight = make_weight(
+                self.hada_w1_a[adapter_name],
+                self.hada_w1_b[adapter_name],
+                self.hada_w2_a[adapter_name],
+                self.hada_w2_b[adapter_name],
+                scale=torch.tensor(self.scaling[adapter_name]),
+            )
+
+        weight = weight.reshape(self.weight.shape)
+
+        # Perform rank dropout during training - drop rows of addition weights
+        rank_dropout = self.rank_dropout[adapter_name]
+        if self.training and rank_dropout:
+            drop = (torch.rand(weight.size(0)) > rank_dropout).to(weight.dtype)
+            drop = drop.view(-1, *[1] * len(weight.shape[1:])).to(weight.device)
+            # TODO: Investigate if there should be a scaler like in normal dropout during training
+            # Original implementation doesn't have it
+            # https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py#L193
+            drop /= drop.mean()
+            weight *= drop
+
+        return weight
+
+
+class Linear(LoHaLayer, nn.Linear):
+    """LoHa implemented in Linear layer"""
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        device: Optional[Union[str, torch.device]] = None,
+        dtype: Optional[torch.dtype] = None,
+        adapter_name: str = "default",
+        r: int = 0,
+        alpha: float = 0.0,
+        rank_dropout: float = 0.0,
+        module_dropout: float = 0.0,
+        **kwargs,
+    ):
+        init_weights = kwargs.pop("init_weights", True)
+        self._init_empty_weights(nn.Linear, in_features, out_features, bias, device=device, dtype=dtype)
+
+        LoHaLayer.__init__(self)
+
+        # Create adapter and set it active
+        self.update_layer(adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, **kwargs)
+        self.set_adapter(adapter_name)
+
+    def _op(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
+        return F.linear(input, weight, bias=self.bias)
+
+
+class Conv2d(LoHaLayer, nn.Conv2d):
+    """LoHa implemented in Conv2d layer"""
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int]],
+        stride: Union[int, Tuple[int]] = 1,
+        padding: Union[int, Tuple[int]] = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+        device: Optional[Union[str, torch.device]] = None,
+        dtype: Optional[torch.dtype] = None,
+        adapter_name: str = "default",
+        r: int = 0,
+        alpha: float = 0.0,
+        rank_dropout: float = 0.0,
+        module_dropout: float = 0.0,
+        use_effective_conv2d: bool = False,
+        **kwargs,
+    ):
+        init_weights = kwargs.pop("init_weights", True)
+        self._init_empty_weights(
+            nn.Conv2d,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            padding_mode=padding_mode,
+            device=device,
+            dtype=dtype,
+        )
+
+        LoHaLayer.__init__(self)
+
+        # Create adapter and set it active
+        self.update_layer(
+            adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, use_effective_conv2d, **kwargs
+        )
+        self.set_adapter(adapter_name)
+
+    def _op(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
+        return F.conv2d(
+            input,
+            weight,
+            bias=self.bias,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation,
+            groups=self.groups,
+        )
+
+
+# Below code is a direct copy from https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py#L9
+
+
+class HadaWeight(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, w1a, w1b, w2a, w2b, scale=torch.tensor(1)):
+        ctx.save_for_backward(w1a, w1b, w2a, w2b, scale)
+        diff_weight = ((w1a @ w1b) * (w2a @ w2b)) * scale
+        return diff_weight
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        (w1a, w1b, w2a, w2b, scale) = ctx.saved_tensors
+        grad_out = grad_out * scale
+        temp = grad_out * (w2a @ w2b)
+        grad_w1a = temp @ w1b.T
+        grad_w1b = w1a.T @ temp
+
+        temp = grad_out * (w1a @ w1b)
+        grad_w2a = temp @ w2b.T
+        grad_w2b = w2a.T @ temp
+
+        del temp
+        return grad_w1a, grad_w1b, grad_w2a, grad_w2b, None
+
+
+class HadaWeightCP(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, t1, w1a, w1b, t2, w2a, w2b, scale=torch.tensor(1)):
+        ctx.save_for_backward(t1, w1a, w1b, t2, w2a, w2b, scale)
+
+        rebuild1 = torch.einsum("i j k l, j r, i p -> p r k l", t1, w1b, w1a)
+        rebuild2 = torch.einsum("i j k l, j r, i p -> p r k l", t2, w2b, w2a)
+
+        return rebuild1 * rebuild2 * scale
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        (t1, w1a, w1b, t2, w2a, w2b, scale) = ctx.saved_tensors
+        grad_out = grad_out * scale
+
+        temp = torch.einsum("i j k l, j r -> i r k l", t2, w2b)
+        rebuild = torch.einsum("i j k l, i r -> r j k l", temp, w2a)
+
+        grad_w = rebuild * grad_out
+        del rebuild
+
+        grad_w1a = torch.einsum("r j k l, i j k l -> r i", temp, grad_w)
+        grad_temp = torch.einsum("i j k l, i r -> r j k l", grad_w, w1a.T)
+        del grad_w, temp
+
+        grad_w1b = torch.einsum("i r k l, i j k l -> r j", t1, grad_temp)
+        grad_t1 = torch.einsum("i j k l, j r -> i r k l", grad_temp, w1b.T)
+        del grad_temp
+
+        temp = torch.einsum("i j k l, j r -> i r k l", t1, w1b)
+        rebuild = torch.einsum("i j k l, i r -> r j k l", temp, w1a)
+
+        grad_w = rebuild * grad_out
+        del rebuild
+
+        grad_w2a = torch.einsum("r j k l, i j k l -> r i", temp, grad_w)
+        grad_temp = torch.einsum("i j k l, i r -> r j k l", grad_w, w2a.T)
+        del grad_w, temp
+
+        grad_w2b = torch.einsum("i r k l, i j k l -> r j", t2, grad_temp)
+        grad_t2 = torch.einsum("i j k l, j r -> i r k l", grad_temp, w2b.T)
+        del grad_temp
+        return grad_t1, grad_w1a, grad_w1b, grad_t2, grad_w2a, grad_w2b, None
+
+
+def make_weight(w1a, w1b, w2a, w2b, scale):
+    return HadaWeight.apply(w1a, w1b, w2a, w2b, scale)
+
+
+def make_weight_cp(t1, w1a, w1b, t2, w2a, w2b, scale):
+    return HadaWeightCP.apply(t1, w1a, w1b, t2, w2a, w2b, scale)
--- a/src/peft/tuners/loha/model.py
+++ b/src/peft/tuners/loha/model.py
@ -0,0 +1,84 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Type
+
+import torch
+
+from ..lycoris_utils import LycorisTuner
+from .layer import Conv2d, Linear, LoHaLayer
+
+
+class LoHaModel(LycorisTuner):
+    """
+    Creates Low-Rank Hadamard Product model from a pretrained model. The method is partially described in
+    https://arxiv.org/abs/2108.06098 Current implementation heavily borrows from
+    https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py
+
+    Args:
+        model (`torch.nn.Module`): The model to which the adapter tuner layers will be attached.
+        config ([`LoHaConfig`]): The configuration of the LoHa model.
+        adapter_name (`str`): The name of the adapter, defaults to `"default"`.
+
+    Returns:
+        `torch.nn.Module`: The LoHa model.
+
+    Example:
+        ```py
+        >>> from diffusers import StableDiffusionPipeline
+        >>> from peft import LoHaModel, LoHaConfig
+
+        >>> config_te = LoHaConfig(
+        ...     r=8,
+        ...     lora_alpha=32,
+        ...     target_modules=["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"],
+        ...     rank_dropout=0.0,
+        ...     module_dropout=0.0,
+        ...     init_weights=True,
+        ... )
+        >>> config_unet = LoHaConfig(
+        ...     r=8,
+        ...     lora_alpha=32,
+        ...     target_modules=[
+        ...         "proj_in",
+        ...         "proj_out",
+        ...         "to_k",
+        ...         "to_q",
+        ...         "to_v",
+        ...         "to_out.0",
+        ...         "ff.net.0.proj",
+        ...         "ff.net.2",
+        ...     ],
+        ...     rank_dropout=0.0,
+        ...     module_dropout=0.0,
+        ...     init_weights=True,
+        ...     use_effective_conv2d=True,
+        ... )
+
+        >>> model = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> model.text_encoder = LoHaModel(model.text_encoder, config_te, "default")
+        >>> model.unet = LoHaModel(model.unet, config_unet, "default")
+        ```
+
+    **Attributes**:
+        - **model** ([`~torch.nn.Module`]) -- The model to be adapted.
+        - **peft_config** ([`LoHaConfig`]): The configuration of the LoHa model.
+    """
+
+    prefix: str = "hada_"
+    layers_mapping: Dict[Type[torch.nn.Module], Type[LoHaLayer]] = {
+        torch.nn.Conv2d: Conv2d,
+        torch.nn.Linear: Linear,
+    }
--- a/src/peft/tuners/lokr/init.py
+++ b/src/peft/tuners/lokr/init.py
@ -0,0 +1,21 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .config import LoKrConfig
+from .layer import Conv2d, Linear, LoKrLayer
+from .model import LoKrModel
+
+
+__all__ = ["LoKrConfig", "LoKrModel", "Conv2d", "Linear", "LoKrLayer"]
--- a/src/peft/tuners/lokr/config.py
+++ b/src/peft/tuners/lokr/config.py
@ -0,0 +1,112 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import List, Optional, Union
+
+from peft.tuners.lycoris_utils import LycorisConfig
+from peft.utils import PeftType
+
+
+@dataclass
+class LoKrConfig(LycorisConfig):
+    """
+    Configuration class of [`LoKrModel`].
+
+    Args:
+        r (`int`): LoKr rank.
+        alpha (`int`): The alpha parameter for LoKr scaling.
+        rank_dropout (`int`): The dropout probability for rank dimension during training.
+        module_dropout (`int`): The dropout probability for disabling LoKr modules during training.
+        use_effective_conv2d (`bool`):
+            Use parameter effective decomposition for Conv2d with ksize > 1 ("Proposition 3" from FedPara paper).
+        decompose_both (`bool`): Perform rank decomposition of left kronecker product matrix.
+        decompose_factor (`int`): Kronecker product decomposition factor.
+        target_modules (`Union[List[str],str]`): The names of the modules to apply LoKr to.
+        init_weights (`bool`): Whether to perform initialization of LoKr weights.
+        layers_to_transform (`Union[List[int],int]`):
+            The layer indexes to transform, if this argument is specified, it will apply the LoKr transformations on
+            the layer indexes that are specified in this list. If a single integer is passed, it will apply the LoKr
+            transformations on the layer at this index.
+        layers_pattern (`str`):
+            The layer pattern name, used only if `layers_to_transform` is different from `None` and if the layer
+            pattern is not in the common layers pattern.
+        rank_pattern (`dict`):
+            The mapping from layer names or regexp expression to ranks which are different from the default rank
+            specified by `r`.
+        alpha_pattern (`dict`):
+            The mapping from layer names or regexp expression to alphas which are different from the default alpha
+            specified by `alpha`.
+        modules_to_save (`List[str]`): The names of modules to be set as trainable except LoKr parameters.
+    """
+
+    r: int = field(default=8, metadata={"help": "LoKr rank"})
+    alpha: int = field(default=8, metadata={"help": "LoKr alpha"})
+    rank_dropout: float = field(
+        default=0.0, metadata={"help": "The dropout probability for rank dimension during training"}
+    )
+    module_dropout: float = field(
+        default=0.0, metadata={"help": "The dropout probability for disabling LoKr modules during training"}
+    )
+    use_effective_conv2d: bool = field(
+        default=False,
+        metadata={
+            "help": 'Use parameter effective decomposition for Conv2d 3x3 with ksize > 1 ("Proposition 3" from FedPara paper)'
+        },
+    )
+    decompose_both: bool = field(
+        default=False,
+        metadata={"help": "Perform rank decomposition of left kronecker product matrix."},
+    )
+    decompose_factor: int = field(default=-1, metadata={"help": "Kronecker product decomposition factor."})
+    target_modules: Optional[Union[List[str], str]] = field(
+        default=None,
+        metadata={
+            "help": "List of module names or regex expression of the module names to replace with LoKr."
+            "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
+        },
+    )
+    init_weights: bool = field(
+        default=True,
+        metadata={
+            "help": (
+                "Whether to initialize the weights of the LoKr layers with their default initialization. Don't change "
+                "this setting, except if you know exactly what you're doing."
+            ),
+        },
+    )
+    layers_to_transform: Optional[Union[List[int], int]] = field(
+        default=None,
+        metadata={
+            "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index."
+        },
+    )
+    layers_pattern: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern."
+        },
+    )
+    modules_to_save: Optional[List[str]] = field(
+        default=None,
+        metadata={
+            "help": "List of modules apart from LoKr layers to be set as trainable and saved in the final checkpoint. "
+            "For example, in Sequence Classification or Token Classification tasks, "
+            "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
+        },
+    )
+
+    def __post_init__(self):
+        self.peft_type = PeftType.LOKR
--- a/src/peft/tuners/lokr/layer.py
+++ b/src/peft/tuners/lokr/layer.py
@ -0,0 +1,374 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Optional, Set, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from peft.tuners.lycoris_utils import LycorisLayer
+
+
+class LoKrLayer(LycorisLayer, nn.Module):
+    # All names of layers that may contain adapter weights
+    adapter_layer_names = (
+        "lokr_w1",
+        "lokr_w1_a",
+        "lokr_w1_b",
+        "lokr_w2",
+        "lokr_w2_a",
+        "lokr_w2_b",
+        "lokr_t2",
+    )
+    # other_param_names is defined on parent class
+
+    def __init__(self):
+        LycorisLayer.__init__(self)
+        super(nn.Module, self).__init__()
+
+        # LoKr info
+        self.lokr_w1 = nn.ParameterDict({})
+        self.lokr_w1_a = nn.ParameterDict({})
+        self.lokr_w1_b = nn.ParameterDict({})
+        self.lokr_w2 = nn.ParameterDict({})
+        self.lokr_w2_a = nn.ParameterDict({})
+        self.lokr_w2_b = nn.ParameterDict({})
+        self.lokr_t2 = nn.ParameterDict({})
+
+    @property
+    def _available_adapters(self) -> Set[str]:
+        return {
+            *self.lokr_w1,
+            *self.lokr_w1_a,
+            *self.lokr_w1_b,
+            *self.lokr_w2,
+            *self.lokr_w2_a,
+            *self.lokr_w2_b,
+            *self.lokr_t2,
+        }
+
+    def create_adapter_parameters(
+        self,
+        adapter_name: str,
+        r: int,
+        shape,
+        use_w1: bool,
+        use_w2: bool,
+        use_effective_conv2d: bool,
+    ):
+        if use_w1:
+            self.lokr_w1[adapter_name] = nn.Parameter(torch.empty(shape[0][0], shape[1][0]))
+        else:
+            self.lokr_w1_a[adapter_name] = nn.Parameter(torch.empty(shape[0][0], r))
+            self.lokr_w1_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1][0]))
+
+        if len(shape) == 4:
+            # Conv2d
+            if use_w2:
+                self.lokr_w2[adapter_name] = nn.Parameter(torch.empty(shape[0][1], shape[1][1], *shape[2:]))
+            elif use_effective_conv2d:
+                self.lokr_t2[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2], shape[3]))
+                self.lokr_w2_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0][1]))  # b, 1-mode
+                self.lokr_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1][1]))  # d, 2-mode
+            else:
+                self.lokr_w2_a[adapter_name] = nn.Parameter(torch.empty(shape[0][1], r))
+                self.lokr_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1][1] * shape[2] * shape[3]))
+        else:
+            # Linear
+            if use_w2:
+                self.lokr_w2[adapter_name] = nn.Parameter(torch.empty(shape[0][1], shape[1][1]))
+            else:
+                self.lokr_w2_a[adapter_name] = nn.Parameter(torch.empty(shape[0][1], r))
+                self.lokr_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1][1]))
+
+    def reset_adapter_parameters(self, adapter_name: str):
+        if adapter_name in self.lokr_w1:
+            nn.init.zeros_(self.lokr_w1[adapter_name])
+        else:
+            nn.init.zeros_(self.lokr_w1_a[adapter_name])
+            nn.init.kaiming_uniform_(self.lokr_w1_b[adapter_name], a=math.sqrt(5))
+
+        if adapter_name in self.lokr_w2:
+            nn.init.kaiming_uniform_(self.lokr_w2[adapter_name], a=math.sqrt(5))
+        else:
+            nn.init.kaiming_uniform_(self.lokr_w2_a[adapter_name], a=math.sqrt(5))
+            nn.init.kaiming_uniform_(self.lokr_w2_b[adapter_name], a=math.sqrt(5))
+
+        if adapter_name in self.lokr_t2:
+            nn.init.kaiming_uniform_(self.lokr_t2[adapter_name], a=math.sqrt(5))
+
+    def update_layer(
+        self,
+        adapter_name: str,
+        r: int,
+        alpha: float,
+        rank_dropout: float,
+        module_dropout: float,
+        init_weights: bool,
+        use_effective_conv2d: bool,
+        decompose_both: bool,
+        decompose_factor: int,
+        **kwargs,
+    ) -> None:
+        """Internal function to create lokr adapter
+
+        Args:
+            adapter_name (`str`): Name for the adapter to add.
+            r (`int`): Rank for the added adapter.
+            alpha (`float`): Alpha for the added adapter.
+            rank_dropout (`float`): The dropout probability for rank dimension during training
+            module_dropout (`float`): The dropout probability for disabling adapter during training.
+            init_weights (`bool`): Whether to initialize adapter weights.
+            use_effective_conv2d (`bool`): Use parameter effective decomposition for Conv2d with ksize > 1.
+            decompose_both (`bool`): Perform rank decomposition of left kronecker product matrix.
+            decompose_factor (`int`): Kronecker product decomposition factor.
+        """
+
+        self.r[adapter_name] = r
+        self.alpha[adapter_name] = alpha
+        self.scaling[adapter_name] = alpha / r
+        self.rank_dropout[adapter_name] = rank_dropout
+        self.module_dropout[adapter_name] = module_dropout
+
+        # Determine shape of LoKr weights
+        if isinstance(self, nn.Linear):
+            in_dim, out_dim = self.in_features, self.out_features
+
+            in_m, in_n = factorization(in_dim, decompose_factor)
+            out_l, out_k = factorization(out_dim, decompose_factor)
+            shape = ((out_l, out_k), (in_m, in_n))  # ((a, b), (c, d)), out_dim = a*c, in_dim = b*d
+
+            use_w1 = not (decompose_both and r < max(shape[0][0], shape[1][0]) / 2)
+            use_w2 = not (r < max(shape[0][1], shape[1][1]) / 2)
+            use_effective_conv2d = False
+        elif isinstance(self, nn.Conv2d):
+            in_dim, out_dim = self.in_channels, self.out_channels
+            k_size = self.kernel_size
+
+            in_m, in_n = factorization(in_dim, decompose_factor)
+            out_l, out_k = factorization(out_dim, decompose_factor)
+            shape = ((out_l, out_k), (in_m, in_n), *k_size)  # ((a, b), (c, d), *k_size)
+
+            use_w1 = not (decompose_both and r < max(shape[0][0], shape[1][0]) / 2)
+            use_w2 = r >= max(shape[0][1], shape[1][1]) / 2
+            use_effective_conv2d = use_effective_conv2d and self.kernel_size != (1, 1)
+        else:
+            raise TypeError(f"LoKr is not implemented for {type(self).__name__} layer")
+
+        # Create weights with provided shape
+        self.create_adapter_parameters(adapter_name, r, shape, use_w1, use_w2, use_effective_conv2d)
+
+        # Initialize weights
+        if init_weights:
+            self.reset_adapter_parameters(adapter_name)
+
+        # Move new weights to device
+        weight = getattr(self, "weight", None)
+        if weight is not None:
+            # the layer is already completely initialized, this is an update
+            if weight.dtype.is_floating_point or weight.dtype.is_complex:
+                self.to(weight.device, dtype=weight.dtype)
+            else:
+                self.to(weight.device)
+        self.set_adapter(self.active_adapters)
+
+    def get_delta_weight(self, adapter_name: str) -> torch.Tensor:
+        # https://github.com/KohakuBlueleaf/LyCORIS/blob/e4259b870d3354a9615a96be61cb5d07455c58ea/lycoris/modules/lokr.py#L224
+        if adapter_name in self.lokr_w1:
+            w1 = self.lokr_w1[adapter_name]
+        else:
+            w1 = self.lokr_w1_a[adapter_name] @ self.lokr_w1_b[adapter_name]
+
+        if adapter_name in self.lokr_w2:
+            w2 = self.lokr_w2[adapter_name]
+        elif adapter_name in self.lokr_t2:
+            w2 = make_weight_cp(self.lokr_t2[adapter_name], self.lokr_w2_a[adapter_name], self.lokr_w2_b[adapter_name])
+        else:
+            w2 = self.lokr_w2_a[adapter_name] @ self.lokr_w2_b[adapter_name]
+
+        # Make weights with Kronecker product
+        weight = make_kron(w1, w2)
+        weight = weight.reshape(self.weight.shape)
+
+        # Perform rank dropout during training - drop rows of addition weights
+        rank_dropout = self.rank_dropout[adapter_name]
+        if self.training and rank_dropout:
+            drop = (torch.rand(weight.size(0)) > rank_dropout).float()
+            drop = drop.view(-1, *[1] * len(weight.shape[1:])).to(weight.device)
+            drop /= drop.mean()
+            weight *= drop
+
+        return weight
+
+
+class Linear(LoKrLayer, nn.Linear):
+    """LoKr implemented in Linear layer"""
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        device: Optional[Union[str, torch.device]] = None,
+        dtype: Optional[torch.dtype] = None,
+        adapter_name: str = "default",
+        r: int = 0,
+        alpha: float = 0.0,
+        rank_dropout: float = 0.0,
+        module_dropout: float = 0.0,
+        **kwargs,
+    ):
+        init_weights = kwargs.pop("init_weights", True)
+        self._init_empty_weights(nn.Linear, in_features, out_features, bias, device=device, dtype=dtype)
+
+        LoKrLayer.__init__(self)
+
+        # Create adapter and set it active
+        self.update_layer(adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, **kwargs)
+        self.set_adapter(adapter_name)
+
+    def _op(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
+        return F.linear(input, weight, bias=self.bias)
+
+
+class Conv2d(LoKrLayer, nn.Conv2d):
+    """LoKr implemented in Conv2d layer"""
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int]],
+        stride: Union[int, Tuple[int]] = 1,
+        padding: Union[int, Tuple[int]] = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+        device: Optional[Union[str, torch.device]] = None,
+        dtype: Optional[torch.dtype] = None,
+        adapter_name: str = "default",
+        r: int = 0,
+        alpha: float = 0.0,
+        rank_dropout: float = 0.0,
+        module_dropout: float = 0.0,
+        use_effective_conv2d: bool = False,
+        **kwargs,
+    ):
+        init_weights = kwargs.pop("init_weights", True)
+        self._init_empty_weights(
+            nn.Conv2d,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            padding_mode=padding_mode,
+            device=device,
+            dtype=dtype,
+        )
+
+        LoKrLayer.__init__(self)
+
+        # Create adapter and set it active
+        self.update_layer(
+            adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, use_effective_conv2d, **kwargs
+        )
+        self.set_adapter(adapter_name)
+
+    def _op(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
+        return F.conv2d(
+            input,
+            weight,
+            bias=self.bias,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation,
+            groups=self.groups,
+        )
+
+
+# Below code is a direct copy from https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/lokr.py#L11
+
+
+def factorization(dimension: int, factor: int = -1) -> Tuple[int, int]:
+    """Factorizes the provided number into the product of two numbers
+
+    Args:
+        dimension (`int`): The number that needs to be factorized.
+        factor (`int`, optional):
+            Factorization divider. The algorithm will try to output two numbers, one of each will be as close to the
+            factor as possible. If -1 is provided, the decomposition algorithm would try to search dividers near the
+            square root of the dimension. Defaults to -1.
+
+    Returns:
+        Tuple[`int`, `int`]: A tuple of two numbers, whose product is equal to the provided number. The first number is
+        always less than or equal to the second.
+
+    Example:
+        ```py
+        >>> factorization(256, factor=-1)
+        (16, 16)
+
+        >>> factorization(128, factor=-1)
+        (8, 16)
+
+        >>> factorization(127, factor=-1)
+        (1, 127)
+
+        >>> factorization(128, factor=4)
+        (4, 32)
+        ```
+    """
+
+    if factor > 0 and (dimension % factor) == 0:
+        m = factor
+        n = dimension // factor
+        return m, n
+    if factor == -1:
+        factor = dimension
+    m, n = 1, dimension
+    length = m + n
+    while m < n:
+        new_m = m + 1
+        while dimension % new_m != 0:
+            new_m += 1
+        new_n = dimension // new_m
+        if new_m + new_n > length or new_m > factor:
+            break
+        else:
+            m, n = new_m, new_n
+    if m > n:
+        n, m = m, n
+    return m, n
+
+
+def make_weight_cp(t, wa, wb):
+    rebuild2 = torch.einsum("i j k l, i p, j r -> p r k l", t, wa, wb)  # [c, d, k1, k2]
+    return rebuild2
+
+
+def make_kron(w1, w2, scale=1.0):
+    if len(w2.shape) == 4:
+        w1 = w1.unsqueeze(2).unsqueeze(2)
+    w2 = w2.contiguous()
+    rebuild = torch.kron(w1, w2)
+
+    return rebuild * scale
--- a/src/peft/tuners/lokr/model.py
+++ b/src/peft/tuners/lokr/model.py
@ -0,0 +1,85 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Type
+
+import torch
+
+from ..lycoris_utils import LycorisTuner
+from .layer import Conv2d, Linear, LoKrLayer
+
+
+class LoKrModel(LycorisTuner):
+    """
+    Creates Low-Rank Kronecker Product model from a pretrained model. The original method is partially described in
+    https://arxiv.org/abs/2108.06098 and in https://arxiv.org/abs/2309.14859 Current implementation heavily borrows
+    from
+    https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/lokr.py
+
+    Args:
+        model (`torch.nn.Module`): The model to which the adapter tuner layers will be attached.
+        config ([`LoKrConfig`]): The configuration of the LoKr model.
+        adapter_name (`str`): The name of the adapter, defaults to `"default"`.
+
+    Returns:
+        `torch.nn.Module`: The LoKr model.
+
+    Example:
+        ```py
+        >>> from diffusers import StableDiffusionPipeline
+        >>> from peft import LoKrModel, LoKrConfig
+
+        >>> config_te = LoKrConfig(
+        ...     r=8,
+        ...     lora_alpha=32,
+        ...     target_modules=["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"],
+        ...     rank_dropout=0.0,
+        ...     module_dropout=0.0,
+        ...     init_weights=True,
+        ... )
+        >>> config_unet = LoKrConfig(
+        ...     r=8,
+        ...     lora_alpha=32,
+        ...     target_modules=[
+        ...         "proj_in",
+        ...         "proj_out",
+        ...         "to_k",
+        ...         "to_q",
+        ...         "to_v",
+        ...         "to_out.0",
+        ...         "ff.net.0.proj",
+        ...         "ff.net.2",
+        ...     ],
+        ...     rank_dropout=0.0,
+        ...     module_dropout=0.0,
+        ...     init_weights=True,
+        ...     use_effective_conv2d=True,
+        ... )
+
+        >>> model = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> model.text_encoder = LoKrModel(model.text_encoder, config_te, "default")
+        >>> model.unet = LoKrModel(model.unet, config_unet, "default")
+        ```
+
+    **Attributes**:
+        - **model** ([`~torch.nn.Module`]) -- The model to be adapted.
+        - **peft_config** ([`LoKrConfig`]): The configuration of the LoKr model.
+    """
+
+    prefix: str = "lokr_"
+    layers_mapping: Dict[Type[torch.nn.Module], Type[LoKrLayer]] = {
+        torch.nn.Conv2d: Conv2d,
+        torch.nn.Linear: Linear,
+    }
--- a/src/peft/tuners/lora.py
+++ b/src/peft/tuners/lora.py
--- a/src/peft/tuners/lora/init.py
+++ b/src/peft/tuners/lora/init.py
@ -0,0 +1,35 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from peft.import_utils import is_bnb_4bit_available, is_bnb_available
+
+from .config import LoraConfig
+from .gptq import QuantLinear
+from .layer import Conv2d, Embedding, Linear, LoraLayer
+from .model import LoraModel
+
+
+__all__ = ["LoraConfig", "Conv2d", "Embedding", "LoraLayer", "Linear", "LoraModel", "QuantLinear"]
+
+
+if is_bnb_available():
+    from .bnb import Linear8bitLt
+
+    __all__ += ["Linear8bitLt"]
+
+if is_bnb_4bit_available():
+    from .bnb import Linear4bit
+
+    __all__ += ["Linear4bit"]
--- a/src/peft/tuners/lora/bnb.py
+++ b/src/peft/tuners/lora/bnb.py
@ -0,0 +1,296 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+import bitsandbytes as bnb
+import torch
+
+from peft.import_utils import is_bnb_4bit_available, is_bnb_available
+from peft.utils.other import transpose
+
+from .layer import LoraLayer
+
+
+if is_bnb_available():
+
+    class Linear8bitLt(torch.nn.Module, LoraLayer):
+        # Lora implemented in a dense layer
+        def __init__(
+            self,
+            adapter_name,
+            base_layer,
+            r: int = 0,
+            lora_alpha: int = 1,
+            lora_dropout: float = 0.0,
+            **kwargs,
+        ) -> None:
+            super().__init__()
+            LoraLayer.__init__(self, in_features=base_layer.in_features, out_features=base_layer.out_features)
+            self.base_layer = base_layer
+
+            init_lora_weights = kwargs.pop("init_lora_weights", True)
+            self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
+            self.set_adapter(adapter_name)
+
+        def merge(self, safe_merge: bool = False):
+            """
+            Merge the active adapter weights into the base weights
+
+            Args:
+                safe_merge (`bool`, *optional*):
+                    If True, the merge operation will be performed in a copy of the original weights and check for NaNs
+                    before merging the weights. This is useful if you want to check if the merge operation will produce
+                    NaNs. Defaults to `False`.
+            """
+            if self.merged:
+                warnings.warn(
+                    f"Already following adapters were merged {','.join(self.merged_adapters)}. "
+                    f"You are now additionally merging {','.join(self.active_adapters)}."
+                )
+
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.lora_A.keys():
+                    continue
+                warnings.warn(
+                    "Merge lora module to 8-bit linear may get different generations due to rounding errors."
+                )
+                lora_data = self.get_delta_weight(active_adapter)
+
+                weight = self.base_layer.weight
+                state = self.base_layer.state
+                if state.SCB is None:
+                    state.SCB = weight.SCB
+
+                # Dequantize the result of identity matrix and int8 weight because bitsandbytes does not support int8
+                # dequantization directly
+                im = torch.eye(weight.data.shape[-1]).contiguous().half().to(weight.device)
+                im, imt, SCim, SCimt, coo_tensorim = bnb.functional.double_quant(im)
+                im, Sim = bnb.functional.transform(im, "col32")
+                if state.CxB is None:
+                    state.CxB, state.SB = bnb.functional.transform(weight.data, to_order=state.formatB)
+                out32, Sout32 = bnb.functional.igemmlt(im, state.CxB, Sim, state.SB)
+                output = bnb.functional.mm_dequant(out32, Sout32, SCim, state.SCB, bias=None).t()
+
+                w_data = output.to(lora_data.dtype).to(lora_data.device) + lora_data
+                if safe_merge and not torch.isfinite(w_data).all():
+                    raise ValueError(
+                        f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                    )
+
+                self.base_layer.weight = bnb.nn.Int8Params(
+                    w_data.to("cpu"), requires_grad=False, has_fp16_weights=weight.has_fp16_weights
+                ).to(weight.device)
+                state.reset_grads()
+                self.merged_adapters.append(active_adapter)
+
+        def unmerge(self):
+            if not self.merged:
+                warnings.warn("Already unmerged. Nothing to do.")
+                return
+
+            while len(self.merged_adapters) > 0:
+                active_adapter = self.merged_adapters.pop()
+                if active_adapter not in self.lora_A.keys():
+                    continue
+                warnings.warn(
+                    "Unmerge lora module to 8-bit linear may get different generations due to rounding errors."
+                )
+                lora_data = self.get_delta_weight(active_adapter)
+
+                weight = self.base_layer.weight
+                state = self.base_layer.state
+                if state.SCB is None:
+                    state.SCB = weight.SCB
+                im = torch.eye(weight.data.shape[-1]).contiguous().half().to(weight.device)
+                im, imt, SCim, SCimt, coo_tensorim = bnb.functional.double_quant(im)
+                im, Sim = bnb.functional.transform(im, "col32")
+
+                if state.CxB is None:
+                    state.CxB, state.SB = bnb.functional.transform(weight.data, to_order=state.formatB)
+                out32, Sout32 = bnb.functional.igemmlt(im, state.CxB, Sim, state.SB)
+                output = bnb.functional.mm_dequant(out32, Sout32, SCim, state.SCB, bias=None).t()
+
+                w_data = output.to(lora_data.dtype).to(lora_data.device) - lora_data
+                self.base_layer.weight = bnb.nn.Int8Params(
+                    w_data.to("cpu"), requires_grad=False, has_fp16_weights=weight.has_fp16_weights
+                ).to(weight.device)
+                state.reset_grads()
+
+        def get_delta_weight(self, adapter):
+            return (
+                transpose(
+                    self.lora_B[adapter].weight @ self.lora_A[adapter].weight,
+                    False,
+                )
+                * self.scaling[adapter]
+            )
+
+        def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+            if self.disable_adapters:
+                if self.merged:
+                    self.unmerge()
+                result = self.base_layer(x, *args, **kwargs)
+            elif self.merged:
+                result = self.base_layer(x, *args, **kwargs)
+            else:
+                result = self.base_layer(x, *args, **kwargs)
+                for active_adapter in self.active_adapters:
+                    if active_adapter not in self.lora_A.keys():
+                        continue
+                    lora_A = self.lora_A[active_adapter]
+                    lora_B = self.lora_B[active_adapter]
+                    dropout = self.lora_dropout[active_adapter]
+                    scaling = self.scaling[active_adapter]
+
+                    requires_conversion = not torch.is_autocast_enabled()
+                    if requires_conversion:
+                        expected_dtype = result.dtype
+                        compute_dtype = lora_A.weight.dtype
+                        if x.dtype != compute_dtype:
+                            x = x.to(compute_dtype)
+                    output = lora_B(lora_A(dropout(x)))
+                    if requires_conversion:
+                        output = output.to(expected_dtype)
+                    output = output * scaling
+                    result += output
+
+            return result
+
+
+if is_bnb_4bit_available():
+
+    class Linear4bit(torch.nn.Module, LoraLayer):
+        # Lora implemented in a dense layer
+        def __init__(
+            self,
+            adapter_name,
+            base_layer,
+            r: int = 0,
+            lora_alpha: int = 1,
+            lora_dropout: float = 0.0,
+            **kwargs,
+        ) -> None:
+            super().__init__()
+            LoraLayer.__init__(self, in_features=base_layer.in_features, out_features=base_layer.out_features)
+            self.base_layer = base_layer
+
+            init_lora_weights = kwargs.pop("init_lora_weights", True)
+            self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
+            self.set_adapter(adapter_name)
+
+        def merge(self, safe_merge: bool = False):
+            """
+            Merge the active adapter weights into the base weights
+
+            Args:
+                safe_merge (`bool`, *optional*):
+                    If True, the merge operation will be performed in a copy of the original weights and check for NaNs
+                    before merging the weights. This is useful if you want to check if the merge operation will produce
+                    NaNs. Defaults to `False`.
+            """
+            if self.merged:
+                warnings.warn(
+                    f"Already following adapters were merged {','.join(self.merged_adapters)}. "
+                    f"You are now additionally merging {','.join(self.active_adapters)}."
+                )
+
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.lora_A.keys():
+                    continue
+                warnings.warn(
+                    "Merge lora module to 4-bit linear may get different generations due to rounding errors."
+                )
+                # Refer to https://gist.github.com/ChrisHayduk/1a53463331f52dca205e55982baf9930
+                weight = self.base_layer.weight
+                kwargs = weight.__dict__
+                lora_data = self.get_delta_weight(active_adapter)
+
+                w_data = bnb.functional.dequantize_4bit(weight.data, weight.quant_state) + lora_data
+                if safe_merge and not torch.isfinite(w_data).all():
+                    raise ValueError(
+                        f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                    )
+
+                self.base_layer.weight = bnb.nn.Params4bit(w_data.to("cpu"), requires_grad=False, **kwargs).to(
+                    weight.device
+                )
+                self.merged_adapters.append(active_adapter)
+
+        def unmerge(self):
+            if not self.merged:
+                warnings.warn("Already unmerged. Nothing to do.")
+                return
+
+            while len(self.merged_adapters) > 0:
+                active_adapter = self.merged_adapters.pop()
+                if active_adapter not in self.lora_A.keys():
+                    continue
+                warnings.warn(
+                    "Unmerge lora module to 4-bit linear may get different generations due to rounding errors."
+                )
+                weight = self.base_layer.weight
+                kwargs = weight.__dict__
+                lora_data = self.get_delta_weight(active_adapter)
+                w_data = bnb.functional.dequantize_4bit(weight.data, weight.quant_state) - lora_data
+                self.base_layer.weight = bnb.nn.Params4bit(w_data.to("cpu"), requires_grad=False, **kwargs).to(
+                    weight.device
+                )
+
+        def get_delta_weight(self, adapter):
+            return (
+                transpose(
+                    self.lora_B[adapter].weight @ self.lora_A[adapter].weight,
+                    False,
+                )
+                * self.scaling[adapter]
+            )
+
+        def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+            if self.disable_adapters:
+                if self.merged:
+                    self.unmerge()
+                result = self.base_layer.forward(x, *args, **kwargs)
+            elif self.merged:
+                result = self.base_layer.forward(x, *args, **kwargs)
+            else:
+                result = self.base_layer.forward(x, *args, **kwargs)
+                # As per Tim Dettmers, for 4bit, we need to defensively clone here.
+                # The reason is that in some cases, an error can occur that backprop
+                # does not work on a manipulated view. This issue may be solved with
+                # newer PyTorch versions but this would need extensive testing to be
+                # sure.
+                result = result.clone()
+
+                for active_adapter in self.active_adapters:
+                    if active_adapter not in self.lora_A.keys():
+                        continue
+                    lora_A = self.lora_A[active_adapter]
+                    lora_B = self.lora_B[active_adapter]
+                    dropout = self.lora_dropout[active_adapter]
+                    scaling = self.scaling[active_adapter]
+
+                    requires_conversion = not torch.is_autocast_enabled()
+                    if requires_conversion:
+                        expected_dtype = result.dtype
+                        x = x.to(lora_A.weight.dtype)
+
+                    output = lora_B(lora_A(dropout(x)))
+                    if requires_conversion:
+                        output = output.to(expected_dtype)
+                    output = output * scaling
+                    result += output
+
+            return result
--- a/src/peft/tuners/lora/config.py
+++ b/src/peft/tuners/lora/config.py
@ -0,0 +1,132 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import List, Optional, Union
+
+from peft.config import PeftConfig
+from peft.utils import PeftType
+
+
+@dataclass
+class LoraConfig(PeftConfig):
+    """
+    This is the configuration class to store the configuration of a [`LoraModel`].
+
+    Args:
+        r (`int`): Lora attention dimension.
+        target_modules (`Union[List[str],str]`): The names of the modules to apply Lora to.
+        lora_alpha (`int`): The alpha parameter for Lora scaling.
+        lora_dropout (`float`): The dropout probability for Lora layers.
+        fan_in_fan_out (`bool`): Set this to True if the layer to replace stores weight like (fan_in, fan_out).
+            For example, gpt-2 uses `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set
+            to `True`.
+        bias (`str`): Bias type for Lora. Can be 'none', 'all' or 'lora_only'. If 'all' or 'lora_only', the
+            corresponding biases will be updated during training. Be aware that this means that, even when disabling
+            the adapters, the model will not produce the same output as the base model would have without adaptation.
+        modules_to_save (`List[str]`):List of modules apart from LoRA layers to be set as trainable
+            and saved in the final checkpoint.
+        layers_to_transform (`Union[List[int],int]`):
+            The layer indexes to transform, if this argument is specified, it will apply the LoRA transformations on
+            the layer indexes that are specified in this list. If a single integer is passed, it will apply the LoRA
+            transformations on the layer at this index.
+        layers_pattern (`str`):
+            The layer pattern name, used only if `layers_to_transform` is different from `None` and if the layer
+            pattern is not in the common layers pattern.
+        rank_pattern (`dict`):
+            The mapping from layer names or regexp expression to ranks which are different from the default rank
+            specified by `r`.
+        alpha_pattern (`dict`):
+            The mapping from layer names or regexp expression to alphas which are different from the default alpha
+            specified by `lora_alpha`.
+    """
+
+    r: int = field(default=8, metadata={"help": "Lora attention dimension"})
+    target_modules: Optional[Union[List[str], str]] = field(
+        default=None,
+        metadata={
+            "help": "List of module names or regex expression of the module names to replace with Lora."
+            "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
+        },
+    )
+    lora_alpha: int = field(default=8, metadata={"help": "Lora alpha"})
+    lora_dropout: float = field(default=0.0, metadata={"help": "Lora dropout"})
+    fan_in_fan_out: bool = field(
+        default=False,
+        metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"},
+    )
+    bias: str = field(default="none", metadata={"help": "Bias type for Lora. Can be 'none', 'all' or 'lora_only'"})
+    modules_to_save: Optional[List[str]] = field(
+        default=None,
+        metadata={
+            "help": "List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. "
+            "For example, in Sequence Classification or Token Classification tasks, "
+            "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
+        },
+    )
+    init_lora_weights: bool = field(
+        default=True,
+        metadata={
+            "help": (
+                "Whether to initialize the weights of the Lora layers with their default initialization. Don't change "
+                "this setting, except if you know exactly what you're doing."
+            ),
+        },
+    )
+    layers_to_transform: Optional[Union[List[int], int]] = field(
+        default=None,
+        metadata={
+            "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index. "
+            "This only works when target_modules is a list of str."
+        },
+    )
+    layers_pattern: Optional[Union[List[str], str]] = field(
+        default=None,
+        metadata={
+            "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern."
+            "This only works when target_modules is a list of str."
+        },
+    )
+    rank_pattern: Optional[dict] = field(
+        default_factory=dict,
+        metadata={
+            "help": (
+                "The mapping from layer names or regexp expression to ranks which are different from the default rank specified by `r`. "
+                "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 8`}"
+            )
+        },
+    )
+    alpha_pattern: Optional[dict] = field(
+        default_factory=dict,
+        metadata={
+            "help": (
+                "The mapping from layer names or regexp expression to alphas which are different from the default alpha specified by `lora_alpha`. "
+                "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 32`}"
+            )
+        },
+    )
+
+    def __post_init__(self):
+        self.peft_type = PeftType.LORA
+        self.target_modules = (
+            set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
+        )
+        # if target_modules is a regex expression, then layers_to_transform should be None
+        if isinstance(self.target_modules, str) and self.layers_to_transform is not None:
+            raise ValueError("`layers_to_transform` cannot be used when `target_modules` is a str.")
+
+        # if target_modules is a regex expression, then layers_pattern should be None
+        if isinstance(self.target_modules, str) and self.layers_pattern is not None:
+            raise ValueError("`layers_pattern` cannot be used when `target_modules` is a str.")
--- a/src/peft/tuners/lora/gptq.py
+++ b/src/peft/tuners/lora/gptq.py
@ -0,0 +1,72 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from peft.tuners.lora.layer import LoraLayer
+
+
+class QuantLinear(torch.nn.Module, LoraLayer):
+    def __init__(
+        self,
+        adapter_name,
+        quant_linear_module,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        **kwargs,
+    ):
+        torch.nn.Module.__init__(self)
+        LoraLayer.__init__(
+            self, in_features=quant_linear_module.infeatures, out_features=quant_linear_module.outfeatures
+        )
+        self.quant_linear_module = quant_linear_module
+        self.weight = quant_linear_module.qweight
+        init_lora_weights = kwargs.pop("init_lora_weights", True)
+        self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
+        self.set_adapter(adapter_name)
+
+    def forward(self, x: torch.Tensor):
+        # note: logic differs from default Linear because merging is not supported
+        result = self.quant_linear_module(x)
+
+        if self.disable_adapters:
+            return result
+
+        for active_adapter in self.active_adapters:
+            if active_adapter not in self.lora_A.keys():
+                continue
+            lora_A = self.lora_A[active_adapter]
+            lora_B = self.lora_B[active_adapter]
+            dropout = self.lora_dropout[active_adapter]
+            scaling = self.scaling[active_adapter]
+
+            requires_conversion = not torch.is_autocast_enabled()
+            if requires_conversion:
+                expected_dtype = result.dtype
+                x = x.to(lora_A.weight.dtype)
+
+            output = lora_B(lora_A(dropout(x)))
+            if requires_conversion:
+                output = output.to(expected_dtype)
+            output = output * scaling
+            result += output
+        return result
+
+    # TODO: Check if it is better as suggested by users https://github.com/PanQiWei/AutoGPTQ/pull/102
+    # def reset_lora_parameters(self, adapter_name):
+    #     if adapter_name in self.lora_A.keys():
+    #         torch.nn.init.xavier_uniform_(self.lora_A[adapter_name].weight)
+    #         torch.nn.init.zeros_(self.lora_B[adapter_name].weight)
--- a/src/peft/tuners/lora/layer.py
+++ b/src/peft/tuners/lora/layer.py
@ -0,0 +1,605 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import warnings
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from peft.tuners.tuners_utils import BaseTunerLayer
+from peft.utils.other import transpose
+
+
+class LoraLayer(BaseTunerLayer):
+    # All names of layers that may contain (trainable) adapter weights
+    adapter_layer_names = ("lora_A", "lora_B", "lora_embedding_A", "lora_embedding_B")
+    # All names of other parameters that may contain adapter-related parameters
+    other_param_names = ("r", "lora_alpha", "scaling", "lora_dropout")
+
+    def __init__(self, in_features: int, out_features: int, **kwargs):
+        self.r = {}
+        self.lora_alpha = {}
+        self.scaling = {}
+        self.lora_dropout = nn.ModuleDict({})
+        self.lora_A = nn.ModuleDict({})
+        self.lora_B = nn.ModuleDict({})
+        # For Embedding layer
+        self.lora_embedding_A = nn.ParameterDict({})
+        self.lora_embedding_B = nn.ParameterDict({})
+        # Mark the weight as unmerged
+        self._disable_adapters = False
+        self.merged_adapters = []
+        self.in_features = in_features
+        self.out_features = out_features
+        self.kwargs = kwargs
+
+    def _init_empty_weights(self, cls, *args, **kwargs) -> None:
+        # A helper method that allows to initialize the layer of the given class without spending time to initialize the
+        # model weights. The implementation is inspired by
+        # https://pytorch.org/docs/stable/generated/torch.nn.utils.skip_init.html but this function cannot be used
+        # directly.
+        # Instead of this approach, it would be possible to bypass the __init__ of the class but that runs the risk of
+        # omitting important logic inside that __init__.
+        kwargs = kwargs.copy()
+        final_device = kwargs.pop("device", "cpu")
+        cls.__init__(self, *args, device="meta", **kwargs)
+        self.to_empty(device=final_device)
+
+    def update_layer(self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights):
+        if r <= 0:
+            raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
+        self.r[adapter_name] = r
+        self.lora_alpha[adapter_name] = lora_alpha
+        if lora_dropout > 0.0:
+            lora_dropout_layer = nn.Dropout(p=lora_dropout)
+        else:
+            lora_dropout_layer = nn.Identity()
+
+        self.lora_dropout.update(nn.ModuleDict({adapter_name: lora_dropout_layer}))
+        # Actual trainable parameters
+        if r > 0:
+            self.lora_A[adapter_name] = nn.Linear(self.in_features, r, bias=False)
+            self.lora_B[adapter_name] = nn.Linear(r, self.out_features, bias=False)
+            self.scaling[adapter_name] = lora_alpha / r
+        if init_lora_weights:
+            self.reset_lora_parameters(adapter_name)
+
+        weight = getattr(self, "weight", None)
+        if weight is not None:
+            # the layer is already completely initialized, this is an update
+            if weight.dtype.is_floating_point or weight.dtype.is_complex:
+                self.to(weight.device, dtype=weight.dtype)
+            else:
+                self.to(weight.device)
+        self.set_adapter(self.active_adapters)
+
+    def update_layer_conv2d(self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights):
+        if r <= 0:
+            raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
+        self.r[adapter_name] = r
+        self.lora_alpha[adapter_name] = lora_alpha
+        if lora_dropout > 0.0:
+            lora_dropout_layer = nn.Dropout(p=lora_dropout)
+        else:
+            lora_dropout_layer = nn.Identity()
+
+        self.lora_dropout[adapter_name] = lora_dropout_layer
+        # Actual trainable parameters
+        if r > 0:
+            kernel_size = self.kwargs["kernel_size"]
+            stride = self.kwargs["stride"]
+            padding = self.kwargs["padding"]
+            self.lora_A[adapter_name] = nn.Conv2d(self.in_features, r, kernel_size, stride, padding, bias=False)
+            self.lora_B[adapter_name] = nn.Conv2d(r, self.out_features, (1, 1), (1, 1), bias=False)
+            self.scaling[adapter_name] = lora_alpha / r
+        if init_lora_weights:
+            self.reset_lora_parameters(adapter_name)
+
+        weight = getattr(self, "weight", None)
+        if weight is not None:
+            # the layer is already completely initialized, this is an update
+            self.to(self.weight.device, dtype=weight.dtype)
+
+    def update_layer_embedding(self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights):
+        if r <= 0:
+            raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
+        self.r[adapter_name] = r
+        self.lora_alpha[adapter_name] = lora_alpha
+        if lora_dropout > 0.0:
+            lora_dropout_layer = nn.Dropout(p=lora_dropout)
+        else:
+            lora_dropout_layer = nn.Identity()
+
+        self.lora_dropout[adapter_name] = lora_dropout_layer
+        # Actual trainable parameters
+        if r > 0:
+            weight_A = torch.randn((r, self.in_features))
+            weight_B = torch.randn((self.out_features, r))
+            self.lora_embedding_A[adapter_name] = nn.Parameter(weight_A)
+            self.lora_embedding_B[adapter_name] = nn.Parameter(weight_B)
+            self.scaling[adapter_name] = lora_alpha / r
+        if init_lora_weights:
+            self.reset_lora_parameters(adapter_name)
+
+        weight = getattr(self, "weight", None)
+        if weight is not None:
+            # the layer is already completely initialized, this is an update
+            self.to(self.weight.device, dtype=weight.dtype)
+
+    def reset_lora_parameters(self, adapter_name):
+        if adapter_name in self.lora_A.keys():
+            # initialize A the same way as the default for nn.Linear and B to zero
+            nn.init.kaiming_uniform_(self.lora_A[adapter_name].weight, a=math.sqrt(5))
+            nn.init.zeros_(self.lora_B[adapter_name].weight)
+        if adapter_name in self.lora_embedding_A.keys():
+            # initialize a the same way as the default for nn.linear and b to zero
+            nn.init.zeros_(self.lora_embedding_A[adapter_name])
+            nn.init.normal_(self.lora_embedding_B[adapter_name])
+
+    def set_scale(self, adapter, scale):
+        if adapter not in self.scaling:
+            # Ignore the case where the adapter is not in the layer
+            return
+        self.scaling[adapter] = scale * self.lora_alpha[adapter] / self.r[adapter]
+
+    def scale_layer(self, scale: float) -> None:
+        if scale == 1:
+            return
+
+        for active_adapter in self.active_adapters:
+            if active_adapter not in self.lora_A.keys():
+                continue
+
+            self.scaling[active_adapter] *= scale
+
+    def unscale_layer(self, scale=None) -> None:
+        for active_adapter in self.active_adapters:
+            if active_adapter not in self.lora_A.keys():
+                continue
+
+            if scale is None:
+                self.scaling[active_adapter] = self.lora_alpha[active_adapter] / self.r[active_adapter]
+            else:
+                self.scaling[active_adapter] /= scale
+
+
+# Below code is based on https://github.com/microsoft/LoRA/blob/main/loralib/layers.py
+# and modified to work with PyTorch FSDP
+
+
+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+
+
+class Linear(nn.Linear, LoraLayer):
+    # Lora implemented in a dense layer
+    def __init__(
+        self,
+        adapter_name: str,
+        in_features: int,
+        out_features: int,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
+        is_target_conv_1d_layer: bool = False,
+        **kwargs,
+    ) -> None:
+        init_lora_weights = kwargs.pop("init_lora_weights", True)
+        # this gets the init from nn.Linear's super perspective, i.e.
+        # nn.Module.__init__, which should always be called
+        super(nn.Linear, self).__init__()
+        # Note that we don't use self._init_empty_weights() for Linear because it is a bit slower and the benefit of
+        # added robustness is not big enough for Linear.
+
+        LoraLayer.__init__(self, in_features=in_features, out_features=out_features)
+        # Freezing the pre-trained weight matrix
+
+        self.fan_in_fan_out = fan_in_fan_out
+
+        self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
+        self.is_target_conv_1d_layer = is_target_conv_1d_layer
+        self.set_adapter(adapter_name)
+
+    def merge(self, safe_merge: bool = False) -> None:
+        """
+        Merge the active adapter weights into the base weights
+
+        Args:
+            safe_merge (`bool`, *optional*):
+                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
+                before merging the weights. This is useful if you want to check if the merge operation will produce
+                NaNs. Defaults to `False`.
+        """
+        if self.merged:
+            warnings.warn(
+                f"Already following adapters were merged {','.join(self.merged_adapters)}. "
+                f"You are now additionally merging {','.join(self.active_adapters)}."
+            )
+        for active_adapter in self.active_adapters:
+            if active_adapter in self.lora_A.keys():
+                if safe_merge:
+                    # Note that safe_merge will be slower than the normal merge
+                    # because of the copy operation.
+                    orig_weights = self.weight.data.clone()
+                    orig_weights += self.get_delta_weight(active_adapter)
+
+                    if not torch.isfinite(orig_weights).all():
+                        raise ValueError(
+                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                        )
+
+                    self.weight.data = orig_weights
+                else:
+                    self.weight.data += self.get_delta_weight(active_adapter)
+                self.merged_adapters.append(active_adapter)
+
+    def unmerge(self) -> None:
+        if not self.merged:
+            warnings.warn("Already unmerged. Nothing to do.")
+            return
+        while len(self.merged_adapters) > 0:
+            active_adapter = self.merged_adapters.pop()
+            if active_adapter in self.lora_A.keys():
+                self.weight.data -= self.get_delta_weight(active_adapter)
+
+    def get_delta_weight(self, adapter) -> torch.Tensor:
+        """
+        Compute the delta weight for the given adapter.
+
+        Args:
+            adapter (str):
+                The name of the adapter for which the delta weight should be computed.
+        """
+        device = self.lora_B[adapter].weight.device
+        dtype = self.lora_B[adapter].weight.dtype
+
+        # In case users wants to merge the adapter weights that are in
+        # float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to
+        # float16 because the `@` and matmul operation in general is not supported in torch + cpu + fp16.
+        cast_to_fp32 = device.type == "cpu" and dtype == torch.float16
+
+        weight_A = self.lora_A[adapter].weight
+        weight_B = self.lora_B[adapter].weight
+
+        if cast_to_fp32:
+            weight_A = weight_A.float()
+            weight_B = weight_B.float()
+
+        output_tensor = transpose(weight_B @ weight_A, self.fan_in_fan_out) * self.scaling[adapter]
+
+        if cast_to_fp32:
+            output_tensor = output_tensor.to(dtype=dtype)
+
+            # cast back the weights
+            self.lora_A[adapter].weight.data = weight_A.to(dtype)
+            self.lora_B[adapter].weight.data = weight_B.to(dtype)
+
+        return output_tensor
+
+    def _linear(self, input: torch.Tensor) -> torch.Tensor:
+        return F.linear(input, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        previous_dtype = x.dtype
+
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result = self._linear(x)
+        elif self.merged:
+            result = self._linear(x)
+        else:
+            result = self._linear(x)
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.lora_A.keys():
+                    continue
+                lora_A = self.lora_A[active_adapter]
+                lora_B = self.lora_B[active_adapter]
+                dropout = self.lora_dropout[active_adapter]
+                scaling = self.scaling[active_adapter]
+                x = x.to(lora_A.weight.dtype)
+                result += lora_B(lora_A(dropout(x))) * scaling
+
+        result = result.to(previous_dtype)
+        return result
+
+
+class Embedding(nn.Embedding, LoraLayer):
+    # LoRA implemented in a Embedding layer
+    def __init__(
+        self,
+        adapter_name: str,
+        num_embeddings: int,
+        embedding_dim: int,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        **kwargs,
+    ) -> None:
+        init_lora_weights = kwargs.pop("init_lora_weights", True)
+        self._init_empty_weights(nn.Embedding, num_embeddings, embedding_dim, **kwargs)
+        LoraLayer.__init__(self, in_features=num_embeddings, out_features=embedding_dim)
+        self.update_layer_embedding(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
+        self.set_adapter(adapter_name)
+
+    def merge(self, safe_merge: bool = False) -> None:
+        """
+        Merge the active adapter weights into the base weights
+
+        Args:
+            safe_merge (`bool`, *optional*):
+                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
+                before merging the weights. This is useful if you want to check if the merge operation will produce
+                NaNs. Defaults to `False`.
+        """
+        if self.merged:
+            warnings.warn(
+                f"Already following adapters were merged {','.join(self.merged_adapters)}. "
+                f"You are now additionally merging {','.join(self.active_adapters)}."
+            )
+        for active_adapter in self.active_adapters:
+            if active_adapter in self.lora_embedding_A.keys():
+                if safe_merge:
+                    # Note that safe_merge will be slower than the normal merge
+                    # because of the copy operation.
+                    orig_weights = self.weight.data.copy()
+                    orig_weights += self.get_delta_weight(active_adapter)
+
+                    if not torch.isfinite(orig_weights).all():
+                        raise ValueError(
+                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                        )
+
+                    self.weight.data = orig_weights
+                else:
+                    self.weight.data += self.get_delta_weight(active_adapter)
+                self.merged_adapters.append(active_adapter)
+
+    def unmerge(self) -> None:
+        if not self.merged:
+            warnings.warn("Already unmerged. Nothing to do.")
+            return
+        while len(self.merged_adapters) > 0:
+            active_adapter = self.merged_adapters.pop()
+            if active_adapter in self.lora_embedding_A.keys():
+                self.weight.data -= self.get_delta_weight(active_adapter)
+
+    def get_delta_weight(self, adapter) -> torch.Tensor:
+        """
+        Compute the delta weight for the given adapter.
+
+        Args:
+            adapter (str):
+                The name of the adapter for which the delta weight should be computed.
+        """
+        device = self.lora_embedding_B[adapter].device
+        dtype = self.lora_embedding_A[adapter].dtype
+
+        # In case users wants to merge the adapter weights that are in
+        # float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to
+        # float16 because the `@` and matmul operation in general is not supported in torch + cpu + fp16.
+        cast_to_fp32 = device.type == "cpu" and dtype == torch.float16
+
+        weight_A = self.lora_embedding_A[adapter]
+        weight_B = self.lora_embedding_B[adapter]
+
+        if cast_to_fp32:
+            weight_A = weight_A.float()
+            weight_B = weight_B.float()
+
+        output_tensor = transpose(weight_B @ weight_A, True) * self.scaling[adapter]
+
+        if cast_to_fp32:
+            output_tensor = output_tensor.to(dtype=dtype)
+
+            # cast back the weights
+            self.lora_embedding_A[adapter] = weight_A.to(dtype)
+            self.lora_embedding_B[adapter] = weight_B.to(dtype)
+
+        return output_tensor
+
+    def _embed(self, input: torch.Tensor, weight: Optional[torch.Tensor] = None) -> torch.Tensor:
+        weight = self.weight if weight is None else weight
+        return F.embedding(
+            input,
+            weight,
+            padding_idx=self.padding_idx,
+            max_norm=self.max_norm,
+            norm_type=self.norm_type,
+            scale_grad_by_freq=self.scale_grad_by_freq,
+            sparse=self.sparse,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # TODO: no dtype conversion here, unlike in Linear, is that correct?
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result = self._embed(x)
+        elif self.merged:
+            result = self._embed(x)
+        else:
+            result = self._embed(x)
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.lora_embedding_A:
+                    continue
+                embedding_A = self.lora_embedding_A[active_adapter].T
+                embedding_B = self.lora_embedding_B[active_adapter].T
+                scaling = self.scaling[active_adapter]
+                after_A = self._embed(x, embedding_A)
+                result += (after_A @ embedding_B) * scaling
+
+        return result
+
+
+class Conv2d(nn.Conv2d, LoraLayer):
+    # Lora implemented in a conv2d layer
+    def __init__(
+        self,
+        adapter_name: str,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int]],
+        stride: Union[int, Tuple[int]] = 1,
+        padding: Union[int, Tuple[int]] = 0,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        **kwargs,
+    ) -> None:
+        init_lora_weights = kwargs.pop("init_lora_weights", True)
+        self._init_empty_weights(nn.Conv2d, in_channels, out_channels, kernel_size, stride=stride, padding=padding)
+
+        LoraLayer.__init__(
+            self,
+            in_features=in_channels,
+            out_features=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+        )
+
+        self.update_layer_conv2d(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
+        self.set_adapter(adapter_name)
+
+    def merge(self, safe_merge: bool = False) -> None:
+        """
+        Merge the active adapter weights inside the base weights
+
+        Args:
+            safe_merge (`bool`, *optional*):
+                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
+                before merging the weights. This is useful if you want to check if the merge operation will produce
+                NaNs. Defaults to `False`.
+        """
+        if self.merged:
+            warnings.warn(
+                f"Already following adapters were merged {','.join(self.merged_adapters)}. "
+                f"You are now additionally merging {','.join(self.active_adapters)}."
+            )
+        for active_adapter in self.active_adapters:
+            if active_adapter in self.lora_A.keys():
+                if safe_merge:
+                    # Note that safe_merge will be slower than the normal merge
+                    # because of the copy operation.
+                    orig_weights = self.weight.data.copy()
+                    orig_weights += self.get_delta_weight(active_adapter)
+
+                    if not torch.isfinite(orig_weights).all():
+                        raise ValueError(
+                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                        )
+                    self.weight.data = orig_weights
+                else:
+                    self.weight.data += self.get_delta_weight(active_adapter)
+                self.merged_adapters.append(active_adapter)
+
+    def unmerge(self) -> None:
+        if not self.merged:
+            warnings.warn("Already unmerged. Nothing to do.")
+            return
+        while len(self.merged_adapters) > 0:
+            active_adapter = self.merged_adapters.pop()
+            if active_adapter in self.lora_A.keys():
+                self.weight.data -= self.get_delta_weight(active_adapter)
+
+    def get_delta_weight(self, adapter) -> torch.Tensor:
+        """
+        Compute the delta weight for the given adapter.
+
+        Args:
+            adapter (str):
+                The name of the adapter for which the delta weight should be computed.
+        """
+        device = self.lora_B[adapter].weight.device
+        dtype = self.lora_A[adapter].weight.dtype
+
+        # In case users wants to merge the adapter weights that are in
+        # float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to
+        # float16 because the `@` and matmul operation in general is not supported in torch + cpu + fp16.
+        cast_to_fp32 = device.type == "cpu" and dtype == torch.float16
+
+        weight_A = self.lora_A[adapter].weight
+        weight_B = self.lora_B[adapter].weight
+
+        if cast_to_fp32:
+            weight_A = weight_A.float()
+            weight_B = weight_B.float()
+
+        # https://github.com/bmaltais/kohya_ss/blob/feb6728762a8f463d15ba936d189d4c3abfaa1ab/networks/lora.py#L117
+        if self.weight.size()[2:4] == (1, 1):
+            # conv2d 1x1
+            output_tensor = (weight_B.squeeze(3).squeeze(2) @ weight_A.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(
+                3
+            ) * self.scaling[adapter]
+        else:
+            # conv2d 3x3
+            output_tensor = (
+                F.conv2d(
+                    weight_A.permute(1, 0, 2, 3),
+                    weight_B,
+                ).permute(1, 0, 2, 3)
+                * self.scaling[adapter]
+            )
+
+        if cast_to_fp32:
+            output_tensor = output_tensor.to(dtype=dtype)
+
+            # cast back the weights
+            self.lora_A[adapter].weight.data = weight_A.to(dtype)
+            self.lora_B[adapter].weight.data = weight_B.to(dtype)
+
+        return output_tensor
+
+    def _conv2d(self, input: torch.Tensor) -> torch.Tensor:
+        return F.conv2d(
+            input,
+            self.weight,
+            bias=self.bias,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation,
+            groups=self.groups,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        previous_dtype = x.dtype
+
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result = self._conv2d(x)
+        elif self.merged:
+            result = self._conv2d(x)
+        else:
+            result = self._conv2d(x)
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.lora_A.keys():
+                    continue
+                lora_A = self.lora_A[active_adapter]
+                lora_B = self.lora_B[active_adapter]
+                dropout = self.lora_dropout[active_adapter]
+                scaling = self.scaling[active_adapter]
+                x = x.to(lora_A.weight.dtype)
+                result += lora_B(lora_A(dropout(x))) * scaling
+
+        result = result.to(previous_dtype)
+        return result
--- a/src/peft/tuners/lora/model.py
+++ b/src/peft/tuners/lora/model.py
@ -0,0 +1,705 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import operator
+import re
+import warnings
+from dataclasses import asdict, replace
+from enum import Enum
+from functools import reduce
+from itertools import chain
+
+import torch
+from torch import nn
+from tqdm import tqdm
+from transformers.pytorch_utils import Conv1D
+
+from peft.import_utils import is_bnb_4bit_available, is_bnb_available
+from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists
+from peft.utils import (
+    TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,
+    ModulesToSaveWrapper,
+    _freeze_adapter,
+    _get_submodules,
+    get_auto_gptq_quant_linear,
+    get_quantization_config,
+)
+
+from .config import LoraConfig
+from .gptq import QuantLinear
+from .layer import Conv2d, Embedding, Linear, LoraLayer
+
+
+if is_bnb_available():
+    import bitsandbytes as bnb
+
+    from .bnb import Linear8bitLt
+
+if is_bnb_4bit_available():
+    from .bnb import Linear4bit
+
+
+class LoraModel(BaseTuner):
+    """
+    Creates Low Rank Adapter (Lora) model from a pretrained transformers model.
+
+    Args:
+        model ([`~transformers.PreTrainedModel`]): The model to be adapted.
+        config ([`LoraConfig`]): The configuration of the Lora model.
+        adapter_name (`str`): The name of the adapter, defaults to `"default"`.
+
+    Returns:
+        `torch.nn.Module`: The Lora model.
+
+    Example:
+
+        ```py
+        >>> from transformers import AutoModelForSeq2SeqLM
+        >>> from peft import LoraModel, LoraConfig
+
+        >>> config = LoraConfig(
+        ...     task_type="SEQ_2_SEQ_LM",
+        ...     r=8,
+        ...     lora_alpha=32,
+        ...     target_modules=["q", "v"],
+        ...     lora_dropout=0.01,
+        ... )
+
+        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+        >>> lora_model = LoraModel(model, config, "default")
+        ```
+
+        ```py
+        >>> import transformers
+        >>> from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_int8_training
+
+        >>> target_modules = ["q_proj", "k_proj", "v_proj", "out_proj", "fc_in", "fc_out", "wte"]
+        >>> config = LoraConfig(
+        ...     r=4, lora_alpha=16, target_modules=target_modules, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM"
+        ... )
+
+        >>> model = transformers.GPTJForCausalLM.from_pretrained(
+        ...     "kakaobrain/kogpt",
+        ...     revision="KoGPT6B-ryan1.5b-float16",  # or float32 version: revision=KoGPT6B-ryan1.5b
+        ...     pad_token_id=tokenizer.eos_token_id,
+        ...     use_cache=False,
+        ...     device_map={"": rank},
+        ...     torch_dtype=torch.float16,
+        ...     load_in_8bit=True,
+        ... )
+        >>> model = prepare_model_for_int8_training(model)
+        >>> lora_model = get_peft_model(model, config)
+        ```
+
+    **Attributes**:
+        - **model** ([`~transformers.PreTrainedModel`]) -- The model to be adapted.
+        - **peft_config** ([`LoraConfig`]): The configuration of the Lora model.
+    """
+
+    def __init__(self, model, config, adapter_name) -> None:
+        super().__init__(model, config, adapter_name)
+
+    def _check_new_adapter_config(self, config: LoraConfig) -> None:
+        """
+        A helper method to check the config when a new adapter is being added.
+
+        Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters.
+
+        """
+        # TODO: there should be a check if any of the existing adapters actually has bias != "none", or else the check
+        # does not fully correspond to the error message.
+        if (len(self.peft_config) > 1) and (config.bias != "none"):
+            raise ValueError(
+                f"{self.__class__.__name__} supports only 1 adapter with bias. When using multiple adapters, "
+                "set bias to 'none' for all adapters."
+            )
+
+    @staticmethod
+    def _check_target_module_exists(lora_config, key):
+        return check_target_module_exists(lora_config, key)
+
+    def _create_and_replace(
+        self,
+        lora_config,
+        adapter_name,
+        target,
+        target_name,
+        parent,
+        current_key,
+        **optional_kwargs,
+    ):
+        if current_key is None:
+            raise ValueError("Current Key shouldn't be `None`")
+        # Regexp matching - Find key which matches current target_name in patterns provided
+        pattern_keys = list(chain(lora_config.rank_pattern.keys(), lora_config.alpha_pattern.keys()))
+        target_name_key = next(filter(lambda key: re.match(f".*\.{key}$", current_key), pattern_keys), current_key)
+
+        r = lora_config.rank_pattern.get(target_name_key, lora_config.r)
+        alpha = lora_config.alpha_pattern.get(target_name_key, lora_config.lora_alpha)
+        bias = hasattr(target, "bias") and target.bias is not None
+        kwargs = {
+            "r": r,
+            "lora_alpha": alpha,
+            "lora_dropout": lora_config.lora_dropout,
+            "fan_in_fan_out": lora_config.fan_in_fan_out,
+            "init_lora_weights": lora_config.init_lora_weights,
+        }
+        kwargs["loaded_in_8bit"] = optional_kwargs.pop("loaded_in_8bit", False)
+        kwargs["loaded_in_4bit"] = optional_kwargs.pop("loaded_in_4bit", False)
+        kwargs["bias"] = bias
+
+        quantization_config = get_quantization_config(self.model, method="gptq")
+        if quantization_config is not None:
+            kwargs["gptq_quantization_config"] = quantization_config
+
+        # TODO: better deal with that
+        if isinstance(target, LoraLayer) and isinstance(target, torch.nn.Conv2d):
+            target.update_layer_conv2d(
+                adapter_name,
+                r,
+                alpha,
+                lora_config.lora_dropout,
+                lora_config.init_lora_weights,
+            )
+        elif isinstance(target, LoraLayer) and isinstance(target, torch.nn.Embedding):
+            target.update_layer_embedding(
+                adapter_name,
+                r,
+                alpha,
+                lora_config.lora_dropout,
+                lora_config.init_lora_weights,
+            )
+
+        elif isinstance(target, LoraLayer):
+            target.update_layer(
+                adapter_name,
+                r,
+                alpha,
+                lora_config.lora_dropout,
+                lora_config.init_lora_weights,
+            )
+        else:
+            new_module = self._create_new_module(lora_config, adapter_name, target, **kwargs)
+            if adapter_name != self.active_adapter:
+                # adding an additional adapter: it is not automatically trainable
+                new_module.requires_grad_(False)
+            self._replace_module(parent, target_name, new_module, target)
+
+    @staticmethod
+    def _replace_module(parent, child_name, new_module, child):
+        setattr(parent, child_name, new_module)
+        # It's not necessary to set requires_grad here, as that is handled by
+        # _mark_only_adapters_as_trainable
+
+        # child layer wraps the original module, unpack it
+        if hasattr(child, "base_layer"):
+            child = child.base_layer
+        elif hasattr(child, "quant_linear_module"):
+            child = child.quant_linear_module
+
+        # TODO: layers with base_layer don't need the weight to be copied, as they have a reference already
+        if not hasattr(new_module, "base_layer"):
+            new_module.weight = child.weight
+            if hasattr(child, "bias"):
+                new_module.bias = child.bias
+
+        if getattr(child, "state", None) is not None:
+            if hasattr(new_module, "base_layer"):
+                new_module.base_layer.state = child.state
+            else:
+                new_module.state = child.state
+            new_module.to(child.weight.device)
+
+        # dispatch to correct device
+        for name, module in new_module.named_modules():
+            if "lora_" in name:
+                module.to(child.weight.device)
+            if "ranknum" in name:
+                module.to(child.weight.device)
+
+    def _mark_only_adapters_as_trainable(self) -> None:
+        for n, p in self.model.named_parameters():
+            if "lora_" not in n:
+                p.requires_grad = False
+
+        for active_adapter in self.active_adapters:
+            bias = self.peft_config[active_adapter].bias
+            if bias == "none":
+                continue
+
+            if bias == "all":
+                for n, p in self.model.named_parameters():
+                    if "bias" in n:
+                        p.requires_grad = True
+            elif bias == "lora_only":
+                for m in self.model.modules():
+                    if isinstance(m, LoraLayer) and hasattr(m, "bias") and m.bias is not None:
+                        m.bias.requires_grad = True
+            else:
+                raise NotImplementedError(f"Requested bias: {bias}, is not implemented.")
+
+    @staticmethod
+    def _create_new_module(lora_config, adapter_name, target, **kwargs):
+        gptq_quantization_config = kwargs.get("gptq_quantization_config", None)
+        AutoGPTQQuantLinear = get_auto_gptq_quant_linear(gptq_quantization_config)
+
+        loaded_in_8bit = kwargs.pop("loaded_in_8bit", False)
+        loaded_in_4bit = kwargs.pop("loaded_in_4bit", False)
+        bias = kwargs.pop("bias", False)
+
+        if loaded_in_8bit and isinstance(target, bnb.nn.Linear8bitLt):
+            eightbit_kwargs = kwargs.copy()
+            eightbit_kwargs.update(
+                {
+                    "has_fp16_weights": target.state.has_fp16_weights,
+                    "memory_efficient_backward": target.state.memory_efficient_backward,
+                    "threshold": target.state.threshold,
+                    "index": target.index,
+                }
+            )
+            new_module = Linear8bitLt(adapter_name, target, **eightbit_kwargs)
+        elif loaded_in_4bit and is_bnb_4bit_available() and isinstance(target, bnb.nn.Linear4bit):
+            fourbit_kwargs = kwargs.copy()
+            fourbit_kwargs.update(
+                {
+                    "compute_dtype": target.compute_dtype,
+                    "compress_statistics": target.weight.compress_statistics,
+                    "quant_type": target.weight.quant_type,
+                }
+            )
+            new_module = Linear4bit(adapter_name, target, **fourbit_kwargs)
+        elif AutoGPTQQuantLinear is not None and isinstance(target, AutoGPTQQuantLinear):
+            new_module = QuantLinear(adapter_name, target, **kwargs)
+            target.weight = target.qweight
+        elif isinstance(target, torch.nn.Embedding):
+            embedding_kwargs = kwargs.copy()
+            embedding_kwargs.pop("fan_in_fan_out", None)
+            in_features, out_features = target.num_embeddings, target.embedding_dim
+            new_module = Embedding(adapter_name, in_features, out_features, **embedding_kwargs)
+        elif isinstance(target, torch.nn.Conv2d):
+            out_channels, in_channels = target.weight.size()[:2]
+            kernel_size = target.weight.size()[2:]
+            stride = target.stride
+            padding = target.padding
+            new_module = Conv2d(adapter_name, in_channels, out_channels, kernel_size, stride, padding, **kwargs)
+        else:
+            if isinstance(target, torch.nn.Linear):
+                in_features, out_features = target.in_features, target.out_features
+                if kwargs["fan_in_fan_out"]:
+                    warnings.warn(
+                        "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
+                        "Setting fan_in_fan_out to False."
+                    )
+                    kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False
+            elif isinstance(target, Conv1D):
+                in_features, out_features = (
+                    target.weight.ds_shape if hasattr(target.weight, "ds_shape") else target.weight.shape
+                )
+                kwargs["is_target_conv_1d_layer"] = True
+                if not kwargs["fan_in_fan_out"]:
+                    warnings.warn(
+                        "fan_in_fan_out is set to False but the target module is `Conv1D`. "
+                        "Setting fan_in_fan_out to True."
+                    )
+                    kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = True
+            else:
+                raise ValueError(
+                    f"Target module {target} is not supported. Currently, only the following modules are supported: "
+                    "`torch.nn.Linear`, `torch.nn.Embedding`, `torch.nn.Conv2d`, `transformers.pytorch_utils.Conv1D`."
+                )
+            new_module = Linear(adapter_name, in_features, out_features, bias=bias, **kwargs)
+
+        return new_module
+
+    def __getattr__(self, name: str):
+        """Forward missing attributes to the wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Module's logic
+        except AttributeError:
+            return getattr(self.model, name)
+
+    def get_peft_config_as_dict(self, inference: bool = False):
+        config_dict = {}
+        for key, value in self.peft_config.items():
+            config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()}
+            if inference:
+                config["inference_mode"] = True
+        config_dict[key] = config
+        return config
+
+    def _set_adapter_layers(self, enabled=True):
+        for module in self.model.modules():
+            if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)):
+                module.enable_adapters(enabled)
+
+    def enable_adapter_layers(self):
+        self._set_adapter_layers(enabled=True)
+
+    def disable_adapter_layers(self):
+        for active_adapter in self.active_adapters:
+            val = self.peft_config[active_adapter].bias
+            if val != "none":
+                msg = (
+                    f"Careful, disabling adapter layers with bias configured to be '{val}' does not produce the same "
+                    "output as the the base model would without adaption."
+                )
+                warnings.warn(msg)
+        self._set_adapter_layers(enabled=False)
+
+    def set_adapter(self, adapter_name):
+        for module in self.model.modules():
+            if isinstance(module, LoraLayer):
+                if module.merged:
+                    warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.")
+                    module.unmerge()
+                module.set_adapter(adapter_name)
+        self.active_adapter = adapter_name
+
+    @staticmethod
+    def _prepare_adapter_config(peft_config, model_config):
+        if peft_config.target_modules is None:
+            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING:
+                raise ValueError("Please specify `target_modules` in `peft_config`")
+            peft_config.target_modules = set(
+                TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING[model_config["model_type"]]
+            )
+        return peft_config
+
+    def _unload_and_optionally_merge(self, merge=True, progressbar: bool = False, safe_merge: bool = False):
+        if merge:
+            if getattr(self.model, "quantization_method", None) == "gptq":
+                raise ValueError("Cannot merge LORA layers when the model is gptq quantized")
+
+        key_list = [key for key, _ in self.model.named_modules() if "lora" not in key]
+        desc = "Unloading " + ("and merging " if merge else "") + "model"
+        for key in tqdm(key_list, disable=not progressbar, desc=desc):
+            try:
+                parent, target, target_name = _get_submodules(self.model, key)
+            except AttributeError:
+                continue
+            if isinstance(target, LoraLayer):
+                if isinstance(target, nn.Embedding):
+                    new_module = torch.nn.Embedding(target.in_features, target.out_features)
+                elif isinstance(target, nn.Conv2d):
+                    new_module = torch.nn.Conv2d(
+                        target.in_channels,
+                        target.out_channels,
+                        kernel_size=target.kernel_size,
+                        stride=target.stride,
+                        padding=target.padding,
+                        dilation=target.dilation,
+                    )
+                elif is_bnb_available() and isinstance(target, Linear8bitLt):
+                    bias = target.base_layer.bias is not None
+                    new_module = bnb.nn.Linear8bitLt(
+                        target.in_features,
+                        target.out_features,
+                        bias=bias,
+                        has_fp16_weights=target.base_layer.state.has_fp16_weights,
+                        memory_efficient_backward=target.base_layer.state.memory_efficient_backward,
+                        threshold=target.base_layer.state.threshold,
+                        index=target.base_layer.index,
+                        device=target.base_layer.weight.device,
+                    )
+                elif is_bnb_4bit_available() and isinstance(target, Linear4bit):
+                    bias = target.base_layer.bias is not None
+                    new_module = bnb.nn.Linear4bit(
+                        target.in_features,
+                        target.out_features,
+                        bias=bias,
+                        compute_dtype=target.base_layer.compute_dtype,
+                        compress_statistics=target.base_layer.weight.compress_statistics,
+                        quant_type=target.base_layer.weight.quant_type,
+                        device=target.base_layer.weight.device,
+                    )
+                else:
+                    bias = target.bias is not None
+                    if getattr(target, "is_target_conv_1d_layer", False):
+                        new_module = Conv1D(target.out_features, target.in_features)
+                    else:
+                        new_module = torch.nn.Linear(target.in_features, target.out_features, bias=bias)
+                if merge:
+                    target.merge(safe_merge=safe_merge)
+                self._replace_module(parent, target_name, new_module, target)
+
+            # save any additional trainable modules part of `modules_to_save`
+            if isinstance(target, ModulesToSaveWrapper):
+                setattr(parent, target_name, target.modules_to_save[target.active_adapter])
+
+        return self.model
+
+    def add_weighted_adapter(
+        self,
+        adapters,
+        weights,
+        adapter_name,
+        combination_type="svd",
+        svd_rank=None,
+        svd_clamp=None,
+        svd_full_matrices=True,
+        svd_driver=None,
+    ):
+        """
+        This method adds a new adapter by merging the given adapters with the given weights.
+
+        When using the `cat` combination_type you should be aware that rank of the resulting adapter will be equal to
+        the sum of all adapters ranks. So it's possible that the mixed adapter may become too big and result in OOM
+        errors.
+
+        Args:
+            adapters (`list`):
+                List of adapter names to be merged.
+            weights (`list`):
+                List of weights for each adapter.
+            adapter_name (`str`):
+                Name of the new adapter.
+            combination_type (`str`):
+                Type of merging. Can be one of [`svd`, `linear`, `cat`]. When using the `cat` combination_type you
+                should be aware that rank of the resulting adapter will be equal to the sum of all adapters ranks. So
+                it's possible that the mixed adapter may become too big and result in OOM errors.
+            svd_rank (`int`, *optional*):
+                Rank of output adapter for svd. If None provided, will use max rank of merging adapters.
+            svd_clamp (`float`, *optional*):
+                A quantile threshold for clamping SVD decomposition output. If None is provided, do not perform
+                clamping. Defaults to None.
+            svd_full_matrices (`bool`, *optional*):
+                Controls whether to compute the full or reduced SVD, and consequently, the shape of the returned
+                tensors U and Vh. Defaults to True.
+            svd_driver (`str`, *optional*):
+                Name of the cuSOLVER method to be used. This keyword argument only works when merging on CUDA. Can be
+                one of [None, `gesvd`, `gesvdj`, `gesvda`]. For more info please refer to `torch.linalg.svd`
+                documentation. Defaults to None.
+        """
+
+        if adapter_name in list(self.peft_config.keys()):
+            return
+        for adapter in adapters:
+            if adapter not in list(self.peft_config.keys()):
+                raise ValueError(f"Adapter {adapter} does not exist")
+
+        # if there is only one adapter, we can only use linear merging
+        combination_type = "linear" if len(adapters) == 1 else combination_type
+
+        adapters_ranks = [self.peft_config[adapter].r for adapter in adapters]
+        if combination_type == "linear":
+            # all adapters ranks should be same, new rank is just this value
+            if len(set(adapters_ranks)) != 1:
+                raise ValueError("All adapters must have the same r value when using `linear` combination_type")
+            new_rank = adapters_ranks[0]
+        elif combination_type == "cat":
+            # adapters ranks may be different, new rank is sum of all ranks
+            # be careful, because output adapter rank may be really big if mixing a lot of adapters
+            new_rank = sum(adapters_ranks)
+        elif combination_type == "svd":
+            # new rank is the max of all ranks of the adapters if not provided
+            new_rank = svd_rank or max(adapters_ranks)
+        else:
+            raise ValueError(f"Invalid combination_type: {combination_type}")
+
+        target_module_types = [type(self.peft_config[adapter].target_modules) for adapter in adapters]
+        if not target_module_types:
+            raise ValueError(f"Found no adapter matching the names in {adapters}")
+        if len(set(target_module_types)) > 1:
+            raise ValueError(
+                "all adapter configs should follow the same target modules type. "
+                "Combining adapters with `target_modules` type being a mix of list/set and string is not supported."
+            )
+
+        if target_module_types[0] == str:
+            new_target_modules = "|".join(f"({self.peft_config[adapter].target_modules})" for adapter in adapters)
+        elif target_module_types[0] == set:
+            new_target_modules = reduce(
+                operator.or_, (self.peft_config[adapter].target_modules for adapter in adapters)
+            )
+        else:
+            raise TypeError(f"Invalid type {target_module_types[0]} found in target_modules")
+
+        self.peft_config[adapter_name] = replace(
+            self.peft_config[adapters[0]],
+            r=new_rank,
+            lora_alpha=new_rank,
+            target_modules=new_target_modules,
+        )
+        self.inject_adapter(self.model, adapter_name)
+
+        # Do we really need that?
+        _freeze_adapter(self.model, adapter_name)
+
+        key_list = [key for key, _ in self.model.named_modules() if "lora" not in key]
+        for key in key_list:
+            _, target, _ = _get_submodules(self.model, key)
+            if isinstance(target, LoraLayer):
+                if adapter_name in target.lora_A:
+                    target_lora_A = target.lora_A[adapter_name].weight
+                    target_lora_B = target.lora_B[adapter_name].weight
+                elif adapter_name in target.lora_embedding_A:
+                    target_lora_A = target.lora_embedding_A[adapter_name]
+                    target_lora_B = target.lora_embedding_B[adapter_name]
+                else:
+                    continue
+
+                target_lora_A.data = target_lora_A.data * 0.0
+                target_lora_B.data = target_lora_B.data * 0.0
+                if combination_type == "linear":
+                    for adapter, weight in zip(adapters, weights):
+                        if adapter in target.lora_A:
+                            current_adapter_lora_A = target.lora_A[adapter].weight
+                            current_adapter_lora_B = target.lora_B[adapter].weight
+                        elif adapter in target.lora_embedding_A:
+                            current_adapter_lora_A = target.lora_embedding_A[adapter]
+                            current_adapter_lora_B = target.lora_embedding_B[adapter]
+                        else:
+                            continue
+                        target_lora_A.data += current_adapter_lora_A.data * weight * target.scaling[adapter]
+                        target_lora_B.data += current_adapter_lora_B.data
+                elif combination_type == "cat":
+                    loras_A, loras_B = [], []
+                    for adapter, weight in zip(adapters, weights):
+                        if adapter in target.lora_A:
+                            current_adapter_lora_A = target.lora_A[adapter].weight
+                            current_adapter_lora_B = target.lora_B[adapter].weight
+                        elif adapter in target.lora_embedding_A:
+                            current_adapter_lora_A = target.lora_embedding_A[adapter]
+                            current_adapter_lora_B = target.lora_embedding_B[adapter]
+                        else:
+                            continue
+                        loras_A.append(current_adapter_lora_A.data * weight * target.scaling[adapter])
+                        loras_B.append(current_adapter_lora_B.data)
+
+                    if len(loras_A) == 0:
+                        raise ValueError("No matching LoRAs found. Please raise an issue on Github.")
+                    loras_A = torch.cat(loras_A, dim=0)
+                    loras_B = torch.cat(loras_B, dim=1)
+                    target_lora_A.data[: loras_A.shape[0], :] = loras_A
+                    target_lora_B.data[:, : loras_B.shape[1]] = loras_B
+                elif combination_type == "svd":
+                    target_lora_A.data, target_lora_B.data = self._svd_weighted_adapter(
+                        adapters,
+                        weights,
+                        new_rank,
+                        target,
+                        target_lora_A,
+                        target_lora_B,
+                        svd_clamp,
+                        full_matrices=svd_full_matrices,
+                        driver=svd_driver,
+                    )
+
+    def _svd_weighted_adapter(
+        self,
+        adapters,
+        weights,
+        new_rank,
+        target,
+        target_lora_A,
+        target_lora_B,
+        clamp=None,
+        full_matrices=True,
+        driver=None,
+    ):
+        valid_adapters = []
+        valid_weights = []
+        for adapter, weight in zip(adapters, weights):
+            if adapter in target.lora_A or adapter in target.lora_embedding_A:
+                valid_adapters.append(adapter)
+                valid_weights.append(weight)
+
+        # if no valid adapter, nothing to do
+        if len(valid_adapters) == 0:
+            raise ValueError("No matching LoRAs found. Please raise an issue on Github.")
+
+        delta_weight = valid_weights[0] * target.get_delta_weight(valid_adapters[0])
+        for adapter, weight in zip(valid_adapters[1:], valid_weights[1:]):
+            delta_weight += weight * target.get_delta_weight(adapter)
+        conv2d = isinstance(target, Conv2d)
+        if conv2d:
+            conv2d_1x1 = target.weight.size()[2:4] == (1, 1)
+            if not conv2d_1x1:
+                delta_weight = delta_weight.flatten(start_dim=1)
+            else:
+                delta_weight = delta_weight.squeeze()
+        if hasattr(target, "fan_in_fan_out") and target.fan_in_fan_out:
+            delta_weight = delta_weight.T
+
+        # based on https://github.com/kohya-ss/sd-scripts/blob/main/networks/svd_merge_lora.py#L114-L131
+        U, S, Vh = torch.linalg.svd(delta_weight, full_matrices=full_matrices, driver=driver)
+        U = U[:, :new_rank]
+        S = S[:new_rank]
+        U = U @ torch.diag(S)
+        Vh = Vh[:new_rank, :]
+        if clamp is not None:
+            dist = torch.cat([U.flatten(), Vh.flatten()])
+            hi_val = torch.quantile(dist, clamp)
+            low_val = -hi_val
+            U = U.clamp(low_val, hi_val)
+            Vh = Vh.clamp(low_val, hi_val)
+        if conv2d:
+            U = U.reshape(target_lora_B.data.shape)
+            Vh = Vh.reshape(target_lora_A.data.shape)
+        return Vh, U
+
+    def delete_adapter(self, adapter_name: str):
+        """
+        Deletes an existing adapter.
+
+        Args:
+            adapter_name (str): Name of the adapter to be deleted.
+        """
+        if adapter_name not in list(self.peft_config.keys()):
+            raise ValueError(f"Adapter {adapter_name} does not exist")
+        del self.peft_config[adapter_name]
+
+        key_list = [key for key, _ in self.model.named_modules() if "lora" not in key]
+        new_adapter = None
+        for key in key_list:
+            _, target, _ = _get_submodules(self.model, key)
+            if isinstance(target, LoraLayer):
+                target.delete_adapter(adapter_name)
+                if new_adapter is None:
+                    new_adapter = target.active_adapters[:]
+
+        self.active_adapter = new_adapter or []
+
+    def merge_and_unload(self, progressbar: bool = False, safe_merge: bool = False):
+        r"""
+        This method merges the LoRa layers into the base model. This is needed if someone wants to use the base model
+        as a standalone model.
+
+        Args:
+            progressbar (`bool`):
+                whether to show a progressbar indicating the unload and merge process
+            safe_merge (`bool`):
+                whether to activate the safe merging check to check if there is any potential Nan in the adapter
+                weights
+
+        Example:
+
+        ```py
+        >>> from transformers import AutoModelForCausalLM
+        >>> from peft import PeftModel
+
+        >>> base_model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-40b")
+        >>> peft_model_id = "smangrul/falcon-40B-int4-peft-lora-sfttrainer-sample"
+        >>> model = PeftModel.from_pretrained(base_model, peft_model_id)
+        >>> merged_model = model.merge_and_unload()
+        ```
+        """
+        return self._unload_and_optionally_merge(progressbar=progressbar, safe_merge=safe_merge)
+
+    def unload(self):
+        """
+        Gets back the base model by removing all the lora modules without merging. This gives back the original base
+        model.
+        """
+        return self._unload_and_optionally_merge(merge=False)
--- a/src/peft/tuners/lycoris_utils.py
+++ b/src/peft/tuners/lycoris_utils.py
@ -0,0 +1,404 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import warnings
+from abc import abstractmethod
+from dataclasses import dataclass, field
+from itertools import chain
+from typing import Dict, Optional, Set, Type, Union
+
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+
+from peft.config import PeftConfig
+from peft.utils import (
+    ModulesToSaveWrapper,
+    _get_submodules,
+)
+
+from .tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists
+
+
+@dataclass
+class LycorisConfig(PeftConfig):
+    r"""
+    A base config for LyCORIS like adapters
+    """
+    rank_pattern: Optional[dict] = field(
+        default_factory=dict,
+        metadata={
+            "help": (
+                "The mapping from layer names or regexp expression to ranks which are different from the default rank specified by `r`. "
+                "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 8`}"
+            )
+        },
+    )
+    alpha_pattern: Optional[dict] = field(
+        default_factory=dict,
+        metadata={
+            "help": (
+                "The mapping from layer names or regexp expression to alphas which are different from the default alpha specified by `alpha`. "
+                "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 32`}"
+            )
+        },
+    )
+
+
+class LycorisLayer(BaseTunerLayer, nn.Module):
+    r"""
+    A base layer for LyCORIS like adapters
+    """
+    # adapter_layer_names needs to be defined on the child class
+    other_param_names = ("r", "alpha", "scaling", "rank_dropout", "module_dropout")
+
+    def __init__(self):
+        self.r = {}
+        self.alpha = {}
+        self.scaling = {}
+        self.rank_dropout = {}
+        self.module_dropout = {}
+
+        # Tuner info
+        self._disable_adapters = False
+        self.merged_adapters = []
+
+    @property
+    @abstractmethod
+    def _available_adapters(self) -> Set[str]:
+        ...
+
+    def _init_empty_weights(self, cls, *args, **kwargs) -> None:
+        # A helper method that allows to initialize the layer of the given class without spending time to initialize the
+        # model weights. The implementation is inspired by
+        # https://pytorch.org/docs/stable/generated/torch.nn.utils.skip_init.html but this function cannot be used
+        # directly.
+        # Instead of this approach, it would be possible to bypass the __init__ of the class but that runs the risk of
+        # omitting important logic inside that __init__.
+        kwargs = kwargs.copy()
+        final_device = kwargs.pop("device", "cpu")
+        cls.__init__(self, *args, device="meta", **kwargs)
+        self.to_empty(device=final_device)
+
+    def _op(self, x: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+
+    @abstractmethod
+    def create_adapter_parameters(self, adapter_name: str, r: int, **kwargs):
+        ...
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        previous_dtype = x.dtype
+
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result = self._op(x, self.weight)
+        elif self.merged:
+            result = self._op(x, self.weight)
+        else:
+            # Get base weights
+            weight = self.weight.data
+
+            # Execute all the adapters
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self._available_adapters:
+                    continue
+
+                module_dropout = self.module_dropout[active_adapter]
+
+                # Modify current execution weights
+                if (not self.training) or (self.training and torch.rand(1) > module_dropout):
+                    weight = weight + self.get_delta_weight(active_adapter)
+
+            # Perform actual operation
+            result = self._op(x, weight)
+
+        result = result.to(previous_dtype)
+        return result
+
+    @abstractmethod
+    def get_delta_weight(self, adapter_name: str) -> torch.Tensor:
+        ...
+
+    def merge(self) -> None:
+        if self.merged:
+            warnings.warn(
+                f"Already following adapters were merged {','.join(self.merged_adapters)}. "
+                f"You are now additionally merging {','.join(self.active_adapters)}."
+            )
+        for active_adapter in self.active_adapters:
+            if active_adapter in self._available_adapters:
+                self.weight.data += self.get_delta_weight(active_adapter)
+                self.merged_adapters.append(active_adapter)
+
+    @abstractmethod
+    def reset_adapter_parameters(self, adapter_name: str):
+        ...
+
+    def set_scale(self, adapter, scale):
+        if adapter not in self._available_adapters:
+            # Ignore the case where the adapter is not in the layer
+            return
+        self.scaling[adapter] = scale * self.alpha[adapter] / self.r[adapter]
+
+    def scale_layer(self, scale: float) -> None:
+        if scale == 1:
+            return
+
+        for active_adapter in self.active_adapters:
+            if active_adapter not in self._available_adapters:
+                continue
+
+            self.scaling[active_adapter] *= scale
+
+    def unmerge(self) -> None:
+        if not self.merged:
+            warnings.warn("Already unmerged. Nothing to do.")
+            return
+        while len(self.merged_adapters) > 0:
+            active_adapter = self.merged_adapters.pop()
+            if active_adapter in self._available_adapters:
+                self.weight.data -= self.get_delta_weight(active_adapter)
+
+    def unscale_layer(self, scale=None) -> None:
+        for active_adapter in self.active_adapters:
+            if active_adapter not in self._available_adapters:
+                continue
+
+            if scale is None:
+                self.scaling[active_adapter] = self.alpha[active_adapter] / self.r[active_adapter]
+            else:
+                self.scaling[active_adapter] /= scale
+
+    @abstractmethod
+    def update_layer(self, adapter_name: str, r: int, alpha: float, **kwargs):
+        ...
+
+
+class LycorisTuner(BaseTuner):
+    r"""
+    A base tuner for LyCORIS like adapters
+    """
+
+    prefix: str
+    layers_mapping: Dict[Type[torch.nn.Module], Type[LycorisLayer]]
+
+    def __init__(self, model, config, adapter_name):
+        super().__init__(model, config, adapter_name)
+
+    def __getattr__(self, name: str):
+        """Forward missing attributes to the wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Module's logic
+        except AttributeError:
+            return getattr(self.model, name)
+
+    @staticmethod
+    def _check_target_module_exists(config, key):
+        return check_target_module_exists(config, key)
+
+    def _create_and_replace(
+        self,
+        config: LycorisConfig,
+        adapter_name: str,
+        target: Union[LycorisLayer, nn.Module],
+        target_name,
+        parent,
+        current_key,
+        **optional_kwargs,
+    ):
+        """
+        A private method to create and replace the target module with the adapter module.
+        """
+
+        # Regexp matching - Find key which matches current target_name in patterns provided
+        pattern_keys = list(chain(config.rank_pattern.keys(), config.alpha_pattern.keys()))
+        target_name_key = next(filter(lambda key: re.match(f"(.*\.)?{key}$", current_key), pattern_keys), target_name)
+
+        kwargs = config.to_dict()
+        kwargs["r"] = config.rank_pattern.get(target_name_key, config.r)
+        kwargs["alpha"] = config.alpha_pattern.get(target_name_key, config.alpha)
+
+        if isinstance(target, LycorisLayer):
+            target.update_layer(adapter_name, **kwargs)
+        else:
+            new_module = self._create_new_module(config, adapter_name, target, **kwargs)
+            self._replace_module(parent, target_name, new_module, target)
+
+    @classmethod
+    def _create_new_module(cls, config: LycorisConfig, adapter_name: str, target: nn.Module, **kwargs) -> LycorisLayer:
+        # Find corresponding subtype of provided target module
+        new_module_cls = None
+        for subtype, target_cls in cls.layers_mapping.items():
+            if isinstance(target, subtype):
+                new_module_cls = target_cls
+                break
+
+        # We didn't find corresponding type, so adapter for this layer is not supported
+        if new_module_cls is None:
+            raise ValueError(
+                f"Target module not found, currently only adapters for {', '.join([x.__name__ for x in cls.modules_mapping.keys()])} are supported"
+            )
+
+        if isinstance(target, torch.nn.Conv2d):
+            new_module = new_module_cls(
+                target.in_channels,
+                target.out_channels,
+                target.weight.size()[2:],
+                stride=target.stride,
+                padding=target.padding,
+                dilation=target.dilation,
+                groups=target.groups,
+                bias=target.bias is not None,
+                padding_mode=target.padding_mode,
+                device=target.weight.device,
+                dtype=target.weight.dtype,
+                adapter_name=adapter_name,
+                **kwargs,
+            )
+        elif isinstance(target, torch.nn.Linear):
+            new_module = new_module_cls(
+                target.in_features,
+                target.out_features,
+                bias=target.bias is not None,
+                device=target.weight.device,
+                dtype=target.weight.dtype,
+                adapter_name=adapter_name,
+                **kwargs,
+            )
+        else:
+            raise ValueError(
+                "Target module not found, currently only adapters for nn.Linear and nn.Conv2d are supported"
+            )
+
+        return new_module
+
+    def _mark_only_adapters_as_trainable(self) -> None:
+        for n, p in self.model.named_parameters():
+            if self.prefix not in n:
+                p.requires_grad = False
+
+    @staticmethod
+    def _prepare_adapter_config(peft_config, model_config):
+        if peft_config.target_modules is None:
+            raise ValueError("Please specify `target_modules` in `peft_config`")
+        return peft_config
+
+    def _replace_module(self, parent, child_name, new_module, child):
+        setattr(parent, child_name, new_module)
+        # It's not necessary to set requires_grad here, as that is handled by
+        # _mark_only_adapters_as_trainable
+        new_module.weight = child.weight
+        if hasattr(child, "bias"):
+            new_module.bias = child.bias
+
+        if getattr(child, "state", None) is not None:
+            new_module.state = child.state
+            new_module.to(child.weight.device)
+
+        # dispatch to correct device
+        for name, module in new_module.named_modules():
+            if self.prefix in name:
+                module.to(child.weight.device)
+
+    def _set_adapter_layers(self, enabled=True):
+        for module in self.model.modules():
+            if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)):
+                module.enable_adapters(enabled)
+
+    def _unload_and_optionally_merge(self, merge=True, progressbar: bool = False):
+        if merge:
+            if getattr(self.model, "quantization_method", None) == "gptq":
+                raise ValueError("Cannot merge LOHA layers when the model is gptq quantized")
+
+        key_list = [key for key, _ in self.model.named_modules() if "hada" not in key]
+        desc = "Unloading " + ("and merging " if merge else "") + "model"
+        for key in tqdm(key_list, disable=not progressbar, desc=desc):
+            try:
+                parent, target, target_name = _get_submodules(self.model, key)
+            except AttributeError:
+                continue
+            if isinstance(target, LycorisLayer):
+                if isinstance(target, nn.Conv2d):
+                    new_module = torch.nn.Conv2d(
+                        target.in_channels,
+                        target.out_channels,
+                        kernel_size=target.kernel_size,
+                        stride=target.stride,
+                        padding=target.padding,
+                        dilation=target.dilation,
+                    )
+                elif isinstance(target, nn.Linear):
+                    bias = target.bias is not None
+                    new_module = torch.nn.Linear(
+                        target.in_features,
+                        target.out_features,
+                        bias=bias,
+                        device=target.weight.device,
+                    )
+                else:
+                    raise ValueError(
+                        "Cannot convert current module to torch module, currently only adapters for nn.Linear and nn.Conv2d are supported"
+                    )
+                if merge:
+                    target.merge()
+                self._replace_module(parent, target_name, new_module, target)
+
+            # save any additional trainable modules part of `modules_to_save`
+            if isinstance(target, ModulesToSaveWrapper):
+                setattr(parent, target_name, target.modules_to_save[target.active_adapter])
+
+        return self.model
+
+    def enable_adapter_layers(self):
+        self._set_adapter_layers(enabled=True)
+
+    def disable_adapter_layers(self):
+        self._set_adapter_layers(enabled=False)
+
+    def merge_and_unload(self, progressbar: bool = False):
+        return self._unload_and_optionally_merge(progressbar=progressbar)
+
+    def set_adapter(self, adapter_name):
+        for module in self.model.modules():
+            if isinstance(module, LycorisLayer):
+                if module.merged:
+                    warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.")
+                    module.unmerge()
+                module.set_adapter(adapter_name)
+
+    def delete_adapter(self, adapter_name: str):
+        """
+        Deletes an existing adapter.
+
+        Args:
+            adapter_name (`str`): Name of the adapter to be deleted.
+        """
+        if adapter_name not in list(self.peft_config.keys()):
+            raise ValueError(f"Adapter {adapter_name} does not exist")
+        del self.peft_config[adapter_name]
+
+        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
+        new_adapter = None
+        for key in key_list:
+            _, target, _ = _get_submodules(self.model, key)
+            if isinstance(target, LycorisLayer):
+                target.delete_adapter(adapter_name)
+                if new_adapter is None:
+                    new_adapter = target.active_adapters[:]
+
+        self.active_adapter = new_adapter or []
--- a/src/peft/tuners/multitask_prompt_tuning/init.py
+++ b/src/peft/tuners/multitask_prompt_tuning/init.py
@ -0,0 +1,20 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .config import MultitaskPromptTuningConfig, MultitaskPromptTuningInit
+from .model import MultitaskPromptEmbedding
+
+
+__all__ = ["MultitaskPromptTuningConfig", "MultitaskPromptTuningInit", "MultitaskPromptEmbedding"]
--- a/src/peft/tuners/multitask_prompt_tuning/config.py
+++ b/src/peft/tuners/multitask_prompt_tuning/config.py
@ -0,0 +1,62 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import enum
+from dataclasses import dataclass, field
+from typing import Optional, Union
+
+from peft.tuners.prompt_tuning import PromptTuningConfig
+from peft.utils import PeftType
+
+
+class MultitaskPromptTuningInit(str, enum.Enum):
+    # initialize prompt with text
+    TEXT = "TEXT"
+    # initialize prompt with random matrix
+    RANDOM = "RANDOM"
+    # average the prefix and column matrices obtained during source training
+    AVERAGE_SOURCE_TASKS = "AVERAGE_SOURCE_TASKS"
+    # pick prefix and column matrices for a particular task obtained during source training
+    EXACT_SOURCE_TASK = "EXACT_SOURCE_TASK"
+    # only use the prompt embeddings trained during source training
+    ONLY_SOURCE_SHARED = "ONLY_SOURCE_SHARED"
+
+
+@dataclass
+class MultitaskPromptTuningConfig(PromptTuningConfig):
+    prompt_tuning_init: Union[MultitaskPromptTuningInit, str] = field(
+        default=MultitaskPromptTuningInit.RANDOM,
+        metadata={
+            "help": (
+                "How to initialize the prompt tuning parameters. Can be one of TEXT, RANDOM, AVERAGE_SOURCE_TASKS, "
+                "EXACT_SOURCE_TASK, ONLY_SOURCE_SHARED."
+            ),
+        },
+    )
+    prompt_tuning_init_state_dict_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The path of source state dict. This is required when training the downstream target prompt from "
+                "the pretrained source prompt"
+            ),
+        },
+    )
+    prompt_tuning_init_task: Optional[int] = field(default=0, metadata={"help": "source task id for initialization"})
+    num_ranks: Optional[int] = field(default=1, metadata={"help": "ranks"})
+    num_tasks: Optional[int] = field(default=1, metadata={"help": "number of tasks"})
+
+    def __post_init__(self):
+        self.peft_type = PeftType.MULTITASK_PROMPT_TUNING
--- a/src/peft/tuners/multitask_prompt_tuning/model.py
+++ b/src/peft/tuners/multitask_prompt_tuning/model.py
@ -0,0 +1,115 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from peft.tuners.prompt_tuning import PromptEmbedding
+from peft.utils import TaskType
+
+from .config import MultitaskPromptTuningConfig, MultitaskPromptTuningInit
+
+
+# This code is adapted for the paper: https://arxiv.org/abs/2303.02861 and
+# constitutes the work done at MIT-IBM Watson Research Lab.
+
+
+class MultitaskPromptEmbedding(PromptEmbedding):
+    def __init__(self, config: MultitaskPromptTuningConfig, word_embeddings):
+        super().__init__(config, word_embeddings)
+
+        self.num_tasks = config.num_tasks
+        self.num_ranks = config.num_ranks
+        self.num_virtual_tokens = config.num_virtual_tokens
+
+        self.num_transformer_submodules = config.num_transformer_submodules
+        if self.num_transformer_submodules is None:
+            self.num_transformer_submodules = 2 if config.task_type == TaskType.SEQ_2_SEQ_LM else 1
+
+        self.token_dim = config.token_dim
+
+        total_virtual_tokens = self.num_virtual_tokens * self.num_transformer_submodules
+
+        self.prefix_task_cols = torch.nn.Parameter(
+            torch.normal(
+                mean=0,
+                std=0.02,
+                size=(self.num_tasks, total_virtual_tokens, self.num_ranks),
+            )
+        )
+        self.prefix_task_rows = torch.nn.Parameter(
+            torch.normal(
+                mean=0,
+                std=0.02,
+                size=(self.num_tasks, self.num_ranks, self.token_dim),
+            )
+        )
+
+        if config.prompt_tuning_init in [
+            MultitaskPromptTuningInit.AVERAGE_SOURCE_TASKS,
+            MultitaskPromptTuningInit.EXACT_SOURCE_TASK,
+            MultitaskPromptTuningInit.ONLY_SOURCE_SHARED,
+        ]:
+            if config.prompt_tuning_init_state_dict_path is None:
+                raise ValueError(
+                    f"prompt_tuning_init_state_dict_path needs to be specified with {config.prompt_tuning_init} "
+                    "init method"
+                )
+
+            state_dict: dict = torch.load(
+                config.prompt_tuning_init_state_dict_path,
+                map_location=word_embeddings.device,
+            )
+
+        if config.prompt_tuning_init in [
+            MultitaskPromptTuningInit.AVERAGE_SOURCE_TASKS,
+            MultitaskPromptTuningInit.EXACT_SOURCE_TASK,
+        ]:
+            prefix_task_cols_: torch.Tensor = state_dict["prefix_task_cols"]
+            prefix_task_rows_: torch.Tensor = state_dict["prefix_task_rows"]
+
+            if config.prompt_tuning_init == MultitaskPromptTuningInit.AVERAGE_SOURCE_TASKS:
+                prefix_task_cols_ = prefix_task_cols_.mean(0, keepdim=True)
+                prefix_task_rows_ = prefix_task_rows_.mean(0, keepdim=True)
+            elif config.prompt_tuning_init == MultitaskPromptTuningInit.EXACT_SOURCE_TASK:
+                prefix_task_cols_ = prefix_task_cols_[config.prompt_tuning_init_task, ...].unsqueeze(0)
+                prefix_task_rows_ = prefix_task_rows_[config.prompt_tuning_init_task, ...].unsqueeze(0)
+
+            state_dict = {
+                "embedding.weight": state_dict["prompt_embeddings"],
+                "prefix_task_cols": prefix_task_cols_,
+                "prefix_task_rows": prefix_task_rows_,
+            }
+
+            self.load_state_dict(state_dict, strict=True)
+        elif config.prompt_tuning_init == MultitaskPromptTuningInit.ONLY_SOURCE_SHARED:
+            state_dict = {
+                "embedding.weight": state_dict["prompt_embeddings"],
+            }
+
+            self.load_state_dict(state_dict, strict=False)
+
+    def forward(self, indices, task_ids):
+        if task_ids is None:
+            raise ValueError("task_ids cannot be None")
+
+        prompt_embeddings = self.embedding(indices)
+
+        task_cols = torch.index_select(self.prefix_task_cols, 0, task_ids)
+        task_rows = torch.index_select(self.prefix_task_rows, 0, task_ids)
+        task_prompts = torch.matmul(task_cols, task_rows)
+
+        prompt_embeddings *= task_prompts
+
+        return prompt_embeddings
--- a/src/peft/tuners/p_tuning/init.py
+++ b/src/peft/tuners/p_tuning/init.py
@ -0,0 +1,20 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .config import PromptEncoderConfig, PromptEncoderReparameterizationType
+from .model import PromptEncoder
+
+
+__all__ = ["PromptEncoder", "PromptEncoderConfig", "PromptEncoderReparameterizationType"]
--- a/src/peft/tuners/p_tuning/config.py
+++ b/src/peft/tuners/p_tuning/config.py
@ -0,0 +1,60 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import enum
+from dataclasses import dataclass, field
+from typing import Union
+
+from peft.config import PromptLearningConfig
+from peft.utils import PeftType
+
+
+class PromptEncoderReparameterizationType(str, enum.Enum):
+    MLP = "MLP"
+    LSTM = "LSTM"
+
+
+@dataclass
+class PromptEncoderConfig(PromptLearningConfig):
+    """
+    This is the configuration class to store the configuration of a [`PromptEncoder`].
+
+    Args:
+        encoder_reparameterization_type (Union[[`PromptEncoderReparameterizationType`], `str`]):
+            The type of reparameterization to use.
+        encoder_hidden_size (`int`): The hidden size of the prompt encoder.
+        encoder_num_layers (`int`): The number of layers of the prompt encoder.
+        encoder_dropout (`float`): The dropout probability of the prompt encoder.
+    """
+
+    encoder_reparameterization_type: Union[str, PromptEncoderReparameterizationType] = field(
+        default=PromptEncoderReparameterizationType.MLP,
+        metadata={"help": "How to reparameterize the prompt encoder"},
+    )
+    encoder_hidden_size: int = field(
+        default=None,
+        metadata={"help": "The hidden size of the prompt encoder"},
+    )
+    encoder_num_layers: int = field(
+        default=2,
+        metadata={"help": "The number of layers of the prompt encoder"},
+    )
+    encoder_dropout: float = field(
+        default=0.0,
+        metadata={"help": "The dropout of the prompt encoder"},
+    )
+
+    def __post_init__(self):
+        self.peft_type = PeftType.P_TUNING
--- a/src/peft/tuners/p_tuning/model.py
+++ b/src/peft/tuners/p_tuning/model.py
@ -13,58 +13,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import enum
+# Based on https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/nlp/modules/common/prompt_encoder.py
+# with some refactor
 import warnings
-from dataclasses import dataclass, field
-from typing import Union

 import torch

-from ..config import PromptLearningConfig
-from ..utils import PeftType
+from .config import PromptEncoderConfig, PromptEncoderReparameterizationType


-class PromptEncoderReparameterizationType(str, enum.Enum):
-    MLP = "MLP"
-    LSTM = "LSTM"
-
-
-@dataclass
-class PromptEncoderConfig(PromptLearningConfig):
-    """
-    This is the configuration class to store the configuration of a [`PromptEncoder`].
-
-    Args:
-        encoder_reparameterization_type (Union[[`PromptEncoderReparameterizationType`], `str`]):
-            The type of reparameterization to use.
-        encoder_hidden_size (`int`): The hidden size of the prompt encoder.
-        encoder_num_layers (`int`): The number of layers of the prompt encoder.
-        encoder_dropout (`float`): The dropout probability of the prompt encoder.
-    """
-
-    encoder_reparameterization_type: Union[str, PromptEncoderReparameterizationType] = field(
-        default=PromptEncoderReparameterizationType.MLP,
-        metadata={"help": "How to reparameterize the prompt encoder"},
-    )
-    encoder_hidden_size: int = field(
-        default=None,
-        metadata={"help": "The hidden size of the prompt encoder"},
-    )
-    encoder_num_layers: int = field(
-        default=2,
-        metadata={"help": "The number of layers of the prompt encoder"},
-    )
-    encoder_dropout: float = field(
-        default=0.0,
-        metadata={"help": "The dropout of the prompt encoder"},
-    )
-
-    def __post_init__(self):
-        self.peft_type = PeftType.P_TUNING
-
-
-# Based on https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/nlp/modules/common/prompt_encoder.py
-# with some refactor
 class PromptEncoder(torch.nn.Module):
    """
    The prompt encoder network that is used to generate the virtual token embeddings for p-tuning.
--- a/src/peft/tuners/prefix_tuning/init.py
+++ b/src/peft/tuners/prefix_tuning/init.py
@ -0,0 +1,20 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .config import PrefixTuningConfig
+from .model import PrefixEncoder
+
+
+__all__ = ["PrefixTuningConfig", "PrefixEncoder"]
--- a/src/peft/tuners/prefix_tuning/config.py
+++ b/src/peft/tuners/prefix_tuning/config.py
@ -0,0 +1,42 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+
+from peft.config import PromptLearningConfig
+from peft.utils import PeftType
+
+
+@dataclass
+class PrefixTuningConfig(PromptLearningConfig):
+    """
+    This is the configuration class to store the configuration of a [`PrefixEncoder`].
+
+    Args:
+        encoder_hidden_size (`int`): The hidden size of the prompt encoder.
+        prefix_projection (`bool`): Whether to project the prefix embeddings.
+    """
+
+    encoder_hidden_size: int = field(
+        default=None,
+        metadata={"help": "The hidden size of the encoder"},
+    )
+    prefix_projection: bool = field(
+        default=False,
+        metadata={"help": "Whether to project the prefix tokens"},
+    )
+
+    def __post_init__(self):
+        self.peft_type = PeftType.PREFIX_TUNING
--- a/src/peft/tuners/prefix_tuning/model.py
+++ b/src/peft/tuners/prefix_tuning/model.py
@ -13,40 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-
-from dataclasses import dataclass, field
-
-import torch
-
-from ..config import PromptLearningConfig
-from ..utils import PeftType
-
-
-@dataclass
-class PrefixTuningConfig(PromptLearningConfig):
-    """
-    This is the configuration class to store the configuration of a [`PrefixEncoder`].
-
-    Args:
-        encoder_hidden_size (`int`): The hidden size of the prompt encoder.
-        prefix_projection (`bool`): Whether to project the prefix embeddings.
-    """
-
-    encoder_hidden_size: int = field(
-        default=None,
-        metadata={"help": "The hidden size of the encoder"},
-    )
-    prefix_projection: bool = field(
-        default=False,
-        metadata={"help": "Whether to project the prefix tokens"},
-    )
-
-    def __post_init__(self):
-        self.peft_type = PeftType.PREFIX_TUNING
-
-
 # Based on https://github.com/THUDM/P-tuning-v2/blob/main/model/prefix_encoder.py
 # with some refactor
+import torch
+
+
 class PrefixEncoder(torch.nn.Module):
    r"""
    The `torch.nn` model to encode the prefix.
--- a/src/peft/tuners/prompt_tuning/init.py
+++ b/src/peft/tuners/prompt_tuning/init.py
@ -0,0 +1,20 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .config import PromptTuningConfig, PromptTuningInit
+from .model import PromptEmbedding
+
+
+__all__ = ["PromptTuningConfig", "PromptEmbedding", "PromptTuningInit"]
--- a/src/peft/tuners/prompt_tuning/config.py
+++ b/src/peft/tuners/prompt_tuning/config.py
@ -0,0 +1,60 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import enum
+from dataclasses import dataclass, field
+from typing import Optional, Union
+
+from peft.config import PromptLearningConfig
+from peft.utils import PeftType
+
+
+class PromptTuningInit(str, enum.Enum):
+    TEXT = "TEXT"
+    RANDOM = "RANDOM"
+
+
+@dataclass
+class PromptTuningConfig(PromptLearningConfig):
+    """
+    This is the configuration class to store the configuration of a [`PromptEmbedding`].
+
+    Args:
+        prompt_tuning_init (Union[[`PromptTuningInit`], `str`]): The initialization of the prompt embedding.
+        prompt_tuning_init_text (`str`, *optional*):
+            The text to initialize the prompt embedding. Only used if `prompt_tuning_init` is `TEXT`.
+        tokenizer_name_or_path (`str`, *optional*):
+            The name or path of the tokenizer. Only used if `prompt_tuning_init` is `TEXT`.
+    """
+
+    prompt_tuning_init: Union[PromptTuningInit, str] = field(
+        default=PromptTuningInit.RANDOM,
+        metadata={"help": "How to initialize the prompt tuning parameters"},
+    )
+    prompt_tuning_init_text: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The text to use for prompt tuning initialization. Only used if prompt_tuning_init is `TEXT`"
+        },
+    )
+    tokenizer_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The tokenizer to use for prompt tuning initialization. Only used if prompt_tuning_init is `TEXT`"
+        },
+    )
+
+    def __post_init__(self):
+        self.peft_type = PeftType.PROMPT_TUNING
--- a/src/peft/tuners/prompt_tuning/model.py
+++ b/src/peft/tuners/prompt_tuning/model.py
@ -13,54 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import enum
 import math
-from dataclasses import dataclass, field
-from typing import Optional, Union

 import torch

-from ..config import PromptLearningConfig
-from ..utils import PeftType
-
-
-class PromptTuningInit(str, enum.Enum):
-    TEXT = "TEXT"
-    RANDOM = "RANDOM"
-
-
-@dataclass
-class PromptTuningConfig(PromptLearningConfig):
-    """
-    This is the configuration class to store the configuration of a [`PromptEmbedding`].
-
-    Args:
-        prompt_tuning_init (Union[[`PromptTuningInit`], `str`]): The initialization of the prompt embedding.
-        prompt_tuning_init_text (`str`, *optional*):
-            The text to initialize the prompt embedding. Only used if `prompt_tuning_init` is `TEXT`.
-        tokenizer_name_or_path (`str`, *optional*):
-            The name or path of the tokenizer. Only used if `prompt_tuning_init` is `TEXT`.
-    """
-
-    prompt_tuning_init: Union[PromptTuningInit, str] = field(
-        default=PromptTuningInit.RANDOM,
-        metadata={"help": "How to initialize the prompt tuning parameters"},
-    )
-    prompt_tuning_init_text: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "The text to use for prompt tuning initialization. Only used if prompt_tuning_init is `TEXT`"
-        },
-    )
-    tokenizer_name_or_path: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "The tokenizer to use for prompt tuning initialization. Only used if prompt_tuning_init is `TEXT`"
-        },
-    )
-
-    def __post_init__(self):
-        self.peft_type = PeftType.PROMPT_TUNING
+from .config import PromptTuningInit


 class PromptEmbedding(torch.nn.Module):
--- a/src/peft/tuners/tuners_utils.py
+++ b/src/peft/tuners/tuners_utils.py
@ -15,13 +15,17 @@
 from __future__ import annotations

 import logging
+import re
+import warnings
 from abc import ABC, abstractmethod
 from typing import Any, Union

 from torch import nn

+from peft.utils import COMMON_LAYERS_PATTERN
+
 from ..config import PeftConfig
-from ..utils import _get_submodules
+from ..utils import ModulesToSaveWrapper, _get_submodules


 logger = logging.getLogger(__name__)
@ -81,6 +85,8 @@ class BaseTuner(nn.Module, ABC):
                # user is adding a dict of PeftConfigs
                self.peft_config.update(peft_config)

+        self.active_adapter = adapter_name
+
        # transformers models have a .config attribute, whose presence is assumed later on
        if not hasattr(self, "config"):
            self.config = {"model_type": "custom"}
@ -90,6 +96,13 @@ class BaseTuner(nn.Module, ABC):
        # Copy the peft_config in the injected model.
        self.model.peft_config = self.peft_config

+    @property
+    def active_adapters(self) -> list[str]:
+        if isinstance(self.active_adapter, str):
+            return [self.active_adapter]
+        # is already a list of str
+        return self.active_adapter
+
    def forward(self, *args: Any, **kwargs: Any):
        return self.model.forward(*args, **kwargs)

@ -133,7 +146,7 @@ class BaseTuner(nn.Module, ABC):
        target: nn.Module,
        target_name: str,
        parent: nn.Module,
-        **optionnal_kwargs: Any,
+        **optional_kwargs: Any,
    ) -> None:
        r"""
        Inplace replacement of the target module with the adapter layer. This method needs to be overriden by all the
@ -152,7 +165,7 @@ class BaseTuner(nn.Module, ABC):
                The target module's name.
            parent (`nn.Module`):
                The parent module.
-            **optionnal_kwargs (`dict`):
+            **optional_kwargs (`dict`):
                The optional keyword arguments to pass to deal with particular cases (e.g. 8bit, 4bit quantization)
        """
        ...
@ -198,6 +211,9 @@ class BaseTuner(nn.Module, ABC):
        is_target_modules_in_base_model = False
        key_list = [key for key, _ in model.named_modules()]

+        _check_for_modules_to_save = getattr(peft_config, "modules_to_save", None) is not None
+        _has_modules_to_save = False
+
        model_config = getattr(model, "config", {"model_type": "custom"})
        if hasattr(model_config, "to_dict"):
            model_config = model_config.to_dict()
@ -205,18 +221,34 @@ class BaseTuner(nn.Module, ABC):
        peft_config = self._prepare_adapter_config(peft_config, model_config)

        for key in key_list:
+            # Check for modules_to_save in case
+            if _check_for_modules_to_save and any(
+                key.endswith(f"{module_to_save}") for module_to_save in peft_config.modules_to_save
+            ):
+                # Optionally set the modules to save
+                parent, target, target_name = _get_submodules(model, key)
+
+                if not isinstance(target, ModulesToSaveWrapper):
+                    new_module = ModulesToSaveWrapper(target, adapter_name)
+                    setattr(parent, target_name, new_module)
+                else:
+                    target.update(adapter_name)
+
+                _has_modules_to_save = True
+                continue
+
            if not self._check_target_module_exists(peft_config, key):
                continue

            is_target_modules_in_base_model = True
            parent, target, target_name = _get_submodules(model, key)

-            optionnal_kwargs = {
+            optional_kwargs = {
                "loaded_in_8bit": getattr(model, "is_loaded_in_8bit", False),
                "loaded_in_4bit": getattr(model, "is_loaded_in_4bit", False),
                "current_key": key,
            }
-            self._create_and_replace(peft_config, adapter_name, target, target_name, parent, **optionnal_kwargs)
+            self._create_and_replace(peft_config, adapter_name, target, target_name, parent, **optional_kwargs)

        if not is_target_modules_in_base_model:
            raise ValueError(
@ -231,6 +263,12 @@ class BaseTuner(nn.Module, ABC):
                if adapter_name in n:
                    p.requires_grad = False

+        if _has_modules_to_save:
+            if not hasattr(model, "modules_to_save"):
+                model.modules_to_save = set(peft_config.modules_to_save)
+            else:
+                model.modules_to_save.update(set(peft_config.modules_to_save))
+
    def merge_adapter(self):
        """
        This method merges the LoRa layers into the base model.
@ -255,13 +293,190 @@ class BaseTunerLayer(ABC):
    Args:
        is_plugable (`bool`, *optional*):
            Whether the adapter layer can be plugged to any pytorch module
-        active_adapter (`str`, *optional*):
+        active_adapters (Union[List[`str`], `str`], *optional*):
            The name of the active adapter.
    """
    active_adapter = None

-    def merge(self):
+    # All names of layers that may contain adapter (trainable) weights
+    adapter_layer_names: tuple[str] = ()
+    # All names of other parameters that may contain adapter-related parameters
+    other_param_names: tuple[str] = ()
+
+    # indicates whether all adapters should be disabled
+    _disable_adapters: bool = False
+
+    # the currently active adapter(s)
+    _active_adapter: str | list[str] = "default"
+
+    # List all merged adapters
+    merged_adapters: list[str] = []
+
+    def merge(self, *args) -> None:
        raise NotImplementedError

-    def unmerge(self):
+    def unmerge(self, *args) -> None:
        raise NotImplementedError
+
+    @property
+    def merged(self) -> bool:
+        return bool(self.merged_adapters)
+
+    @property
+    def disable_adapters(self) -> bool:
+        # use a property to ensure that disable_adapters is not set directly, instead use the enable_adapters method
+        return self._disable_adapters
+
+    @property
+    def active_adapter(self) -> str:
+        # use a property to ensure that active_adapter is not set directly, instead use the set_adapter method
+        return self._active_adapter
+
+    @property
+    def active_adapters(self):
+        if isinstance(self.active_adapter, str):
+            return [self.active_adapter]
+        # is already a list of str
+        return self.active_adapter
+
+    def enable_adapters(self, enabled: bool):
+        """Toggle the enabling and disabling of adapters
+
+        Takes care of setting the requires_grad flag for the adapter weights.
+
+        Args:
+            enabled (bool): True to enable adapters, False to disable adapters
+        """
+        if enabled:
+            self.set_adapter(self.active_adapters)
+            self._disable_adapters = False
+        else:
+            # disable grads on all adapter layers
+            for layer_name in self.adapter_layer_names:
+                layer = getattr(self, layer_name)
+                layer.requires_grad_(False)
+            self._disable_adapters = True
+
+    def set_adapter(self, adapter_names: str | list[str]):
+        """Set the active adapter
+
+        Args:
+            adapter_name (str): The name of the adapter to set as active
+        """
+        if isinstance(adapter_names, str):
+            adapter_names = [adapter_names]
+
+        # Deactivate grads on the inactive adapter and activate grads on the active adapter
+        for layer_name in self.adapter_layer_names:
+            module_dict = getattr(self, layer_name)
+            for key, layer in module_dict.items():
+                if key in adapter_names:
+                    # Note: It is possible that not a single layer is called with requires_grad_(True) here. This may
+                    # happen if a completely different adapter layer is being activated.
+                    layer.requires_grad_(True)
+                else:
+                    layer.requires_grad_(False)
+
+        self._active_adapter = adapter_names
+
+    def _all_available_adapter_names(self) -> list[str]:
+        """Return a sorted list of all available adapter names"""
+        adapter_names = set()
+        for name in self.adapter_layer_names + self.other_param_names:
+            # we check each possible attribute and if it's a dict or ModuleDict, we assume that the keys are the adapter
+            # names
+            attr = getattr(self, name)
+            if hasattr(attr, "keys"):
+                adapter_names.update(attr.keys())
+        return sorted(adapter_names)
+
+    def delete_adapter(self, adapter_name: str) -> None:
+        """
+        Delete an adapter from the layer
+
+        This should be called on all adapter layers, or else we will get an inconsistent state.
+
+        This method will also set a new active adapter if the deleted adapter was an active adapter. It is important
+        that the new adapter is chosen in a deterministic way, so that the same adapter is chosen on all layers.
+
+        Args:
+            adapter_name (`str`): The name of the adapter to delete
+
+        """
+        for attr in self.adapter_layer_names + self.other_param_names:
+            if adapter_name in getattr(self, attr):
+                del getattr(self, attr)[adapter_name]
+
+        if adapter_name in self.active_adapters:
+            # choose a new active adapter
+            active_adapters = self.active_adapters[:]
+            active_adapters.remove(adapter_name)
+            if active_adapters:
+                self.set_adapter(active_adapters)
+            else:
+                # no active adapters left, set a new default adapter
+                # here we get the list of all adapters existing adapter names and choose the first one
+                remaining_adapters = self._all_available_adapter_names()
+                if not remaining_adapters:
+                    self.set_adapter([])
+                else:
+                    new_active_adapter = remaining_adapters[0]
+                    warnings.warn(
+                        f"Adapter {adapter_name} was active which is now deleted. Setting active adapter to "
+                        f"{new_active_adapter}."
+                    )
+                    self.set_adapter(remaining_adapters[0])
+
+
+def check_target_module_exists(config, key: str) -> bool | re.Match[str] | None:
+    """A helper method to check if the passed module's key name matches any of the target modules in the adapter_config.
+
+    Args:
+        config (`LoraConfig` | `LycorisConfig`): A config to match target modules from
+        key (`str`): A key to search any matches in config
+
+    Returns:
+        `bool` | `re.Match[str]` | `None`: True of match object if key matches any target modules from config, False or
+        None if no match found
+    """
+    if isinstance(config.target_modules, str):
+        target_module_found = re.fullmatch(config.target_modules, key)
+    else:
+        target_module_found = key in config.target_modules or any(
+            key.endswith(f".{target_key}") for target_key in config.target_modules
+        )
+        is_using_layer_indexes = getattr(config, "layers_to_transform", None) is not None
+        layer_indexing_pattern = getattr(config, "layers_pattern", None)
+
+        if is_using_layer_indexes and target_module_found:
+            layers_pattern = COMMON_LAYERS_PATTERN if layer_indexing_pattern is None else layer_indexing_pattern
+            layers_pattern = [layers_pattern] if isinstance(layers_pattern, str) else layers_pattern
+
+            for pattern in layers_pattern:
+                layer_index = re.match(f".*.{pattern}\.(\d+)\.*", key)
+                if layer_index is not None:
+                    layer_index = int(layer_index.group(1))
+                    if isinstance(config.layers_to_transform, int):
+                        target_module_found = layer_index == config.layers_to_transform
+                    else:
+                        target_module_found = layer_index in config.layers_to_transform
+
+                    break
+                else:
+                    target_module_found = False
+    return target_module_found
+
+
+def inspect_matched_modules(tuner: BaseTuner, adapter_name: str = "default") -> dict:
+    """
+    A helper function to inspect the set of matched and unmatched modules for a PEFT model and the given adapter.
+    """
+    config = tuner.peft_config[adapter_name]
+    key_list = [key for key, _ in tuner.model.named_modules()]
+    module_dict = {"matched": [], "unmatched": []}
+    for key in key_list:
+        if tuner._check_target_module_exists(config, key):
+            module_dict["matched"].append(key)
+        else:
+            module_dict["unmatched"].append(key)
+    return module_dict
--- a/src/peft/utils/init.py
+++ b/src/peft/utils/init.py
@ -29,9 +29,7 @@ from .other import (
    CONFIG_NAME,
    WEIGHTS_NAME,
    SAFETENSORS_WEIGHTS_NAME,
-    CLAMP_QUANTILE,
    _set_trainable,
-    add_library_to_model_card,
    bloom_model_postprocess_past_key_value,
    prepare_model_for_int8_training,
    prepare_model_for_kbit_training,
@ -45,6 +43,9 @@ from .other import (
    _prepare_prompt_learning_config,
    _is_valid_match,
    infer_device,
+    get_auto_gptq_quant_linear,
+    get_quantization_config,
+    id_tensor_storage,
 )
 from .hub_utils import hub_file_exists
 from .save_and_load import get_peft_model_state_dict, set_peft_model_state_dict, load_peft_weights
--- a/src/peft/utils/other.py
+++ b/src/peft/utils/other.py
@ -14,14 +14,16 @@
 # limitations under the License.
 import copy
 import inspect
-import os
 import warnings
-from typing import Optional
+from typing import Optional, Tuple

 import accelerate
 import torch
 from accelerate.hooks import add_hook_to_module, remove_hook_from_module
 from accelerate.utils import is_npu_available, is_xpu_available
+from safetensors.torch import storage_ptr, storage_size
+
+from ..import_utils import is_auto_gptq_available, is_torch_tpu_available


 # Get current device name based on available devices
@ -37,31 +39,6 @@ def infer_device():
    return torch_device


-# Add or edit model card to have `library_name: peft`
-def add_library_to_model_card(output_dir):
-    if os.path.exists(os.path.join(output_dir, "README.md")):
-        with open(os.path.join(output_dir, "README.md"), "r") as f:
-            lines = f.readlines()
-        # check if the first line is `---`
-        if len(lines) > 0 and lines[0].startswith("---"):
-            for i, line in enumerate(lines[1:]):
-                # check if line starts with `library_name`, if yes, update it
-                if line.startswith("library_name"):
-                    lines[i + 1] = "library_name: peft\n"
-                    break
-                elif line.startswith("---"):
-                    # insert `library_name: peft` before the last `---`
-                    lines.insert(i + 1, "library_name: peft\n")
-                    break
-        else:
-            lines = ["---\n", "library_name: peft\n", "---\n"] + lines
-    else:
-        lines = ["---\n", "library_name: peft\n", "---\n"]
-    # write the lines back to README.md
-    with open(os.path.join(output_dir, "README.md"), "w") as f:
-        f.writelines(lines)
-
-
 # needed for prefix-tuning of bloom model
 def bloom_model_postprocess_past_key_value(past_key_values):
    past_key_values = torch.cat(past_key_values)
@ -76,41 +53,81 @@ def bloom_model_postprocess_past_key_value(past_key_values):
    return tuple(zip(keys, values))


-def prepare_model_for_kbit_training(model, use_gradient_checkpointing=True):
+# needed for prefix-tuning of StarCoder models
+def starcoder_model_postprocess_past_key_value(past_key_values):
+    result = []
+    for k in past_key_values:
+        k = k[:, :, 0]
+        k = k.permute([1, 2, 0, 3])
+        k = k.reshape(*k.shape[:-2], -1)
+        result.append(k)
+    return tuple(result)
+
+
+def prepare_model_for_kbit_training(model, use_gradient_checkpointing=True, gradient_checkpointing_kwargs=None):
    r"""
+    Note this method only works for `transformers` models.
+
    This method wraps the entire protocol for preparing a model before running a training. This includes:
        1- Cast the layernorm in fp32 2- making output embedding layer require grads 3- Add the upcasting of the lm
        head to fp32

    Args:
-        model, (`transformers.PreTrainedModel`):
+        model (`transformers.PreTrainedModel`):
            The loaded model from `transformers`
+        use_gradient_checkpointing (`bool`, *optional*, defaults to `True`):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+        gradient_checkpointing_kwargs (`dict`, *optional*, defaults to `None`):
+            Keyword arguments to pass to the gradient checkpointing function, please refer to the documentation of
+            `torch.utils.checkpoint.checkpoint` for more details about the arguments that you can pass to that method.
+            Note this is only available in the latest transformers versions (> 4.34.1).
    """
    loaded_in_kbit = getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False)
+    is_gptq_quantized = getattr(model, "quantization_method", None) == "gptq"
+    if gradient_checkpointing_kwargs is None:
+        gradient_checkpointing_kwargs = {}

    for name, param in model.named_parameters():
        # freeze base model's layers
        param.requires_grad = False

-    # cast all non INT8 parameters to fp32
-    for param in model.parameters():
-        if (param.dtype == torch.float16) or (param.dtype == torch.bfloat16):
-            param.data = param.data.to(torch.float32)
+    if not is_gptq_quantized:
+        # cast all non INT8 parameters to fp32
+        for param in model.parameters():
+            if (param.dtype == torch.float16) or (param.dtype == torch.bfloat16):
+                param.data = param.data.to(torch.float32)

-    if loaded_in_kbit and use_gradient_checkpointing:
-        # For backward compatibility
-        if hasattr(model, "enable_input_require_grads"):
-            model.enable_input_require_grads()
-        else:
+    if (loaded_in_kbit or is_gptq_quantized) and use_gradient_checkpointing:
+        # When having `use_reentrant=False` + gradient_checkpointing, there is no need for this hack
+        if "use_reentrant" not in gradient_checkpointing_kwargs or gradient_checkpointing_kwargs["use_reentrant"]:
+            # For backward compatibility
+            if hasattr(model, "enable_input_require_grads"):
+                model.enable_input_require_grads()
+            else:

-            def make_inputs_require_grad(module, input, output):
-                output.requires_grad_(True)
+                def make_inputs_require_grad(module, input, output):
+                    output.requires_grad_(True)

-            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+                model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+        # To support older transformers versions, check if the model supports gradient_checkpointing_kwargs
+        _supports_gc_kwargs = "gradient_checkpointing_kwargs" in list(
+            inspect.signature(model.gradient_checkpointing_enable).parameters
+        )
+
+        if not _supports_gc_kwargs and len(gradient_checkpointing_kwargs) > 0:
+            warnings.warn(
+                "gradient_checkpointing_kwargs is not supported in this version of transformers. The passed kwargs will be ignored."
+                " if you want to use that feature, please upgrade to the latest version of transformers.",
+                FutureWarning,
+            )
+
+        gc_enable_kwargs = (
+            {} if not _supports_gc_kwargs else {"gradient_checkpointing_kwargs": gradient_checkpointing_kwargs}
+        )

        # enable gradient checkpointing for memory efficiency
-        model.gradient_checkpointing_enable()
-
+        model.gradient_checkpointing_enable(**gc_enable_kwargs)
    return model


@ -150,9 +167,19 @@ class ModulesToSaveWrapper(torch.nn.Module):
        super().__init__()
        self.original_module = module_to_save
        self.modules_to_save = torch.nn.ModuleDict({})
+        self._active_adapter = adapter_name
+        self._disable_adapters = False
        self.update(adapter_name)
-        self.active_adapter = adapter_name
-        self.disable_adapters = False
+
+    @property
+    def disable_adapters(self) -> bool:
+        # use a property to ensure that disable_adapters is not set directly, instead use the enable_adapters method
+        return self._disable_adapters
+
+    @property
+    def active_adapter(self) -> str:
+        # use a property to ensure that active_adapter is not set directly, instead use the set_adapter method
+        return self._active_adapter

    def update(self, adapter_name):
        self.modules_to_save.update(torch.nn.ModuleDict({adapter_name: copy.deepcopy(self.original_module)}))
@ -163,6 +190,10 @@ class ModulesToSaveWrapper(torch.nn.Module):
            remove_hook_from_module(self.modules_to_save[adapter_name])
            add_hook_to_module(self.modules_to_save[adapter_name], new_hook)

+        self.original_module.requires_grad_(False)
+        if adapter_name == self.active_adapter:
+            self.modules_to_save[adapter_name].requires_grad_(True)
+
    def _create_new_hook(self, old_hook):
        r"""
        Creates a new hook based on the old hook. Use it only if you know what you are doing !
@ -182,6 +213,40 @@ class ModulesToSaveWrapper(torch.nn.Module):
            return self.original_module(*args, **kwargs)
        return self.modules_to_save[self.active_adapter](*args, **kwargs)

+    def enable_adapters(self, enabled: bool):
+        """Toggle the enabling and disabling of adapters
+
+        Takes care of setting the requires_grad flag for the adapter weights.
+
+        Args:
+            enabled (bool): True to enable adapters, False to disable adapters
+        """
+        if self._disable_adapters is not enabled:
+            # already in the desired state, do nothing
+            return
+
+        if enabled:
+            self.original_module.requires_grad_(False)
+            self.modules_to_save[self.active_adapter].requires_grad_(True)
+            self._disable_adapters = False
+        else:
+            self.original_module.requires_grad_(True)
+            self.modules_to_save.requires_grad_(False)
+            self._disable_adapters = True
+
+    def set_adapter(self, adapter_name: str):
+        """Set the active adapter
+
+        Args:
+            adapter_name (str): The name of the adapter to set as active
+        """
+        if adapter_name not in self.modules_to_save:
+            raise ValueError(f"Adapter {adapter_name} not found in {self.modules_to_save.keys()}")
+
+        self.modules_to_save[self.active_adapter].requires_grad_(False)
+        self.modules_to_save[adapter_name].requires_grad_(True)
+        self._active_adapter = adapter_name
+

 def _get_submodules(model, key):
    parent = model.get_submodule(".".join(key.split(".")[:-1]))
@ -204,16 +269,17 @@ def _set_trainable(model, adapter_name):
            parent, target, target_name = _get_submodules(model, key)
            if isinstance(target, ModulesToSaveWrapper):
                target.update(adapter_name)
+                target.set_adapter(target.active_adapter)
            else:
-                for param in target.parameters():
-                    param.requires_grad = True
-                setattr(parent, target_name, ModulesToSaveWrapper(target, adapter_name))
+                new_module = ModulesToSaveWrapper(target, adapter_name)
+                new_module.set_adapter(adapter_name)
+                setattr(parent, target_name, new_module)


 def _set_adapter(model, adapter_name):
    for module in model.modules():
        if isinstance(module, ModulesToSaveWrapper):
-            module.active_adapter = adapter_name
+            module.set_adapter(adapter_name)


 def _prepare_prompt_learning_config(peft_config, model_config):
@ -294,7 +360,12 @@ def fsdp_auto_wrap_policy(model):


 def transpose(weight, fan_in_fan_out):
-    return weight.T if fan_in_fan_out else weight
+    if not fan_in_fan_out:
+        return weight
+
+    if isinstance(weight, torch.nn.Parameter):
+        return torch.nn.Parameter(weight.T)
+    return weight.T


 def _is_valid_match(key: str, target_key: str):
@ -325,6 +396,74 @@ def _get_batch_size(input_ids: Optional[torch.Tensor], inputs_embeds: Optional[t
    return batch_size


+def get_quantization_config(model: torch.nn.Module, method: str):
+    """
+    Get the quantization config of the related quantization method
+    """
+    if (
+        hasattr(model, "config")
+        and hasattr(model.config, "quantization_config")
+        and (getattr(model, "quantization_method", None) == method)
+    ):
+        return model.config.quantization_config
+    return None
+
+
+def get_auto_gptq_quant_linear(gptq_quantization_config):
+    """
+    Get the right AutoGPTQQuantLinear class based on the quantization config file
+    """
+    if gptq_quantization_config is not None and is_auto_gptq_available():
+        from auto_gptq.utils.import_utils import dynamically_import_QuantLinear
+
+        desc_act = gptq_quantization_config.desc_act
+        group_size = gptq_quantization_config.group_size
+        bits = gptq_quantization_config.bits
+        if hasattr(gptq_quantization_config, "use_exllama"):
+            use_exllama = gptq_quantization_config.use_exllama
+        else:
+            use_exllama = not gptq_quantization_config.disable_exllama
+        if hasattr(gptq_quantization_config, "exllama_config"):
+            exllama_version = gptq_quantization_config.exllama_config["version"]
+        else:
+            exllama_version = 1
+        AutoGPTQQuantLinear = dynamically_import_QuantLinear(
+            use_triton=False,
+            desc_act=desc_act,
+            group_size=group_size,
+            bits=bits,
+            disable_exllama=not (use_exllama and exllama_version == 1),
+            disable_exllamav2=not (use_exllama and exllama_version == 2),
+        )
+        return AutoGPTQQuantLinear
+    return None
+
+
+def id_tensor_storage(tensor: torch.Tensor) -> Tuple[torch.device, int, int]:
+    """
+    Unique identifier to a tensor storage. Multiple different tensors can share the same underlying storage. For
+    example, "meta" tensors all share the same storage, and thus their identifier will all be equal. This identifier is
+    guaranteed to be unique and constant for this tensor's storage during its lifetime. Two tensor storages with
+    non-overlapping lifetimes may have the same id.
+
+    This method is the exact same copy of
+    https://github.com/huggingface/transformers/blob/main/src/transformers/pytorch_utils.py#L282C1-L300C58 but we added
+    it here manually to avoid import issue with old versions of transformers.
+    """
+    if tensor.device.type == "xla" and is_torch_tpu_available():
+        # NOTE: xla tensors dont have storage
+        # use some other unique id to distinguish.
+        # this is a XLA tensor, it must be created using torch_xla's
+        # device. So the following import is safe:
+        import torch_xla
+
+        unique_id = torch_xla._XLAC._xla_get_tensor_id(tensor)
+    else:
+        unique_id = storage_ptr(tensor)
+
+    return tensor.device, unique_id, storage_size(tensor)
+
+
 TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING = {
    "t5": ["q", "v"],
    "mt5": ["q", "v"],
@ -352,6 +491,8 @@ TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING = {
    "falcon": ["query_key_value"],
    "btlm": ["c_proj", "c_attn"],
    "codegen": ["qkv_proj"],
+    "mistral": ["q_proj", "v_proj"],
+    "stablelm": ["q_proj", "v_proj"],
 }

 TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING = {
@ -370,9 +511,9 @@ TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING = {
    "bert": ["key", "value", "output.dense"],
    "deberta-v2": ["key_proj", "value_proj", "output.dense"],
    "deberta": ["in_proj", "output.dense"],
-    "RefinedWebModel": ["query_key_value"],
-    "RefinedWeb": ["query_key_value"],
-    "falcon": ["query_key_value"],
+    "RefinedWebModel": ["query_key_value", "dense_4h_to_h"],
+    "RefinedWeb": ["query_key_value", "dense_4h_to_h"],
+    "falcon": ["query_key_value", "dense_4h_to_h"],
 }

 TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING = {
@ -391,9 +532,9 @@ TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING = {
    "bert": ["output.dense"],
    "deberta-v2": ["output.dense"],
    "deberta": ["output.dense"],
-    "RefinedWeb": ["query_key_value"],
-    "RefinedWebModel": ["query_key_value"],
-    "falcon": ["query_key_value"],
+    "RefinedWeb": ["dense_4h_to_h"],
+    "RefinedWebModel": ["dense_4h_to_h"],
+    "falcon": ["dense_4h_to_h"],
 }

 COMMON_LAYERS_PATTERN = ["layers", "h", "block", "blocks", "layer"]
@ -421,9 +562,9 @@ TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING = {

 TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING = {
    "bloom": bloom_model_postprocess_past_key_value,
+    "gpt_bigcode": starcoder_model_postprocess_past_key_value,
 }

 WEIGHTS_NAME = "adapter_model.bin"
 SAFETENSORS_WEIGHTS_NAME = "adapter_model.safetensors"
 CONFIG_NAME = "adapter_config.json"
-CLAMP_QUANTILE = 0.99
--- a/src/peft/utils/peft_types.py
+++ b/src/peft/utils/peft_types.py
@ -21,12 +21,15 @@ import enum

 class PeftType(str, enum.Enum):
    PROMPT_TUNING = "PROMPT_TUNING"
+    MULTITASK_PROMPT_TUNING = "MULTITASK_PROMPT_TUNING"
    P_TUNING = "P_TUNING"
    PREFIX_TUNING = "PREFIX_TUNING"
    LORA = "LORA"
    ADALORA = "ADALORA"
    ADAPTION_PROMPT = "ADAPTION_PROMPT"
    IA3 = "IA3"
+    LOHA = "LOHA"
+    LOKR = "LOKR"


 class TaskType(str, enum.Enum):
--- a/src/peft/utils/save_and_load.py
+++ b/src/peft/utils/save_and_load.py
@ -25,17 +25,23 @@ from .other import SAFETENSORS_WEIGHTS_NAME, WEIGHTS_NAME, infer_device
 from .peft_types import PeftType


-def get_peft_model_state_dict(model, state_dict=None, adapter_name="default"):
+def get_peft_model_state_dict(model, state_dict=None, adapter_name="default", unwrap_compiled=False):
    """
    Get the state dict of the Peft model.

    Args:
        model ([`PeftModel`]): The Peft model. When using torch.nn.DistributedDataParallel, DeepSpeed or FSDP,
-        the model should be the underlying model/unwrapped model (i.e. model.module).
+            the model should be the underlying model/unwrapped model (i.e. model.module).
        state_dict (`dict`, *optional*, defaults to `None`):
-            The state dict of the model. If not provided, the state dict of the model
-        will be used.
+            The state dict of the model. If not provided, the state dict of the passed model will be used.
+        adapter_name (`str`, *optional*, defaults to `"default"`):
+            The name of the adapter whose state dict should be returned.
+        unwrap_compiled (`bool`, *optional*, defaults to `False`):
+            Whether to unwrap the model if torch.compile was used.
    """
+    if unwrap_compiled:
+        model = getattr(model, "_orig_mod", model)
+
    config = model.peft_config[adapter_name]
    if state_dict is None:
        state_dict = model.state_dict()
@ -66,14 +72,25 @@ def get_peft_model_state_dict(model, state_dict=None, adapter_name="default"):
                config.rank_pattern = rank_pattern
                to_return = model.resize_state_dict_by_rank_pattern(rank_pattern, to_return, adapter_name)

+    elif config.peft_type == PeftType.LOHA:
+        to_return = {k: state_dict[k] for k in state_dict if "hada_" in k}
+
+    elif config.peft_type == PeftType.LOKR:
+        to_return = {k: state_dict[k] for k in state_dict if "lokr_" in k}
+
    elif config.peft_type == PeftType.ADAPTION_PROMPT:
        to_return = {k: state_dict[k] for k in state_dict if k.split(".")[-1].startswith("adaption_")}
    elif config.is_prompt_learning:
        to_return = {}
-        if config.inference_mode:
+        if config.peft_type == PeftType.MULTITASK_PROMPT_TUNING:
+            to_return["prefix_task_cols"] = model.prompt_encoder[adapter_name].prefix_task_cols
+            to_return["prefix_task_rows"] = model.prompt_encoder[adapter_name].prefix_task_rows
            prompt_embeddings = model.prompt_encoder[adapter_name].embedding.weight
        else:
-            prompt_embeddings = model.get_prompt_embedding_to_save(adapter_name)
+            if config.inference_mode:
+                prompt_embeddings = model.prompt_encoder[adapter_name].embedding.weight
+            else:
+                prompt_embeddings = model.get_prompt_embedding_to_save(adapter_name)
        to_return["prompt_embeddings"] = prompt_embeddings
    elif config.peft_type == PeftType.IA3:
        to_return = {k: state_dict[k] for k in state_dict if "ia3_" in k}
@ -109,9 +126,15 @@ def set_peft_model_state_dict(model, peft_model_state_dict, adapter_name="defaul
    else:
        state_dict = peft_model_state_dict

-    if config.peft_type in (PeftType.LORA, PeftType.ADALORA, PeftType.IA3):
+    if config.peft_type in (PeftType.LORA, PeftType.LOHA, PeftType.LOKR, PeftType.ADALORA, PeftType.IA3):
        peft_model_state_dict = {}
-        parameter_prefix = "ia3_" if config.peft_type == PeftType.IA3 else "lora_"
+        parameter_prefix = {
+            PeftType.IA3: "ia3_",
+            PeftType.LORA: "lora_",
+            PeftType.ADALORA: "lora_",
+            PeftType.LOHA: "hada_",
+            PeftType.LOKR: "lokr_",
+        }[config.peft_type]
        for k, v in state_dict.items():
            if parameter_prefix in k:
                suffix = k.split(parameter_prefix)[1]
@ -137,6 +160,9 @@ def set_peft_model_state_dict(model, peft_model_state_dict, adapter_name="defaul
        model.prompt_encoder[adapter_name].embedding.load_state_dict(
            {"weight": peft_model_state_dict["prompt_embeddings"]}, strict=True
        )
+
+    if config.peft_type == PeftType.MULTITASK_PROMPT_TUNING:
+        model.prompt_encoder[adapter_name].load_state_dict(peft_model_state_dict, strict=False)
    return load_result


--- a/tests/init.py
+++ b/tests/init.py
@ -0,0 +1,19 @@
+import os
+
+
+if os.environ.get("PEFT_DEBUG_WITH_TORCH_COMPILE") == "1":
+    # This is a hack purely for debugging purposes. If the environment variable PEFT_DEBUG_WITH_TORCH_COMPILE is set to
+    # 1, get_peft_model() will return a compiled model. This way, all unit tests that use peft.get_peft_model() will
+    # use a compiled model. See .github/workflows/torch_compile_tests.yml.
+    import torch
+
+    import peft
+    from peft.mapping import get_peft_model as get_peft_model_original
+
+    def get_peft_model_new(*args, **kwargs):
+        """Make get_peft_model() return a compiled model."""
+        peft_model = get_peft_model_original(*args, **kwargs)
+        peft_model = torch.compile(peft_model)
+        return peft_model
+
+    peft.get_peft_model = get_peft_model_new
--- a/tests/test_adaption_prompt.py
+++ b/tests/test_adaption_prompt.py
@ -53,7 +53,7 @@ class AdaptionPromptTester(TestCase, PeftCommonTester):
    """

    def setUp(self):
-        """Check that llama is available in transformers package before running each test."""
+        # Check that llama is available in transformers package before running each test.
        if not is_llama_available():
            self.skipTest("Llama not available in transformers. Skipping test.")

--- a/tests/test_common_gpu.py
+++ b/tests/test_common_gpu.py
@ -13,31 +13,46 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import gc
+import tempfile
 import unittest

 import pytest
 import torch
+import torch.nn.functional as F
 from transformers import (
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoModelForSequenceClassification,
+    AutoModelForTokenClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    LlamaForCausalLM,
    WhisperForConditionalGeneration,
 )

-from peft import AdaptionPromptConfig, LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
+from peft import (
+    AdaptionPromptConfig,
+    IA3Config,
+    LoraConfig,
+    PeftModel,
+    TaskType,
+    get_peft_model,
+    prepare_model_for_kbit_training,
+)
 from peft.import_utils import is_bnb_4bit_available, is_bnb_available

 from .testing_utils import require_bitsandbytes, require_torch_gpu, require_torch_multi_gpu


 if is_bnb_available():
-    from peft.tuners.lora import Linear8bitLt
+    import bitsandbytes as bnb
+
+    from peft.tuners.ia3 import Linear8bitLt as IA3Linear8bitLt
+    from peft.tuners.lora import Linear8bitLt as LoraLinear8bitLt

    if is_bnb_4bit_available():
-        from peft.tuners.lora import Linear4bit
+        from peft.tuners.ia3 import Linear4bit as IA3Linear4bit
+        from peft.tuners.lora import Linear4bit as LoraLinear4bit


@require_torch_gpu
@ -104,14 +119,68 @@ class PeftGPUCommonTests(unittest.TestCase):
        config = LoraConfig(r=32, lora_alpha=64, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none")

        flan_8bit = get_peft_model(flan_8bit, flan_lora_config)
-        self.assertTrue(isinstance(flan_8bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, Linear8bitLt))
+        self.assertTrue(
+            isinstance(flan_8bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, LoraLinear8bitLt)
+        )

        opt_8bit = get_peft_model(opt_8bit, opt_lora_config)
-        self.assertTrue(isinstance(opt_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, Linear8bitLt))
+        self.assertTrue(
+            isinstance(opt_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear8bitLt)
+        )

        whisper_8bit = get_peft_model(whisper_8bit, config)
        self.assertTrue(
-            isinstance(whisper_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, Linear8bitLt)
+            isinstance(whisper_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear8bitLt)
+        )
+
+    @require_bitsandbytes
+    @pytest.mark.multi_gpu_tests
+    @pytest.mark.single_gpu_tests
+    def test_ia3_bnb_8bit_quantization(self):
+        r"""
+        Test that tests if the 8bit quantization using IA3 works as expected
+        """
+        whisper_8bit = WhisperForConditionalGeneration.from_pretrained(
+            self.audio_model_id,
+            device_map="auto",
+            load_in_8bit=True,
+        )
+
+        opt_8bit = AutoModelForCausalLM.from_pretrained(
+            self.causal_lm_model_id,
+            device_map="auto",
+            load_in_8bit=True,
+        )
+
+        flan_8bit = AutoModelForSeq2SeqLM.from_pretrained(
+            self.seq2seq_model_id,
+            device_map="auto",
+            load_in_8bit=True,
+        )
+
+        flan_ia3_config = IA3Config(target_modules=["q", "v"], task_type="SEQ_2_SEQ_LM")
+
+        opt_ia3_config = IA3Config(
+            target_modules=["q_proj", "v_proj", "fc2"],
+            feedforward_modules=["fc2"],
+            task_type="CAUSAL_LM",
+        )
+
+        config = IA3Config(target_modules=["q_proj", "v_proj", "fc2"], feedforward_modules=["fc2"])
+
+        flan_8bit = get_peft_model(flan_8bit, flan_ia3_config)
+        self.assertTrue(
+            isinstance(flan_8bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, IA3Linear8bitLt)
+        )
+
+        opt_8bit = get_peft_model(opt_8bit, opt_ia3_config)
+        self.assertTrue(
+            isinstance(opt_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, IA3Linear8bitLt)
+        )
+
+        whisper_8bit = get_peft_model(whisper_8bit, config)
+        self.assertTrue(
+            isinstance(whisper_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, IA3Linear8bitLt)
        )

    @require_bitsandbytes
@ -170,13 +239,65 @@ class PeftGPUCommonTests(unittest.TestCase):
        config = LoraConfig(r=32, lora_alpha=64, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none")

        flan_4bit = get_peft_model(flan_4bit, flan_lora_config)
-        self.assertTrue(isinstance(flan_4bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, Linear4bit))
+        self.assertTrue(
+            isinstance(flan_4bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, LoraLinear4bit)
+        )

        opt_4bit = get_peft_model(opt_4bit, opt_lora_config)
-        self.assertTrue(isinstance(opt_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, Linear4bit))
+        self.assertTrue(isinstance(opt_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear4bit))

        whisper_4bit = get_peft_model(whisper_4bit, config)
-        self.assertTrue(isinstance(whisper_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, Linear4bit))
+        self.assertTrue(
+            isinstance(whisper_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear4bit)
+        )
+
+    @require_bitsandbytes
+    @pytest.mark.multi_gpu_tests
+    @pytest.mark.single_gpu_tests
+    def test_ia3_bnb_4bit_quantization(self):
+        r"""
+        Test that tests if the 4bit quantization using IA3 works as expected
+        """
+        whisper_4bit = WhisperForConditionalGeneration.from_pretrained(
+            self.audio_model_id,
+            device_map="auto",
+            load_in_4bit=True,
+        )
+
+        opt_4bit = AutoModelForCausalLM.from_pretrained(
+            self.causal_lm_model_id,
+            device_map="auto",
+            load_in_4bit=True,
+        )
+
+        flan_4bit = AutoModelForSeq2SeqLM.from_pretrained(
+            self.seq2seq_model_id,
+            device_map="auto",
+            load_in_4bit=True,
+        )
+
+        flan_ia3_config = IA3Config(target_modules=["q", "v"], task_type="SEQ_2_SEQ_LM")
+
+        opt_ia3_config = IA3Config(
+            target_modules=["q_proj", "v_proj", "fc2"],
+            feedforward_modules=["fc2"],
+            task_type="CAUSAL_LM",
+        )
+
+        config = IA3Config(target_modules=["q_proj", "v_proj", "fc2"], feedforward_modules=["fc2"])
+
+        flan_4bit = get_peft_model(flan_4bit, flan_ia3_config)
+        self.assertTrue(
+            isinstance(flan_4bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, IA3Linear4bit)
+        )
+
+        opt_4bit = get_peft_model(opt_4bit, opt_ia3_config)
+        self.assertTrue(isinstance(opt_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, IA3Linear4bit))
+
+        whisper_4bit = get_peft_model(whisper_4bit, config)
+        self.assertTrue(
+            isinstance(whisper_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, IA3Linear4bit)
+        )

    @pytest.mark.multi_gpu_tests
    @require_torch_multi_gpu
@ -225,7 +346,7 @@ class PeftGPUCommonTests(unittest.TestCase):

        model = get_peft_model(model, lora_config)
        self.assertTrue(isinstance(model, PeftModel))
-        self.assertTrue(isinstance(model.base_model.model.encoder.block[0].layer[0].SelfAttention.q, Linear8bitLt))
+        self.assertTrue(isinstance(model.base_model.model.encoder.block[0].layer[0].SelfAttention.q, LoraLinear8bitLt))

        dummy_input = "This is a dummy input:"
        input_ids = tokenizer(dummy_input, return_tensors="pt").input_ids.to(self.device)
@ -356,3 +477,173 @@ class PeftGPUCommonTests(unittest.TestCase):
        self.assertTrue(modules_to_save.weight.requires_grad is True)
        self.assertTrue(original_module.weight.grad is None)
        self.assertTrue(modules_to_save.weight.grad is not None)
+
+    @require_torch_gpu
+    @pytest.mark.single_gpu_tests
+    @require_bitsandbytes
+    def test_8bit_merge_lora(self):
+        torch.manual_seed(1000)
+        model = AutoModelForCausalLM.from_pretrained(
+            "facebook/opt-125m",
+            load_in_8bit=True,
+        )
+        random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device)
+        out_base = F.softmax(model(random_input).logits, dim=-1)
+
+        config = LoraConfig(
+            r=8,
+            init_lora_weights=False,
+        )
+        model = get_peft_model(model, config)
+
+        with torch.inference_mode():
+            out_before_merge = F.softmax(model(random_input).logits, dim=-1)
+
+        model.merge_and_unload()
+        with torch.inference_mode():
+            out_after_merge = F.softmax(model(random_input).logits, dim=-1)
+
+        atol = 0.01
+        rtol = 10
+        self.assertFalse(torch.allclose(out_base, out_before_merge, atol=atol, rtol=rtol))
+        self.assertTrue(torch.allclose(out_before_merge, out_after_merge, atol=atol, rtol=rtol))
+        self.assertTrue(isinstance(model, PeftModel))
+        self.assertTrue(
+            isinstance(model.base_model.model.model.decoder.layers[0].self_attn.q_proj, bnb.nn.Linear8bitLt)
+        )
+        self.assertTrue(
+            isinstance(model.base_model.model.model.decoder.layers[0].self_attn.v_proj, bnb.nn.Linear8bitLt)
+        )
+
+    @require_torch_gpu
+    @pytest.mark.single_gpu_tests
+    @require_bitsandbytes
+    def test_8bit_merge_and_disable_lora(self):
+        torch.manual_seed(1000)
+        model = AutoModelForCausalLM.from_pretrained(
+            "facebook/opt-125m",
+            load_in_8bit=True,
+        )
+        random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device)
+        # compare outputs in probability space, because logits can have outliers
+        # and token ids are not precise enough
+        out_base = F.softmax(model(random_input).logits, dim=-1)
+
+        config = LoraConfig(
+            r=8,
+            init_lora_weights=False,
+        )
+        model = get_peft_model(model, config)
+
+        with torch.inference_mode():
+            out_before = F.softmax(model(random_input).logits, dim=-1)
+
+        model.merge_adapter()
+        with model.disable_adapter():
+            with torch.inference_mode():
+                out_after = F.softmax(model(random_input).logits, dim=-1)
+
+        atol = 0.01
+        rtol = 10
+        self.assertFalse(torch.allclose(out_base, out_before, atol=atol, rtol=rtol))
+        self.assertTrue(torch.allclose(out_base, out_after, atol=atol, rtol=rtol))
+        self.assertTrue(isinstance(model, PeftModel))
+        self.assertTrue(isinstance(model.base_model.model.model.decoder.layers[0].self_attn.q_proj, LoraLinear8bitLt))
+        self.assertTrue(isinstance(model.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear8bitLt))
+
+    @require_torch_gpu
+    @pytest.mark.single_gpu_tests
+    @require_bitsandbytes
+    def test_4bit_merge_lora(self):
+        torch.manual_seed(3000)
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_use_double_quant=False,
+            bnb_4bit_compute_type=torch.float32,
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            "facebook/opt-125m",
+            quantization_config=bnb_config,
+            torch_dtype=torch.float32,
+        )
+        random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device)
+        # compare outputs in probability space, because logits can have outliers
+        # and token ids are not precise enough
+        out_base = F.softmax(model(random_input).logits, dim=-1)
+
+        config = LoraConfig(
+            r=8,
+            init_lora_weights=False,
+        )
+        model = get_peft_model(model, config)
+
+        with torch.inference_mode():
+            out_before_merge = F.softmax(model(random_input).logits, dim=-1)
+
+        model.merge_and_unload()
+        with torch.inference_mode():
+            out_after_merge = F.softmax(model(random_input).logits, dim=-1)
+
+        # tolerances are pretty high because some deviations are expected with quantization
+        atol = 0.01
+        rtol = 10
+        self.assertFalse(torch.allclose(out_base, out_before_merge, atol=atol, rtol=rtol))
+        self.assertTrue(torch.allclose(out_before_merge, out_after_merge, atol=atol, rtol=rtol))
+        self.assertTrue(isinstance(model, PeftModel))
+        self.assertTrue(isinstance(model.base_model.model.model.decoder.layers[0].self_attn.q_proj, bnb.nn.Linear4bit))
+        self.assertTrue(isinstance(model.base_model.model.model.decoder.layers[0].self_attn.v_proj, bnb.nn.Linear4bit))
+
+    @require_torch_gpu
+    @pytest.mark.single_gpu_tests
+    @require_bitsandbytes
+    def test_4bit_merge_and_disable_lora(self):
+        torch.manual_seed(3000)
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_use_double_quant=False,
+            bnb_4bit_compute_type=torch.float32,
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            "facebook/opt-125m",
+            quantization_config=bnb_config,
+            torch_dtype=torch.float32,
+        )
+        random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device)
+        # compare outputs in probability space, because logits can have outliers
+        # and token ids are not precise enough
+        out_base = F.softmax(model(random_input).logits, dim=-1)
+
+        config = LoraConfig(
+            r=8,
+            init_lora_weights=False,
+        )
+        model = get_peft_model(model, config)
+
+        with torch.inference_mode():
+            out_before = F.softmax(model(random_input).logits, dim=-1)
+
+        model.merge_adapter()
+        with model.disable_adapter():
+            with torch.inference_mode():
+                out_after = F.softmax(model(random_input).logits, dim=-1)
+
+        atol = 0.01
+        rtol = 10
+        self.assertFalse(torch.allclose(out_base, out_before, atol=atol, rtol=rtol))
+        self.assertTrue(torch.allclose(out_base, out_after, atol=atol, rtol=rtol))
+        self.assertTrue(isinstance(model, PeftModel))
+        self.assertTrue(isinstance(model.base_model.model.model.decoder.layers[0].self_attn.q_proj, LoraLinear4bit))
+        self.assertTrue(isinstance(model.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear4bit))
+
+    @require_torch_gpu
+    @pytest.mark.single_gpu_tests
+    def test_serialization_shared_tensors(self):
+        model_checkpoint = "roberta-base"
+        peft_config = LoraConfig(
+            task_type=TaskType.TOKEN_CLS, inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias="all"
+        )
+        model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=11).to("cuda")
+        model = get_peft_model(model, peft_config)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, safe_serialization=True)
--- a/tests/test_config.py
+++ b/tests/test_config.py
@ -20,11 +20,16 @@ import unittest
 import warnings

 import pytest
+from parameterized import parameterized

 from peft import (
+    AdaLoraConfig,
+    # TODO: uncomment once PEFT works again with transformers
    AdaptionPromptConfig,
    IA3Config,
+    LoHaConfig,
    LoraConfig,
+    MultitaskPromptTuningConfig,
    PeftConfig,
    PrefixTuningConfig,
    PromptEncoder,
@ -35,20 +40,23 @@ from peft import (

 PEFT_MODELS_TO_TEST = [("lewtun/tiny-random-OPTForCausalLM-delta", "v1")]

-
-class PeftConfigTestMixin:
-    all_config_classes = (
-        LoraConfig,
-        PromptEncoderConfig,
-        PrefixTuningConfig,
-        PromptTuningConfig,
-        AdaptionPromptConfig,
-        IA3Config,
-    )
+ALL_CONFIG_CLASSES = (
+    # TODO: uncomment once PEFT works again with transformers
+    AdaptionPromptConfig,
+    AdaLoraConfig,
+    IA3Config,
+    LoHaConfig,
+    LoraConfig,
+    MultitaskPromptTuningConfig,
+    PrefixTuningConfig,
+    PromptEncoderConfig,
+    PromptTuningConfig,
+)


-class PeftConfigTester(unittest.TestCase, PeftConfigTestMixin):
-    def test_methods(self):
+class PeftConfigTester(unittest.TestCase):
+    @parameterized.expand(ALL_CONFIG_CLASSES)
+    def test_methods(self, config_class):
        r"""
        Test if all configs have the expected methods. Here we test
        - to_dict
@ -57,109 +65,107 @@ class PeftConfigTester(unittest.TestCase, PeftConfigTestMixin):
        - from_json_file
        """
        # test if all configs have the expected methods
-        for config_class in self.all_config_classes:
-            config = config_class()
-            self.assertTrue(hasattr(config, "to_dict"))
-            self.assertTrue(hasattr(config, "save_pretrained"))
-            self.assertTrue(hasattr(config, "from_pretrained"))
-            self.assertTrue(hasattr(config, "from_json_file"))
+        config = config_class()
+        self.assertTrue(hasattr(config, "to_dict"))
+        self.assertTrue(hasattr(config, "save_pretrained"))
+        self.assertTrue(hasattr(config, "from_pretrained"))
+        self.assertTrue(hasattr(config, "from_json_file"))

-    def test_task_type(self):
-        for config_class in self.all_config_classes:
-            # assert this will not fail
-            _ = config_class(task_type="test")
+    @parameterized.expand(ALL_CONFIG_CLASSES)
+    def test_task_type(self, config_class):
+        config_class(task_type="test")

-    def test_from_pretrained(self):
+    @parameterized.expand(ALL_CONFIG_CLASSES)
+    def test_from_pretrained(self, config_class):
        r"""
        Test if the config is correctly loaded using:
        - from_pretrained
        """
-        for config_class in self.all_config_classes:
-            for model_name, revision in PEFT_MODELS_TO_TEST:
-                # Test we can load config from delta
-                _ = config_class.from_pretrained(model_name, revision=revision)
+        for model_name, revision in PEFT_MODELS_TO_TEST:
+            # Test we can load config from delta
+            config_class.from_pretrained(model_name, revision=revision)

-    def test_save_pretrained(self):
+    @parameterized.expand(ALL_CONFIG_CLASSES)
+    def test_save_pretrained(self, config_class):
        r"""
        Test if the config is correctly saved and loaded using
        - save_pretrained
        """
-        for config_class in self.all_config_classes:
-            config = config_class()
-            with tempfile.TemporaryDirectory() as tmp_dirname:
-                config.save_pretrained(tmp_dirname)
+        config = config_class()
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            config.save_pretrained(tmp_dirname)

-                config_from_pretrained = config_class.from_pretrained(tmp_dirname)
-                self.assertEqual(config.to_dict(), config_from_pretrained.to_dict())
+            config_from_pretrained = config_class.from_pretrained(tmp_dirname)
+            self.assertEqual(config.to_dict(), config_from_pretrained.to_dict())

-    def test_from_json_file(self):
-        for config_class in self.all_config_classes:
-            config = config_class()
-            with tempfile.TemporaryDirectory() as tmp_dirname:
-                config.save_pretrained(tmp_dirname)
+    @parameterized.expand(ALL_CONFIG_CLASSES)
+    def test_from_json_file(self, config_class):
+        config = config_class()
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            config.save_pretrained(tmp_dirname)

-                config_from_json = config_class.from_json_file(os.path.join(tmp_dirname, "adapter_config.json"))
-                self.assertEqual(config.to_dict(), config_from_json)
+            config_from_json = config_class.from_json_file(os.path.join(tmp_dirname, "adapter_config.json"))
+            self.assertEqual(config.to_dict(), config_from_json)

-    def test_to_dict(self):
+    @parameterized.expand(ALL_CONFIG_CLASSES)
+    def test_to_dict(self, config_class):
        r"""
        Test if the config can be correctly converted to a dict using:
        - to_dict
        """
-        for config_class in self.all_config_classes:
-            config = config_class()
-            self.assertTrue(isinstance(config.to_dict(), dict))
+        config = config_class()
+        self.assertTrue(isinstance(config.to_dict(), dict))

-    def test_from_pretrained_cache_dir(self):
+    @parameterized.expand(ALL_CONFIG_CLASSES)
+    def test_from_pretrained_cache_dir(self, config_class):
        r"""
        Test if the config is correctly loaded with extra kwargs
        """
        with tempfile.TemporaryDirectory() as tmp_dirname:
-            for config_class in self.all_config_classes:
-                for model_name, revision in PEFT_MODELS_TO_TEST:
-                    # Test we can load config from delta
-                    _ = config_class.from_pretrained(model_name, revision=revision, cache_dir=tmp_dirname)
+            for model_name, revision in PEFT_MODELS_TO_TEST:
+                # Test we can load config from delta
+                config_class.from_pretrained(model_name, revision=revision, cache_dir=tmp_dirname)

    def test_from_pretrained_cache_dir_remote(self):
        r"""
        Test if the config is correctly loaded with a checkpoint from the hub
        """
        with tempfile.TemporaryDirectory() as tmp_dirname:
-            _ = PeftConfig.from_pretrained("ybelkada/test-st-lora", cache_dir=tmp_dirname)
+            PeftConfig.from_pretrained("ybelkada/test-st-lora", cache_dir=tmp_dirname)
            self.assertTrue("models--ybelkada--test-st-lora" in os.listdir(tmp_dirname))

-    def test_set_attributes(self):
+    @parameterized.expand(ALL_CONFIG_CLASSES)
+    def test_set_attributes(self, config_class):
        # manually set attributes and check if they are correctly written
-        for config_class in self.all_config_classes:
-            config = config_class(peft_type="test")
+        config = config_class(peft_type="test")

-            # save pretrained
-            with tempfile.TemporaryDirectory() as tmp_dirname:
-                config.save_pretrained(tmp_dirname)
+        # save pretrained
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            config.save_pretrained(tmp_dirname)

-                config_from_pretrained = config_class.from_pretrained(tmp_dirname)
-                self.assertEqual(config.to_dict(), config_from_pretrained.to_dict())
+            config_from_pretrained = config_class.from_pretrained(tmp_dirname)
+            self.assertEqual(config.to_dict(), config_from_pretrained.to_dict())

-    def test_config_copy(self):
+    @parameterized.expand(ALL_CONFIG_CLASSES)
+    def test_config_copy(self, config_class):
        # see https://github.com/huggingface/peft/issues/424
-        for config_class in self.all_config_classes:
-            config = config_class()
-            copied = copy.copy(config)
-            self.assertEqual(config.to_dict(), copied.to_dict())
+        config = config_class()
+        copied = copy.copy(config)
+        self.assertEqual(config.to_dict(), copied.to_dict())

-    def test_config_deepcopy(self):
+    @parameterized.expand(ALL_CONFIG_CLASSES)
+    def test_config_deepcopy(self, config_class):
        # see https://github.com/huggingface/peft/issues/424
-        for config_class in self.all_config_classes:
-            config = config_class()
-            copied = copy.deepcopy(config)
-            self.assertEqual(config.to_dict(), copied.to_dict())
+        config = config_class()
+        copied = copy.deepcopy(config)
+        self.assertEqual(config.to_dict(), copied.to_dict())

-    def test_config_pickle_roundtrip(self):
+    @parameterized.expand(ALL_CONFIG_CLASSES)
+    def test_config_pickle_roundtrip(self, config_class):
        # see https://github.com/huggingface/peft/issues/424
-        for config_class in self.all_config_classes:
-            config = config_class()
-            copied = pickle.loads(pickle.dumps(config))
-            self.assertEqual(config.to_dict(), copied.to_dict())
+        config = config_class()
+        copied = pickle.loads(pickle.dumps(config))
+        self.assertEqual(config.to_dict(), copied.to_dict())

    def test_prompt_encoder_warning_num_layers(self):
        # This test checks that if a prompt encoder config is created with an argument that is ignored, there should be
@ -182,3 +188,66 @@ class PeftConfigTester(unittest.TestCase, PeftConfigTestMixin):
            PromptEncoder(config)
        expected_msg = "for MLP, the argument `encoder_num_layers` is ignored. Exactly 2 MLP layers are used."
        assert str(record.list[0].message) == expected_msg
+
+    @parameterized.expand([LoHaConfig, LoraConfig, IA3Config])
+    def test_save_pretrained_with_target_modules(self, config_class):
+        # See #1041, #1045
+        config = config_class(target_modules=["a", "list"])
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            config.save_pretrained(tmp_dirname)
+
+            config_from_pretrained = config_class.from_pretrained(tmp_dirname)
+            self.assertEqual(config.to_dict(), config_from_pretrained.to_dict())
+            # explicit test that target_modules should be converted to set
+            self.assertTrue(isinstance(config_from_pretrained.target_modules, set))
+
+    def test_regex_with_layer_indexing_lora(self):
+        # This test checks that an error is raised if `target_modules` is a regex expression and `layers_to_transform` or
+        # `layers_pattern` are not None
+
+        invalid_config1 = {"target_modules": ".*foo", "layers_to_transform": [0]}
+        invalid_config2 = {"target_modules": ".*foo", "layers_pattern": ["bar"]}
+
+        valid_config = {"target_modules": ["foo"], "layers_pattern": ["bar"], "layers_to_transform": [0]}
+
+        with self.assertRaisesRegex(
+            ValueError,
+            expected_regex="`layers_to_transform` cannot be used when `target_modules` is a str.",
+        ):
+            LoraConfig(**invalid_config1)
+
+        with self.assertRaisesRegex(
+            ValueError, expected_regex="`layers_pattern` cannot be used when `target_modules` is a str."
+        ):
+            LoraConfig(**invalid_config2)
+
+        # should run without errors
+        LoraConfig(**valid_config)
+
+    def test_ia3_is_feedforward_subset_invalid_config(self):
+        # This test checks that the IA3 config raises a value error if the feedforward_modules argument
+        # is not a subset of the target_modules argument
+
+        # an example invalid config
+        invalid_config = {"target_modules": ["k", "v"], "feedforward_modules": ["q"]}
+
+        with self.assertRaisesRegex(
+            ValueError, expected_regex="^`feedforward_modules` should be a subset of `target_modules`$"
+        ):
+            IA3Config(**invalid_config)
+
+    def test_ia3_is_feedforward_subset_valid_config(self):
+        # This test checks that the IA3 config is created without errors with valid arguments.
+        # feedforward_modules should be a subset of target_modules if both are lists
+
+        # an example valid config with regex expressions.
+        valid_config_regex_exp = {
+            "target_modules": ".*.(SelfAttention|EncDecAttention|DenseReluDense).*(q|v|wo)$",
+            "feedforward_modules": ".*.DenseReluDense.wo$",
+        }
+        # an example valid config with module lists.
+        valid_config_list = {"target_modules": ["k", "v", "wo"], "feedforward_modules": ["wo"]}
+
+        # should run without errors
+        IA3Config(**valid_config_regex_exp)
+        IA3Config(**valid_config_list)
--- a/tests/test_custom_models.py
+++ b/tests/test_custom_models.py
--- a/tests/test_decoder_models.py
+++ b/tests/test_decoder_models.py
@ -40,13 +40,6 @@ FULL_GRID = {
 }


-def skip_non_pt_mqa(test_list):
-    r"""
-    Skip tests that are prefix tuning for MQA models (not supported yet)
-    """
-    return [test for test in test_list if not ("prefix_tuning" in test[0] and "GPTBigCodeForCausalLM" in test[0])]
-
-
 def skip_adalora_and_gpt2(test_list):
    return [test for test in test_list if not (("GPT2LMHeadModel" in test[1]) and (test[2] == AdaLoraConfig))]

@ -108,15 +101,32 @@ class PeftDecoderModelTester(unittest.TestCase, PeftCommonTester):
    def test_merge_layers(self, test_name, model_id, config_cls, config_kwargs):
        self._test_merge_layers(model_id, config_cls, config_kwargs)

-    @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_non_pt_mqa))
+    @parameterized.expand(
+        PeftTestConfigManager.get_grid_parameters(
+            {
+                "model_ids": PEFT_DECODER_MODELS_TO_TEST,
+                "lora_kwargs": {"init_lora_weights": [False]},
+                "ia3_kwargs": {"init_ia3_weights": [False]},
+                "task_type": "CAUSAL_LM",
+            },
+        )
+    )
+    def test_merge_layers_nan(self, test_name, model_id, config_cls, config_kwargs):
+        self._test_merge_layers_nan(model_id, config_cls, config_kwargs)
+
+    @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID))
    def test_generate(self, test_name, model_id, config_cls, config_kwargs):
        self._test_generate(model_id, config_cls, config_kwargs)

-    @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_non_pt_mqa))
+    @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID))
+    def test_merge_layers_fp16(self, test_name, model_id, config_cls, config_kwargs):
+        self._test_merge_layers_fp16(model_id, config_cls, config_kwargs)
+
+    @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID))
    def test_generate_half_prec(self, test_name, model_id, config_cls, config_kwargs):
        self._test_generate_half_prec(model_id, config_cls, config_kwargs)

-    @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_non_pt_mqa))
+    @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID))
    def test_prefix_tuning_half_prec_conversion(self, test_name, model_id, config_cls, config_kwargs):
        self._test_prefix_tuning_half_prec_conversion(model_id, config_cls, config_kwargs)

@ -144,6 +154,10 @@ class PeftDecoderModelTester(unittest.TestCase, PeftCommonTester):
    def test_delete_adapter(self, test_name, model_id, config_cls, config_kwargs):
        self._test_delete_adapter(model_id, config_cls, config_kwargs)

+    @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID))
+    def test_delete_inactive_adapter(self, test_name, model_id, config_cls, config_kwargs):
+        self._test_delete_inactive_adapter(model_id, config_cls, config_kwargs)
+
    @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID))
    def test_adding_multiple_adapters_with_bias_raises(self, test_name, model_id, config_cls, config_kwargs):
        self._test_adding_multiple_adapters_with_bias_raises(model_id, config_cls, config_kwargs)
@ -174,7 +188,7 @@ class PeftDecoderModelTester(unittest.TestCase, PeftCommonTester):
    def test_weighted_combination_of_adapters(self, test_name, model_id, config_cls, config_kwargs):
        self._test_weighted_combination_of_adapters(model_id, config_cls, config_kwargs)

-    @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_non_pt_mqa))
+    @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID))
    def test_training_prompt_learning_tasks(self, test_name, model_id, config_cls, config_kwargs):
        self._test_training_prompt_learning_tasks(model_id, config_cls, config_kwargs)

@ -187,7 +201,6 @@ class PeftDecoderModelTester(unittest.TestCase, PeftCommonTester):
                "adalora_kwargs": {"init_lora_weights": [False]},
                "task_type": "CAUSAL_LM",
            },
-            filter_params_func=skip_non_pt_mqa,
        )
    )
    def test_disable_adapter(self, test_name, model_id, config_cls, config_kwargs):
@ -203,6 +216,6 @@ class PeftDecoderModelTester(unittest.TestCase, PeftCommonTester):
        }
        self._test_generate(model_id, AdaLoraConfig, config_kwargs)

-    @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_non_pt_mqa))
+    @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID))
    def test_passing_input_embeds_works(self, test_name, model_id, config_cls, config_kwargs):
        self._test_passing_input_embeds_works(test_name, model_id, config_cls, config_kwargs)
--- a/tests/test_encoder_decoder_models.py
+++ b/tests/test_encoder_decoder_models.py
@ -12,11 +12,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import tempfile
 import unittest

 import torch
 from parameterized import parameterized
-from transformers import AutoModelForSeq2SeqLM
+from transformers import AutoModelForSeq2SeqLM, AutoModelForTokenClassification
+
+from peft import LoraConfig, TaskType, get_peft_model

 from .testing_common import PeftCommonTester, PeftTestConfigManager

@ -125,6 +128,10 @@ class PeftEncoderDecoderModelTester(unittest.TestCase, PeftCommonTester):
    def test_delete_adapter(self, test_name, model_id, config_cls, config_kwargs):
        self._test_delete_adapter(model_id, config_cls, config_kwargs)

+    @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID))
+    def test_delete_inactive_adapter(self, test_name, model_id, config_cls, config_kwargs):
+        self._test_delete_inactive_adapter(model_id, config_cls, config_kwargs)
+
    @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID))
    def test_adding_multiple_adapters_with_bias_raises(self, test_name, model_id, config_cls, config_kwargs):
        self._test_adding_multiple_adapters_with_bias_raises(model_id, config_cls, config_kwargs)
@ -172,3 +179,20 @@ class PeftEncoderDecoderModelTester(unittest.TestCase, PeftCommonTester):
    )
    def test_disable_adapter(self, test_name, model_id, config_cls, config_kwargs):
        self._test_disable_adapter(model_id, config_cls, config_kwargs)
+
+
+class PeftEncoderDecoderCustomModelTester(unittest.TestCase):
+    """
+    A custom class to write any custom test related with Enc-Dec models
+    """
+
+    def test_save_shared_tensors(self):
+        model_id = "hf-internal-testing/tiny-random-RobertaModel"
+        peft_config = LoraConfig(
+            task_type=TaskType.TOKEN_CLS, inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias="all"
+        )
+        model = AutoModelForTokenClassification.from_pretrained(model_id, num_labels=11)
+        model = get_peft_model(model, peft_config)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            # This should work fine
+            model.save_pretrained(tmp_dir, safe_serialization=True)
--- a/tests/test_feature_extraction_models.py
+++ b/tests/test_feature_extraction_models.py
@ -146,6 +146,10 @@ class PeftFeatureExtractionModelTester(unittest.TestCase, PeftCommonTester):
    def test_delete_adapter(self, test_name, model_id, config_cls, config_kwargs):
        self._test_delete_adapter(model_id, config_cls, config_kwargs)

+    @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID))
+    def test_delete_inactive_adapter(self, test_name, model_id, config_cls, config_kwargs):
+        self._test_delete_inactive_adapter(model_id, config_cls, config_kwargs)
+
    @parameterized.expand(
        PeftTestConfigManager.get_grid_parameters(
            {
--- a/tests/test_gpu_examples.py
+++ b/tests/test_gpu_examples.py
@ -45,7 +45,13 @@ from peft import (
    prepare_model_for_kbit_training,
 )

-from .testing_utils import require_bitsandbytes, require_torch_gpu, require_torch_multi_gpu
+from .testing_utils import (
+    require_auto_gptq,
+    require_bitsandbytes,
+    require_optimum,
+    require_torch_gpu,
+    require_torch_multi_gpu,
+)


 # A full testing suite that tests all the necessary features on GPU. The tests should
@ -105,6 +111,7 @@ class PeftBnbGPUExampleTests(unittest.TestCase):
    def setUp(self):
        self.seq2seq_model_id = "google/flan-t5-base"
        self.causal_lm_model_id = "facebook/opt-6.7b"
+        self.tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id)
        self.audio_model_id = "openai/whisper-large"

    def tearDown(self):
@ -175,6 +182,125 @@ class PeftBnbGPUExampleTests(unittest.TestCase):
            # assert loss is not None
            self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])

+    @pytest.mark.single_gpu_tests
+    def test_causal_lm_training_4bit(self):
+        r"""
+        Test the CausalLM training on a single GPU device. This test is a converted version of
+        https://github.com/huggingface/peft/blob/main/examples/int8_training/Finetune_opt_bnb_peft.ipynb where we train
+        `opt-6.7b` on `english_quotes` dataset in few steps using 4bit base model. The test would simply fail if the
+        adapters are not set correctly.
+        """
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model = AutoModelForCausalLM.from_pretrained(
+                self.causal_lm_model_id,
+                load_in_4bit=True,
+                device_map="auto",
+            )
+
+            tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id)
+            model = prepare_model_for_kbit_training(model)
+
+            config = LoraConfig(
+                r=16,
+                lora_alpha=32,
+                target_modules=["q_proj", "v_proj"],
+                lora_dropout=0.05,
+                bias="none",
+                task_type="CAUSAL_LM",
+            )
+
+            model = get_peft_model(model, config)
+
+            data = load_dataset("ybelkada/english_quotes_copy")
+            data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
+
+            trainer = Trainer(
+                model=model,
+                train_dataset=data["train"],
+                args=TrainingArguments(
+                    per_device_train_batch_size=4,
+                    gradient_accumulation_steps=4,
+                    warmup_steps=2,
+                    max_steps=3,
+                    learning_rate=2e-4,
+                    fp16=True,
+                    logging_steps=1,
+                    output_dir=tmp_dir,
+                ),
+                data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
+            )
+            model.config.use_cache = False
+            trainer.train()
+
+            model.cpu().save_pretrained(tmp_dir)
+
+            self.assertTrue("adapter_config.json" in os.listdir(tmp_dir))
+            self.assertTrue("adapter_model.bin" in os.listdir(tmp_dir))
+
+            # assert loss is not None
+            self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+
+    @pytest.mark.multi_gpu_tests
+    def test_causal_lm_training_mutli_gpu_4bit(self):
+        r"""
+        Test the CausalLM training on a multi-GPU device with 4bit base model. The test would simply fail if the
+        adapters are not set correctly.
+        """
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model = AutoModelForCausalLM.from_pretrained(
+                self.causal_lm_model_id,
+                device_map="auto",
+                load_in_4bit=True,
+            )
+
+            self.assertEqual(set(model.hf_device_map.values()), {0, 1})
+
+            model = prepare_model_for_kbit_training(model)
+
+            setattr(model, "model_parallel", True)
+            setattr(model, "is_parallelizable", True)
+
+            config = LoraConfig(
+                r=16,
+                lora_alpha=32,
+                target_modules=["q_proj", "v_proj"],
+                lora_dropout=0.05,
+                bias="none",
+                task_type="CAUSAL_LM",
+            )
+
+            model = get_peft_model(model, config)
+
+            data = load_dataset("Abirate/english_quotes")
+            data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True)
+
+            trainer = Trainer(
+                model=model,
+                train_dataset=data["train"],
+                args=TrainingArguments(
+                    per_device_train_batch_size=4,
+                    gradient_accumulation_steps=4,
+                    warmup_steps=2,
+                    max_steps=3,
+                    learning_rate=2e-4,
+                    fp16=True,
+                    logging_steps=1,
+                    output_dir=tmp_dir,
+                ),
+                data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False),
+            )
+            model.config.use_cache = False
+            trainer.train()
+
+            model.cpu().save_pretrained(tmp_dir)
+
+            self.assertTrue("adapter_config.json" in os.listdir(tmp_dir))
+            self.assertTrue("adapter_model.bin" in os.listdir(tmp_dir))
+
+            # assert loss is not None
+            self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+
    @pytest.mark.single_gpu_tests
    @require_torch_gpu
    def test_4bit_adalora_causalLM(self):
@ -518,3 +644,207 @@ class PeftBnbGPUExampleTests(unittest.TestCase):

            # assert loss is not None
            self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+
+
+@require_torch_gpu
+@require_auto_gptq
+@require_optimum
+class PeftGPTQGPUTests(unittest.TestCase):
+    r"""
+    GPTQ + peft tests
+    """
+
+    def setUp(self):
+        from transformers import GPTQConfig
+
+        self.causal_lm_model_id = "marcsun13/opt-350m-gptq-4bit"
+        # TODO : check if it works for Exllamav2 kernels
+        self.quantization_config = GPTQConfig(bits=4, use_exllama=False)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id)
+
+    def tearDown(self):
+        r"""
+        Efficient mechanism to free GPU memory after each test. Based on
+        https://github.com/huggingface/transformers/issues/21094
+        """
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @pytest.mark.single_gpu_tests
+    def test_causal_lm_training(self):
+        r"""
+        Test the CausalLM training on a single GPU device. The test would simply fail if the adapters are not set
+        correctly.
+        """
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model = AutoModelForCausalLM.from_pretrained(
+                self.causal_lm_model_id,
+                torch_dtype=torch.float16,
+                device_map="auto",
+                quantization_config=self.quantization_config,
+            )
+
+            model = prepare_model_for_kbit_training(model)
+            config = LoraConfig(
+                r=16,
+                lora_alpha=32,
+                target_modules=["q_proj", "v_proj"],
+                lora_dropout=0.05,
+                bias="none",
+                task_type="CAUSAL_LM",
+            )
+            model = get_peft_model(model, config)
+
+            data = load_dataset("ybelkada/english_quotes_copy")
+            data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True)
+
+            trainer = Trainer(
+                model=model,
+                train_dataset=data["train"],
+                args=TrainingArguments(
+                    per_device_train_batch_size=4,
+                    gradient_accumulation_steps=4,
+                    warmup_steps=2,
+                    max_steps=3,
+                    learning_rate=2e-4,
+                    fp16=True,
+                    logging_steps=1,
+                    output_dir=tmp_dir,
+                ),
+                data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False),
+            )
+            model.config.use_cache = False
+            trainer.train()
+
+            model.cpu().save_pretrained(tmp_dir)
+
+            self.assertTrue("adapter_config.json" in os.listdir(tmp_dir))
+            self.assertTrue("adapter_model.bin" in os.listdir(tmp_dir))
+
+            # assert loss is not None
+            self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+
+    @pytest.mark.single_gpu_tests
+    def test_adalora_causalLM(self):
+        r"""
+        Tests the gptq training with adalora
+        """
+
+        model = AutoModelForCausalLM.from_pretrained(
+            self.causal_lm_model_id,
+            torch_dtype=torch.float16,
+            device_map="auto",
+            quantization_config=self.quantization_config,
+        )
+
+        model = prepare_model_for_kbit_training(model)
+
+        peft_config = AdaLoraConfig(
+            init_r=6,
+            target_r=4,
+            tinit=50,
+            tfinal=100,
+            deltaT=5,
+            beta1=0.3,
+            beta2=0.3,
+            orth_reg_weight=0.2,
+            lora_alpha=32,
+            lora_dropout=0.05,
+            bias="none",
+            task_type="CAUSAL_LM",
+        )
+
+        model = get_peft_model(model, peft_config)
+
+        data = load_dataset("ybelkada/english_quotes_copy")
+        data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = Trainer(
+                model=model,
+                train_dataset=data["train"],
+                args=TrainingArguments(
+                    per_device_train_batch_size=4,
+                    gradient_accumulation_steps=4,
+                    warmup_steps=2,
+                    max_steps=3,
+                    learning_rate=2e-4,
+                    fp16=True,
+                    logging_steps=1,
+                    output_dir=tmp_dir,
+                ),
+                data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False),
+            )
+            model.config.use_cache = False
+            trainer.train()
+
+            model.cpu().save_pretrained(tmp_dir)
+
+            self.assertTrue("adapter_config.json" in os.listdir(tmp_dir))
+            self.assertTrue("adapter_model.bin" in os.listdir(tmp_dir))
+
+            # assert loss is not None
+            self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+
+    @pytest.mark.multi_gpu_tests
+    @require_torch_multi_gpu
+    def test_causal_lm_training_mutli_gpu(self):
+        r"""
+        Test the CausalLM training on a multi-GPU device. The test would simply fail if the adapters are not set
+        correctly.
+        """
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model = AutoModelForCausalLM.from_pretrained(
+                self.causal_lm_model_id,
+                torch_dtype=torch.float16,
+                device_map="auto",
+                quantization_config=self.quantization_config,
+            )
+
+            self.assertEqual(set(model.hf_device_map.values()), {0, 1})
+
+            model = prepare_model_for_kbit_training(model)
+
+            setattr(model, "model_parallel", True)
+            setattr(model, "is_parallelizable", True)
+
+            config = LoraConfig(
+                r=16,
+                lora_alpha=32,
+                target_modules=["q_proj", "v_proj"],
+                lora_dropout=0.05,
+                bias="none",
+                task_type="CAUSAL_LM",
+            )
+
+            model = get_peft_model(model, config)
+
+            data = load_dataset("Abirate/english_quotes")
+            data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True)
+
+            trainer = Trainer(
+                model=model,
+                train_dataset=data["train"],
+                args=TrainingArguments(
+                    per_device_train_batch_size=4,
+                    gradient_accumulation_steps=4,
+                    warmup_steps=2,
+                    max_steps=3,
+                    learning_rate=2e-4,
+                    fp16=True,
+                    logging_steps=1,
+                    output_dir=tmp_dir,
+                ),
+                data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False),
+            )
+            model.config.use_cache = False
+            trainer.train()
+
+            model.cpu().save_pretrained(tmp_dir)
+
+            self.assertTrue("adapter_config.json" in os.listdir(tmp_dir))
+            self.assertTrue("adapter_model.bin" in os.listdir(tmp_dir))
+
+            # assert loss is not None
+            self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
--- a/Show More
+++ b/Show More