Release v0.12.0 (#1946 )

Also: Fix small error in doc: mentions wrong version
PiSSA, OLoRA: Delete initial adapter after conversion instead of the active adapter (#1933 )
2025-10-21 16:03:50 +08:00 · 2024-07-24 13:13:40 +02:00 · 2024-07-24 12:55:56 +02:00 · 2024-07-23 20:24:05 +02:00 · 2024-07-23 15:04:13 +05:30 · 2024-07-22 19:12:15 +05:30
143 changed files with 28060 additions and 752 deletions
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@ -23,7 +23,7 @@ body:

        Please tag fewer than 3 people.

-        Library: @pacman100 @younesbelkada @benjaminbossan @sayakpaul
+        Library: @benjaminbossan @sayakpaul

        Documentation: @stevhliu

--- a/.github/workflows/build_docker_images.yml
+++ b/.github/workflows/build_docker_images.yml
@ -45,30 +45,14 @@ jobs:
          push: true
          tags: huggingface/peft-cpu

-      - name: Post to a Slack channel
-        id: slack
-        #uses: slackapi/slack-github-action@v1.25.0
-        uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
-          # Slack channel id, channel name, or user id to post message.
-          # See also: https://api.slack.com/methods/chat.postMessage#channels
-          channel-id: ${{ env.CI_SLACK_CHANNEL }}
-          # For posting a rich message using Block Kit
-          payload: |
-            {
-              "text": "peft-cpu Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}",
-              "blocks": [
-                {
-                  "type": "section",
-                  "text": {
-                    "type": "mrkdwn",
-                    "text": "peft-cpu Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}"
-                  }
-                }
-              ]
-            }
-        env:
-          SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+          slack_channel: ${{ env.CI_SLACK_CHANNEL }}
+          title: 🤗 Results of the PEFT-CPU docker build 
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

  latest-cuda:
    name: "Latest Peft GPU [dev]"
@ -100,31 +84,15 @@ jobs:
          context: ./docker/peft-gpu
          push: true
          tags: huggingface/peft-gpu
-
-      - name: Post to a Slack channel
-        id: slack
-        #uses: slackapi/slack-github-action@v1.25.0
-        uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
+ 
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
-          # Slack channel id, channel name, or user id to post message.
-          # See also: https://api.slack.com/methods/chat.postMessage#channels
-          channel-id: ${{ env.CI_SLACK_CHANNEL }}
-          # For posting a rich message using Block Kit
-          payload: |
-            {
-              "text": "peft-gpu Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}",
-              "blocks": [
-                {
-                  "type": "section",
-                  "text": {
-                    "type": "mrkdwn",
-                    "text": "peft-gpu Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}"
-                  }
-                }
-              ]
-            }
-        env:
-          SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+          slack_channel: ${{ env.CI_SLACK_CHANNEL }}
+          title: 🤗 Results of the PEFT-GPU docker build 
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

  latest-cuda-bnb-source:
    name: "Latest Peft GPU + bnb source [dev]"
@ -156,33 +124,15 @@ jobs:
          context: ./docker/peft-gpu-bnb-source
          push: true
          tags: huggingface/peft-gpu-bnb-source
-
-
-      - name: Post to a Slack channel
-        id: slack
-        #uses: slackapi/slack-github-action@v1.25.0
-        uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
+ 
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
-          # Slack channel id, channel name, or user id to post message.
-          # See also: https://api.slack.com/methods/chat.postMessage#channels
-          channel-id: ${{ env.CI_SLACK_CHANNEL }}
-          # For posting a rich message using Block Kit
-          payload: |
-            {
-              "text": "peft-gpu + bnb-source (source) Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}",
-              "blocks": [
-                {
-                  "type": "section",
-                  "text": {
-                    "type": "mrkdwn",
-                    "text": "peft-gpu + bnb-source (source) Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}"
-                  }
-                }
-              ]
-            }
-        env:
-          SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
-
+          slack_channel: ${{ env.CI_SLACK_CHANNEL }}
+          title: 🤗 Results of the PEFT-GPU (bnb source / HF latest) docker build 
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

  latest-cuda-bnb-source-latest:
    name: "Latest Peft GPU + bnb source [accelerate / peft / transformers latest]"
@ -214,31 +164,15 @@ jobs:
          context: ./docker/peft-gpu-bnb-latest
          push: true
          tags: huggingface/peft-gpu-bnb-latest
-
-      - name: Post to a Slack channel
-        id: slack
-        #uses: slackapi/slack-github-action@v1.25.0
-        uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
+   
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
-          # Slack channel id, channel name, or user id to post message.
-          # See also: https://api.slack.com/methods/chat.postMessage#channels
-          channel-id: ${{ env.CI_SLACK_CHANNEL }}
-          # For posting a rich message using Block Kit
-          payload: |
-            {
-              "text": "peft-gpu + bnb-source (latest) Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}",
-              "blocks": [
-                {
-                  "type": "section",
-                  "text": {
-                    "type": "mrkdwn",
-                    "text": "peft-gpu + bnb-source (latest) Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}"
-                  }
-                }
-              ]
-            }
-        env:
-          SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+          slack_channel: ${{ env.CI_SLACK_CHANNEL }}
+          title: 🤗 Results of the PEFT-GPU (bnb source / HF source) docker build 
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

  latest-cuda-bnb-source-multi:
    name: "Latest Peft GPU + bnb (multi-backend) source [accelerate / peft / transformers source]"
@ -270,28 +204,14 @@ jobs:
          context: ./docker/peft-gpu-bnb-multi-source
          push: true
          tags: huggingface/peft-gpu-bnb-multi-source
-
-      - name: Post to a Slack channel
-        id: slack
-        #uses: slackapi/slack-github-action@v1.25.0
-        uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
+ 
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
-          # Slack channel id, channel name, or user id to post message.
-          # See also: https://api.slack.com/methods/chat.postMessage#channels
-          channel-id: ${{ env.CI_SLACK_CHANNEL }}
-          # For posting a rich message using Block Kit
-          payload: |
-            {
-              "text": "peft-gpu + bnb-source (latest) Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}",
-              "blocks": [
-                {
-                  "type": "section",
-                  "text": {
-                    "type": "mrkdwn",
-                    "text": "peft-gpu + bnb-source (latest) Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}"
-                  }
-                }
-              ]
-            }
-        env:
-          SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+          slack_channel: ${{ env.CI_SLACK_CHANNEL }}
+          title: 🤗 Results of the PEFT-GPU (bnb source multi-backend / HF latest) docker build 
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
+
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@ -14,6 +14,7 @@ jobs:
      commit_sha: ${{ github.sha }}
      package: peft
      notebook_folder: peft_docs
+      custom_container: huggingface/transformers-doc-builder
    secrets:
      token: ${{ secrets.HUGGINGFACE_PUSH }}
-      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
+      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@ -14,3 +14,4 @@ jobs:
      commit_sha: ${{ github.event.pull_request.head.sha }}
      pr_number: ${{ github.event.number }}
      package: peft
+      custom_container: huggingface/transformers-doc-builder
--- a/.github/workflows/nightly-bnb.yml
+++ b/.github/workflows/nightly-bnb.yml
@ -15,6 +15,7 @@ env:

 jobs:
  run_all_tests_single_gpu:
+    timeout-minutes: 60
    strategy:
      fail-fast: false
      matrix:
@ -46,23 +47,88 @@ jobs:
            git checkout tags/v$transformers_version
            cd .. 
          fi
-      - name: Run examples on single GPU
+
+      - name: Test bnb import
+        id: import
        if: always()
        run: |
          source activate peft
-          make tests_examples_single_gpu_bnb
-      
+          python3 -m bitsandbytes
+          python3 -c "import bitsandbytes as bnb"
+
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.BNB_SLACK_CHANNEL_ID }}
+          title: 🤗 Results of bitsandbytes import
+          status: ${{ steps.examples_tests.outcome }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
+      # TODO: uncomment this block if error is solved or bnb multi backend branch is merged
+      # - name: Run examples on single GPU
+      #   id: examples_tests
+      #   if: always()
+      #   run: |
+      #     source activate peft
+      #     make tests_examples_single_gpu_bnb
+
+      # - name: Post to Slack
+      #   if: always()
+      #   uses: huggingface/hf-workflows/.github/actions/post-slack@main
+      #   with:
+      #     slack_channel: ${{ secrets.BNB_SLACK_CHANNEL_ID }}
+      #     title: 🤗 Results of bitsandbytes examples tests - single GPU
+      #     status: ${{ steps.examples_tests.outcome }}
+      #     slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
      - name: Run core tests on single GPU
+        id: core_tests
        if: always()
        run: |
          source activate peft
          make tests_core_single_gpu_bnb

+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.BNB_SLACK_CHANNEL_ID }}
+          title: 🤗 Results of bitsandbytes core tests - single GPU
+          status: ${{ steps.core_tests.outcome }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
+      - name: Run BNB regression tests on single GPU
+        id: regression_tests
+        if: always()
+        run: |
+          source activate peft
+          make tests_gpu_bnb_regression
+
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.BNB_SLACK_CHANNEL_ID }}
+          title: 🤗 Results of bitsandbytes regression tests - single GPU
+          status: ${{ steps.regression_tests.outcome }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
      - name: Run transformers tests on single GPU
+        id: transformers_tests
        if: always()
        run: |
          source activate peft
          make transformers_tests
+
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.BNB_SLACK_CHANNEL_ID }}
+          title: 🤗 Results of bitsandbytes transformers tests - single GPU
+          status: ${{ steps.transformers_tests.outcome }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
          
      - name: Generate Report
        if: always()
@ -71,6 +137,7 @@ jobs:
          python scripts/log_reports.py --slack_channel_name bnb-daily-ci-collab >> $GITHUB_STEP_SUMMARY

  run_all_tests_multi_gpu:
+    timeout-minutes: 60
    strategy:
      fail-fast: false
      matrix:
@ -103,28 +170,75 @@ jobs:
            cd ..
          fi 

+      - name: Test bnb import
+        id: import
+        if: always()
+        run: |
+          source activate peft
+          python3 -m bitsandbytes
+          python3 -c "import bitsandbytes as bnb"
+
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.BNB_SLACK_CHANNEL_ID }}
+          title: 🤗 Results of bitsandbytes import
+          status: ${{ steps.import.outcome }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
      - name: Run core GPU tests on multi-gpu
        if: always()
        run: |
          source activate peft
        
      - name: Run examples on multi GPU
+        id: examples_tests
        if: always()
        run: |
          source activate peft
          make tests_examples_multi_gpu_bnb
+
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.BNB_SLACK_CHANNEL_ID }}
+          title: 🤗 Results of bitsandbytes examples tests - multi GPU
+          status: ${{ steps.examples_tests.outcome }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
      
      - name: Run core tests on multi GPU
+        id: core_tests
        if: always()
        run: |
          source activate peft
          make tests_core_multi_gpu_bnb

+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.BNB_SLACK_CHANNEL_ID }}
+          title: 🤗 Results of bitsandbytes core tests - multi GPU
+          status: ${{ steps.core_tests.outcome }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
      - name: Run transformers tests on multi GPU
+        id: transformers_tests
        if: always()
        run: |
          source activate peft
          make transformers_tests
+
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.BNB_SLACK_CHANNEL_ID }}
+          title: 🤗 Results of bitsandbytes transformers tests - multi GPU
+          status: ${{ steps.transformers_tests.outcome }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
          
      - name: Generate Report
        if: always()
--- a/.github/workflows/tests-main.yml
+++ b/.github/workflows/tests-main.yml
@ -26,3 +26,11 @@ jobs:
      - name: Test with pytest
        run: |
          make test
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.SLACK_CHANNEL_ID }}
+          title: 🤗 Results of transformers main tests
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -31,6 +31,8 @@ jobs:
  tests:
    needs: check_code_quality
    strategy:
+      # TODO: remove 'fail-fast' line once timeout issue from the Hub is solved
+      fail-fast: false
      matrix:
        python-version: ["3.8", "3.9", "3.10", "3.11"]
        os: ["ubuntu-latest", "macos-12", "windows-latest"]
@ -48,6 +50,12 @@ jobs:
          python -m pip install --upgrade pip
          # cpu version of pytorch
          pip install -e .[test]
+      - name: Downgrade numpy on MacOS and Windows
+        # TODO: remove numpy downgrade on MacOS & Windows once torch fixes numpy 2.0 issue
+        shell: bash
+        if: matrix.os == 'windows-latest' || matrix.os == 'macos-12'
+        run: |
+          pip install --force-reinstall -U "numpy<2.0.0"
      - name: Test with pytest
        run: |
          make test
--- a/.github/workflows/torch_compile_tests.yml
+++ b/.github/workflows/torch_compile_tests.yml
@ -1,7 +1,5 @@
 name: torch compile tests

-# see peft/tests/__init__.py
-
 on:
  workflow_dispatch:
    inputs:
@ -13,31 +11,41 @@ on:
        required: false
        default: false

+env:
+  RUN_SLOW: "yes"
+  IS_GITHUB_CI: "1"
+  # To be able to run tests on CUDA 12.2
+  NVIDIA_DISABLE_REQUIRE: "1"
+
 jobs:
  run_tests_with_compile:
-    runs-on: ubuntu-latest
+    runs-on: [self-hosted, single-gpu, nvidia-gpu, a10, ci]
    env:
      PEFT_DEBUG_WITH_TORCH_COMPILE: 1
+      CUDA_VISIBLE_DEVICES: "0"
+      TEST_TYPE: "single_gpu_huggingface/peft-gpu-bnb-latest:latest"
+    container:
+      image: "huggingface/peft-gpu-bnb-latest:latest"
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    defaults:
+      run:
+        shell: bash
    steps:
      - uses: actions/checkout@v4
        with:
          ref: ${{ github.event.inputs.branch }}
          repository: ${{ github.event.pull_request.head.repo.full_name }}
-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: "3.10"
-          cache: "pip"
-          cache-dependency-path: "setup.py"
-      - name: Install dependencies
+      - name: Pip install
        run: |
-          python -m pip install --upgrade pip
-          python -m pip install .[test]
+          source activate peft
+          pip install -e . --no-deps
+          pip install pytest-cov pytest-reportlog parameterized datasets scipy einops
+          pip install "pytest>=7.2.0,<8.0.0" # see: https://github.com/huggingface/transformers/blob/ce4fff0be7f6464d713f7ac3e0bbaafbc6959ae5/setup.py#L148C6-L148C26
          if [ "${{ github.event.inputs.pytorch_nightly }}" = "true" ]; then
            python -m pip install --upgrade --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
          fi
      - name: Test compile with pytest
        run: |
+          source activate peft
          echo "PEFT_DEBUG_WITH_TORCH_COMPILE=$PEFT_DEBUG_WITH_TORCH_COMPILE"
-          git status
-          make test
+          make tests_torch_compile
--- a/.github/workflows/trufflehog.yml
+++ b/.github/workflows/trufflehog.yml
@ -0,0 +1,15 @@
+on:
+  push:
+
+name: Secret Leaks
+
+jobs:
+  trufflehog:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+    - name: Secret Scanning
+      uses: trufflesecurity/trufflehog@main
--- a/10
+++ b/10
@ -6,13 +6,13 @@ check_dirs := src tests examples docs scripts docker

 # this target runs checks on all files
 quality:
-	ruff $(check_dirs)
+	ruff check $(check_dirs)
 	ruff format --check $(check_dirs)
 	doc-builder style src/peft tests docs/source --max_len 119 --check_only

 # Format source code automatically and check is there are any problems left that need manual fixing
 style:
-	ruff $(check_dirs) --fix
+	ruff check --fix $(check_dirs)
 	ruff format $(check_dirs)
 	doc-builder style src/peft tests docs/source --max_len 119

@ -47,9 +47,15 @@ tests_core_multi_gpu_bnb:
 tests_core_single_gpu_bnb:
 	python -m pytest -m "single_gpu_tests and bitsandbytes" tests/test_common_gpu.py $(if $(IS_GITHUB_CI),--report-log "core_single_gpu.log",)

+tests_gpu_bnb_regression:
+	python -m pytest tests/bnb/test_bnb_regression.py $(if $(IS_GITHUB_CI),--report-log "bnb_regression_gpu.log",)
+
 # For testing transformers tests for bnb runners
 transformers_tests:
 	RUN_SLOW=1 python -m pytest transformers-clone/tests/quantization/bnb $(if $(IS_GITHUB_CI),--report-log "transformers_tests.log",)

 tests_regression:
 	python -m pytest -s --regression tests/regression/ $(if $(IS_GITHUB_CI),--report-log "regression_tests.log",)
+
+tests_torch_compile:
+	python -m pytest tests/test_torch_compile.py $(if $(IS_GITHUB_CI),--report-log "compile_tests.log",)
--- a/docker/peft-gpu/Dockerfile
+++ b/docker/peft-gpu/Dockerfile
@ -42,9 +42,9 @@ RUN source activate peft && \

 # Add autoawq for quantization testing
 RUN source activate peft && \
-    python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.1/autoawq-0.2.1-cp38-cp38-linux_x86_64.whl
+    python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.4/autoawq-0.2.4-cp38-cp38-linux_x86_64.whl
 RUN source activate peft && \
-    python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ_kernels/releases/download/v0.0.4/autoawq_kernels-0.0.4-cp38-cp38-linux_x86_64.whl
+    python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ_kernels/releases/download/v0.0.6/autoawq_kernels-0.0.6-cp38-cp38-linux_x86_64.whl

 # Install apt libs
 RUN apt-get update && \
@ -52,6 +52,10 @@ RUN apt-get update && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists*

+# Add eetq for quantization testing
+RUN source activate peft && \
+    python3 -m pip install git+https://github.com/NetEase-FuXi/EETQ.git
+
 # Activate the conda env and install transformers + accelerate from source
 RUN source activate peft && \
    python3 -m pip install -U --no-cache-dir \
@ -66,6 +70,10 @@ RUN source activate peft && \
 RUN source activate peft && \
    pip install aqlm[gpu]>=1.0.2

+# Add HQQ for quantization testing
+RUN source activate peft && \
+pip install hqq
+
 RUN source activate peft && \ 
    pip freeze | grep transformers

--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@ -37,10 +37,16 @@
    title: Adapter injection
  - local: developer_guides/mixed_models
    title: Mixed adapter types
+  - local: developer_guides/torch_compile
+    title: torch.compile
  - local: developer_guides/contributing
    title: Contribute to PEFT
  - local: developer_guides/troubleshooting
    title: Troubleshooting
+  - local: developer_guides/checkpoint
+    title: PEFT checkpoint format
+  - local: package_reference/helpers
+    title: Helpers

 - title: 🤗 Accelerate integrations
  sections:
@ -102,8 +108,13 @@
      title: Prefix tuning
    - local: package_reference/prompt_tuning
      title: Prompt tuning
+    - local: package_reference/layernorm_tuning
+      title: Layernorm tuning
    - local: package_reference/vera
      title: VeRA
+    - local: package_reference/fourierft
+      title: FourierFT
+      
    title: Adapters
  - sections:
    - local: package_reference/merge_utils
--- a/docs/source/accelerate/fsdp.md
+++ b/docs/source/accelerate/fsdp.md
@ -249,7 +249,7 @@ accelerate launch --config_file "configs/fsdp_config_qlora.yaml"  train.py \
 --bnb_4bit_quant_storage_dtype "bfloat16"
 ```

-Notice the new argument being passed, `bnb_4bit_quant_storage_dtype`, which denotes the data type for packing the 4-bit parameters. For example, when it is set to `bfloat16`, **32/4 = 8** 4-bit params are packed together post quantization. When using mixed precision training with `bfloat16`, `bnb_4bit_quant_storage_dtype` can be either `bfloat16` for pure `bfloat16` finetuning, or `float32` for automatic mixed precision (this consumes more GPU memory). When using mixed precision training with `float16`, `bnb_4bit_quant_storage_dtype` should be set to `float32` for stable automatic mixed precision training.
+Notice the new argument being passed, `bnb_4bit_quant_storage_dtype`, which denotes the data type for packing the 4-bit parameters. For example, when it is set to `bfloat16`, **16/4 = 4** 4-bit params are packed together post quantization. When using mixed precision training with `bfloat16`, `bnb_4bit_quant_storage_dtype` can be either `bfloat16` for pure `bfloat16` finetuning, or `float32` for automatic mixed precision (this consumes more GPU memory). When using mixed precision training with `float16`, `bnb_4bit_quant_storage_dtype` should be set to `float32` for stable automatic mixed precision training.

 In terms of training code, the important code changes are: 

@ -288,4 +288,5 @@ You can also refer the [llama-recipes](https://github.com/facebookresearch/llama
 1. Merging when using PEFT and FSDP is currently unsupported and will raise error.
 2. Passing `modules_to_save` config parameter to is untested at present.
 3. GPU Memory saving when using CPU Offloading is untested at present.
-4. When using FSDP+QLoRA, `paged_adamw_8bit` currently results in an error when saving a checkpoint.
+4. When using FSDP+QLoRA, `paged_adamw_8bit` currently results in an error when saving a checkpoint.
+5. DoRA training with FSDP should work (albeit at lower speed than LoRA). If combined with bitsandbytes (QDoRA), 4-bit quantization should also work, but 8-bit quantization has known issues and is not recommended.
--- a/docs/source/conceptual_guides/prompting.md
+++ b/docs/source/conceptual_guides/prompting.md
@ -64,9 +64,9 @@ Take a look at [P-tuning for sequence classification](../task_guides/ptuning-seq
 <div class="flex justify-center">
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/mpt.png"/>
 </div>
-<small><a href="https://hf.co/papers/2103.10385">Multitask prompt tuning enables parameter-efficient transfer learning</a>.</small>
+<small><a href="https://hf.co/papers/2303.02861">Multitask prompt tuning enables parameter-efficient transfer learning</a>.</small>

-[Multitask prompt tuning (MPT)](https://hf.co/papers/2103.10385) learns a single prompt from data for multiple task types that can be shared for different target tasks. Other existing approaches learn a separate soft prompt for each task that need to be retrieved or aggregated for adaptation to target tasks. MPT consists of two stages:
+[Multitask prompt tuning (MPT)](https://hf.co/papers/2303.02861) learns a single prompt from data for multiple task types that can be shared for different target tasks. Other existing approaches learn a separate soft prompt for each task that need to be retrieved or aggregated for adaptation to target tasks. MPT consists of two stages:

 1. source training - for each task, its soft prompt is decomposed into task-specific vectors. The task-specific vectors are multiplied together to form another matrix W, and the Hadamard product is used between W and a shared prompt matrix P to generate a task-specific prompt matrix. The task-specific prompts are distilled into a single prompt matrix that is shared across all tasks. This prompt is trained with multitask training.
 2. target adaptation - to adapt the single prompt for a target task, a target prompt is initialized and expressed as the Hadamard product of the shared prompt matrix and the task-specific low-rank prompt matrix.
--- a/docs/source/developer_guides/checkpoint.md
+++ b/docs/source/developer_guides/checkpoint.md
@ -0,0 +1,250 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# PEFT checkpoint format
+
+This document describes how PEFT's checkpoint files are structured and how to convert between the PEFT format and other formats.
+
+## PEFT files
+
+PEFT (parameter-efficient fine-tuning) methods only update a small subset of a model's parameters rather than all of them. This is nice because checkpoint files can generally be much smaller than the original model files and are easier to store and share. However, this also means that to load a PEFT model, you need to have the original model available as well.
+
+When you call [`~PeftModel.save_pretrained`] on a PEFT model, the PEFT model saves three files, described below:
+
+1. `adapter_model.safetensors` or `adapter_model.bin`
+
+By default, the model is saved in the `safetensors` format, a secure alternative to the `bin` format, which is known to be susceptible to [security vulnerabilities](https://huggingface.co/docs/hub/security-pickle) because it uses the pickle utility under the hood. Both formats store the same `state_dict` though, and are interchangeable.
+
+The `state_dict` only contains the parameters of the adapter module, not the base model. To illustrate the difference in size, a normal BERT model requires ~420MB of disk space, whereas an IA³ adapter on top of this BERT model only requires ~260KB.
+
+2. `adapter_config.json`
+
+The `adapter_config.json` file contains the configuration of the adapter module, which is necessary to load the model. Below is an example of an `adapter_config.json` for an IA³ adapter with standard settings applied to a BERT model:
+
+```json
+{
+  "auto_mapping": {
+    "base_model_class": "BertModel",
+    "parent_library": "transformers.models.bert.modeling_bert"
+  },
+  "base_model_name_or_path": "bert-base-uncased",
+  "fan_in_fan_out": false,
+  "feedforward_modules": [
+    "output.dense"
+  ],
+  "inference_mode": true,
+  "init_ia3_weights": true,
+  "modules_to_save": null,
+  "peft_type": "IA3",
+  "revision": null,
+  "target_modules": [
+    "key",
+    "value",
+    "output.dense"
+  ],
+  "task_type": null
+}
+```
+
+The configuration file contains:
+
+- the adapter module type stored, `"peft_type": "IA3"`
+- information about the base model like `"base_model_name_or_path": "bert-base-uncased"`
+- the revision of the model (if any), `"revision": null`
+
+If the base model is not a pretrained Transformers model, the latter two entries will be `null`. Other than that, the settings are all related to the specific IA³ adapter that was used to fine-tune the model.
+
+3. `README.md`
+
+The generated `README.md` is the model card of a PEFT model and contains a few pre-filled entries. The intent of this is to make it easier to share the model with others and to provide some basic information about the model. This file is not needed to load the model.
+
+## Convert to PEFT format
+
+When converting from another format to the PEFT format, we require both the `adapter_model.safetensors` (or `adapter_model.bin`) file and the `adapter_config.json` file.
+
+### adapter_model
+
+For the model weights, it is important to use the correct mapping from parameter name to value for PEFT to load the file. Getting this mapping right is an exercise in checking the implementation details, as there is no generally agreed upon format for PEFT adapters.
+
+Fortunately, figuring out this mapping is not overly complicated for common base cases. Let's look at a concrete example, the [`LoraLayer`](https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/layer.py):
+
+```python
+# showing only part of the code
+
+class LoraLayer(BaseTunerLayer):
+    # All names of layers that may contain (trainable) adapter weights
+    adapter_layer_names = ("lora_A", "lora_B", "lora_embedding_A", "lora_embedding_B")
+    # All names of other parameters that may contain adapter-related parameters
+    other_param_names = ("r", "lora_alpha", "scaling", "lora_dropout")
+
+    def __init__(self, base_layer: nn.Module, **kwargs) -> None:
+        self.base_layer = base_layer
+        self.r = {}
+        self.lora_alpha = {}
+        self.scaling = {}
+        self.lora_dropout = nn.ModuleDict({})
+        self.lora_A = nn.ModuleDict({})
+        self.lora_B = nn.ModuleDict({})
+        # For Embedding layer
+        self.lora_embedding_A = nn.ParameterDict({})
+        self.lora_embedding_B = nn.ParameterDict({})
+        # Mark the weight as unmerged
+        self._disable_adapters = False
+        self.merged_adapters = []
+        self.use_dora: dict[str, bool] = {}
+        self.lora_magnitude_vector: Optional[torch.nn.ParameterDict] = None  # for DoRA
+        self._caches: dict[str, Any] = {}
+        self.kwargs = kwargs
+```
+
+In the `__init__` code used by all `LoraLayer` classes in PEFT, there are a bunch of parameters used to initialize the model, but only a few are relevant for the checkpoint file: `lora_A`, `lora_B`, `lora_embedding_A`, and `lora_embedding_B`. These parameters are listed in the class attribute `adapter_layer_names` and contain the learnable parameters, so they must be included in the checkpoint file. All the other parameters, like the rank `r`, are derived from the `adapter_config.json` and must be included there (unless the default value is used).
+
+Let's check the `state_dict` of a PEFT LoRA model applied to BERT. When printing the first five keys using the default LoRA settings (the remaining keys are the same, just with different layer numbers), we get:
+
+- `base_model.model.encoder.layer.0.attention.self.query.lora_A.weight` 
+- `base_model.model.encoder.layer.0.attention.self.query.lora_B.weight` 
+- `base_model.model.encoder.layer.0.attention.self.value.lora_A.weight` 
+- `base_model.model.encoder.layer.0.attention.self.value.lora_B.weight` 
+- `base_model.model.encoder.layer.1.attention.self.query.lora_A.weight`
+- etc.
+
+Let's break this down:
+
+- By default, for BERT models, LoRA is applied to the `query` and `value` layers of the attention module. This is why you see `attention.self.query` and `attention.self.value` in the key names for each layer.
+- LoRA decomposes the weights into two low-rank matrices, `lora_A` and `lora_B`. This is where `lora_A` and `lora_B` come from in the key names.
+- These LoRA matrices are implemented as `nn.Linear` layers, so the parameters are stored in the `.weight` attribute (`lora_A.weight`, `lora_B.weight`).
+- By default, LoRA isn't applied to BERT's embedding layer, so there are _no entries_ for `lora_A_embedding` and `lora_B_embedding`.
+- The keys of the `state_dict` always start with `"base_model.model."`. The reason is that, in PEFT, we wrap the base model inside a tuner-specific model (`LoraModel` in this case), which itself is wrapped in a general PEFT model (`PeftModel`). For this reason, these two prefixes are added to the keys. When converting to the PEFT format, it is required to add these prefixes.
+
+<Tip>
+
+This last point is not true for prefix tuning techniques like prompt tuning. There, the extra embeddings are directly stored in the `state_dict` without any prefixes added to the keys.
+
+</Tip>
+
+When inspecting the parameter names in the loaded model, you might be surprised to find that they look a bit different, e.g. `base_model.model.encoder.layer.0.attention.self.query.lora_A.default.weight`. The difference is the *`.default`* part in the second to last segment. This part exists because PEFT generally allows the addition of multiple adapters at once (using an `nn.ModuleDict` or `nn.ParameterDict` to store them). For example, if you add another adapter called "other", the key for that adapter would be `base_model.model.encoder.layer.0.attention.self.query.lora_A.other.weight`.
+
+When you call [`~PeftModel.save_pretrained`], the adapter name is stripped from the keys. The reason is that the adapter name is not an important part of the model architecture; it is just an arbitrary name. When loading the adapter, you could choose a totally different name, and the model would still work the same way. This is why the adapter name is not stored in the checkpoint file.
+
+<Tip>
+
+If you call `save_pretrained("some/path")` and the adapter name is not `"default"`, the adapter is stored in a sub-directory with the same name as the adapter. So if the name is "other", it would be stored inside of `some/path/other`.
+
+</Tip>
+
+In some circumstances, deciding which values to add to the checkpoint file can become a bit more complicated. For example, in PEFT, DoRA is implemented as a special case of LoRA. If you want to convert a DoRA model to PEFT, you should create a LoRA checkpoint with extra entries for DoRA. You can see this in the `__init__` of the previous `LoraLayer` code:
+
+```python
+self.lora_magnitude_vector: Optional[torch.nn.ParameterDict] = None  # for DoRA
+```
+
+This indicates that there is an optional extra parameter per layer for DoRA.
+
+### adapter_config
+
+All the other information needed to load a PEFT model is contained in the `adapter_config.json` file. Let's check this file for a LoRA model applied to BERT:
+
+```json
+{
+  "alpha_pattern": {},
+  "auto_mapping": {
+    "base_model_class": "BertModel",
+    "parent_library": "transformers.models.bert.modeling_bert"
+  },
+  "base_model_name_or_path": "bert-base-uncased",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 8,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "query",
+    "value"
+  ],
+  "task_type": null,
+  "use_dora": false,
+  "use_rslora": false
+}
+```
+
+This contains a lot of entries, and at first glance, it could feel overwhelming to figure out all the right values to put in there. However, most of the entries are not necessary to load the model. This is either because they use the default values and don't need to be added or because they only affect the initialization of the LoRA weights, which is irrelevant when it comes to loading the model. If you find that you don't know what a specific parameter does, e.g., `"use_rslora",` don't add it, and you should be fine. Also note that as more options are added, this file will get more entries in the future, but it should be backward compatible.
+
+At the minimum, you should include the following entries:
+
+```json
+{
+  "target_modules": ["query", "value"],
+  "peft_type": "LORA"
+}
+```
+
+However, adding as many entries as possible, like the rank `r` or the `base_model_name_or_path` (if it's a Transformers model) is recommended. This information can help others understand the model better and share it more easily. To check which keys and values are expected, check out the [config.py](https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/config.py) file (as an example, this is the config file for LoRA) in the PEFT source code.
+
+## Model storage
+
+In some circumstances, you might want to store the whole PEFT model, including the base weights. This can be necessary if, for instance, the base model is not available to the users trying to load the PEFT model. You can merge the weights first or convert it into a Transformer model.
+
+### Merge the weights
+
+The most straightforward way to store the whole PEFT model is to merge the adapter weights into the base weights:
+
+```python
+merged_model = model.merge_and_unload()
+merged_model.save_pretrained(...)
+```
+
+There are some disadvantages to this approach, though:
+
+- Once [`~LoraModel.merge_and_unload`] is called, you get a basic model without any PEFT-specific functionality. This means you can't use any of the PEFT-specific methods anymore.
+- You cannot unmerge the weights, load multiple adapters at once, disable the adapter, etc.
+- Not all PEFT methods support merging weights.
+- Some PEFT methods may generally allow merging, but not with specific settings (e.g. when using certain quantization techniques).
+- The whole model will be much larger than the PEFT model, as it will contain all the base weights as well.
+
+But inference with a merged model should be a bit faster.
+
+### Convert to a Transformers model
+
+Another way to save the whole model, assuming the base model is a Transformers model, is to use this hacky approach to directly insert the PEFT weights into the base model and save it, which only works if you "trick" Transformers into believing the PEFT model is not a PEFT model. This only works with LoRA because other adapters are not implemented in Transformers.
+
+```python
+model = ...  # the PEFT model
+...
+# after you finish training the model, save it in a temporary location
+model.save_pretrained(<temp_location>)
+# now load this model directly into a transformers model, without the PEFT wrapper
+# the PEFT weights are directly injected into the base model
+model_loaded = AutoModel.from_pretrained(<temp_location>)
+# now make the loaded model believe that it is _not_ a PEFT model
+model_loaded._hf_peft_config_loaded = False
+# now when we save it, it will save the whole model
+model_loaded.save_pretrained(<final_location>)
+# or upload to Hugging Face Hub
+model_loaded.push_to_hub(<final_location>)
+```
+
--- a/docs/source/developer_guides/custom_models.md
+++ b/docs/source/developer_guides/custom_models.md
@ -238,3 +238,73 @@ peft_model.print_trainable_parameters()
 ```python
 print(peft_model.targeted_module_names)
 ```
+
+## Unsupported module types
+
+Methods like LoRA only work if the target modules are supported by PEFT. For example, it's possible to apply LoRA to `nn.Linear` and `nn.Conv2d` layers, but not, for instance, to `nn.LSTM`. If you find a layer class you want to apply PEFT to is not supported, you can:
+
+ - define a custom mapping to dynamically dispatch custom modules in LoRA
+ -  open an [issue](https://github.com/huggingface/peft/issues) and request the feature where maintainers will implement it or guide you on how to implement it yourself if demand for this module type is sufficiently high
+
+### Experimental support for dynamic dispatch of custom modules in LoRA
+
+> [!WARNING]
+> This feature is experimental and subject to change, depending on its reception by the community. We will introduce a public and stable API if there is significant demand for it.
+
+PEFT supports an experimental API for custom module types for LoRA. Let's assume you have a LoRA implementation for LSTMs. Normally, you would not be able to tell PEFT to use it, even if it would theoretically work with PEFT. However, this is possible with dynamic dispatch of custom layers.
+
+The experimental API currently looks like this:
+
+```python
+class MyLoraLSTMLayer:
+    ...
+
+base_model = ...  # load the base model that uses LSTMs
+
+# add the LSTM layer names to target_modules
+config = LoraConfig(..., target_modules=["lstm"])
+# define a mapping from base layer type to LoRA layer type
+custom_module_mapping = {nn.LSTM: MyLoraLSTMLayer}
+# register the new mapping
+config._register_custom_module(custom_module_mapping)
+# after registration, create the PEFT model
+peft_model = get_peft_model(base_model, config)
+# do training
+```
+
+<Tip>
+
+When you call [`get_peft_model`], you will see a warning because PEFT does not recognize the targeted module type. In this case, you can ignore this warning.
+
+</Tip>
+
+By supplying a custom mapping, PEFT first checks the base model's layers against the custom mapping and dispatches to the custom LoRA layer type if there is a match. If there is no match, PEFT checks the built-in LoRA layer types for a match.
+
+Therefore, this feature can also be used to override existing dispatch logic, e.g. if you want to use your own LoRA layer for `nn.Linear` instead of using the one provided by PEFT.
+
+When creating your custom LoRA module, please follow the same rules as the [existing LoRA modules](https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/layer.py). Some important constraints to consider:
+
+- The custom module should inherit from `nn.Module` and `peft.tuners.lora.layer.LoraLayer`.
+- The `__init__` method of the custom module should have the positional arguments `base_layer` and `adapter_name`. After this, there are additional `**kwargs` that you are free to use or ignore.
+- The learnable parameters should be stored in an `nn.ModuleDict` or `nn.ParameterDict`, where the key corresponds to the name of the specific adapter (remember that a model can have more than one adapter at a time).
+- The name of these learnable parameter attributes should start with `"lora_"`, e.g. `self.lora_new_param = ...`.
+- Some methods are optional, e.g. you only need to implement `merge` and `unmerge` if you want to support weight merging.
+
+Currently, the information about the custom module does not persist when you save the model. When loading the model, you have to register the custom modules again.
+
+```python
+# saving works as always and includes the parameters of the custom modules
+peft_model.save_pretrained(<model-path>)
+
+# loading the model later:
+base_model = ...
+# load the LoRA config that you saved earlier
+config = LoraConfig.from_pretrained(<model-path>)
+# register the custom module again, the same way as the first time
+custom_module_mapping = {nn.LSTM: MyLoraLSTMLayer}
+config._register_custom_module(custom_module_mapping)
+# pass the config instance to from_pretrained:
+peft_model = PeftModel.from_pretrained(model, tmp_path / "lora-custom-module", config=config)
+```
+
+If you use this feature and find it useful, or if you encounter problems, let us know by creating an issue or a discussion on GitHub. This allows us to estimate the demand for this feature and add a public API if it is sufficiently high.
--- a/docs/source/developer_guides/lora.md
+++ b/docs/source/developer_guides/lora.md
@ -40,6 +40,29 @@ from peft import LoraConfig
 config = LoraConfig(init_lora_weights=False, ...)
 ```

+### PiSSA
+[PiSSA](https://arxiv.org/abs/2404.02948) initializes the LoRA adapter using the principal singular values and singular vectors. This straightforward modification allows PiSSA to converge more rapidly than LoRA and ultimately attain superior performance. Moreover, PiSSA reduces the quantization error compared to QLoRA, leading to further enhancements. 
+
+Configure the initialization method to "pissa", which may take several minutes to execute SVD on the pre-trained model:
+```python
+from peft import LoraConfig
+config = LoraConfig(init_lora_weights="pissa", ...)
+```
+Alternatively, execute fast SVD, which takes only a few seconds. The number of iterations determines the trade-off between the error and computation time:
+```python
+lora_config = LoraConfig(init_lora_weights="pissa_niter_[number of iters]", ...) 
+```
+For detailed instruction on using PiSSA, please follow [these instructions](https://github.com/fxmeng/peft/tree/main/examples/pissa_finetuning).
+
+### OLoRA
+[OLoRA](https://arxiv.org/abs/2406.01775) utilizes QR decomposition to initialize the LoRA adapters. OLoRA translates the base weights of the model by a factor of their QR decompositions, i.e., it mutates the weights before performing any training on them. This approach significantly improves stability, accelerates convergence speed, and ultimately achieves superior performance.
+
+You just need to pass a single additional option to use OLoRA:
+```python
+from peft import LoraConfig
+config = LoraConfig(init_lora_weights="olora", ...)
+```
+For more advanced usage, please refer to our [documentation](https://github.com/huggingface/peft/tree/main/examples/olora_finetuning).
 ### LoftQ

 #### Standard approach
@ -99,6 +122,22 @@ from peft import LoraConfig
 config = LoraConfig(use_dora=True, ...)
 ```

+If parts of the model or the DoRA adapter are offloaded to CPU you can get a significant speedup at the cost of some temporary (ephemeral) VRAM overhead by using `ephemeral_gpu_offload=True` in `config.runtime_config`.
+
+```py
+from peft import LoraConfig, LoraRuntimeConfig
+
+config = LoraConfig(use_dora=True, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=True), ...)
+```
+
+A `PeftModel` with a DoRA adapter can also be loaded with `ephemeral_gpu_offload=True` flag using the `from_pretrained` method as well as the `load_adapter` method.
+
+```py
+from peft import PeftModel
+
+model = PeftModel.from_pretrained(base_model, peft_model_id, ephemeral_gpu_offload=True)
+```
+
 #### Caveats

 - DoRA only supports linear and Conv2d layers at the momement.
@ -126,10 +165,18 @@ Assuming the original model had 5 layers `[0, 1, 2 ,3, 4]`, this would create a
 [Fewshot-Metamath-OrcaVicuna-Mistral-10B](https://huggingface.co/abacusai/Fewshot-Metamath-OrcaVicuna-Mistral-10B) is an example of a model trained using this method on Mistral-7B expanded to 10B. The
 [adapter_config.json](https://huggingface.co/abacusai/Fewshot-Metamath-OrcaVicuna-Mistral-10B/blob/main/adapter_config.json) shows a sample LoRA adapter config applying this method for fine-tuning.

-## Merge adapters
+## Merge LoRA weights into the base model

 While LoRA is significantly smaller and faster to train, you may encounter latency issues during inference due to separately loading the base model and the LoRA adapter. To eliminate latency, use the [`~LoraModel.merge_and_unload`] function to merge the adapter weights with the base model. This allows you to use the newly merged model as a standalone model. The [`~LoraModel.merge_and_unload`] function doesn't keep the adapter weights in memory.

+Below is a diagram that explains the intuition of LoRA adapter merging:
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/lora_diagram.png"/>
+</div>
+
+We show in the snippets below how to run that using PEFT.
+
 ```py
 from transformers import AutoModelForCausalLM
 from peft import PeftModel
--- a/docs/source/developer_guides/model_merging.md
+++ b/docs/source/developer_guides/model_merging.md
@ -138,3 +138,20 @@ print(tokenizer.decode(outputs[0]))

 </hfoption>
 </hfoptions>
+
+
+## Merging (IA)³ Models
+The (IA)³ models facilitate linear merging of adapters. To merge adapters in an (IA)³ model, utilize the `add_weighted_adapter` method from the `IA3Model` class. This method is analogous to the `add_weighted_adapter` method used in `LoraModel`, with the key difference being the absence of the `combination_type` parameter. For example, to merge three (IA)³ adapters into a PEFT model, you would proceed as follows:
+
+```py
+adapters = ["adapter1", "adapter2", "adapter3"]
+weights = [0.4, 0.3, 0.3]
+adapter_name = "merge"
+model.add_weighted_adapter(adapters, weights, adapter_name)
+```
+
+It is recommended that the weights sum to 1.0 to preserve the scale of the model. The merged model can then be set as the active model using the `set_adapter` method:
+
+```py
+model.set_adapter("merge")
+```
--- a/docs/source/developer_guides/quantization.md
+++ b/docs/source/developer_guides/quantization.md
@ -128,6 +128,70 @@ quantized_model = get_peft_model(quantized_model, peft_config)

 You can refer to the [Google Colab](https://colab.research.google.com/drive/12GTp1FCj5_0SnnNQH18h_2XFh9vS_guX?usp=sharing) example for an overview of AQLM+LoRA finetuning.

+## EETQ quantization
+
+You can also perform LoRA fine-tuning on EETQ quantized models. [EETQ](https://github.com/NetEase-FuXi/EETQ) package offers simple and efficient way to perform 8-bit quantization, which is claimed to be faster than the `LLM.int8()` algorithm. First, make sure that you have a transformers version that is compatible with EETQ (e.g. by installing it from latest pypi or from source).
+
+```py
+import torch
+from transformers import EetqConfig
+
+config = EetqConfig("int8")
+```
+
+Pass the `config` to the [`~transformers.AutoModelForCausalLM.from_pretrained`] method.
+
+```py
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", quantization_config=config)
+```
+
+and create a `LoraConfig` and pass it to `get_peft_model`:
+
+```py
+from peft import LoraConfig, get_peft_model
+
+config = LoraConfig(
+    r=16,
+    lora_alpha=8,
+    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
+    lora_dropout=0.05,
+    bias="none",
+    task_type="CAUSAL_LM"
+)
+
+model = get_peft_model(model, config)
+```
+
+## HQQ quantization
+
+The models that is quantized using Half-Quadratic Quantization of Large Machine Learning Models ([HQQ](https://mobiusml.github.io/hqq_blog/)) support LoRA adapter tuning. To tune the quantized model, you'll need to install the `hqq` library with: `pip install hqq`.
+
+```py
+from hqq.engine.hf import HQQModelForCausalLM
+
+quantized_model = HQQModelForCausalLM.from_quantized(save_dir_or_hfhub, device='cuda')
+
+peft_config = LoraConfig(...)
+
+quantized_model = get_peft_model(quantized_model, peft_config)
+```
+
+Or using transformers version that is compatible with HQQ (e.g. by installing it from latest pypi or from source).
+
+```python
+from transformers import HqqConfig, AutoModelForCausalLM
+
+quant_config = HqqConfig(nbits=4, group_size=64)
+
+quantized_model = AutoModelForCausalLM.from_pretrained(save_dir_or_hfhub, device='cuda', quantization_config=quant_config)
+
+peft_config = LoraConfig(...)
+
+quantized_model = get_peft_model(quantized_model, peft_config)
+```
+
 ## Next steps

 If you're interested in learning more about quantization, the following may be helpful:
--- a/docs/source/developer_guides/torch_compile.md
+++ b/docs/source/developer_guides/torch_compile.md
@ -0,0 +1,76 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# torch.compile
+
+In PEFT, [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) works for some but not all features. The reason why it won't always work is because PEFT is highly dynamic in certain places (loading and switching between multiple adapters, for instance), which can cause trouble for `torch.compile`. In other places, `torch.compile` may work, but won't be as fast as expected because of graph breaks.
+
+If you don't see an error, it doesn't necessarily mean that `torch.compile` worked correctly. It might give you an output, but the output is incorrect. This guide describes what works with `torch.compile` and what doesn't.
+
+> [!TIP]
+> Unless indicated otherwise, the default `torch.compile` settings were used.
+
+## Training and inference with `torch.compile`
+
+These features **work** with `torch.compile`. Everything listed below was tested with a causal LM:
+
+- Training with `Trainer` from 🤗 transformers
+- Training with a custom PyTorch loop
+- Inference
+- Generation
+
+The following adapters were tested successfully:
+
+- AdaLoRA
+- BOFT
+- IA³
+- Layer Norm Tuning
+- LoHa
+- LoRA
+- LoRA + DoRA
+- OFT
+- VeRA
+- HRA
+
+The following adapters **don't work** correctly for training or inference when using `torch.compile`:
+
+- LoKr
+- LoRA targeting embedding layers
+
+## Advanced PEFT features with `torch.compile`
+
+Below are some of the more advanced PEFT features that **work**. They were all tested with LoRA.
+
+- `modules_to_save` (i.e. `config = LoraConfig(..., modules_to_save=...)`)
+- Merging adapters (one or multiple)
+- Merging multiple adapters into one adapter (i.e. calling `model.add_weighted_adapter(...)`)
+
+Generally, we can expect that if a feature works correctly with LoRA and is also supported by other adapter types, it should also work for that adapter type.
+
+The more advanced PEFT features below **don't work** in conjunction with `torch.compile`. Tests were run with LoRA:
+
+- Using PEFT adapters with quantization (bitsandbytes)
+- Inference with multiple adapters
+- Unloading (i.e. calling `model.merge_and_unload()`)
+- Disabling adapters (i.e. using `with model.disable_adapter()`)
+- Mixed adapter batches (i.e. calling `model(batch, adapter_names=["__base__", "default", "other", ...])`)
+
+## Test cases
+
+All the use cases listed above are tested inside of [`peft/tests/test_torch_compile.py`](https://github.com/huggingface/peft/blob/main/tests/test_torch_compile.py). If you want to check in more detail how we tested a certain feature, please go to that file and check the test that corresponds to your use case.
+
+> [!TIP]
+> If you have another use case where you know that `torch.compile` does or does not work with PEFT, please contribute by letting us know or by opening a PR to add this use case to the covered test cases.
--- a/docs/source/developer_guides/troubleshooting.md
+++ b/docs/source/developer_guides/troubleshooting.md
@ -69,6 +69,12 @@ trainer = Trainer(model=peft_model, fp16=True, ...)
 trainer.train()
 ```

+<Tip>
+
+Starting from PEFT verion v0.12.0, PEFT automatically promotes the dtype of adapter weights from `torch.float16` and `torch.bfloat16` to `torch.float32` where appropriate. To _prevent_ this behavior, you can pass `autocast_adapter_dtype=False` to [`~get_peft_model`], to [`~PeftModel.from_pretrained`], and to [`~PeftModel.load_adapter`].
+
+</Tip>
+
 ## Bad results from a loaded PEFT model

 There can be several reasons for getting a poor result from a loaded PEFT model which are listed below. If you're still unable to troubleshoot the problem, see if anyone else had a similar [issue](https://github.com/huggingface/peft/issues) on GitHub, and if you can't find any, open a new issue.
@ -129,9 +135,139 @@ If the model's embedding layer doesn't follow the Transformer's naming scheme, y
 ```python
 model = get_peft_model(...)
 # train the model
-model.save_adapter("my_adapter", save_embedding_layers=True)
+model.save_pretrained("my_adapter", save_embedding_layers=True)
 ```

 For inference, load the base model first and resize it the same way you did before you trained the model. After you've resized the base model, you can load the PEFT checkpoint.

 For a complete example, please check out [this notebook](https://github.com/huggingface/peft/blob/main/examples/causal_language_modeling/peft_lora_clm_with_additional_tokens.ipynb).
+
+### Check layer and model status
+
+Sometimes a PEFT model can end up in a bad state, especially when handling multiple adapters. There can be some confusion around what adapters exist, which one is active, which one is merged, etc. To help investigate this issue, call the [`~peft.PeftModel.get_layer_status`] and the [`~peft.PeftModel.get_model_status`] methods. 
+
+The [`~peft.PeftModel.get_layer_status`] method gives you a detailed overview of each targeted layer's active, merged, and available adapters.
+
+```python
+>>> from transformers import AutoModel
+>>> from peft import get_peft_model, LoraConfig
+
+>>> model_id = "google/flan-t5-small"
+>>> model = AutoModel.from_pretrained(model_id)
+>>> model = get_peft_model(model, LoraConfig())
+
+>>> model.get_layer_status()
+[TunerLayerStatus(name='model.encoder.block.0.layer.0.SelfAttention.q',
+                  module_type='lora.Linear',
+                  enabled=True,
+                  active_adapters=['default'],
+                  merged_adapters=[],
+                  requires_grad={'default': True},
+                  available_adapters=['default']),
+ TunerLayerStatus(name='model.encoder.block.0.layer.0.SelfAttention.v',
+                  module_type='lora.Linear',
+                  enabled=True,
+                  active_adapters=['default'],
+                  merged_adapters=[],
+                  requires_grad={'default': True},
+                  available_adapters=['default']),
+...]
+
+>>> model.get_model_status()
+TunerModelStatus(
+    base_model_type='T5Model',
+    adapter_model_type='LoraModel',
+    peft_types={'default': 'LORA'},
+    trainable_params=344064,
+    total_params=60855680,
+    num_adapter_layers=48,
+    enabled=True,
+    active_adapters=['default'],
+    merged_adapters=[],
+    requires_grad={'default': True},
+    available_adapters=['default'],
+)
+```
+
+In the model state output, you should look out for entries that say `"irregular"`. This means PEFT detected an inconsistent state in the model. For instance, if `merged_adapters="irregular"`, it means that for at least one adapter, it was merged on some target modules but not on others. The inference results will most likely be incorrect as a result.
+
+The best way to resolve this issue is to reload the whole model and adapter checkpoint(s). Ensure that you don't perform any incorrect operations on the model, e.g. manually merging adapters on some modules but not others.
+
+Convert the layer status into a pandas `DataFrame` for an easier visual inspection.
+
+```python
+from dataclasses import asdict
+import pandas as pd
+
+df = pd.DataFrame(asdict(layer) for layer in model.get_layer_status())
+```
+
+It is possible to get this information for non-PEFT models if they are using PEFT layers under the hood, but some information like the `base_model_type` or the `peft_types` cannot be determined in that case. As an example, you can call this on a [diffusers](https://huggingface.co/docs/diffusers/index) model like so:
+
+```python
+>>> import torch
+>>> from diffusers import StableDiffusionPipeline
+>>> from peft import get_model_status, get_layer_status
+
+>>> path = "runwayml/stable-diffusion-v1-5"
+>>> lora_id = "takuma104/lora-test-text-encoder-lora-target"
+>>> pipe = StableDiffusionPipeline.from_pretrained(path, torch_dtype=torch.float16)
+>>> pipe.load_lora_weights(lora_id, adapter_name="adapter-1")
+>>> pipe.load_lora_weights(lora_id, adapter_name="adapter-2")
+>>> pipe.set_lora_device(["adapter-2"], "cuda")
+>>> get_layer_status(pipe.text_encoder)
+[TunerLayerStatus(name='text_model.encoder.layers.0.self_attn.k_proj',
+                  module_type='lora.Linear',
+                  enabled=True,
+                  active_adapters=['adapter-2'],
+                  merged_adapters=[],
+                  requires_grad={'adapter-1': False, 'adapter-2': True},
+                  available_adapters=['adapter-1', 'adapter-2'],
+                  devices={'adapter-1': ['cpu'], 'adapter-2': ['cuda']}),
+ TunerLayerStatus(name='text_model.encoder.layers.0.self_attn.v_proj',
+                  module_type='lora.Linear',
+                  enabled=True,
+                  active_adapters=['adapter-2'],
+                  merged_adapters=[],
+                  requires_grad={'adapter-1': False, 'adapter-2': True},
+                  devices={'adapter-1': ['cpu'], 'adapter-2': ['cuda']}),
+...]
+
+>>> get_model_status(pipe.unet)
+TunerModelStatus(
+    base_model_type='other',
+    adapter_model_type='None',
+    peft_types={},
+    trainable_params=797184,
+    total_params=861115332,
+    num_adapter_layers=128,
+    enabled=True,
+    active_adapters=['adapter-2'],
+    merged_adapters=[],
+    requires_grad={'adapter-1': False, 'adapter-2': True},
+    available_adapters=['adapter-1', 'adapter-2'],
+    devices={'adapter-1': ['cpu'], 'adapter-2': ['cuda']},
+)
+```
+
+## Reproducibility
+
+### Models using batch norm
+
+When loading a trained PEFT model where the base model uses batch norm (e.g. `torch.nn.BatchNorm1d` or `torch.nn.BatchNorm2d`), you may find that you cannot reproduce the exact same outputs. This is because the batch norm layers keep track of running stats during training, but these stats are not part of the PEFT checkpoint. Therefore, when you load the PEFT model, the running stats of the base model will be used (i.e. from before training with PEFT).
+
+Depending on your use case, this may not be a big deal. If, however, you need your outputs to be 100% reproducible, you can achieve this by adding the batch norm layers to `modules_to_save`. Below is an example of this using resnet and LoRA. Notice that we set `modules_to_save=["classifier", "normalization"]`. We need the `"classifier"` argument because our task is image classification, and we add the `"normalization"` argument to ensure that the batch norm layers are saved in the PEFT checkpoint.
+
+```python
+from transformers import AutoModelForImageClassification
+from peft import LoraConfig, get_peft_model
+
+model_id = "microsoft/resnet-18"
+base_model = AutoModelForImageClassification.from_pretrained(self.model_id)
+config = LoraConfig(
+    target_modules=["convolution"],
+    modules_to_save=["classifier", "normalization"],
+),
+```
+
+Depending on the type of model you use, the batch norm layers could have different names than `"normalization"`, so please ensure that the name matches your model architecture.
--- a/docs/source/package_reference/fourierft.md
+++ b/docs/source/package_reference/fourierft.md
@ -0,0 +1,38 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# FourierFT: Discrete Fourier Transformation Fine-Tuning
+
+[FourierFT](https://huggingface.co/papers/2405.03003) is a parameter-efficient fine-tuning technique that leverages Discrete Fourier Transform to compress the model's tunable weights. This method outperforms LoRA in the GLUE benchmark and common ViT classification tasks using much less parameters.
+
+FourierFT currently has the following constraints:
+
+- Only `nn.Linear` layers are supported.
+- Quantized layers are not supported.
+
+If these constraints don't work for your use case, consider other methods instead.
+
+The abstract from the paper is:
+
+> Low-rank adaptation (LoRA) has recently gained much interest in fine-tuning foundation models. It effectively reduces the number of trainable parameters by incorporating low-rank matrices A and B to represent the weight change, i.e., Delta W=BA. Despite LoRA's progress, it faces storage challenges when handling extensive customization adaptations or larger base models. In this work, we aim to further compress trainable parameters by enjoying the powerful expressiveness of the Fourier transform. Specifically, we introduce FourierFT, which treats Delta W as a matrix in the spatial domain and learns only a small fraction of its spectral coefficients. With the trained spectral coefficients, we implement the inverse discrete Fourier transform to recover Delta W. Empirically, our FourierFT method shows comparable or better performance with fewer parameters than LoRA on various tasks, including natural language understanding, natural language generation, instruction tuning, and image classification. For example, when performing instruction tuning on the LLaMA2-7B model, FourierFT surpasses LoRA with only 0.064M trainable parameters, compared to LoRA's 33.5M.
+
+## FourierFTConfig
+
+[[autodoc]] tuners.fourierft.config.FourierFTConfig
+
+## FourierFTModel
+
+[[autodoc]] tuners.fourierft.model.FourierFTModel
--- a/docs/source/package_reference/helpers.md
+++ b/docs/source/package_reference/helpers.md
@ -0,0 +1,12 @@
+<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# Document Title
+
+A collection of helper functions for PEFT.
+
+## Checking if a model is a PEFT model
+
+[[autodoc]] helpers.check_if_peft_model
+    - all
--- a/docs/source/package_reference/layernorm_tuning.md
+++ b/docs/source/package_reference/layernorm_tuning.md
@ -0,0 +1,34 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# LayerNorm Tuning
+
+LayerNorm Tuning ([LN Tuning](https://huggingface.co/papers/2312.11420)) is a PEFT method that only fine-tunes the parameters of the LayerNorm layers in a model.
+The paper has tested the performance of this method on large language models and has shown that it can achieve strong performance with a significant reduction in the number of trainable parameters and GPU memory usage.
+However, the method is not limited to language models and can be applied to any model that uses LayerNorm layers.
+In this implementation, the default is that all layernorm layers inside a model is finetuned, but it could be used to target other layer types such as `MLP` or `Attention` layers, this can be done by specifying the `target_modules` in the `LNTuningConfig`.
+
+The abstract from the paper is:
+
+*This paper introduces an efficient strategy to transform Large Language Models (LLMs) into Multi-Modal Large Language Models (MLLMs). By conceptualizing this transformation as a domain adaptation process, i.e., transitioning from text understanding to embracing multiple modalities, we intriguingly note that, within each attention block, tuning LayerNorm suffices to yield strong performance. Moreover, when benchmarked against other tuning approaches like full parameter finetuning or LoRA, its benefits on efficiency are substantial. For example, when compared to LoRA on a 13B model scale, performance can be enhanced by an average of over 20% across five multi-modal tasks, and meanwhile, results in a significant reduction of trainable parameters by 41.9% and a decrease in GPU memory usage by 17.6%. On top of this LayerNorm strategy, we showcase that selectively tuning only with conversational data can improve efficiency further. Beyond these empirical outcomes, we provide a comprehensive analysis to explore the role of LayerNorm in adapting LLMs to the multi-modal domain and improving the expressive power of the model.*
+
+## LNTuningConfig
+
+[[autodoc]] tuners.ln_tuning.config.LNTuningConfig
+
+## LNTuningModel
+
+[[autodoc]] tuners.ln_tuning.model.LNTuningModel
--- a/docs/source/package_reference/peft_model.md
+++ b/docs/source/package_reference/peft_model.md
@ -71,3 +71,7 @@ A `PeftModel` for mixing different adapter types (e.g. LoRA and LoHa).
 [[autodoc]] utils.get_peft_model_state_dict

 [[autodoc]] utils.prepare_model_for_kbit_training
+
+[[autodoc]] get_layer_status
+
+[[autodoc]] get_model_status
--- a/docs/source/package_reference/vera.md
+++ b/docs/source/package_reference/vera.md
@ -20,9 +20,10 @@ rendered properly in your Markdown viewer.

 When saving the adapter parameters, it's possible to eschew storing the low rank matrices by setting `save_projection=False` on the `VeraConfig`. In that case, these matrices will be restored based on the fixed random seed from the `projection_prng_key` argument. This cuts down on the size of the checkpoint, but we cannot guarantee reproducibility on all devices and for all future versions of PyTorch. If you want to ensure reproducibility, set `save_projection=True` (which is the default).

+To handle different shapes of adapted layers, VeRA initializes shared A and B matrices with the largest required size for each dimension. During the forward pass, submatrices A and B for a given layer are sliced out from these shared matrices and used as described in the paper. For example, adapting two linear layers of shapes (100, 20) and (80, 50) will create A and B matrices of shapes (rank, 50) and (100, rank) respectively. Then, to adapt a layer of shape (100, 20), submatrices A and B of shapes (rank, 20) and (100, rank) will be extracted.
+
 VeRA currently has the following constraints:

- All targeted parameters must have the same shape.
 - Only `nn.Linear` layers are supported.
 - Quantized layers are not supported.

--- a/docs/source/task_guides/lora_based_methods.md
+++ b/docs/source/task_guides/lora_based_methods.md
@ -307,7 +307,7 @@ Let's load the model from the Hub and test it out on a food image.

 ```py
 from peft import PeftConfig, PeftModel
-from transfomers import AutoImageProcessor
+from transformers import AutoImageProcessor
 from PIL import Image
 import requests

--- a/docs/source/task_guides/prompt_based_methods.md
+++ b/docs/source/task_guides/prompt_based_methods.md
@ -90,7 +90,7 @@ def preprocess_function(examples, text_column="Tweet text", label_column="text_l
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
-        labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids
+        labels["input_ids"][i] = [-100] * (max_length - len(label_input_ids)) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
--- a/docs/source/tutorial/peft_model_config.md
+++ b/docs/source/tutorial/peft_model_config.md
@ -99,7 +99,7 @@ You can create your own configuration for training by initializing a [`PromptEnc
 from peft import PromptEncoderConfig, TaskType

 p_tuning_config = PromptEncoderConfig(
-    encoder_reprameterization_type="MLP",
+    encoder_reparameterization_type="MLP",
    encoder_hidden_size=128,
    num_attention_heads=16,
    num_layers=24,
--- a/examples/boft_controlnet/test_controlnet.py
+++ b/examples/boft_controlnet/test_controlnet.py
@ -37,7 +37,7 @@ from utils.unet_2d_condition import UNet2DConditionNewModel


 sys.path.append("../../src")
-from peft import PeftModel
+from peft import PeftModel  # noqa: E402


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
--- a/examples/causal_language_modeling/peft_ln_tuning_clm.ipynb
+++ b/examples/causal_language_modeling/peft_ln_tuning_clm.ipynb
--- a/examples/dna_language_models/dna_lm.ipynb
+++ b/examples/dna_language_models/dna_lm.ipynb
--- a/examples/dora_finetuning/QDoRA_finetuning.ipynb
+++ b/examples/dora_finetuning/QDoRA_finetuning.ipynb
--- a/examples/dora_finetuning/README.md
+++ b/examples/dora_finetuning/README.md
@ -0,0 +1,106 @@
+# DoRA: Weight-Decomposed Low-Rank Adaptation
+
+![dora](https://i.ytimg.com/vi/m7KQdGSr0Dg/maxresdefault.jpg)
+
+
+## Introduction
+[DoRA](https://arxiv.org/abs/2402.09353) is a novel approach that leverages low rank adaptation through weight decomposition analysis to investigate the inherent differences between full fine-tuning and LoRA. DoRA initially decomposes the pretrained weight into its magnitude and directional components and finetunes both of them. Because the directional component is large in terms of parameter numbers, we further decompose it with LoRA for efficient finetuning. This results in enhancing both the learning capacity and training stability of LoRA while avoiding any additional inference overhead.
+
+## Quick start
+```python
+import torch
+from peft import LoraConfig, get_peft_model
+from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer
+from datasets import load_dataset
+
+model = AutoModelForCausalLM.from_pretrained("huggyllama/llama-7b", device_map="cuda")
+tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
+dataset = load_dataset("timdettmers/openassistant-guanaco", split="train")
+lora_config = LoraConfig(
+    use_dora=True
+)
+peft_model = get_peft_model(model, lora_config)
+trainer = transformers.Trainer(
+    model=peft_model,
+    train_dataset=dataset,
+    dataset_text_field="text",
+    max_seq_length=2048,
+    tokenizer=tokenizer,
+)
+trainer.train()
+peft_model.save_pretrained("dora-llama-3-8b")
+```
+
+There is no additional change needed to your standard LoRA procedure, except for specifying `use_dora = True` option in your lora configuration.
+
+
+Run the finetuning script simply by running:
+```bash
+python examples/dora_finetuning/dora_finetuning.py --base_model meta-llama/Meta-Llama-3-8B --data_path timdettmers/openassistant-guanaco
+```
+This 👆🏻 by default will load the model in peft set up with LoRA config. Now if you wanna quickly compare it with Dora, all you need to do is to input ` --use_dora` in the command line. So same above example would be 👇🏻;
+
+```bash
+python examples/dora_finetuning/dora_finetuning.py --base_model meta-llama/Meta-Llama-3-8B --data_path timdettmers/openassistant-guanaco --use_dora 
+```
+
+DoRA also supports quantization. To use 4-bit quantization try:
+
+```bash
+python examples/dora_finetuning/dora_finetuning.py --base_model meta-llama/Meta-Llama-3-8B --quantize
+```
+
+Similarly, by default the LoRA layers are the attention and MLP layers of LLama model, if you get to choose a different set of layers for LoRA to be applied on, you can simply define it using:
+```bash
+python examples/dora_finetuning/dora_finetuning.py --lora_target_modules "q_proj,k_proj,v_proj,o_proj" 
+```
+
+### Full example of the script 
+```bash
+python dora_finetuning.py \
+    --base_model "PATH_TO_MODEL" \
+    --data_path "PATH_TO_DATASET" \
+    --output_dir "PATH_TO_OUTPUT_DIR" \
+    --batch_size 1 \
+    --num_epochs 3 \
+    --learning_rate 3e-4 \
+    --cutoff_len 512 \
+    --val_set_size 500 \
+    --use_dora \
+    --quantize \
+    --eval_step 10 \
+    --save_step 100 \
+    --device "cuda:0" \
+    --lora_r 16 \
+    --lora_alpha 32 \
+    --lora_dropout 0.05 \
+    --lora_target_modules "q_proj,k_proj,v_proj,o_proj" \
+    --hub_model_id "YOUR_HF_REPO" \
+    --push_to_hub
+```
+## Use the model on 🤗
+You can load and use the model as any other 🤗 models.
+```python
+from transformers import AutoModel
+model = AutoModel.from_pretrained("ShirinYamani/huggyllama-llama-7b-finetuned")
+```
+
+## DoRA vs. LoRA
+In general, DoRA finetuning on diffusion models is still experimental and is likely to require different hyperparameter values to perform best compared to LoRA.
+
+Specifically, people have noticed 2 differences to take into account in your training:
+
+1. LoRA seem to converge faster than DoRA (so a set of parameters that may lead to overfitting when training a LoRA may be working well for a DoRA)
+
+2. DoRA quality superior to LoRA especially in lower ranks: The difference in quality of DoRA of rank 8 and LoRA of rank 8 appears to be more significant than when training ranks of 32 or 64 for example.
+
+
+## Citation
+```
+@article{liu2024dora,
+  title={DoRA: Weight-Decomposed Low-Rank Adaptation},
+  author={Liu, Shih-Yang and Wang, Chien-Yi and Yin, Hongxu and Molchanov, Pavlo and Wang, Yu-Chiang Frank and Cheng, Kwang-Ting and Chen, Min-Hung},
+  journal={arXiv preprint arXiv:2402.09353},
+  year={2024}
+}
+```
--- a/examples/dora_finetuning/dora_finetuning.py
+++ b/examples/dora_finetuning/dora_finetuning.py
@ -0,0 +1,200 @@
+import os
+
+import torch
+from datasets import load_dataset
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    DataCollatorWithPadding,
+    Trainer,
+    TrainingArguments,
+)
+
+from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+
+
+def train_model(
+    base_model: str,
+    data_path: str,
+    output_dir: str,
+    batch_size: int,
+    num_epochs: int,
+    learning_rate: float,
+    cutoff_len: int,
+    val_set_size: int,
+    use_dora: bool,
+    quantize: bool,
+    eval_step: int,
+    save_step: int,
+    device: str,
+    lora_r: int,
+    lora_alpha: int,
+    lora_dropout: float,
+    lora_target_modules: str,
+    hub_model_id: str,
+    push_to_hub: bool,
+):
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    hf_token = os.getenv("HF_TOKEN")
+
+    # Setup device
+    device = torch.device(device)
+    print(f"Using device: {device}")
+
+    # load tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(base_model, token=hf_token)
+
+    # QDoRA (quantized dora): IF YOU WANNA QUANTIZE THE MODEL
+    if quantize:
+        model = AutoModelForCausalLM.from_pretrained(
+            base_model,
+            token=hf_token,
+            quantization_config=BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=(
+                    torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
+                ),
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4",
+            ),
+        )
+        # setup for quantized training
+        model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
+    else:
+        model = AutoModelForCausalLM.from_pretrained(base_model, token=hf_token)
+    # LoRa config for the PEFT model
+    lora_config = LoraConfig(
+        use_dora=use_dora,  # to use Dora OR compare to Lora just set the --use_dora
+        r=lora_r,  # Rank of matrix
+        lora_alpha=lora_alpha,
+        target_modules=(
+            lora_target_modules.split(",")
+            if lora_target_modules
+            else ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+        ),
+        lora_dropout=lora_dropout,
+        bias="none",
+    )
+
+    # get the peft model with LoRa config
+    model = get_peft_model(model, lora_config)
+
+    model.to(device)  # MODEL TO GPU/CUDA
+    tokenizer.pad_token = tokenizer.eos_token
+
+    # Load the dataset
+    dataset = load_dataset(data_path)
+
+    def tokenize_function(examples):
+        inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=cutoff_len)
+        inputs["labels"] = inputs["input_ids"].copy()  # setting labels for a language modeling task
+        return inputs
+
+    # Tokenize the dataset and prepare for training
+    tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)
+
+    # Data collator to dynamically pad the batched examples
+    data_collator = DataCollatorWithPadding(tokenizer)
+
+    # Define training arguments
+    training_args = TrainingArguments(
+        output_dir=output_dir,
+        num_train_epochs=num_epochs,
+        per_device_train_batch_size=batch_size,
+        per_device_eval_batch_size=batch_size,
+        warmup_steps=100,
+        weight_decay=0.01,
+        logging_dir="./logs",
+        logging_steps=eval_step,
+        save_steps=save_step,
+        save_total_limit=2,
+        push_to_hub=push_to_hub,
+        hub_model_id=hub_model_id,
+        gradient_accumulation_steps=16,
+        fp16=True,
+        learning_rate=learning_rate,
+        hub_token=hf_token,
+    )
+
+    # Clear CUDA cache to free memory
+    torch.cuda.empty_cache()
+
+    # Initialize the Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_datasets["train"],
+        eval_dataset=tokenized_datasets["test"],
+        data_collator=data_collator,
+    )
+
+    # Start model training
+    trainer.train()
+
+    # Save and push the trained model and tokenizer
+    if push_to_hub:
+        # Push the main model to the hub
+        trainer.push_to_hub(commit_message="Fine-tuned model")
+
+    # Save the model and tokenizer locally
+    model.save_pretrained(output_dir)
+    tokenizer.save_pretrained(output_dir)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Fine-tune LLaMA with DoRA and PEFT")
+    parser.add_argument("--base_model", type=str, default="huggyllama/llama-7b", help="Base model path or name")
+    parser.add_argument(
+        "--data_path", type=str, default="timdettmers/openassistant-guanaco", help="Dataset path or name"
+    )
+    parser.add_argument(
+        "--output_dir", type=str, default="path/to/output", help="Output directory for the fine-tuned model"
+    )
+    parser.add_argument("--batch_size", type=int, default=1, help="Batch size")
+    parser.add_argument("--num_epochs", type=int, default=1, help="Number of training epochs")
+    parser.add_argument("--learning_rate", type=float, default=3e-4, help="Learning rate")
+    parser.add_argument("--cutoff_len", type=int, default=512, help="Cutoff length for tokenization")
+    parser.add_argument("--val_set_size", type=int, default=500, help="Validation set size")
+    parser.add_argument("--use_dora", action="store_true", help="Apply Dora")
+    parser.add_argument("--quantize", action="store_true", help="Use quantization")
+    parser.add_argument("--eval_step", type=int, default=10, help="Evaluation step interval")
+    parser.add_argument("--save_step", type=int, default=100, help="Save step interval")
+    parser.add_argument("--device", type=str, default="cuda:0", help="Device to use for training")
+    parser.add_argument("--lora_r", type=int, default=8, help="LoRA rank")
+    parser.add_argument("--lora_alpha", type=int, default=16, help="LoRA alpha")
+    parser.add_argument("--lora_dropout", type=float, default=0.05, help="LoRA dropout rate")
+    parser.add_argument(
+        "--lora_target_modules", type=str, default=None, help="Comma-separated list of target modules for LoRA"
+    )
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default="path/to/repo",
+        help="Repository name to push the model on the Hugging Face Hub",
+    )
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to Hugging Face Hub")
+    args = parser.parse_args()
+    train_model(
+        base_model=args.base_model,
+        data_path=args.data_path,
+        output_dir=args.output_dir,
+        batch_size=args.batch_size,
+        num_epochs=args.num_epochs,
+        learning_rate=args.learning_rate,
+        cutoff_len=args.cutoff_len,
+        val_set_size=args.val_set_size,
+        use_dora=args.use_dora,
+        quantize=args.quantize,
+        eval_step=args.eval_step,
+        save_step=args.save_step,
+        device=args.device,
+        lora_r=args.lora_r,
+        lora_alpha=args.lora_alpha,
+        lora_dropout=args.lora_dropout,
+        lora_target_modules=args.lora_target_modules,
+        hub_model_id=args.hub_model_id,
+        push_to_hub=args.push_to_hub,
+    )
--- a/examples/ephemeral_gpu_offloading/load_with_dora.py
+++ b/examples/ephemeral_gpu_offloading/load_with_dora.py
@ -0,0 +1,103 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Example script demonstrating the time difference loading a model with a DoRA using ephemeral GPU offloading vs doing it purely on the CPU.
+
+Example outputs:
+$ python load_with_dora.py
+--- Loading model ---
+Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.03s/it]
+--- Loading PeftModel ---
+--- Done ---
+Model loading time: 4.83s
+PeftModel loading time: 28.14s
+Use ephemeral GPU offloading: False
+
+(Note: if this was the first time you ran the script, or if your cache was cleared, the times shown above are invalid, due to the time taken to download the model and DoRA files. Just re-run the script in this case.)
+
+$ python load_with_dora.py --ephemeral_gpu_offload
+--- Loading model ---
+Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.11it/s]
+--- Loading PeftModel ---
+--- Done ---
+Model loading time: 4.28s
+PeftModel loading time: 16.59s
+Use ephemeral GPU offloading: True
+
+(Note: if this was the first time you ran the script, or if your cache was cleared, the times shown above are invalid, due to the time taken to download the model and DoRA files. Just re-run the script in this case.)
+"""
+
+import argparse
+import time
+
+from huggingface_hub import snapshot_download
+from transformers import AutoModelForCausalLM
+
+from peft import PeftModel
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Load a model with DoRA using ephemeral GPU offloading")
+    parser.add_argument("--model", type=str, default="NousResearch/Hermes-2-Pro-Mistral-7B", help="Model to load")
+    parser.add_argument(
+        "--dora",
+        type=str,
+        default="peft-internal-testing/DoRA-Hermes-2-Pro-Mistral-7B",
+        help="DoRA to use",
+    )
+    parser.add_argument("--ephemeral_gpu_offload", action="store_true", help="Use ephemeral GPU offloading")
+    parser.add_argument(
+        "--merge_model_path", type="str", help="Merge the model with the DoRA model and save to the given path"
+    )
+    args = parser.parse_args()
+
+    peft_model_kwargs = {
+        "ephemeral_gpu_offload": args.ephemeral_gpu_offload,
+        "max_memory": {"cpu": "256GiB"},
+        "device_map": {"": "cpu"},
+    }
+
+    # Predownload
+    try:
+        snapshot_download(repo_id=args.model)
+    except Exception as e:
+        print(f"Failed to download model: {e}")
+        # We continue anyway as this might be e.g. a local directory or something
+    try:
+        snapshot_download(repo_id=args.dora)
+    except Exception as e:
+        print(f"Failed to download DoRA: {e}")
+        # We continue anyway as this might be e.g. a local directory or something
+
+    start = time.perf_counter()
+    print("--- Loading model ---")
+    model = AutoModelForCausalLM.from_pretrained(args.model)
+    model_time = time.perf_counter() - start
+    print("--- Loading PeftModel ---")
+    peft_model = PeftModel.from_pretrained(model, args.dora, **peft_model_kwargs)
+    print("--- Done ---")
+    peft_model_time = time.perf_counter() - start
+
+    print(f"Model loading time: {model_time:.2f}s")
+    print(f"PeftModel loading time: {peft_model_time:.2f}s")
+    print(f"Use ephemeral GPU offloading: {args.ephemeral_gpu_offload}")
+
+    if args.merge_model_path is not None:
+        merged_model = peft_model.merge_and_unload(progressbar=True)
+        merged_model.save_pretrained(args.merge_model_path)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/feature_extraction/peft_lora_embedding_semantic_search.py
+++ b/examples/feature_extraction/peft_lora_embedding_semantic_search.py
@ -194,6 +194,8 @@ class AutoModelForSentenceEmbedding(nn.Module):
        try:
            return super().__getattr__(name)  # defer to nn.Module's logic
        except AttributeError:
+            if name == "model":  # see #1892: prevent infinite recursion if class is not initialized
+                raise
            return getattr(self.model, name)


--- a/examples/hra_dreambooth/README.md
+++ b/examples/hra_dreambooth/README.md
@ -0,0 +1,98 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DreamBooth fine-tuning with HRA
+
+This guide demonstrates how to use Householder reflection adaptation (HRA) method, to fine-tune Dreambooth with `stabilityai/stable-diffusion-2-1` model.
+
+HRA provides a new perspective connecting LoRA to OFT and achieves encouraging performance in various downstream tasks.
+HRA adapts a pre-trained model by multiplying each frozen weight matrix with a chain of r learnable Householder reflections (HRs).
+HRA can be interpreted as either an OFT adapter or an adaptive LoRA. 
+Consequently, it harnesses the advantages of both strategies, reducing parameters and computation costs while penalizing the loss of pre-training knowledge.
+For further details on HRA, please consult the [original HRA paper](https://arxiv.org/abs/2405.17484).
+
+In this guide we provide a Dreambooth fine-tuning script that is available in [PEFT's GitHub repo examples](https://github.com/huggingface/peft/tree/main/examples/hra_dreambooth). This implementation is adapted from [peft's boft_dreambooth](https://github.com/huggingface/peft/tree/main/examples/boft_dreambooth). 
+
+You can try it out and fine-tune on your custom images.
+
+## Set up your environment
+
+Start by cloning the PEFT repository:
+
+```bash
+git clone --recursive https://github.com/huggingface/peft
+```
+
+Navigate to the directory containing the training scripts for fine-tuning Dreambooth with HRA:
+
+```bash
+cd peft/examples/hra_dreambooth
+```
+
+Set up your environment: install PEFT, and all the required libraries. At the time of writing this guide we recommend installing PEFT from source. The following environment setup should work on A100 and H100:
+
+```bash
+conda create --name peft python=3.10
+conda activate peft
+conda install pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=11.8 -c pytorch -c nvidia
+conda install xformers -c xformers
+pip install -r requirements.txt
+pip install git+https://github.com/huggingface/peft
+```
+
+## Download the data
+
+[dreambooth](https://github.com/google/dreambooth) dataset should have been automatically cloned in the following structure when running the training script.
+
+```
+hra_dreambooth
+├── data
+│   └── dreambooth
+│       └── dataset
+│           ├── backpack
+│           └── backpack_dog
+│           ...
+```
+
+You can also put your custom images into `hra_dreambooth/data/dreambooth/dataset`.
+
+## Fine-tune Dreambooth with HRA
+
+```bash
+class_idx=0
+bash ./train_dreambooth.sh $class_idx
+```
+
+where the `$class_idx` corresponds to different subjects ranging from 0 to 29.
+
+Launch the training script with `accelerate` and pass hyperparameters, as well as LoRa-specific arguments to it such as:
+
+- `use_hra`: Enables HRA in the training script.
+- `hra_r`: the number of HRs (i.e., r) across different layers, expressed in `int`. 
+As r increases, the number of trainable parameters increases, which generally leads to improved performance.
+However, this also results in higher memory consumption and longer computation times. 
+Therefore, r is usually set to 8.
+**Note**, please set r to an even number to avoid potential issues during initialization.
+- `hra_apply_GS`: Applies Gram-Schmidt orthogonalization. Default is `false`.
+- `hra_bias`: specify if the `bias` parameters should be trained. Can be `none`, `all` or `hra_only`.
+
+If you are running this script on Windows, you may need to set the `--num_dataloader_workers` to 0.
+
+To learn more about DreamBooth fine-tuning with prior-preserving loss, check out the [Diffusers documentation](https://huggingface.co/docs/diffusers/training/dreambooth#finetuning-with-priorpreserving-loss).
+
+## Generate images with the fine-tuned model
+
+To generate images with the fine-tuned model, simply run the jupyter notebook `dreambooth_inference.ipynb` for visualization with `jupyter notebook` under `./examples/hra_dreambooth`.
--- a/examples/hra_dreambooth/a_purple_qwe_backpack.png
+++ b/examples/hra_dreambooth/a_purple_qwe_backpack.png
--- a/examples/hra_dreambooth/dreambooth_inference.ipynb
+++ b/examples/hra_dreambooth/dreambooth_inference.ipynb
--- a/examples/hra_dreambooth/requirements.txt
+++ b/examples/hra_dreambooth/requirements.txt
@ -0,0 +1,13 @@
+transformers==4.36.2
+accelerate==0.25.0
+evaluate
+tqdm
+datasets==2.16.1
+diffusers==0.17.1
+Pillow
+huggingface_hub
+safetensors
+nb_conda_kernels
+ipykernel
+ipywidgets
+wandb==0.16.1
--- a/examples/hra_dreambooth/train_dreambooth.py
+++ b/examples/hra_dreambooth/train_dreambooth.py
@ -0,0 +1,609 @@
+#!/usr/bin/env python
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The implementation is based on "Bridging The Gap between Low-rank and Orthogonal
+# Adaptation via Householder Reflection Adaptation" (https://arxiv.org/abs/2405.17484).
+
+import hashlib
+import itertools
+import logging
+import math
+import os
+from contextlib import nullcontext
+from pathlib import Path
+
+import datasets
+import diffusers
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DiffusionPipeline,
+    DPMSolverMultistepScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version
+from diffusers.utils.import_utils import is_xformers_available
+from huggingface_hub import Repository
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer
+from utils.args_loader import (
+    get_full_repo_name,
+    import_model_class_from_model_name_or_path,
+    parse_args,
+)
+from utils.dataset import DreamBoothDataset, PromptDataset, collate_fn
+from utils.tracemalloc import TorchTracemalloc, b2mb
+
+from peft import HRAConfig, get_peft_model
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.16.0.dev0")
+
+logger = get_logger(__name__)
+
+UNET_TARGET_MODULES = ["to_q", "to_v", "to_k", "query", "value", "key", "to_out.0", "add_k_proj", "add_v_proj"]
+TEXT_ENCODER_TARGET_MODULES = ["q_proj", "v_proj"]
+
+
+def save_adaptor(accelerator, step, unet, text_encoder, args):
+    unwarpped_unet = accelerator.unwrap_model(unet)
+    unwarpped_unet.save_pretrained(
+        os.path.join(args.output_dir, f"unet/{step}"), state_dict=accelerator.get_state_dict(unet)
+    )
+    if args.train_text_encoder:
+        unwarpped_text_encoder = accelerator.unwrap_model(text_encoder)
+        unwarpped_text_encoder.save_pretrained(
+            os.path.join(args.output_dir, f"text_encoder/{step}"),
+            state_dict=accelerator.get_state_dict(text_encoder),
+        )
+
+
+def main(args):
+    validation_prompts = list(filter(None, args.validation_prompt[0].split(".")))
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to if args.report_to != "none" else None,
+        project_dir=accelerator_project_config,
+    )
+    if args.report_to == "wandb":
+        import wandb
+
+        args.wandb_project_name = args.project_name
+        args.wandb_run_name = args.run_name
+        wandb_init = {
+            "wandb": {
+                "name": args.wandb_run_name,
+                "mode": "online",
+            }
+        }
+
+    # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate
+    # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models.
+    # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate.
+    if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1:
+        raise ValueError(
+            "Gradient accumulation is not supported when training the text encoder in distributed training. "
+            "Please set gradient_accumulation_steps to 1. This feature will be supported in the future."
+        )
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    global_seed = hash(args.run_name) % (2**32)
+    set_seed(global_seed)
+
+    # Generate class images if prior preservation is enabled.
+    if args.with_prior_preservation:
+        class_images_dir = Path(args.class_data_dir)
+        if not class_images_dir.exists():
+            class_images_dir.mkdir(parents=True)
+        cur_class_images = len(list(class_images_dir.iterdir()))
+
+        if cur_class_images < args.num_class_images:
+            torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
+            if args.prior_generation_precision == "fp32":
+                torch_dtype = torch.float32
+            elif args.prior_generation_precision == "fp16":
+                torch_dtype = torch.float16
+            elif args.prior_generation_precision == "bf16":
+                torch_dtype = torch.bfloat16
+            pipeline = DiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                torch_dtype=torch_dtype,
+                safety_checker=None,
+                revision=args.revision,
+            )
+            pipeline.set_progress_bar_config(disable=True)
+
+            num_new_images = args.num_class_images - cur_class_images
+            logger.info(f"Number of class images to sample: {num_new_images}.")
+
+            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+
+            sample_dataloader = accelerator.prepare(sample_dataloader)
+            pipeline.to(accelerator.device)
+
+            for example in tqdm(
+                sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
+            ):
+                images = pipeline(example["prompt"]).images
+
+                for i, image in enumerate(images):
+                    hash_image = hashlib.sha1(image.tobytes()).hexdigest()
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                    image.save(image_filename)
+
+            del pipeline
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            repo = Repository(args.output_dir, clone_from=repo_name)  # noqa: F841
+
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+    # Load the tokenizer
+    if args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.pretrained_model_name_or_path,
+            subfolder="tokenizer",
+            revision=args.revision,
+            use_fast=False,
+        )
+
+    # import correct text encoder class
+    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision)
+
+    # Load scheduler and models
+    noise_scheduler = DDIMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+
+    text_encoder = text_encoder_cls.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
+    )
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision)
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
+    )
+
+    if args.use_hra:
+        config = HRAConfig(
+            r=args.hra_r,
+            apply_GS=args.hra_apply_GS,
+            target_modules=UNET_TARGET_MODULES,
+            bias=args.hra_bias,
+        )
+        unet = get_peft_model(unet, config, adapter_name=args.run_name)
+        unet.print_trainable_parameters()
+
+    vae.requires_grad_(False)
+    unet.train()
+
+    if args.train_text_encoder and args.use_hra:
+        config = HRAConfig(
+            r=args.hra_r,
+            apply_GS=args.hra_apply_GS,
+            target_modules=UNET_TARGET_MODULES,
+            bias=args.hra_bias,
+        )
+        text_encoder = get_peft_model(text_encoder, config, adapter_name=args.run_name)
+        text_encoder.print_trainable_parameters()
+        text_encoder.train()
+    else:
+        text_encoder.requires_grad_(False)
+
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    unet.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+        # below fails when using hra so commenting it out
+        if args.train_text_encoder and not args.use_hra:
+            text_encoder.gradient_checkpointing_enable()
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # Optimizer creation
+    params_to_optimize = [param for param in unet.parameters() if param.requires_grad]
+
+    if args.train_text_encoder:
+        params_to_optimize += [param for param in text_encoder.parameters() if param.requires_grad]
+
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Download the official dreambooth dataset from the official repository: https://github.com/google/dreambooth.git
+    data_path = os.path.join(os.getcwd(), "data", "dreambooth")
+    if not os.path.exists(data_path):
+        os.makedirs(os.path.join(os.getcwd(), "data"), exist_ok=True)
+        os.system(f"git clone https://github.com/google/dreambooth.git '{data_path}'")
+
+    # Dataset and DataLoaders creation:
+    train_dataset = DreamBoothDataset(
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
+        class_prompt=args.class_prompt,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        center_crop=args.center_crop,
+    )
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation),
+        num_workers=args.num_dataloader_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    # Prepare everything with our `accelerator`.
+    if args.train_text_encoder:
+        unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, text_encoder, optimizer, train_dataloader, lr_scheduler
+        )
+    else:
+        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, optimizer, train_dataloader, lr_scheduler
+        )
+
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move vae and text_encoder to device and cast to weight_dtype
+    vae.to(accelerator.device, dtype=weight_dtype)
+    if not args.train_text_encoder:
+        text_encoder.to(accelerator.device, dtype=weight_dtype)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        if args.report_to == "wandb":
+            accelerator.init_trackers(args.wandb_project_name, config=vars(args), init_kwargs=wandb_init)
+        else:
+            accelerator.init_trackers(args.project_name, config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+        accelerator.print(f"Resuming from checkpoint {path}")
+        accelerator.load_state(os.path.join(args.output_dir, path))
+        global_step = int(path.split("-")[1])
+
+        resume_global_step = global_step * args.gradient_accumulation_steps
+        first_epoch = resume_global_step // num_update_steps_per_epoch
+        resume_step = resume_global_step % num_update_steps_per_epoch
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+
+    if args.train_text_encoder:
+        text_encoder.train()
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+
+        with TorchTracemalloc() if not args.no_tracemalloc else nullcontext() as tracemalloc:
+            for step, batch in enumerate(train_dataloader):
+                # Skip steps until we reach the resumed step
+                if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+                    if step % args.gradient_accumulation_steps == 0:
+                        progress_bar.update(1)
+                        if args.report_to == "wandb":
+                            accelerator.print(progress_bar)
+                    continue
+
+                with accelerator.accumulate(unet):
+                    # Convert images to latent space
+                    latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+                    latents = latents * vae.config.scaling_factor
+
+                    # Sample noise that we'll add to the latents
+                    noise = torch.randn_like(latents)
+                    bsz = latents.shape[0]
+                    # Sample a random timestep for each image
+                    timesteps = torch.randint(
+                        0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device
+                    )
+                    timesteps = timesteps.long()
+
+                    # Add noise to the latents according to the noise magnitude at each timestep
+                    # (this is the forward diffusion process)
+                    noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                    # Get the text embedding for conditioning
+                    encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+
+                    # Predict the noise residual
+                    model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+                    # Get the target for loss depending on the prediction type
+                    if noise_scheduler.config.prediction_type == "epsilon":
+                        target = noise
+                    elif noise_scheduler.config.prediction_type == "v_prediction":
+                        target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                    else:
+                        raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                    if args.with_prior_preservation:
+                        # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                        model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                        target, target_prior = torch.chunk(target, 2, dim=0)
+
+                        # Compute instance loss
+                        loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+
+                        # Compute prior loss
+                        prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
+
+                        # Add the prior loss to the instance loss.
+                        loss = loss + args.prior_loss_weight * prior_loss
+                    else:
+                        loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+
+                    accelerator.backward(loss)
+
+                    if accelerator.sync_gradients:
+                        params_to_clip = (
+                            itertools.chain(unet.parameters(), text_encoder.parameters())
+                            if args.train_text_encoder
+                            else unet.parameters()
+                        )
+                        accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+
+                    optimizer.step()
+                    lr_scheduler.step()
+                    optimizer.zero_grad()
+
+                # Checks if the accelerator has performed an optimization step behind the scenes
+                if accelerator.sync_gradients:
+                    progress_bar.update(1)
+                    if args.report_to == "wandb":
+                        accelerator.print(progress_bar)
+                    global_step += 1
+
+                if global_step % args.checkpointing_steps == 0 and global_step != 0:
+                    if accelerator.is_main_process:
+                        save_adaptor(accelerator, global_step, unet, text_encoder, args)
+
+                logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+                progress_bar.set_postfix(**logs)
+                accelerator.log(logs, step=global_step)
+
+                if (
+                    args.validation_prompt is not None
+                    and (step + num_update_steps_per_epoch * epoch) % args.validation_steps == 0
+                    and global_step > 10
+                ):
+                    unet.eval()
+
+                    logger.info(
+                        f"Running validation... \n Generating {len(validation_prompts)} images with prompt:"
+                        f" {validation_prompts[0]}, ......"
+                    )
+                    # create pipeline
+                    pipeline = DiffusionPipeline.from_pretrained(
+                        args.pretrained_model_name_or_path,
+                        safety_checker=None,
+                        revision=args.revision,
+                    )
+                    # set `keep_fp32_wrapper` to True because we do not want to remove
+                    # mixed precision hooks while we are still training
+                    pipeline.unet = accelerator.unwrap_model(unet, keep_fp32_wrapper=True)
+                    pipeline.text_encoder = accelerator.unwrap_model(text_encoder, keep_fp32_wrapper=True)
+                    pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+                    pipeline = pipeline.to(accelerator.device)
+                    pipeline.set_progress_bar_config(disable=True)
+
+                    # run inference
+                    if args.seed is not None:
+                        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+                    else:
+                        generator = None
+
+                    images = []
+                    val_img_dir = os.path.join(
+                        args.output_dir,
+                        f"validation/{global_step}",
+                        args.run_name,
+                    )
+                    os.makedirs(val_img_dir, exist_ok=True)
+
+                    for val_promot in validation_prompts:
+                        image = pipeline(val_promot, num_inference_steps=50, generator=generator).images[0]
+                        image.save(os.path.join(val_img_dir, f"{'_'.join(val_promot.split(' '))}.png"[1:]))
+                        images.append(image)
+
+                    for tracker in accelerator.trackers:
+                        if tracker.name == "tensorboard":
+                            np_images = np.stack([np.asarray(img) for img in images])
+                            tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+                        if tracker.name == "wandb":
+                            import wandb
+
+                            tracker.log(
+                                {
+                                    "validation": [
+                                        wandb.Image(image, caption=f"{i}: {validation_prompts[i]}")
+                                        for i, image in enumerate(images)
+                                    ]
+                                }
+                            )
+
+                    del pipeline
+                    torch.cuda.empty_cache()
+
+                if global_step >= args.max_train_steps:
+                    break
+
+        # Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
+        if not args.no_tracemalloc:
+            accelerator.print(f"GPU Memory before entering the train : {b2mb(tracemalloc.begin)}")
+            accelerator.print(f"GPU Memory consumed at the end of the train (end-begin): {tracemalloc.used}")
+            accelerator.print(f"GPU Peak Memory consumed during the train (max-begin): {tracemalloc.peaked}")
+            accelerator.print(
+                f"GPU Total Peak Memory consumed during the train (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}"
+            )
+
+            accelerator.print(f"CPU Memory before entering the train : {b2mb(tracemalloc.cpu_begin)}")
+            accelerator.print(f"CPU Memory consumed at the end of the train (end-begin): {tracemalloc.cpu_used}")
+            accelerator.print(f"CPU Peak Memory consumed during the train (max-begin): {tracemalloc.cpu_peaked}")
+            accelerator.print(
+                f"CPU Total Peak Memory consumed during the train (max): {tracemalloc.cpu_peaked + b2mb(tracemalloc.cpu_begin)}"
+            )
+
+    if args.push_to_hub:
+        repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/hra_dreambooth/train_dreambooth.sh
+++ b/examples/hra_dreambooth/train_dreambooth.sh
@ -0,0 +1,185 @@
+
+CLASS_IDX=$1
+
+# Define the UNIQUE_TOKEN, CLASS_TOKENs, and SUBJECT_NAMES
+UNIQUE_TOKEN="qwe"
+
+SUBJECT_NAMES=(
+    "backpack" "backpack_dog" "bear_plushie" "berry_bowl" "can"
+    "candle" "cat" "cat2" "clock" "colorful_sneaker"
+    "dog" "dog2" "dog3" "dog5" "dog6"
+    "dog7" "dog8" "duck_toy" "fancy_boot" "grey_sloth_plushie"
+    "monster_toy" "pink_sunglasses" "poop_emoji" "rc_car" "red_cartoon"
+    "robot_toy" "shiny_sneaker" "teapot" "vase" "wolf_plushie"
+)
+
+CLASS_TOKENs=(
+    "backpack" "backpack" "stuffed animal" "bowl" "can"
+    "candle" "cat" "cat" "clock" "sneaker"
+    "dog" "dog" "dog" "dog" "dog"
+    "dog" "dog" "toy" "boot" "stuffed animal"
+    "toy" "glasses" "toy" "toy" "cartoon"
+    "toy" "sneaker" "teapot" "vase" "stuffed animal"
+)
+
+CLASS_TOKEN=${CLASS_TOKENs[$CLASS_IDX]}
+SELECTED_SUBJECT=${SUBJECT_NAMES[$CLASS_IDX]}
+
+if [[ $CLASS_IDX =~ ^(0|1|2|3|4|5|8|9|17|18|19|20|21|22|23|24|25|26|27|28|29)$ ]]; then
+  PROMPT_LIST=(
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} in the jungle."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} in the snow."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on the beach."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on a cobblestone street."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of pink fabric."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of a wooden floor."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} with a city in the background."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} with a mountain in the background."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} with a blue house in the background."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of a purple rug in a forest."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} with a wheat field in the background."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} with a tree and autumn leaves in the background."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} with the Eiffel Tower in the background."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} floating on top of water."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} floating in an ocean of milk."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of green grass with sunflowers around it."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of a mirror."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of the sidewalk in a crowded street."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of a dirt road."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of a white rug."
+    "a red ${UNIQUE_TOKEN} ${CLASS_TOKEN}."
+    "a purple ${UNIQUE_TOKEN} ${CLASS_TOKEN}."
+    "a shiny ${UNIQUE_TOKEN} ${CLASS_TOKEN}."
+    "a wet ${UNIQUE_TOKEN} ${CLASS_TOKEN}."
+    "a cube shaped ${UNIQUE_TOKEN} ${CLASS_TOKEN}."
+  )
+
+  prompt_test_list=(
+    "a ${CLASS_TOKEN} in the jungle"
+    "a ${CLASS_TOKEN} in the snow"
+    "a ${CLASS_TOKEN} on the beach"
+    "a ${CLASS_TOKEN} on a cobblestone street"
+    "a ${CLASS_TOKEN} on top of pink fabric"
+    "a ${CLASS_TOKEN} on top of a wooden floor"
+    "a ${CLASS_TOKEN} with a city in the background"
+    "a ${CLASS_TOKEN} with a mountain in the background"
+    "a ${CLASS_TOKEN} with a blue house in the background"
+    "a ${CLASS_TOKEN} on top of a purple rug in a forest"
+    "a ${CLASS_TOKEN} with a wheat field in the background"
+    "a ${CLASS_TOKEN} with a tree and autumn leaves in the background"
+    "a ${CLASS_TOKEN} with the Eiffel Tower in the background"
+    "a ${CLASS_TOKEN} floating on top of water"
+    "a ${CLASS_TOKEN} floating in an ocean of milk"
+    "a ${CLASS_TOKEN} on top of green grass with sunflowers around it"
+    "a ${CLASS_TOKEN} on top of a mirror"
+    "a ${CLASS_TOKEN} on top of the sidewalk in a crowded street"
+    "a ${CLASS_TOKEN} on top of a dirt road"
+    "a ${CLASS_TOKEN} on top of a white rug"
+    "a red ${CLASS_TOKEN}"
+    "a purple ${CLASS_TOKEN}"
+    "a shiny ${CLASS_TOKEN}"
+    "a wet ${CLASS_TOKEN}"
+    "a cube shaped ${CLASS_TOKEN}"
+  )
+
+else
+  PROMPT_LIST=(
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} in the jungle."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} in the snow."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on the beach."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on a cobblestone street."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of pink fabric."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of a wooden floor."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} with a city in the background."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} with a mountain in the background."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} with a blue house in the background."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of a purple rug in a forest."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} wearing a red hat."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} wearing a santa hat."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} wearing a rainbow scarf."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} wearing a black top hat and a monocle."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} in a chef outfit."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} in a firefighter outfit."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} in a police outfit."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} wearing pink glasses."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} wearing a yellow shirt."
+    "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} in a purple wizard outfit."
+    "a red ${UNIQUE_TOKEN} ${CLASS_TOKEN}."
+    "a purple ${UNIQUE_TOKEN} ${CLASS_TOKEN}."
+    "a shiny ${UNIQUE_TOKEN} ${CLASS_TOKEN}."
+    "a wet ${UNIQUE_TOKEN} ${CLASS_TOKEN}."
+    "a cube shaped ${UNIQUE_TOKEN} ${CLASS_TOKEN}."
+  )
+
+  prompt_test_list=(
+    "a ${CLASS_TOKEN} in the jungle"
+    "a ${CLASS_TOKEN} in the snow"
+    "a ${CLASS_TOKEN} on the beach"
+    "a ${CLASS_TOKEN} on a cobblestone street"
+    "a ${CLASS_TOKEN} on top of pink fabric"
+    "a ${CLASS_TOKEN} on top of a wooden floor"
+    "a ${CLASS_TOKEN} with a city in the background"
+    "a ${CLASS_TOKEN} with a mountain in the background"
+    "a ${CLASS_TOKEN} with a blue house in the background"
+    "a ${CLASS_TOKEN} on top of a purple rug in a forest"
+    "a ${CLASS_TOKEN} wearing a red hat"
+    "a ${CLASS_TOKEN} wearing a santa hat"
+    "a ${CLASS_TOKEN} wearing a rainbow scarf"
+    "a ${CLASS_TOKEN} wearing a black top hat and a monocle"
+    "a ${CLASS_TOKEN} in a chef outfit"
+    "a ${CLASS_TOKEN} in a firefighter outfit"
+    "a ${CLASS_TOKEN} in a police outfit"
+    "a ${CLASS_TOKEN} wearing pink glasses"
+    "a ${CLASS_TOKEN} wearing a yellow shirt"
+    "a ${CLASS_TOKEN} in a purple wizard outfit"
+    "a red ${CLASS_TOKEN}"
+    "a purple ${CLASS_TOKEN}"
+    "a shiny ${CLASS_TOKEN}"
+    "a wet ${CLASS_TOKEN}"
+    "a cube shaped ${CLASS_TOKEN}"
+  )
+fi
+
+VALIDATION_PROMPT=${PROMPT_LIST[@]}
+INSTANCE_PROMPT="a photo of ${UNIQUE_TOKEN} ${CLASS_TOKEN}"
+CLASS_PROMPT="a photo of ${CLASS_TOKEN}"
+
+export MODEL_NAME="stabilityai/stable-diffusion-2-1"
+
+PEFT_TYPE="hra"
+HRA_R=8
+
+export PROJECT_NAME="dreambooth_${PEFT_TYPE}"
+export RUN_NAME="${SELECTED_SUBJECT}_${PEFT_TYPE}_${HRA_R}"
+export INSTANCE_DIR="./data/dreambooth/dataset/${SELECTED_SUBJECT}"
+export CLASS_DIR="./data/class_data/${CLASS_TOKEN}"
+export OUTPUT_DIR="./data/output/${PEFT_TYPE}"
+
+
+accelerate launch train_dreambooth.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir="$CLASS_DIR" \
+  --output_dir=$OUTPUT_DIR \
+  --project_name=$PROJECT_NAME \
+  --run_name=$RUN_NAME \
+  --with_prior_preservation \
+  --prior_loss_weight=1.0 \
+  --instance_prompt="$INSTANCE_PROMPT" \
+  --validation_prompt="$VALIDATION_PROMPT" \
+  --class_prompt="$CLASS_PROMPT" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --num_dataloader_workers=2 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --use_hra \
+  --hra_r=$HRA_R \
+  --hra_bias="hra_only" \
+  --learning_rate=5e-3 \
+  --max_train_steps=510 \
+  --checkpointing_steps=200 \
+  --validation_steps=200 \
+  --enable_xformers_memory_efficient_attention \
+  --report_to="none" \
--- a/examples/hra_dreambooth/utils/init.py
+++ b/examples/hra_dreambooth/utils/init.py
--- a/examples/hra_dreambooth/utils/args_loader.py
+++ b/examples/hra_dreambooth/utils/args_loader.py
@ -0,0 +1,377 @@
+# adapted from [peft's boft_dreambooth](https://github.com/huggingface/peft/tree/main/examples/boft_dreambooth)
+
+import argparse
+import os
+import warnings
+from typing import Optional
+
+from huggingface_hub import HfFolder, whoami
+from transformers import PretrainedConfig
+
+
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=revision,
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "RobertaSeriesModelWithTransformation":
+        from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
+
+        return RobertaSeriesModelWithTransformation
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
+    if token is None:
+        token = HfFolder.get_token()
+    if organization is None:
+        username = whoami(token)["name"]
+        return f"{username}/{model_id}"
+    else:
+        return f"{organization}/{model_id}"
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a Dreambooth training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        required=True,
+        help="A folder containing the training data of instance images.",
+    )
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        required=True,
+        help="The prompt with identifier specifying the instance",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=(
+            "Minimal class images for prior preservation loss. If there are not enough images already present in"
+            " class_data_dir, additional images will be sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        nargs="+",
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=500,
+        help=(
+            "Run dreambooth validation every X steps. Dreambooth validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution"
+    )
+    parser.add_argument("--train_text_encoder", action="store_true", help="Whether to train the text encoder")
+
+    parser.add_argument(
+        "--set_grads_to_none",
+        action="store_true",
+        help=(
+            "Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain"
+            " behaviors, so disable this argument if it causes any problems. More info:"
+            " https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html"
+        ),
+    )
+
+    # hra args
+    parser.add_argument("--use_hra", action="store_true", help="Whether to use HRA for parameter efficient tuning.")
+    parser.add_argument("--hra_r", type=int, default=8, help="The rank of HRA across different layers.")
+    parser.add_argument(
+        "--hra_apply_GS", default=False, action="store_true", help="Whether to apply Gram-Schmidt orthogonalization."
+    )
+    parser.add_argument(
+        "--hra_bias",
+        type=str,
+        default="none",
+        help="Bias type for HRA. Can be 'none', 'all' or 'hra_only', only used if use_hra is True.",
+    )
+    parser.add_argument(
+        "--num_dataloader_workers", type=int, default=1, help="Num of workers for the training dataloader."
+    )
+    parser.add_argument(
+        "--no_tracemalloc",
+        default=False,
+        action="store_true",
+        help="Flag to stop memory allocation tracing during training. This could speed up training on Windows.",
+    )
+
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-6,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--project_name",
+        type=str,
+        default=None,
+        help=("The project name for log tracking"),
+    )
+    parser.add_argument(
+        "--run_name",
+        type=str,
+        default=None,
+        help=("The run name for log tracking"),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="wandb",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"wandb"`'
+            ' (default), `"tensorboard"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--wandb_key",
+        type=str,
+        default=None,
+        help=("If report to option is set to wandb, api-key for wandb used for login to wandb "),
+    )
+    parser.add_argument(
+        "--wandb_project_name",
+        type=str,
+        default=None,
+        help=("If report to option is set to wandb, project name in wandb for log tracking  "),
+    )
+    parser.add_argument(
+        "--wandb_run_name",
+        type=str,
+        default=None,
+        help=("If report to option is set to wandb, project name in wandb for log tracking  "),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--prior_generation_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp32", "fp16", "bf16"],
+        help=(
+            "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to  fp16 if a GPU is available else fp32."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    # if args.dataset_name is None and args.train_data_dir is None:
+    #     raise ValueError("Need either a dataset name or a training folder.")
+
+    if args.with_prior_preservation:
+        if args.class_data_dir is None:
+            raise ValueError("You must specify a data directory for class images.")
+        if args.class_prompt is None:
+            raise ValueError("You must specify prompt for class images.")
+    else:
+        # logger is not available yet
+        if args.class_data_dir is not None:
+            warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
+        if args.class_prompt is not None:
+            warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
+
+    return args
--- a/examples/hra_dreambooth/utils/dataset.py
+++ b/examples/hra_dreambooth/utils/dataset.py
@ -0,0 +1,128 @@
+# adapted from [peft's boft_dreambooth](https://github.com/huggingface/peft/tree/main/examples/boft_dreambooth)
+
+from pathlib import Path
+
+import torch
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images and the tokenizes prompts.
+    """
+
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        tokenizer,
+        class_data_root=None,
+        class_prompt=None,
+        size=512,
+        center_crop=False,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+        self.tokenizer = tokenizer
+
+        self.instance_data_root = Path(instance_data_root)
+        if not self.instance_data_root.exists():
+            raise ValueError("Instance images root doesn't exists.")
+
+        self.instance_images_path = list(Path(instance_data_root).iterdir())
+        self.num_instance_images = len(self.instance_images_path)
+        self.instance_prompt = instance_prompt
+        self._length = self.num_instance_images
+
+        if class_data_root is not None:
+            self.class_data_root = Path(class_data_root)
+            self.class_data_root.mkdir(parents=True, exist_ok=True)
+            self.class_images_path = list(self.class_data_root.iterdir())
+            self.num_class_images = len(self.class_images_path)
+            self._length = max(self.num_class_images, self.num_instance_images)
+            self.class_prompt = class_prompt
+        else:
+            self.class_data_root = None
+
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
+        if not instance_image.mode == "RGB":
+            instance_image = instance_image.convert("RGB")
+        example["instance_images"] = self.image_transforms(instance_image)
+        example["instance_prompt_ids"] = self.tokenizer(
+            self.instance_prompt,
+            truncation=True,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids
+
+        if self.class_data_root:
+            class_image = Image.open(self.class_images_path[index % self.num_class_images])
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            example["class_images"] = self.image_transforms(class_image)
+            example["class_prompt_ids"] = self.tokenizer(
+                self.class_prompt,
+                truncation=True,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                return_tensors="pt",
+            ).input_ids
+
+        return example
+
+
+def collate_fn(examples, with_prior_preservation=False):
+    input_ids = [example["instance_prompt_ids"] for example in examples]
+    pixel_values = [example["instance_images"] for example in examples]
+
+    # Concat class and instance examples for prior preservation.
+    # We do this to avoid doing two forward passes.
+    if with_prior_preservation:
+        input_ids += [example["class_prompt_ids"] for example in examples]
+        pixel_values += [example["class_images"] for example in examples]
+
+    pixel_values = torch.stack(pixel_values)
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    input_ids = torch.cat(input_ids, dim=0)
+
+    batch = {
+        "input_ids": input_ids,
+        "pixel_values": pixel_values,
+    }
+    return batch
+
+
+class PromptDataset(Dataset):
+    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
--- a/examples/hra_dreambooth/utils/tracemalloc.py
+++ b/examples/hra_dreambooth/utils/tracemalloc.py
@ -0,0 +1,60 @@
+# adapted from [peft's boft_dreambooth](https://github.com/huggingface/peft/tree/main/examples/boft_dreambooth)
+
+import gc
+import threading
+
+import psutil
+import torch
+
+
+# Converting Bytes to Megabytes
+def b2mb(x):
+    return int(x / 2**20)
+
+
+# This context manager is used to track the peak memory usage of the process
+class TorchTracemalloc:
+    def __enter__(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()  # reset the peak gauge to zero
+        self.begin = torch.cuda.memory_allocated()
+        self.process = psutil.Process()
+
+        self.cpu_begin = self.cpu_mem_used()
+        self.peak_monitoring = True
+        peak_monitor_thread = threading.Thread(target=self.peak_monitor_func)
+        peak_monitor_thread.daemon = True
+        peak_monitor_thread.start()
+        return self
+
+    def cpu_mem_used(self):
+        """get resident set size memory for the current process"""
+        return self.process.memory_info().rss
+
+    def peak_monitor_func(self):
+        self.cpu_peak = -1
+
+        while True:
+            self.cpu_peak = max(self.cpu_mem_used(), self.cpu_peak)
+
+            # can't sleep or will not catch the peak right (this comment is here on purpose)
+            # time.sleep(0.001) # 1msec
+
+            if not self.peak_monitoring:
+                break
+
+    def __exit__(self, *exc):
+        self.peak_monitoring = False
+
+        gc.collect()
+        torch.cuda.empty_cache()
+        self.end = torch.cuda.memory_allocated()
+        self.peak = torch.cuda.max_memory_allocated()
+        self.used = b2mb(self.end - self.begin)
+        self.peaked = b2mb(self.peak - self.begin)
+
+        self.cpu_end = self.cpu_mem_used()
+        self.cpu_used = b2mb(self.cpu_end - self.cpu_begin)
+        self.cpu_peaked = b2mb(self.cpu_peak - self.cpu_begin)
+        # print(f"delta used/peak {self.used:4d}/{self.peaked:4d}")
--- a/examples/olora_finetuning/README.md
+++ b/examples/olora_finetuning/README.md
@ -0,0 +1,84 @@
+# OLoRA: Orthonormal Low Rank Adaptation of Large Language Models
+
+## Introduction
+[OLoRA](https://arxiv.org/abs/2406.01775) is a novel approach that leverages orthonormal low rank adaptation through QR decomposition. Unlike the default LoRA implementation, OLoRA decomposes original weights into their $\mathbf{Q}$ and $\mathbf{R}$ parts, and then uses the first `rank` rows of $\mathbf{R}$ and the first `rank` columns of $\mathbf{Q}$ to initialize $\mathbf{A}$ and $\mathbf{B}$, respectively. This results in significantly faster convergence, more stable training, and superior performance.
+
+## Quick start
+```python
+import torch
+from peft import LoraConfig, get_peft_model
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from trl import SFTTrainer
+from datasets import load_dataset
+
+model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.bfloat16, device_map="auto")
+tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+dataset = load_dataset("imdb", split="train[:1%]")
+lora_config = LoraConfig(
+    init_lora_weights="olora"
+)
+peft_model = get_peft_model(model, lora_config)
+trainer = SFTTrainer(
+    model=peft_model,
+    train_dataset=dataset,
+    dataset_text_field="text",
+    max_seq_length=512,
+    tokenizer=tokenizer,
+)
+trainer.train()
+peft_model.save_pretrained("olora-opt-350m")
+```
+
+There is no additional change needed to your standard LoRA procedure, except for specifying `init_lora_weights = "olora"` option in your lora configuration.
+
+Additionally you can refer to olora finetuning script.
+Run the script simply by running:
+```bash
+python3 examples/olora_finetuning/olora_finetuning.py --base_model facebook/opt-350m
+```
+OLoRA also supports quantization. To use 4-bit quantization try:
+```bash
+python3 examples/olora_finetuning/olora_finetuning.py --base_model facebook/opt-350m --quantize
+```
+
+
+## Use the model
+You can load and use the model as any other 🤗 PEFT model
+```python
+from peft import PeftModel
+model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")
+tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+olora_model = PeftModel.from_pretrained(model, "olora-opt-350m")
+```
+
+## OLoRA and LoRA
+OLoRA differs from LoRA in that it mutates the original weights. To utilize multiple adapters simultaneously, you can leverage the `path_initial_model_for_weight_conversion` option. Below is a simple template illustrating how to convert OLoRA to conventional LoRA:
+```python
+base_model = AutoModel.from_pretrained("facebook/opt-350m")
+olora_config = LoraConfig(
+    ...
+    init_lora_weights = "olora" # Initialize the model with OLoRA
+)
+olora_model = get_peft_model(base_model, olora_config)
+init_path = <path-to-untrained-olora-model>
+olora_model.save_pretrained(init_path) # Save the model *before* performing any training
+
+# Train the model
+train(olora_model) # Your training loop
+
+#Save the model after training
+olora_model.save_pretrained(output_dir, path_initial_model_for_weight_conversion=init_path) 
+```
+After completing training, you can save and convert your OLoRA model to a conventional LoRA model by setting `path_initial_model_for_weight_conversion` to `init_path`, that is the path of your untrained OLoRA model. This conversion enables you to use multiple adapters with your LoRA model. Note that this conversion is not supported if `rslora` is used in combination with `rank_pattern` or `alpha_pattern`.
+
+## Citation
+```
+@misc{büyükakyüz2024olora,
+      title={OLoRA: Orthonormal Low-Rank Adaptation of Large Language Models}, 
+      author={Kerim Büyükakyüz},
+      year={2024},
+      eprint={2406.01775},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
--- a/examples/olora_finetuning/olora_finetuning.py
+++ b/examples/olora_finetuning/olora_finetuning.py
@ -0,0 +1,184 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import List
+
+import torch
+import transformers
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+from peft import (
+    LoraConfig,
+    get_peft_model,
+)
+
+
+def train(
+    base_model: str = "path/to/model",
+    data_path: str = "yahma/alpaca-cleaned",
+    output_dir: str = "olora",
+    batch_size: int = 16,
+    num_epochs: int = 1,
+    learning_rate: float = 3e-4,
+    cutoff_len: int = 256,
+    val_set_size: int = 16,
+    quantize: bool = False,
+    eval_step: int = 100,
+    save_step: int = 100,
+    device_map: str = "auto",
+    lora_r: int = 32,
+    lora_alpha: int = 16,
+    lora_dropout: float = 0.05,
+    lora_target_modules: List[str] = None,
+    init_lora_weights="olora",
+):
+    model = AutoModelForCausalLM.from_pretrained(
+        base_model,
+        device_map=device_map,
+        quantization_config=BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+        )
+        if quantize
+        else None,
+        torch_dtype=torch.float16,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
+
+    def tokenize(prompt, add_eos_token=True):
+        result = tokenizer(
+            prompt,
+            truncation=True,
+            max_length=cutoff_len,
+            padding=False,
+            return_tensors=None,
+        )
+        if (
+            result["input_ids"][-1] != tokenizer.eos_token_id
+            and len(result["input_ids"]) < cutoff_len
+            and add_eos_token
+        ):
+            result["input_ids"].append(tokenizer.eos_token_id)
+            result["attention_mask"].append(1)
+
+        result["labels"] = result["input_ids"].copy()
+
+        return result
+
+    def generate_and_tokenize_prompt(example):
+        full_prompt = generate_prompt(example)
+        tokenized_full_prompt = tokenize(full_prompt)
+        return tokenized_full_prompt
+
+    config = LoraConfig(
+        r=lora_r,
+        lora_alpha=lora_alpha,
+        target_modules=lora_target_modules,
+        lora_dropout=lora_dropout,
+        bias="none",
+        task_type="CAUSAL_LM",
+        init_lora_weights=init_lora_weights,
+    )
+    model = get_peft_model(model, config)
+
+    data = load_dataset(data_path)
+
+    train_val = data["train"].train_test_split(test_size=val_set_size, shuffle=True, seed=42)
+    train_data = train_val["train"].shuffle().map(generate_and_tokenize_prompt)
+    val_data = train_val["test"].shuffle().map(generate_and_tokenize_prompt)
+
+    trainer = transformers.Trainer(
+        model=model,
+        train_dataset=train_data,
+        eval_dataset=val_data,
+        args=transformers.TrainingArguments(
+            per_device_train_batch_size=batch_size,
+            warmup_steps=100,
+            num_train_epochs=num_epochs,
+            learning_rate=learning_rate,
+            fp16=True,
+            logging_steps=100,
+            optim="adamw_torch",
+            evaluation_strategy="steps",
+            save_strategy="steps",
+            eval_steps=eval_step,
+            save_steps=save_step,
+            output_dir=output_dir,
+            save_total_limit=3,
+            load_best_model_at_end=True,
+        ),
+        data_collator=transformers.DataCollatorForSeq2Seq(
+            tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
+        ),
+    )
+    trainer.train()
+    model.save_pretrained(output_dir)
+
+
+def generate_prompt(example):
+    return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
+            ### Instruction:
+            {example["instruction"]}
+            ### Response:
+            {example["output"]}"""
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base_model", type=str, default="path/to/model")
+    parser.add_argument("--data_path", type=str, default="yahma/alpaca-cleaned")
+    parser.add_argument("--output_dir", type=str, default="olora")
+    parser.add_argument("--batch_size", type=int, default=16)
+    parser.add_argument("--num_epochs", type=int, default=1)
+    parser.add_argument("--learning_rate", type=float, default=3e-4)
+    parser.add_argument("--cutoff_len", type=int, default=256)
+    parser.add_argument("--val_set_size", type=int, default=16)
+    parser.add_argument("--quantize", action="store_true")
+    parser.add_argument("--eval_step", type=int, default=100)
+    parser.add_argument("--save_step", type=int, default=100)
+    parser.add_argument("--device_map", type=str, default="auto")
+    parser.add_argument("--lora_r", type=int, default=32)
+    parser.add_argument("--lora_alpha", type=int, default=16)
+    parser.add_argument("--lora_dropout", type=float, default=0.05)
+    parser.add_argument("--lora_target_modules", type=str, default=None)
+    parser.add_argument("--init_lora_weights", type=str, default="olora")
+
+    args = parser.parse_args()
+
+    train(
+        base_model=args.base_model,
+        data_path=args.data_path,
+        output_dir=args.output_dir,
+        batch_size=args.batch_size,
+        num_epochs=args.num_epochs,
+        learning_rate=args.learning_rate,
+        cutoff_len=args.cutoff_len,
+        val_set_size=args.val_set_size,
+        quantize=args.quantize,
+        eval_step=args.eval_step,
+        save_step=args.save_step,
+        device_map=args.device_map,
+        lora_r=args.lora_r,
+        lora_alpha=args.lora_alpha,
+        lora_dropout=args.lora_dropout,
+        lora_target_modules=args.lora_target_modules,
+        init_lora_weights=args.init_lora_weights,
+    )
--- a/examples/pissa_finetuning/README.md
+++ b/examples/pissa_finetuning/README.md
@ -0,0 +1,131 @@
+# PiSSA: Principal Singular values and Singular vectors Adaptation
+## Introduction ([Paper](https://arxiv.org/abs/2404.02948), [code](https://github.com/GraphPKU/PiSSA))
+PiSSA represents a matrix $W\in\mathbb{R}^{m\times n}$ within the model by the product of two trainable matrices $A \in \mathbb{R}^{m\times r}$ and $B \in \mathbb{R}^{r\times n}$, where $r \ll \min(m, n)$, plus a residual matrix $W^{res}\in\mathbb{R}^{m\times n}$ for error correction. Singular value decomposition (SVD) is employed to factorize $W$, and the principal singular values and vectors of $W$ are utilized to initialize $A$ and $B$. The residual singular values and vectors initialize the residual matrix $W^{res}$, which keeps frozen during fine-tuning. This straightforward modification allows PiSSA to converge more rapidly than LoRA and ultimately attain superior performance. Moreover, PiSSA reduces the quantization error compared to QLoRA, leading to further enhancements.
+
+## Quick Start
+```python
+import torch
+from peft import LoraConfig, get_peft_model
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from trl import SFTTrainer
+from datasets import load_dataset
+
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16, device_map="auto")
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+tokenizer.pad_token_id = tokenizer.eos_token_id
+lora_config = LoraConfig(
+    # init_lora_weights="pissa", # Configure the initialization method to "pissa", which may take several minutes to execute SVD on the pre-trained model.
+    init_lora_weights="pissa_niter_4", # Initialize the PiSSA with fast SVD, which completes in just a few seconds.
+)
+peft_model = get_peft_model(model, lora_config)
+
+peft_model.print_trainable_parameters()
+
+dataset = load_dataset("imdb", split="train[:1%]")
+
+trainer = SFTTrainer(
+    model=peft_model,
+    train_dataset=dataset,
+    dataset_text_field="text",
+    max_seq_length=128,
+    tokenizer=tokenizer,
+)
+trainer.train()
+peft_model.save_pretrained("pissa-llama-2-7b")
+```
+When utilizing fast SVD, reducing the rank and the number of iterations decreases the time required. However, this approach leads to higher errors in the computed matrices $A$ and $B$. To preserve the model's initial capabilities, we calculate the residual matrix by $W^{res} = W - BA$. Even with potential errors in $A$ and $B$, the sum of $W^{res}$ and $BA$ accurately equals $W$.
+
+
+To utilize the fine-tuned PiSSA modules, simply run the following command:
+```python
+import torch
+from peft import PeftModel
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16, device_map="auto"
+)
+# Performs SVD again to initialize the residual model and loads the state_dict of the fine-tuned PiSSA modules.
+peft_model = PeftModel.from_pretrained(model, "pissa-llama-2-7b")
+```
+
+## Advanced Usage
+
+### Access the preprocessed models
+We recommend downloading decomposed models directly from the [Hugging Face Collections](https://huggingface.co/collections/fxmeng/pissa-661ce700721235e542a5d7a8) instead of performing SVD every time.
+If the existing models do not meet your needs, apply PiSSA initialization to a pre-trained model and store the decomposed model locally:
+```bash
+python preprocess.py \
+    --base_model_name_or_path meta-llama/Llama-2-7b-hf \
+    --init_lora_weights pissa \
+    --output_dir pissa-llama-2-7b-r32-alpha-32 \
+    --lora_r 32 \
+    --lora_alpha 32 \
+    --lora_dropout 0 \
+    --bits bf16
+```
+
+### Convert PiSSA to LoRA
+The main advantage of PiSSA is concentrated during the training phase. For a trained PiSSA adapter, we recommend converting it equivalently to the LoRA adapter for using and sharing.
+```python
+# The fine-tuned matrices $A$ and $B$ in PiSSA adapter is saved and should be combined with the residual model.
+peft_model.save_pretrained(output_dir) 
+# Given the matrices $A_0$ and $B_0$, initialized by PiSSA and untrained, and the trained matrices $A$ and $B$, 
+# we can convert these to LoRA by setting $\Delta W = A \times B - A_0 \times B_0 = [A \mid A_0] \times [B \mid -B_0]^T = A'B'$.
+peft_model.save_pretrained(output_dir, convert_pissa_to_lora="pissa_init")
+
+```
+This conversion enables the loading of LoRA on top of a standard base model:
+
+```python
+import torch
+from peft import PeftModel
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16, device_map="auto"
+)
+# No SVD is performed during this step, and the base model remains unaltered.
+peft_model = PeftModel.from_pretrained(model, "pissa-llama-2-7b-lora")
+```
+Utilizing the converted LoRA does not require modifying the parameters of the base model. When multiple converted LoRAs are needed simultaneously, each adapter operates independently without interference, allowing for the adapters to be freely deleted or added.
+
+Note that this conversion is not supported if `rslora` is used in combination with `rank_pattern` or `alpha_pattern`.
+
+### Fine-tune in 4-bit or 8-bit
+If quantization fine-tuning is desired, it is necessary to first decompose the original model at full precision and then reload the residual model in either 4-bit or 8-bit configurations.
+```shell
+python pissa_finetuning.py \
+    --residual_model_name_or_path fxmeng/pissa-llama-2-7b-r16-alpha-16 \
+    --output_dir output/pissa-llama-2-7b-r16-alpha-16-metamath-10k \
+    --bits nf4 \
+    --data_path meta-math/MetaMathQA \
+    --dataset_split train[:100000] \
+    --dataset_field query response \
+    --bf16 True \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 32 \
+    --gradient_accumulation_steps 4 \
+    --save_strategy "steps" \
+    --save_steps 1000 \
+    --save_total_limit 1 \
+    --logging_steps 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --tf32 True \
+    --report_to none \
+    --convert_pissa_to_lora
+```
+
+This approach ensures the preservation of high-frequency, out-of-distribution parameters in the low-rank PiSSA modules, resulting in reduced quantization errors during the quantization of the residual model.
+
+## Citation
+```
+@article{meng2024pissa,
+  title={PiSSA: Principal Singular Values and Singular Vectors Adaptation of Large Language Models},
+  author={Meng, Fanxu and Wang, Zhaohui and Zhang, Muhan},
+  journal={arXiv preprint arXiv:2404.02948},
+  year={2024}
+}
+```
--- a/examples/pissa_finetuning/pissa_finetuning.py
+++ b/examples/pissa_finetuning/pissa_finetuning.py
@ -0,0 +1,156 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+import torch
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments
+from trl import SFTTrainer
+
+from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
+
+
+@dataclass
+class TrainingArguments(TrainingArguments):
+    # model configs
+    base_model_name_or_path: Optional[str] = field(
+        default=None, metadata={"help": "The name or path of the fp32/16 base model."}
+    )
+    residual_model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The name or path of the fp32/16 residual model. (`['fxmeng/pissa-llama-2-7b-r16-alpha-16']`)"
+        },
+    )
+    bits: str = field(default="fp32", metadata={"help": "(`['fp4', 'nf4', 'int8', 'bf16', 'fp16', fp32]`)"})
+    init_lora_weights: str = field(default="pissa", metadata={"help": "(`['gaussian', 'pissa', 'pissa_niter_4']`)"})
+    lora_r: int = field(default=16)
+    lora_alpha: int = field(default=16)
+    lora_dropout: float = field(default=0)
+    convert_pissa_to_lora: bool = field(default=False)
+    merge_and_save: bool = field(default=False)
+    # dataset configs
+    data_path: str = field(default="imdb", metadata={"help": "Path to the training data."})
+    dataset_split: str = field(default="train[:1%]", metadata={"help": "(`['train', 'test', 'eval']`):"})
+    dataset_field: List[str] = field(default=None, metadata={"help": "Fields of dataset input and output."})
+    max_seq_length: int = field(
+        default=512,
+        metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."},
+    )
+
+
+parser = HfArgumentParser(TrainingArguments)
+script_args = parser.parse_args_into_dataclasses()[0]
+print(script_args)
+
+print(f"Load pre-processed residual model in {script_args.bits} bits.")
+if script_args.bits in ["nf4", "fp4", "int8"]:
+    quantization_config = BitsAndBytesConfig(
+        load_in_4bit=(script_args.bits == "nf4" or script_args.bits == "fp4"),
+        load_in_8bit=script_args.bits == "int8",
+        bnb_4bit_quant_type=script_args.bits,
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_compute_dtype=torch.bfloat16,
+    )
+    res_model = AutoModelForCausalLM.from_pretrained(
+        script_args.residual_model_name_or_path, quantization_config=quantization_config, low_cpu_mem_usage=True
+    )
+    res_model = prepare_model_for_kbit_training(res_model)
+    print("Wrapping the residual model with PiSSA.")
+    peft_model = PeftModel.from_pretrained(
+        res_model, script_args.residual_model_name_or_path, subfolder="pissa_init", is_trainable=True
+    )
+    tokenizer = AutoTokenizer.from_pretrained(script_args.residual_model_name_or_path)
+
+elif script_args.residual_model_name_or_path is not None:
+    res_model = AutoModelForCausalLM.from_pretrained(
+        script_args.residual_model_name_or_path,
+        torch_dtype=(
+            torch.float16
+            if script_args.bits == "fp16"
+            else (torch.bfloat16 if script_args.bits == "bf16" else torch.float32)
+        ),
+        device_map="auto",
+    )
+    print("Wrapping the residual model with PiSSA.")
+    peft_model = PeftModel.from_pretrained(
+        res_model, script_args.residual_model_name_or_path, subfolder="pissa_init", is_trainable=True
+    )
+    tokenizer = AutoTokenizer.from_pretrained(script_args.residual_model_name_or_path)
+
+elif script_args.base_model_name_or_path is not None:
+    print(
+        f"No available pre-processed model, manually initialize a PiSSA using {script_args.base_model_name_or_path}."
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        script_args.base_model_name_or_path,
+        torch_dtype=(
+            torch.float16
+            if script_args.bits == "fp16"
+            else (torch.bfloat16 if script_args.bits == "bf16" else torch.float32)
+        ),
+        device_map="auto",
+    )
+    tokenizer = AutoTokenizer.from_pretrained(script_args.base_model_name_or_path)
+    tokenizer.pad_token_id = tokenizer.eos_token_id
+    lora_config = LoraConfig(
+        r=script_args.lora_r,
+        lora_alpha=script_args.lora_alpha,
+        init_lora_weights=script_args.init_lora_weights,
+        lora_dropout=script_args.lora_dropout,
+        target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
+        bias="none",
+        task_type="CAUSAL_LM",
+    )
+    peft_model = get_peft_model(model, lora_config)
+
+print(peft_model)
+peft_model.print_trainable_parameters()
+
+print(f"Training PiSSA with trl on the {script_args.data_path}[{script_args.dataset_split}] dataset.")
+dataset = load_dataset(script_args.data_path, split=script_args.dataset_split)
+dataset = dataset.map(
+    lambda example: {
+        "text": f"### USER: {example[script_args.dataset_field[0]]}\n### ASSISTANT: {example[script_args.dataset_field[1]]}"
+    }
+)
+
+trainer = SFTTrainer(
+    model=peft_model,
+    args=script_args,
+    train_dataset=dataset,
+    dataset_text_field="text",
+    max_seq_length=script_args.max_seq_length,
+    tokenizer=tokenizer,
+)
+trainer.train()
+trainer.save_state()
+############################## Upon training completion, convert and save PiSSA in LoRA format ##############################
+if script_args.convert_pissa_to_lora:
+    peft_model.save_pretrained(
+        os.path.join(script_args.output_dir, "pissa_lora"),
+        convert_pissa_to_lora=os.path.join(script_args.residual_model_name_or_path, "pissa_init"),
+    )
+else:
+    peft_model.save_pretrained(
+        os.path.join(script_args.output_dir, "pissa_ft"),
+    )
+
+if script_args.merge_and_save:
+    model = peft_model.merge_and_unload()
+    model.save_pretrained(os.path.join(script_args.output_dir, "pissa_merged"))
+    tokenizer.save_pretrained(os.path.join(script_args.output_dir, "pissa_merged"))
--- a/examples/pissa_finetuning/preprocess.py
+++ b/examples/pissa_finetuning/preprocess.py
@ -0,0 +1,67 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from peft import LoraConfig, get_peft_model
+
+
+parser = argparse.ArgumentParser(
+    description="Merge Adapter to Base Model", help="The name or path of the fp32/16 base model."
+)
+parser.add_argument("--base_model_name_or_path", type=str, default="bf16")
+parser.add_argument("--bits", type=str, default="bf16", choices=["bf16", "fp16", "fp32"])
+parser.add_argument(
+    "--init_lora_weights", type=str, default="pissa", help="(`['pissa', 'pissa_niter_[number of iters]']`)"
+)
+parser.add_argument("--lora_r", type=int, default=128)
+parser.add_argument("--lora_alpha", type=int, default=128)
+parser.add_argument("--lora_dropout", type=int, default=0)
+script_args = parser.parse_args()
+print(script_args)
+
+model = AutoModelForCausalLM.from_pretrained(
+    script_args.base_model_name_or_path,
+    torch_dtype=(
+        torch.float16
+        if script_args.bits == "fp16"
+        else (torch.bfloat16 if script_args.bits == "bf16" else torch.float32)
+    ),
+    device_map="auto",
+)
+tokenizer = AutoTokenizer.from_pretrained(script_args.base_model_name_or_path)
+tokenizer.pad_token_id = tokenizer.eos_token_id
+lora_config = LoraConfig(
+    r=script_args.lora_r,
+    lora_alpha=script_args.lora_alpha,
+    init_lora_weights=script_args.init_lora_weights,
+    lora_dropout=script_args.lora_dropout,
+    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
+    bias="none",
+    task_type="CAUSAL_LM",
+)
+peft_model = get_peft_model(model, lora_config)
+
+# Save PiSSA modules:
+peft_model.peft_config["default"].init_lora_weights = True
+peft_model.save_pretrained(os.path.join(script_args.output_dir, "pissa_init"))
+# Save residual model:
+peft_model = peft_model.unload()
+peft_model.save_pretrained(script_args.output_dir)
+# Save the tokenizer:
+tokenizer.save_pretrained(script_args.output_dir)
--- a/examples/sequence_classification/FourierFT.ipynb
+++ b/examples/sequence_classification/FourierFT.ipynb
@ -0,0 +1,556 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "d36e1e93-ae93-4a4e-93c6-68fd868d2882",
+   "metadata": {},
+   "source": [
+    "# Using FourierFT for sequence classification"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ddfc0610-55f6-4343-a950-125ccf0f45ac",
+   "metadata": {},
+   "source": [
+    "In this example, we fine-tune Roberta (base) on a sequence classification task using FourierFT."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "45addd81-d4f3-4dfd-960d-3920d347f0a6",
+   "metadata": {},
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "a9935ae2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/zgaoat/anaconda3/envs/pr2/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "#  To run this notebook, please run `pip install evaluate` to install additional dependencies not covered by PEFT.\n",
+    "import torch\n",
+    "from torch.optim import AdamW\n",
+    "from torch.utils.data import DataLoader\n",
+    "from peft import (\n",
+    "    get_peft_model,\n",
+    "    FourierFTConfig,\n",
+    "    PeftType,\n",
+    ")\n",
+    "\n",
+    "import evaluate\n",
+    "from datasets import load_dataset\n",
+    "from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed, AutoConfig\n",
+    "from tqdm import tqdm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "62c959bf-7cc2-49e0-b97e-4c10ec3b9bf3",
+   "metadata": {},
+   "source": [
+    "## Parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "e3b13308",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<torch._C.Generator at 0x78e2a49744b0>"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "batch_size = 32\n",
+    "model_name_or_path = \"roberta-base\"\n",
+    "task = \"mrpc\"\n",
+    "peft_type = PeftType.FOURIERFT\n",
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "num_epochs = 5  # for better results, increase this number\n",
+    "n_frequency = 1000        # for better results, increase this number\n",
+    "scaling = 150.0\n",
+    "max_length = 512\n",
+    "torch.manual_seed(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "0526f571",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "peft_config = FourierFTConfig(\n",
+    "    task_type=\"SEQ_CLS\", \n",
+    "    n_frequency=n_frequency,\n",
+    "    target_modules=[\"query\", \"value\"],\n",
+    "    scaling = scaling,\n",
+    ")\n",
+    "head_lr = 6e-3  # the learning rate for the classification head for NLU tasks\n",
+    "fft_lr = 6e-2   # the learning rate for the parameters other than the classification head (q,v in this case)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c075c5d2-a457-4f37-a7f1-94fd0d277972",
+   "metadata": {},
+   "source": [
+    "## Loading data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "7bb52cb4-d1c3-4b04-8bf0-f39ca88af139",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if any(k in model_name_or_path for k in (\"gpt\", \"opt\", \"bloom\")):\n",
+    "    padding_side = \"left\"\n",
+    "else:\n",
+    "    padding_side = \"right\"\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)\n",
+    "if getattr(tokenizer, \"pad_token_id\") is None:\n",
+    "    tokenizer.pad_token_id = tokenizer.eos_token_id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "e69c5e1f-d27b-4264-a41e-fc9b99d025e6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "datasets = load_dataset(\"glue\", task)\n",
+    "metric = evaluate.load(\"glue\", task)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "0209f778-c93b-40eb-a4e0-24c25db03980",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tokenize_function(examples):\n",
+    "    # max_length=None => use the model max length (it's actually the default)\n",
+    "    outputs = tokenizer(examples[\"sentence1\"], examples[\"sentence2\"], truncation=True, max_length=max_length)\n",
+    "    return outputs\n",
+    "\n",
+    "\n",
+    "tokenized_datasets = datasets.map(\n",
+    "    tokenize_function,\n",
+    "    batched=True,\n",
+    "    remove_columns=[\"idx\", \"sentence1\", \"sentence2\"],\n",
+    ")\n",
+    "\n",
+    "# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the\n",
+    "# transformers library\n",
+    "tokenized_datasets = tokenized_datasets.rename_column(\"label\", \"labels\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "7453954e-982c-46f0-b09c-589776e6d6cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def collate_fn(examples):\n",
+    "    return tokenizer.pad(examples, padding=\"longest\", return_tensors=\"pt\")\n",
+    "\n",
+    "\n",
+    "# Instantiate dataloaders.\n",
+    "train_dataloader = DataLoader(tokenized_datasets[\"train\"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size)\n",
+    "eval_dataloader = DataLoader(\n",
+    "    tokenized_datasets[\"validation\"], shuffle=False, collate_fn=collate_fn, batch_size=batch_size\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f3b9b2e8-f415-4d0f-9fb4-436f1a3585ea",
+   "metadata": {},
+   "source": [
+    "## Preparing the FourierFT model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "2ed5ac74",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "trainable params: 616,130 || all params: 125,263,300 || trainable%: 0.4919\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True, max_length=None)\n",
+    "model = get_peft_model(model, peft_config)\n",
+    "model.print_trainable_parameters()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "0d2d0381",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "head_param = list(map(id, model.classifier.parameters()))\n",
+    "\n",
+    "others_param = filter(lambda p: id(p) not in head_param, model.parameters()) \n",
+    "\n",
+    "optimizer = AdamW([\n",
+    "    {\"params\": model.classifier.parameters(), \"lr\": head_lr},\n",
+    "    {\"params\": others_param, \"lr\": fft_lr}\n",
+    "],weight_decay=0.)\n",
+    "\n",
+    "\n",
+    "# Instantiate scheduler\n",
+    "lr_scheduler = get_linear_schedule_with_warmup(\n",
+    "    optimizer=optimizer,\n",
+    "    num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),\n",
+    "    num_training_steps=(len(train_dataloader) * num_epochs),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c0dd5aa8-977b-4ac0-8b96-884b17bcdd00",
+   "metadata": {},
+   "source": [
+    "## Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "fa0e73be",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|          | 0/115 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "100%|██████████| 115/115 [00:06<00:00, 19.03it/s]\n",
+      "100%|██████████| 13/13 [00:00<00:00, 41.72it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch 0: {'accuracy': 0.8161764705882353, 'f1': 0.8709122203098106}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 115/115 [00:05<00:00, 20.61it/s]\n",
+      "100%|██████████| 13/13 [00:00<00:00, 42.91it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch 1: {'accuracy': 0.8480392156862745, 'f1': 0.8966666666666666}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 115/115 [00:05<00:00, 20.63it/s]\n",
+      "100%|██████████| 13/13 [00:00<00:00, 42.65it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch 2: {'accuracy': 0.8676470588235294, 'f1': 0.9075342465753424}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 115/115 [00:05<00:00, 20.56it/s]\n",
+      "100%|██████████| 13/13 [00:00<00:00, 42.11it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch 3: {'accuracy': 0.8504901960784313, 'f1': 0.8988391376451078}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 115/115 [00:05<00:00, 20.50it/s]\n",
+      "100%|██████████| 13/13 [00:00<00:00, 43.15it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch 4: {'accuracy': 0.8725490196078431, 'f1': 0.9103448275862069}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "model.to(device)\n",
+    "for epoch in range(num_epochs):\n",
+    "    model.train()\n",
+    "    for step, batch in enumerate(tqdm(train_dataloader)):\n",
+    "        batch.to(device)\n",
+    "        outputs = model(**batch)\n",
+    "        loss = outputs.loss\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "        lr_scheduler.step()\n",
+    "        optimizer.zero_grad()\n",
+    "\n",
+    "    model.eval()\n",
+    "    for step, batch in enumerate(tqdm(eval_dataloader)):\n",
+    "        batch.to(device)\n",
+    "        with torch.no_grad():\n",
+    "            outputs = model(**batch)\n",
+    "        predictions = outputs.logits.argmax(dim=-1)\n",
+    "        predictions, references = predictions, batch[\"labels\"]\n",
+    "        metric.add_batch(\n",
+    "            predictions=predictions,\n",
+    "            references=references,\n",
+    "        )\n",
+    "\n",
+    "    eval_metric = metric.compute()\n",
+    "    print(f\"epoch {epoch}:\", eval_metric)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f2b2caca",
+   "metadata": {},
+   "source": [
+    "## Share adapters on the 🤗 Hub"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "7b23af6f-cf6e-486f-9d10-0eada95b631f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "account_id = ...  # your Hugging Face Hub account ID"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "990b3c93",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/zgaoat/anaconda3/envs/pr2/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/zgaoat/roberta-base-mrpc-peft-fourierft/commit/064eb35cbb7a1073b4d8fafbeccee43a0a4e37c9', commit_message='Upload model', commit_description='', oid='064eb35cbb7a1073b4d8fafbeccee43a0a4e37c9', pr_url=None, pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.push_to_hub(f\"{account_id}/roberta-base-mrpc-peft-fourierft\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9d140b26",
+   "metadata": {},
+   "source": [
+    "## Load adapters from the Hub\n",
+    "\n",
+    "You can also directly load adapters from the Hub using the commands below:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "c283e028-b349-46b0-a20e-cde0ee5fbd7b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from peft import PeftModel, PeftConfig\n",
+    "from transformers import AutoTokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "320b10a0-4ea8-4786-9f3c-4670019c6b18",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "peft_model_id = f\"{account_id}/roberta-base-mrpc-peft-fourierft\"\n",
+    "config = PeftConfig.from_pretrained(peft_model_id)\n",
+    "inference_model = AutoModelForSequenceClassification.from_pretrained(config.base_model_name_or_path)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "b3a94049-bc01-4f2e-8cf9-66daf24a4402",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the FourierFT model\n",
+    "inference_model = PeftModel.from_pretrained(inference_model, peft_model_id, config=config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "bd919fef-4e9a-4dc5-a957-7b879cfc5d38",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|          | 0/13 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "100%|██████████| 13/13 [00:00<00:00, 43.06it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'accuracy': 0.8725490196078431, 'f1': 0.9103448275862069}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "inference_model.to(device)\n",
+    "inference_model.eval()\n",
+    "for step, batch in enumerate(tqdm(eval_dataloader)):\n",
+    "    batch.to(device)\n",
+    "    with torch.no_grad():\n",
+    "        outputs = inference_model(**batch)\n",
+    "    predictions = outputs.logits.argmax(dim=-1)\n",
+    "    predictions, references = predictions, batch[\"labels\"]\n",
+    "    metric.add_batch(\n",
+    "        predictions=predictions,\n",
+    "        references=references,\n",
+    "    )\n",
+    "\n",
+    "eval_metric = metric.compute()\n",
+    "print(eval_metric)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/examples/sequence_classification/VeRA.ipynb
+++ b/examples/sequence_classification/VeRA.ipynb
@ -94,7 +94,7 @@
    "    task_type=\"SEQ_CLS\", \n",
    "    r=rank,\n",
    "    d_initial=0.1,\n",
-    "    target_modules=[\"query\", \"value\"],\n",
+    "    target_modules=[\"query\", \"value\", \"intermediate.dense\"],\n",
    "    save_projection=True,\n",
    ")\n",
    "head_lr = 1e-2\n",
@ -205,7 +205,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "trainable params: 610,754 || all params: 125,257,924 || trainable%: 0.48759709605278145\n"
+      "trainable params: 647,714 || all params: 125,294,884 || trainable%: 0.5170\n"
     ]
    }
   ],
@ -255,76 +255,76 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "  0%|                                                                                                                                                                                                                                                    | 0/29 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
-      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:23<00:00,  1.24it/s]\n",
-      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.33it/s]\n"
+      "  0%|          | 0/29 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "100%|██████████| 29/29 [00:18<00:00,  1.58it/s]\n",
+      "100%|██████████| 4/4 [00:01<00:00,  3.52it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "epoch 0: {'accuracy': 0.7132352941176471, 'f1': 0.823529411764706}\n"
+      "epoch 0: {'accuracy': 0.7475490196078431, 'f1': 0.8367670364500792}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:23<00:00,  1.26it/s]\n",
-      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.30it/s]\n"
+      "100%|██████████| 29/29 [00:17<00:00,  1.68it/s]\n",
+      "100%|██████████| 4/4 [00:01<00:00,  3.37it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "epoch 1: {'accuracy': 0.7671568627450981, 'f1': 0.8484848484848485}\n"
+      "epoch 1: {'accuracy': 0.7671568627450981, 'f1': 0.8536209553158706}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:23<00:00,  1.24it/s]\n",
-      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.30it/s]\n"
+      "100%|██████████| 29/29 [00:17<00:00,  1.66it/s]\n",
+      "100%|██████████| 4/4 [00:01<00:00,  3.33it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "epoch 2: {'accuracy': 0.8259803921568627, 'f1': 0.8738898756660745}\n"
+      "epoch 2: {'accuracy': 0.8553921568627451, 'f1': 0.8959435626102292}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:23<00:00,  1.25it/s]\n",
-      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.41it/s]\n"
+      "100%|██████████| 29/29 [00:17<00:00,  1.64it/s]\n",
+      "100%|██████████| 4/4 [00:01<00:00,  3.35it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "epoch 3: {'accuracy': 0.8431372549019608, 'f1': 0.891156462585034}\n"
+      "epoch 3: {'accuracy': 0.8823529411764706, 'f1': 0.9133574007220215}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:23<00:00,  1.25it/s]\n",
-      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.35it/s]"
+      "100%|██████████| 29/29 [00:17<00:00,  1.63it/s]\n",
+      "100%|██████████| 4/4 [00:01<00:00,  3.17it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "epoch 4: {'accuracy': 0.8480392156862745, 'f1': 0.8938356164383561}\n"
+      "epoch 4: {'accuracy': 0.8897058823529411, 'f1': 0.9183303085299456}\n"
     ]
    },
    {
@ -520,18 +520,6 @@
   "language": "python",
   "name": "python3"
  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.11"
-  },
  "vscode": {
   "interpreter": {
    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
--- a/examples/sft/run_peft_multigpu.sh
+++ b/examples/sft/run_peft_multigpu.sh
@ -28,7 +28,7 @@ torchrun --nproc_per_node 8 --nnodes 1 train.py \
 --per_device_eval_batch_size 8 \
 --gradient_accumulation_steps 8 \
 --gradient_checkpointing True \
--use_reentrant False \ 
+--use_reentrant False \
 --dataset_text_field "content" \
 --use_peft_lora True \
 --lora_r 8 \
--- a/examples/sft/train.py
+++ b/examples/sft/train.py
@ -137,7 +137,8 @@ def main(model_args, data_args, training_args):
        max_seq_length=data_args.max_seq_length,
    )
    trainer.accelerator.print(f"{trainer.model}")
-    trainer.model.print_trainable_parameters()
+    if hasattr(trainer.model, "print_trainable_parameters"):
+        trainer.model.print_trainable_parameters()

    # train
    checkpoint = None
--- a/setup.py
+++ b/setup.py
@ -15,13 +15,13 @@
 from setuptools import find_packages, setup


-VERSION = "0.10.1.dev0"
+VERSION = "0.12.0"

 extras = {}
 extras["quality"] = [
    "black",  # doc-builder has an implicit dependency on Black, see huggingface/doc-builder#434
    "hf-doc-builder",
-    "ruff~=0.2.1",
+    "ruff~=0.4.8",
 ]
 extras["docs_specific"] = [
    "black",  # doc-builder has an implicit dependency on Black, see huggingface/doc-builder#434
--- a/src/peft/init.py
+++ b/src/peft/init.py
@ -17,7 +17,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-__version__ = "0.10.1.dev0"
+__version__ = "0.12.0"

 from .auto import (
    AutoPeftModel,
@ -44,11 +44,14 @@ from .peft_model import (
    PeftModelForTokenClassification,
    PeftModelForQuestionAnswering,
    PeftModelForFeatureExtraction,
+    get_layer_status,
+    get_model_status,
 )
 from .tuners import (
    AdaptionPromptConfig,
    AdaptionPromptModel,
    LoraConfig,
+    LoraRuntimeConfig,
    LoftQConfig,
    LoraModel,
    LoHaConfig,
@ -75,8 +78,16 @@ from .tuners import (
    OFTModel,
    PolyConfig,
    PolyModel,
+    LNTuningConfig,
+    LNTuningModel,
    VeraConfig,
    VeraModel,
+    FourierFTConfig,
+    FourierFTModel,
+    XLoraConfig,
+    XLoraModel,
+    HRAConfig,
+    HRAModel,
 )
 from .utils import (
    TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING,
--- a/src/peft/auto.py
+++ b/src/peft/auto.py
@ -62,6 +62,7 @@ class _BaseAutoPeftModel:
        adapter_name: str = "default",
        is_trainable: bool = False,
        config: Optional[PeftConfig] = None,
+        revision: Optional[str] = None,
        **kwargs,
    ):
        r"""
@ -69,8 +70,9 @@ class _BaseAutoPeftModel:
        are passed along to `PeftConfig` that automatically takes care of filtering the kwargs of the Hub methods and
        the config object init.
        """
-        peft_config = PeftConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        peft_config = PeftConfig.from_pretrained(pretrained_model_name_or_path, revision=revision, **kwargs)
        base_model_path = peft_config.base_model_name_or_path
+        base_model_revision = peft_config.revision

        task_type = getattr(peft_config, "task_type", None)

@ -101,7 +103,7 @@ class _BaseAutoPeftModel:
                "Cannot infer the auto class from the config, please make sure that you are loading the correct model for your task type."
            )

-        base_model = target_class.from_pretrained(base_model_path, **kwargs)
+        base_model = target_class.from_pretrained(base_model_path, revision=base_model_revision, **kwargs)

        tokenizer_exists = False
        if os.path.exists(os.path.join(pretrained_model_name_or_path, TOKENIZER_CONFIG_NAME)):
@ -114,7 +116,7 @@ class _BaseAutoPeftModel:
            tokenizer_exists = check_file_exists_on_hf_hub(
                repo_id=pretrained_model_name_or_path,
                filename=TOKENIZER_CONFIG_NAME,
-                revision=kwargs.get("revision", None),
+                revision=revision,
                repo_type=kwargs.get("repo_type", None),
                token=token,
            )
--- a/src/peft/config.py
+++ b/src/peft/config.py
@ -14,6 +14,7 @@
 import inspect
 import json
 import os
+import warnings
 from dataclasses import asdict, dataclass, field
 from typing import Dict, Optional, Union

@ -63,7 +64,7 @@ class PeftConfigMixin(PushToHubMixin):
        os.makedirs(save_directory, exist_ok=True)
        auto_mapping_dict = kwargs.pop("auto_mapping_dict", None)

-        output_dict = asdict(self)
+        output_dict = self.to_dict()
        # converting set type to list
        for key, value in output_dict.items():
            if isinstance(value, set):
@ -97,7 +98,7 @@ class PeftConfigMixin(PushToHubMixin):
        # TODO: this hack is needed to fix the following issue (on commit 702f937):
        # if someone saves a default config and loads it back with `PeftConfig` class it yields to
        # not loading the correct config class.
-
+        #
        # from peft import AdaLoraConfig, PeftConfig
        # peft_config = AdaLoraConfig()
        # print(peft_config)
@ -162,6 +163,13 @@ class PeftConfigMixin(PushToHubMixin):
        with open(path_json_file) as file:
            json_object = json.load(file)

+        # Sanity check that config does not contain a runtime_config
+        if "runtime_config" in json_object:
+            warnings.warn(
+                "The configuration file contains a `runtime_config` key. This is ignored. Runtime configurations are only valid at runtime."
+            )
+            del json_object["runtime_config"]
+
        return json_object

    @classmethod
@ -232,7 +240,7 @@ class PeftConfig(PeftConfigMixin):
    base_model_name_or_path: Optional[str] = field(
        default=None, metadata={"help": "The name of the base model to use."}
    )
-    revision: Optional[str] = field(default=None, metadata={"help": "The specific model version to use."})
+    revision: Optional[str] = field(default=None, metadata={"help": "The specific base model version to use."})
    peft_type: Optional[Union[str, PeftType]] = field(default=None, metadata={"help": "Peft type"})
    task_type: Optional[Union[str, TaskType]] = field(default=None, metadata={"help": "Task type"})
    inference_mode: bool = field(default=False, metadata={"help": "Whether to use inference mode"})
--- a/src/peft/helpers.py
+++ b/src/peft/helpers.py
@ -1,16 +1,30 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import inspect
 from copy import deepcopy
 from functools import update_wrapper
 from types import MethodType

-from .peft_model import PeftModel
+from .peft_model import PeftConfig, PeftModel


 def update_forward_signature(model: PeftModel) -> None:
    """
-    Args:
    Updates the forward signature of the PeftModel to include parents class signature
        model (`PeftModel`): Peft model to update the forward signature
+
    Example:

    ```python
@ -41,9 +55,9 @@ def update_forward_signature(model: PeftModel) -> None:

 def update_generate_signature(model: PeftModel) -> None:
    """
-    Args:
    Updates the generate signature of a PeftModel with overriding generate to include parents class signature
        model (`PeftModel`): Peft model to update the generate signature
+
    Example:

    ```python
@ -81,12 +95,12 @@ def update_generate_signature(model: PeftModel) -> None:

 def update_signature(model: PeftModel, method: str = "all") -> None:
    """
-    Args:
    Updates the signature of a PeftModel include parents class signature for forward or generate method
        model (`PeftModel`): Peft model to update generate or forward signature method (`str`): method to update
        signature choose one of "forward", "generate", "all"
+
    Example:
-     ```python
+    ```python
    >>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
    >>> from peft import get_peft_model, LoraConfig, TaskType, update_signature

@ -111,3 +125,24 @@ def update_signature(model: PeftModel, method: str = "all") -> None:
        update_generate_signature(model)
    else:
        raise ValueError(f"method {method} is not supported please choose one of ['forward', 'generate', 'all']")
+
+
+def check_if_peft_model(model_name_or_path: str) -> bool:
+    """
+    Check if the model is a PEFT model.
+
+    Args:
+        model_name_or_path (`str`):
+            Model id to check, can be local or on the Hugging Face Hub.
+
+    Returns:
+        `bool`: True if the model is a PEFT model, False otherwise.
+    """
+    is_peft_model = True
+    try:
+        PeftConfig.from_pretrained(model_name_or_path)
+    except Exception:
+        # allow broad exceptions so that this works even if new exceptions are added on HF Hub side
+        is_peft_model = False
+
+    return is_peft_model
--- a/src/peft/import_utils.py
+++ b/src/peft/import_utils.py
@ -77,3 +77,13 @@ def is_aqlm_available():
@lru_cache
 def is_auto_awq_available():
    return importlib.util.find_spec("awq") is not None
+
+
+@lru_cache
+def is_eetq_available():
+    return importlib.util.find_spec("eetq") is not None
+
+
+@lru_cache
+def is_hqq_available():
+    return importlib.util.find_spec("hqq") is not None
--- a/src/peft/mapping.py
+++ b/src/peft/mapping.py
@ -14,10 +14,13 @@

 from __future__ import annotations

-from typing import TYPE_CHECKING, Any
+import warnings
+from typing import TYPE_CHECKING, Any, Optional

 import torch

+from peft.tuners.xlora.model import XLoraModel
+
 from .config import PeftConfig
 from .mixed_model import PeftMixedModel
 from .peft_model import (
@ -35,8 +38,14 @@ from .tuners import (
    AdaptionPromptConfig,
    BOFTConfig,
    BOFTModel,
+    FourierFTConfig,
+    FourierFTModel,
+    HRAConfig,
+    HRAModel,
    IA3Config,
    IA3Model,
+    LNTuningConfig,
+    LNTuningModel,
    LoHaConfig,
    LoHaModel,
    LoKrConfig,
@ -53,6 +62,7 @@ from .tuners import (
    PromptTuningConfig,
    VeraConfig,
    VeraModel,
+    XLoraConfig,
 )
 from .tuners.tuners_utils import BaseTuner as _BaseTuner
 from .utils import _prepare_prompt_learning_config
@ -85,7 +95,11 @@ PEFT_TYPE_TO_CONFIG_MAPPING: dict[str, type[PeftConfig]] = {
    "MULTITASK_PROMPT_TUNING": MultitaskPromptTuningConfig,
    "OFT": OFTConfig,
    "POLY": PolyConfig,
+    "LN_TUNING": LNTuningConfig,
    "VERA": VeraConfig,
+    "FOURIERFT": FourierFTConfig,
+    "XLORA": XLoraConfig,
+    "HRA": HRAConfig,
 }

 PEFT_TYPE_TO_TUNER_MAPPING: dict[str, type[_BaseTuner]] = {
@ -97,7 +111,11 @@ PEFT_TYPE_TO_TUNER_MAPPING: dict[str, type[_BaseTuner]] = {
    "IA3": IA3Model,
    "OFT": OFTModel,
    "POLY": PolyModel,
+    "LN_TUNING": LNTuningModel,
    "VERA": VeraModel,
+    "FOURIERFT": FourierFTModel,
+    "XLORA": XLoraModel,
+    "HRA": HRAModel,
 }


@ -113,7 +131,12 @@ def get_peft_config(config_dict: dict[str, Any]) -> PeftConfig:


 def get_peft_model(
-    model: PreTrainedModel, peft_config: PeftConfig, adapter_name: str = "default", mixed: bool = False
+    model: PreTrainedModel,
+    peft_config: PeftConfig,
+    adapter_name: str = "default",
+    mixed: bool = False,
+    autocast_adapter_dtype: bool = True,
+    revision: Optional[str] = None,
 ) -> PeftModel | PeftMixedModel:
    """
    Returns a Peft model object from a model and a config.
@ -127,6 +150,13 @@ def get_peft_model(
            The name of the adapter to be injected, if not provided, the default adapter name is used ("default").
        mixed (`bool`, `optional`, defaults to `False`):
            Whether to allow mixing different (compatible) adapter types.
+        autocast_adapter_dtype (`bool`, *optional*):
+            Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter weights
+            using float16 or bfloat16 to float32, as this is typically required for stable training, and only affect
+            select PEFT tuners.
+        revision (`str`, `optional`, defaults to `main`):
+            The revision of the base model. If this isn't set, the saved peft model will load the `main` revision for
+            the base model
    """
    model_config = getattr(model, "config", {"model_type": "custom"})
    if hasattr(model_config, "to_dict"):
@ -134,15 +164,25 @@ def get_peft_model(

    peft_config.base_model_name_or_path = model.__dict__.get("name_or_path", None)

+    if revision is not None:
+        if peft_config.revision is not None and peft_config.revision != revision:
+            warnings.warn(
+                f"peft config has already set base model revision to {peft_config.revision}, overwriting with revision {revision}"
+            )
+        peft_config.revision = revision
+
    if mixed:
+        # note: PeftMixedModel does not support autocast_adapter_dtype, so don't pass it
        return PeftMixedModel(model, peft_config, adapter_name=adapter_name)

    if peft_config.task_type not in MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys() and not peft_config.is_prompt_learning:
-        return PeftModel(model, peft_config, adapter_name=adapter_name)
+        return PeftModel(model, peft_config, adapter_name=adapter_name, autocast_adapter_dtype=autocast_adapter_dtype)

    if peft_config.is_prompt_learning:
        peft_config = _prepare_prompt_learning_config(peft_config, model_config)
-    return MODEL_TYPE_TO_PEFT_MODEL_MAPPING[peft_config.task_type](model, peft_config, adapter_name=adapter_name)
+    return MODEL_TYPE_TO_PEFT_MODEL_MAPPING[peft_config.task_type](
+        model, peft_config, adapter_name=adapter_name, autocast_adapter_dtype=autocast_adapter_dtype
+    )


 def inject_adapter_in_model(
--- a/src/peft/mixed_model.py
+++ b/src/peft/mixed_model.py
@ -23,8 +23,6 @@ from accelerate.hooks import remove_hook_from_submodules
 from torch import nn
 from transformers.utils import PushToHubMixin

-from peft.tuners.mixed import COMPATIBLE_TUNER_TYPES
-
 from .config import PeftConfig
 from .peft_model import PeftModel
 from .tuners import (
@ -36,6 +34,7 @@ from .tuners import (
    MixedModel,
    OFTModel,
 )
+from .tuners.mixed import COMPATIBLE_TUNER_TYPES
 from .utils import PeftType, _set_adapter, _set_trainable


@ -97,8 +96,6 @@ class PeftMixedModel(PushToHubMixin, torch.nn.Module):
    Example:

    ```py
-    >>> from peft import get_peft_model
-
    >>> base_model = ...  # load the base model, e.g. from transformers
    >>> peft_model = PeftMixedModel.from_pretrained(base_model, path_to_adapter1, "adapter1").eval()
    >>> peft_model.load_adapter(path_to_adapter2, "adapter2")
@ -193,6 +190,8 @@ class PeftMixedModel(PushToHubMixin, torch.nn.Module):
        try:
            return super().__getattr__(name)  # defer to nn.Module's logic
        except AttributeError:
+            if name == "base_model":  # see #1892: prevent infinite recursion if class is not initialized
+                raise
            return getattr(self.base_model, name)

    def forward(self, *args: Any, **kwargs: Any):
@ -311,6 +310,12 @@ class PeftMixedModel(PushToHubMixin, torch.nn.Module):
        """
        return self.base_model.unload(*args, **kwargs)

+    def get_layer_status(self):
+        raise TypeError(f"get_layer_status is not supported for {self.__class__.__name__}.")
+
+    def get_model_status(self):
+        raise TypeError(f"get_model_status is not supported for {self.__class__.__name__}.")
+
    @classmethod
    def _split_kwargs(cls, kwargs: dict[str, Any]):
        return PeftModel._split_kwargs(kwargs)
--- a/src/peft/peft_model.py
+++ b/src/peft/peft_model.py
@ -20,7 +20,8 @@ import os
 import warnings
 from contextlib import contextmanager
 from copy import deepcopy
-from typing import Any, Optional, Union
+from dataclasses import dataclass
+from typing import Any, Literal, Optional, Union

 import packaging.version
 import torch
@ -28,7 +29,7 @@ import transformers
 from accelerate import dispatch_model, infer_auto_device_map
 from accelerate.hooks import AlignDevicesHook, add_hook_to_module, remove_hook_from_submodules
 from accelerate.utils import get_balanced_memory, named_module_tensors
-from huggingface_hub import ModelCard, ModelCardData, hf_hub_download
+from huggingface_hub import HfFileSystem, ModelCard, ModelCardData, hf_hub_download
 from safetensors import safe_open
 from safetensors.torch import save_file as safe_save_file
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
@ -42,7 +43,10 @@ from .tuners import (
    AdaLoraModel,
    AdaptionPromptModel,
    BOFTModel,
+    FourierFTModel,
+    HRAModel,
    IA3Model,
+    LNTuningModel,
    LoHaModel,
    LoKrModel,
    LoraModel,
@ -53,8 +57,10 @@ from .tuners import (
    PromptEmbedding,
    PromptEncoder,
    VeraModel,
+    XLoraConfig,
+    XLoraModel,
 )
-from .tuners.tuners_utils import BaseTunerLayer
+from .tuners.tuners_utils import BaseTuner, BaseTunerLayer
 from .utils import (
    SAFETENSORS_WEIGHTS_NAME,
    TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING,
@ -87,7 +93,11 @@ PEFT_TYPE_TO_MODEL_MAPPING = {
    PeftType.IA3: IA3Model,
    PeftType.OFT: OFTModel,
    PeftType.POLY: PolyModel,
+    PeftType.LN_TUNING: LNTuningModel,
    PeftType.VERA: VeraModel,
+    PeftType.FOURIERFT: FourierFTModel,
+    PeftType.XLORA: XLoraModel,
+    PeftType.HRA: HRAModel,
 }


@ -99,6 +109,10 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
        model ([`~transformers.PreTrainedModel`]): The base transformer model used for Peft.
        peft_config ([`PeftConfig`]): The configuration of the Peft model.
        adapter_name (`str`,  *optional*): The name of the adapter, defaults to `"default"`.
+        autocast_adapter_dtype (`bool`, *optional*):
+            Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter weights
+            using float16 and bfloat16 to float32, as this is typically required for stable training, and only affect
+            select PEFT tuners.

    **Attributes**:
        - **base_model** ([`torch.nn.Module`]) -- The base transformer model used for Peft.
@ -115,7 +129,13 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
            in the base model if using [`PromptLearningConfig`].
    """

-    def __init__(self, model: PreTrainedModel, peft_config: PeftConfig, adapter_name: str = "default") -> None:
+    def __init__(
+        self,
+        model: PreTrainedModel,
+        peft_config: PeftConfig,
+        adapter_name: str = "default",
+        autocast_adapter_dtype: bool = True,
+    ) -> None:
        super().__init__()
        self.modules_to_save = None
        self.active_adapter = adapter_name
@ -135,6 +155,11 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
            self.base_model = cls(model, {adapter_name: peft_config}, adapter_name)
            self.set_additional_trainable_modules(peft_config, adapter_name)

+        if hasattr(self.base_model, "_cast_adapter_dtype"):
+            self.base_model._cast_adapter_dtype(
+                adapter_name=adapter_name, autocast_adapter_dtype=autocast_adapter_dtype
+            )
+
        if getattr(model, "is_gradient_checkpointing", True):
            model = self._prepare_model_for_gradient_checkpointing(model)

@ -174,6 +199,8 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
        selected_adapters: Optional[list[str]] = None,
        save_embedding_layers: Union[str, bool] = "auto",
        is_main_process: bool = True,
+        convert_pissa_to_lora: Optional[str] = None,
+        path_initial_model_for_weight_conversion: Optional[str] = None,
        **kwargs: Any,
    ) -> None:
        r"""
@ -196,8 +223,19 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
            is_main_process (`bool`, *optional*):
                Whether the process calling this is the main process or not. Will default to `True`. Will not save the
                checkpoint if not on the main process, which is important for multi device setups (e.g. DDP).
+            convert_pissa_to_lora (`str, *optional*`):
+                Deprecated. Use `path_initial_model_for_weight_conversion` instead.
+            path_initial_model_for_weight_conversion (`str, *optional*`):
+                The path to the initialized adapter, which is obtained after initializing the model with PiSSA or OLoRA
+                and before performing any training. When `path_initial_model_for_weight_conversion` is not None, the
+                difference in adapter before and after fine-tuning is calculated. This difference can be represented as
+                the parameters of a standard LoRA adapter. Using this converted adapter does not require changes to the
+                base model, thus conveniently allowing the use of multiple PiSSA or OLoRA adapters with LoRA adapters,
+                and the activation or deactivation of any adapters. Note that this conversion is not supported if
+                `rslora` is used in combination with `rank_pattern` or `alpha_pattern`.
            kwargs (additional keyword arguments, *optional*):
                Additional keyword arguments passed along to the `push_to_hub` method.
+
        """
        if os.path.isfile(save_directory):
            raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file")
@ -213,6 +251,50 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
                    f"You passed an invalid `selected_adapters` arguments, current supported adapter names are"
                    f" {list(self.peft_config.keys())} - got {selected_adapters}."
                )
+        # TODO: remove deprecated parameter in PEFT v0.14.0
+        if convert_pissa_to_lora is not None:
+            warnings.warn(
+                "`convert_pissa_to_lora` is deprecated and will be removed in a future version. "
+                "Use `path_initial_model_for_weight_conversion` instead."
+            )
+            path_initial_model_for_weight_conversion = convert_pissa_to_lora
+
+        def save_mutated_as_lora(peft_config, path_initial_model_for_weight_conversion, output_state_dict, kwargs):
+            if peft_config.use_rslora and (peft_config.rank_pattern or peft_config.alpha_pattern):
+                msg = (
+                    "Passing `path_initial_model_for_weight_conversion` to `save_pretrained` is not supported when "
+                    "using `rank_pattern` or `alpha_pattern` at the same time as `use_rslora=True`."
+                )
+                raise ValueError(msg)
+
+            if not any(
+                str(peft_config.init_lora_weights).lower().startswith(prefix) for prefix in ["pissa", "olora", "true"]
+            ):
+                warnings.warn(
+                    "`path_initial_model_for_weight_conversion` only works for converting a PiSSA or OLoRA adapter to "
+                    "a LoRA adapter"
+                )
+            initial_adapter_name = os.path.basename(path_initial_model_for_weight_conversion)
+            try:
+                self.load_adapter(
+                    os.path.dirname(path_initial_model_for_weight_conversion),
+                    subfolder=initial_adapter_name,
+                    adapter_name=initial_adapter_name,
+                )
+                is_pissa = str(self.peft_config[initial_adapter_name].init_lora_weights).lower().startswith("pissa")
+                is_olora = str(self.peft_config[initial_adapter_name].init_lora_weights).lower() == "olora"
+                if is_pissa or is_olora:
+                    raise ValueError(
+                        "The `init_lora_weights` parameter of the initial adapter should be set to `True`. "
+                        "Otherwise, `self.load_adapter` will subtract the decomposed values again based on the "
+                        "residual model."
+                    )
+                output_state_dict = self.base_model.subtract_mutated_init(
+                    output_state_dict, initial_adapter_name, kwargs
+                )
+            finally:
+                self.delete_adapter(initial_adapter_name)
+            return output_state_dict

        if is_main_process:
            os.makedirs(save_directory, exist_ok=True)
@ -252,13 +334,24 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
                    # not supported in safetensors.
                    for shared_tensor_name in names[1:]:
                        output_state_dict[shared_tensor_name] = output_state_dict[shared_tensor_name].clone()
-
+                if path_initial_model_for_weight_conversion is not None:
+                    peft_config.init_lora_weights = True
+                    peft_config.save_pretrained(path_initial_model_for_weight_conversion)
+                    output_state_dict = save_mutated_as_lora(
+                        peft_config, path_initial_model_for_weight_conversion, output_state_dict, kwargs
+                    )
                safe_save_file(
                    output_state_dict,
                    os.path.join(output_dir, SAFETENSORS_WEIGHTS_NAME),
                    metadata={"format": "pt"},
                )
            elif is_main_process:
+                if path_initial_model_for_weight_conversion is not None:
+                    peft_config.init_lora_weights = True
+                    peft_config.save_pretrained(path_initial_model_for_weight_conversion)
+                    output_state_dict = save_mutated_as_lora(
+                        peft_config, path_initial_model_for_weight_conversion, output_state_dict, kwargs
+                    )
                torch.save(output_state_dict, os.path.join(output_dir, WEIGHTS_NAME))

            # save the config and change the inference mode to `True`
@ -286,6 +379,20 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
                auto_mapping_dict = None

            if is_main_process:
+                if path_initial_model_for_weight_conversion is not None:
+                    peft_config.init_lora_weights = True
+                    peft_config.r *= 2
+                    if not peft_config.use_rslora:
+                        peft_config.lora_alpha *= 2
+                    else:
+                        # with rslora, we have scaling = alpha / sqrt(r), we thus adjust alpha to keep the same scaling
+                        peft_config.lora_alpha *= 2**0.5
+
+                    if peft_config.rank_pattern:
+                        peft_config.rank_pattern = {key: 2 * val for key, val in peft_config.rank_pattern.items()}
+                    if peft_config.alpha_pattern:
+                        peft_config.alpha_pattern = {key: 2 * val for key, val in peft_config.alpha_pattern.items()}
+
                peft_config.save_pretrained(output_dir, auto_mapping_dict=auto_mapping_dict)
            peft_config.inference_mode = inference_mode

@ -297,6 +404,8 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
        adapter_name: str = "default",
        is_trainable: bool = False,
        config: Optional[PeftConfig] = None,
+        autocast_adapter_dtype: bool = True,
+        ephemeral_gpu_offload: bool = False,
        **kwargs: Any,
    ) -> PeftModel:
        r"""
@ -323,6 +432,16 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
                The configuration object to use instead of an automatically loaded configuration. This configuration
                object is mutually exclusive with `model_id` and `kwargs`. This is useful when configuration is already
                loaded before calling `from_pretrained`.
+            autocast_adapter_dtype (`bool`, *optional*):
+                Whether to autocast the adapter dtype. Defaults to `True`. Only relevant for specific adapter types.
+            ephemeral_gpu_offload (`bool`, *optional*):
+                Whether to use ephemeral GPU offloading for partially loaded modules. Defaults to `False`. This is
+                useful when parts of the model and/or components (such as adapters) are kept in CPU memory until they
+                are needed. Rather than perform expensive operations on small data, the data is transferred to the GPU
+                on-demand, the operation(s) performed, and the results moved back to CPU memory. This brings a slight
+                momentary VRAM overhead but gives orders of magnitude speedup in certain cases.
+            torch_device (`str`, *optional*, defaults to None):
+                The device to load the adapter on. If `None`, the device will be inferred.
            kwargs: (`optional`):
                Additional keyword arguments passed along to the specific PEFT configuration class.
        """
@ -345,6 +464,13 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
        else:
            raise ValueError(f"The input config must be a PeftConfig, got {config.__class__}")

+        # Runtime configuration, if supported
+        if hasattr(config, "runtime_config"):
+            config.runtime_config.ephemeral_gpu_offload = ephemeral_gpu_offload
+        else:
+            if ephemeral_gpu_offload:
+                warnings.warn("Ephemeral GPU offloading is not supported for this model. Ignoring.")
+
        if hasattr(model, "hf_device_map"):
            weight_map = dict(named_module_tensors(model, recurse=True))

@ -384,12 +510,42 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
            raise ValueError("Cannot set a prompt learning adapter to trainable when loading pretrained adapter.")
        else:
            config.inference_mode = not is_trainable
+        if isinstance(getattr(model, "base_model", None), XLoraModel):
+            if not isinstance(config, XLoraConfig):
+                raise TypeError(f"Expected 'XLoraConfig', got '{type(config)}' instead.")
+            if "adapters" in kwargs:
+                config.adapters = kwargs["adapters"]
+            else:
+                # If the path is on HF hub, then we get the adapter names to create a subfolders list which tells
+                # `load_adapter` where the adapters are.
+                if not os.path.exists(model_id):
+                    s = HfFileSystem()
+
+                    # The names of the adapters which must be in folders
+                    adapter_names = [
+                        file["name"][len(model_id) + 1 :] for file in s.ls(model_id) if file["type"] == "directory"
+                    ]
+                    # Prepare a dict of adapter paths, which really just point to the hf id; we will use the subfolders
+                    adapter_paths = {}
+                    for adapter_name in adapter_names:
+                        adapter_paths[adapter_name] = os.path.join(model_id, model_id)
+                    config.adapters = adapter_paths
+                    config._subfolders = adapter_names
+                else:
+                    if "adapters" not in kwargs:
+                        raise ValueError("If model_id is a local path, then `adapters` must be passed in kwargs.")

        if config.task_type not in MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys():
-            model = cls(model, config, adapter_name)
+            model = cls(model, config, adapter_name, autocast_adapter_dtype=autocast_adapter_dtype)
        else:
-            model = MODEL_TYPE_TO_PEFT_MODEL_MAPPING[config.task_type](model, config, adapter_name)
-        model.load_adapter(model_id, adapter_name, is_trainable=is_trainable, **kwargs)
+            model = MODEL_TYPE_TO_PEFT_MODEL_MAPPING[config.task_type](
+                model, config, adapter_name, autocast_adapter_dtype=autocast_adapter_dtype
+            )
+
+        model.load_adapter(
+            model_id, adapter_name, is_trainable=is_trainable, autocast_adapter_dtype=autocast_adapter_dtype, **kwargs
+        )
+
        return model

    def _setup_prompt_encoder(self, adapter_name: str):
@ -580,6 +736,8 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
        try:
            return super().__getattr__(name)  # defer to nn.Module's logic
        except AttributeError:
+            if name == "base_model":  # see #1892: prevent infinite recursion if class is not initialized
+                raise
            return getattr(self.base_model, name)

    @contextmanager
@ -628,24 +786,42 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
        ...     model(inputs)
        ```
        """
-        try:
-            if self.peft_config[self.active_adapter].is_prompt_learning:
+        if self.peft_config[self.active_adapter].is_prompt_learning:
+            try:
                # TODO: consider replacing this patching of methods with a more robust mechanism: setting a flag and
                # letting the underlying methods deal with it, same as how LoRA does it.
                old_forward = self.forward
                self.forward = self.base_model.forward
                old_prepare_inputs_for_generation = self.prepare_inputs_for_generation
                self.prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation
-            else:
-                self.base_model.disable_adapter_layers()
-            yield
-        finally:
-            if self.peft_config[self.active_adapter].is_prompt_learning:
+                yield
+            finally:
                self.forward = old_forward
                self.prepare_inputs_for_generation = old_prepare_inputs_for_generation
-            else:
+
+        elif self.peft_config[self.active_adapter].is_adaption_prompt:
+            try:
+                self.base_model.disable_adapter_layers()
+                yield
+            finally:
                self.base_model.enable_adapter_layers()

+        else:  # LoRA, LoHa, etc.
+            model_status = self.get_model_status()
+            if model_status.enabled == "irregular":
+                warnings.warn(
+                    "The model contains some adapter layers that are enabled and others that are disabled. "
+                    "This is most likely unintentional. After exiting the disable_adapter context, all adapters "
+                    "will be enabled"
+                )
+            try:
+                self.base_model.disable_adapter_layers()
+                yield
+            finally:
+                if model_status.enabled is not False:
+                    # model_status.enabled is `True` or `"irregular"`
+                    self.base_model.enable_adapter_layers()
+
    def get_base_model(self) -> torch.nn.Module:
        """
        Returns the base model.
@ -709,6 +885,76 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
                self.modules_to_save.update(peft_config.modules_to_save)
            _set_trainable(self, adapter_name)  # this may add a new ModulesToSaveWrapper

+    def get_layer_status(self) -> list[TunerLayerStatus]:
+        """Get the status of each adapter layer in the model.
+
+        This method returns a list of `TunerLayerStatus` dataclass instances, each of which contains the following
+        attributes:
+
+        - `name` (`str`):
+           The name of the adapter layer, e.g. `model.encoder.block.0.layer.0.SelfAttention.q`.
+        - `module_type` (`str`):
+           The type of the adapter layer, e.g. `lora.Linear`.
+        - `enabled` (`bool`):
+           Whether the adapter layer is enabled.
+        - `active_adapters` (`list[str]`):
+           The names of the active adapters, if any, e.g. `["default"]`.
+        - `merged_adapters` (`list[str]`):
+           The names of the merged adapters, if any, e.g. `["default"]`.
+        - `available_adapters` (`list[str]`):
+           The names of the available adapters, e.g. `["default"]`.
+
+        Args:
+            model ([`~PeftModel`]):
+                The model to get the adapter layer status from.
+
+        Returns:
+            list[`peft.peft_model.TunerLayerStatus`]:
+                A list of dataclasses, each containing the status of the corresponding adapter layer.
+
+        """
+        return get_layer_status(self)
+
+    def get_model_status(self) -> TunerModelStatus:
+        """Get the status of tuners of the model.
+
+        This method returns a `TunerModelStatus` dataclass instance, which contains the following attributes:
+
+        - `base_model_type` (`str`):
+           The type of the base model, e.g. `T5Model`.
+        - `adapter_model_type` (`str`):
+           The type of the adapter model, e.g. `LoraModel`.
+        - `peft_types` (`dict[str, str]`):
+           The mapping of adapter name to adapter type, e.g. `{"default": "LORA"}`.
+        - `trainable_params` (`int`):
+           The number of trainable parameters in the model.
+        - `total_params` (`int`):
+           The total number of parameters in the model.
+        - `num_adapter_layers` (`int`):
+           The number of adapter layers in the model.
+        - `enabled` (`bool`, `Literal["irregular"]`):
+           Whether all adapter layers are enabled. If some are enabled and some are not, this will be `"irregular"`.
+           This means that your model is in an inconsistent state and might not work as expected.
+        - `active_adapters` (`list[str]`, `Literal["irregular"]`):
+           The names of the active adapters. If the active adapters are not consistent across all layers, this will be
+           `"irregular"`, which means that your model is in an inconsistent state and might not work as expected.
+        - `merged_adapters` (`list[str]`, `Literal["irregular"]`):
+           The names of the merged adapters. If the merged adapters are not consistent across all layers, this will be
+           `"irregular"`, which means that your model is in an inconsistent state and might not work as expected.
+        - `available_adapters` (`list[str]`):
+           The names of the available adapters, e.g. `["default"]`.
+
+        Args:
+            model ([`~PeftModel`]):
+                The model to get the adapter layer status from.
+
+        Returns:
+            `peft.peft_model.TunerModelStatus`:
+                A dataclass containing the status of the model.
+
+        """
+        return get_model_status(self)
+
    @classmethod
    def _split_kwargs(cls, kwargs: dict[str, Any]):
        _kwargs_not_in_hf_hub_download_signature = ("use_auth_token",)
@ -809,6 +1055,8 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
        adapter_name: str,
        is_trainable: bool = False,
        torch_device: Optional[str] = None,
+        autocast_adapter_dtype: bool = True,
+        ephemeral_gpu_offload: bool = False,
        **kwargs: Any,
    ):
        """
@ -829,6 +1077,12 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
                used for inference.
            torch_device (`str`, *optional*, defaults to None):
                The device to load the adapter on. If `None`, the device will be inferred.
+            autocast_adapter_dtype (`bool`, *optional*, defaults to `True`):
+                Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter
+                weights using float16 and bfloat16 to float32, as this is typically required for stable training, and
+                only affect select PEFT tuners.
+            ephemeral_gpu_offload (`bool`, *optional*, defaults to `False`):
+                Whether to use ephemeral GPU offloading for partially loaded modules. Defaults to `False`.
            kwargs: (`optional`):
                Additional arguments to modify the way the adapter is loaded, e.g. the token for Hugging Face Hub.
        """
@ -847,6 +1101,7 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
                )
            ].from_pretrained(
                model_id,
+                ephemeral_gpu_offload=ephemeral_gpu_offload,
                **hf_hub_download_kwargs,
            )
            if peft_config.is_prompt_learning and is_trainable:
@ -908,6 +1163,11 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
                remove_hook_from_submodules(self.prompt_encoder)
            add_hook_to_module(self.get_base_model(), hook)

+        if hasattr(self.base_model, "_cast_adapter_dtype"):
+            self.base_model._cast_adapter_dtype(
+                adapter_name=adapter_name, autocast_adapter_dtype=autocast_adapter_dtype
+            )
+
        # Set model in evaluation mode to deactivate Dropout modules by default
        if not is_trainable:
            self.eval()
@ -1007,6 +1267,11 @@ class PeftModelForSequenceClassification(PeftModel):
    Args:
        model ([`~transformers.PreTrainedModel`]): Base transformer model.
        peft_config ([`PeftConfig`]): Peft config.
+        adapter_name (`str`,  *optional*): The name of the adapter, defaults to `"default"`.
+        autocast_adapter_dtype (`bool`, *optional*):
+            Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter weights
+            using float16 and bfloat16 to float32, as this is typically required for stable training, and only affect
+            select PEFT tuners.

    **Attributes**:
        - **config** ([`~transformers.PretrainedConfig`]) -- The configuration object of the base model.
@ -1040,8 +1305,10 @@ class PeftModelForSequenceClassification(PeftModel):
        ```
    """

-    def __init__(self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default") -> None:
-        super().__init__(model, peft_config, adapter_name)
+    def __init__(
+        self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default", **kwargs
+    ) -> None:
+        super().__init__(model, peft_config, adapter_name, **kwargs)

        classifier_module_names = ["classifier", "score"]
        if self.modules_to_save is None:
@ -1235,7 +1502,11 @@ class PeftModelForCausalLM(PeftModel):
    Args:
        model ([`~transformers.PreTrainedModel`]): Base transformer model.
        peft_config ([`PeftConfig`]): Peft config.
-
+        adapter_name (`str`,  *optional*): The name of the adapter, defaults to `"default"`.
+        autocast_adapter_dtype (`bool`, *optional*):
+            Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter weights
+            using float16 and bfloat16 to float32, as this is typically required for stable training, and only affect
+            select PEFT tuners.

    Example:

@ -1265,8 +1536,10 @@ class PeftModelForCausalLM(PeftModel):
        ```
    """

-    def __init__(self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default") -> None:
-        super().__init__(model, peft_config, adapter_name)
+    def __init__(
+        self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default", **kwargs
+    ) -> None:
+        super().__init__(model, peft_config, adapter_name, **kwargs)
        self.base_model_prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation

    def forward(
@ -1393,7 +1666,12 @@ class PeftModelForCausalLM(PeftModel):
                # change in the logic of `prepare_inputs_for_generation` makes the below code necessary
                # In prompt learning methods, past key values are longer when compared to the `input_ids`.
                # As such only consider the last input ids in the autogressive generation phase.
-                if model_kwargs["past_key_values"][0][0].shape[-2] >= model_kwargs["input_ids"].shape[1]:
+                past_key_values = model_kwargs["past_key_values"]
+                if isinstance(past_key_values, (tuple, list)):
+                    seq_len = past_key_values[0][0].shape[-2]
+                else:  # using transformers kv cache
+                    seq_len = past_key_values.get_seq_length()
+                if seq_len >= model_kwargs["input_ids"].shape[1]:
                    model_kwargs["input_ids"] = model_kwargs["input_ids"][:, -1:]

            if model_kwargs.get("attention_mask", None) is not None:
@ -1440,7 +1718,11 @@ class PeftModelForSeq2SeqLM(PeftModel):
    Args:
        model ([`~transformers.PreTrainedModel`]): Base transformer model.
        peft_config ([`PeftConfig`]): Peft config.
-
+        adapter_name (`str`,  *optional*): The name of the adapter, defaults to `"default"`.
+        autocast_adapter_dtype (`bool`, *optional*):
+            Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter weights
+            using float16 and bfloat16 to float32, as this is typically required for stable training, and only affect
+            select PEFT tuners.

    Example:

@ -1469,8 +1751,10 @@ class PeftModelForSeq2SeqLM(PeftModel):
        ```
    """

-    def __init__(self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default") -> None:
-        super().__init__(model, peft_config, adapter_name)
+    def __init__(
+        self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default", **kwargs
+    ) -> None:
+        super().__init__(model, peft_config, adapter_name, **kwargs)
        self.base_model_prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation
        self.base_model_prepare_encoder_decoder_kwargs_for_generation = (
            self.base_model._prepare_encoder_decoder_kwargs_for_generation
@ -1694,6 +1978,11 @@ class PeftModelForTokenClassification(PeftModel):
    Args:
        model ([`~transformers.PreTrainedModel`]): Base transformer model.
        peft_config ([`PeftConfig`]): Peft config.
+        adapter_name (`str`,  *optional*): The name of the adapter, defaults to `"default"`.
+        autocast_adapter_dtype (`bool`, *optional*):
+            Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter weights
+            using float16 and bfloat16 to float32, as this is typically required for stable training, and only affect
+            select PEFT tuners.

    **Attributes**:
        - **config** ([`~transformers.PretrainedConfig`]) -- The configuration object of the base model.
@ -1727,8 +2016,10 @@ class PeftModelForTokenClassification(PeftModel):
        ```
    """

-    def __init__(self, model: torch.nn.Module, peft_config: PeftConfig = None, adapter_name: str = "default") -> None:
-        super().__init__(model, peft_config, adapter_name)
+    def __init__(
+        self, model: torch.nn.Module, peft_config: PeftConfig = None, adapter_name: str = "default", **kwargs
+    ) -> None:
+        super().__init__(model, peft_config, adapter_name, **kwargs)

        classifier_module_names = ["classifier", "score"]
        if self.modules_to_save is None:
@ -1906,6 +2197,11 @@ class PeftModelForQuestionAnswering(PeftModel):
    Args:
        model ([`~transformers.PreTrainedModel`]): Base transformer model.
        peft_config ([`PeftConfig`]): Peft config.
+        adapter_name (`str`,  *optional*): The name of the adapter, defaults to `"default"`.
+        autocast_adapter_dtype (`bool`, *optional*):
+            Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter weights
+            using float16 and bfloat16 to float32, as this is typically required for stable training, and only affect
+            select PEFT tuners.

    **Attributes**:
        - **config** ([`~transformers.PretrainedConfig`]) -- The configuration object of the base model.
@ -1937,8 +2233,10 @@ class PeftModelForQuestionAnswering(PeftModel):
        ```
    """

-    def __init__(self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default") -> None:
-        super().__init__(model, peft_config, adapter_name)
+    def __init__(
+        self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default", **kwargs
+    ) -> None:
+        super().__init__(model, peft_config, adapter_name, **kwargs)

        qa_module_names = ["qa_outputs"]
        if self.modules_to_save is None:
@ -2139,6 +2437,11 @@ class PeftModelForFeatureExtraction(PeftModel):
    Args:
        model ([`~transformers.PreTrainedModel`]): Base transformer model.
        peft_config ([`PeftConfig`]): Peft config.
+        adapter_name (`str`,  *optional*): The name of the adapter, defaults to `"default"`.
+        autocast_adapter_dtype (`bool`, *optional*):
+            Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter weights
+            using float16 and bfloat16 to float32, as this is typically required for stable training, and only affect
+            select PEFT tuners.

    **Attributes**:
        - **config** ([`~transformers.PretrainedConfig`]) -- The configuration object of the base model.
@ -2167,8 +2470,8 @@ class PeftModelForFeatureExtraction(PeftModel):
        ```
    """

-    def __init__(self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default"):
-        super().__init__(model, peft_config, adapter_name)
+    def __init__(self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default", **kwargs):
+        super().__init__(model, peft_config, adapter_name, **kwargs)

    def forward(
        self,
@ -2229,3 +2532,287 @@ class PeftModelForFeatureExtraction(PeftModel):
            prompts = prompts.to(inputs_embeds.dtype)
            inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1)
            return self.base_model(inputs_embeds=inputs_embeds, **kwargs)
+
+
+@dataclass
+class TunerLayerStatus:
+    name: str
+    module_type: str
+    enabled: bool
+    active_adapters: list[str]
+    merged_adapters: list[str]
+    requires_grad: dict[str, bool | Literal["irregular"]]
+    available_adapters: list[str]
+    devices: dict[str, list[str]]
+
+
+def get_layer_status(model: torch.nn.Module) -> list[TunerLayerStatus]:
+    """Get the status of each adapter layer in the model.
+
+    This function returns a list of `TunerLayerStatus` dataclass instances, each of which contains the following
+    attributes:
+
+    - `name` (`str`):
+       The name of the adapter layer, e.g. `model.encoder.block.0.layer.0.SelfAttention.q`.
+    - `module_type` (`str`):
+       The type of the adapter layer, e.g. `lora.Linear`.
+    - `enabled` (`bool`):
+       Whether the adapter layer is enabled.
+    - `active_adapters` (`list[str]`):
+       The names of the active adapters, if any, e.g. `["default"]`.
+    - `merged_adapters` (`list[str]`):
+       The names of the merged adapters, if any, e.g. `["default"]`.
+    - requires_grad : dict[str, bool | Literal["irregular"]]
+       The requires_grad status of the parameters for each adapter module. Ideally, it should be either `True` or
+       `False`. If the requires_grad status is not consistent across all parameters, the value will be set to
+       `"irregular"`.
+    - `available_adapters` (`list[str]`):
+       The names of the available adapters, e.g. `["default"]`.
+    - `devices` (`dict[str, list[str]]`):
+       The devices where the parameters of the given adapter are stored, e.g. `["cuda"]`.
+
+    Args:
+        model ([Union[`~PeftModel`, `~transformers.PreTrainedModel`, `nn.Module`]]):
+            The model to get the adapter layer status from.
+
+    Returns:
+        list[`peft.peft_model.TunerLayerStatus`]:
+            A list of dataclasses, each containing the status of the corresponding adapter layer.
+
+    """
+    if isinstance(model, PeftModel):
+        base_model = model.base_model
+        if not isinstance(base_model, BaseTuner):
+            raise TypeError(
+                "get_layer_status() got an invalid PeftModel instance; prefix tuning and adaption prompt are not "
+                "supported."
+            )
+    else:
+        base_model = model
+
+    layer_status: list[TunerLayerStatus] = []
+    for name, module in base_model.named_modules():
+        if not isinstance(module, BaseTunerLayer):
+            continue
+
+        # determine if all submodules/parameters if this module require grad or not
+        mapping_requires_grad_list: dict[str, list[bool]] = collections.defaultdict(list)
+        for adapter_module_name in module.adapter_layer_names:
+            adapter_module = getattr(module, adapter_module_name)
+            if isinstance(adapter_module, torch.nn.ModuleDict):
+                for key, submodule in adapter_module.items():
+                    for param in submodule.parameters():
+                        mapping_requires_grad_list[key].append(param.requires_grad)
+            elif isinstance(adapter_module, torch.nn.ParameterDict):
+                for key, param in adapter_module.items():
+                    mapping_requires_grad_list[key].append(param.requires_grad)
+            else:
+                # strange, we don't know how to handle this, ignore for now
+                pass
+
+        def check_irrgular(vals: list[bool]) -> bool | Literal["irregular"]:
+            if all(vals):
+                return True
+            if not any(vals):
+                return False
+            return "irregular"
+
+        requires_grad = {key: check_irrgular(vals) for key, vals in mapping_requires_grad_list.items()}
+
+        devices_dd = collections.defaultdict(list)
+        for adapter_module_name in module.adapter_layer_names + module.other_param_names:
+            adapter_module = getattr(module, adapter_module_name)
+            if isinstance(adapter_module, torch.nn.ModuleDict):
+                for key, submodule in adapter_module.items():
+                    devices_dd[key].extend([param.device.type for param in submodule.parameters()])
+            elif (
+                isinstance(adapter_module, torch.nn.ParameterDict)
+                or (adapter_module.__class__.__name__ == "BufferDict")  # VeRA
+            ):
+                for key, param in adapter_module.items():
+                    devices_dd[key].append(param.device.type)
+        devices = {key: sorted(set(val)) for key, val in devices_dd.items()}
+
+        status = TunerLayerStatus(
+            name=name,
+            module_type=repr(module).partition("(")[0],
+            enabled=not module.disable_adapters,
+            active_adapters=module.active_adapters,
+            merged_adapters=module.merged_adapters,
+            requires_grad=requires_grad,
+            available_adapters=sorted(module._get_available_adapters()),
+            devices=devices,
+        )
+        layer_status.append(status)
+
+    if not layer_status:
+        raise ValueError(
+            "No adapter layers found in the model, please ensure that it's a PEFT model or that you have PEFT adapters "
+            "injected in the model."
+        )
+
+    return layer_status
+
+
+@dataclass
+class TunerModelStatus:
+    base_model_type: str
+    adapter_model_type: str
+    peft_types: dict[str, str]
+    trainable_params: int
+    total_params: int
+    num_adapter_layers: int
+    enabled: bool | Literal["irregular"]
+    active_adapters: list[str] | Literal["irregular"]
+    merged_adapters: list[str] | Literal["irregular"]
+    requires_grad: dict[str, bool | Literal["irregular"]]
+    available_adapters: list[str]
+    devices: dict[str, list[str]]
+
+
+def get_model_status(model: torch.nn.Module) -> TunerModelStatus:
+    """Get the status of tuners of the model.
+
+    This function returns a `TunerModelStatus` dataclass instance, which contains the following attributes:
+
+    - `base_model_type` (`str`):
+       The type of the base model, e.g. `T5Model`.
+    - `adapter_model_type` (`str`):
+       The type of the adapter model, e.g. `LoraModel`.
+    - `peft_types` (`dict[str, str]`):
+       The mapping of adapter name to adapter type, e.g. `{"default": "LORA"}`.
+    - `trainable_params` (`int`):
+       The number of trainable parameters in the model.
+    - `total_params` (`int`):
+       The total number of parameters in the model.
+    - `num_adapter_layers` (`int`):
+       The number of adapter layers in the model.
+    - `enabled` (`bool`, `Literal["irregular"]`):
+       Whether all adapter layers are enabled. If some are enabled and some are not, this will be `"irregular"`. This
+       means that your model is in an inconsistent state and might not work as expected.
+    - `active_adapters` (`list[str]`, `Literal["irregular"]`):
+       The names of the active adapters. If the active adapters are not consistent across all layers, this will be
+       `"irregular"`, which means that your model is in an inconsistent state and might not work as expected.
+    - `merged_adapters` (`list[str]`, `Literal["irregular"]`):
+       The names of the merged adapters. If the merged adapters are not consistent across all layers, this will be
+       `"irregular"`, which means that your model is in an inconsistent state and might not work as expected.
+    - `requires_grad` (`dict[str, bool | Literal["irregular"]]`):
+       Whether for the given adapter, all adapter layers have `requires_grad` set to `True` or `False`. If there is a
+       mix, this will be set to `"irregular"`, which means that your model is in an inconsistent state and might not
+       work as expected.
+    - `available_adapters` (`list[str]`):
+       The names of the available adapters, e.g. `["default"]`.
+    - `devices` (`dict[str, list[str]]`):
+       The devices where the parameters of the given adapter are stored, e.g. `["cuda"]`.
+
+    Args:
+        model ([Union[`~PeftModel`, `~transformers.PreTrainedModel`, `nn.Module`]]):
+            The model to get the adapter layer status from.
+
+    Returns:
+        `peft.peft_model.TunerModelStatus`:
+            A dataclass containing the status of the model.
+
+    """
+    if isinstance(model, PeftModel):
+        if not isinstance(model.base_model, BaseTuner):
+            raise TypeError(
+                "get_model_status() got an invalid PeftModel instance; prefix tuning and adaption prompt are not "
+                "supported."
+            )
+        base_model_type = model.get_base_model().__class__.__name__
+        trainable_params, total_params = model.get_nb_trainable_parameters()
+        base_model = model.base_model
+        peft_types = {key: str(config.peft_type).partition(".")[-1] for key, config in base_model.peft_config.items()}
+        adapter_model_type = base_model.__class__.__name__
+    elif isinstance(model, PreTrainedModel):
+        base_model_type = model.__class__.__name__
+        trainable_params, total_params = PeftModel.get_nb_trainable_parameters(model)
+        base_model = model
+        peft_types = {}
+        adapter_model_type = "None"
+    else:
+        base_model_type = "other"
+        trainable_params, total_params = PeftModel.get_nb_trainable_parameters(model)
+        base_model = model
+        peft_types = {}
+        adapter_model_type = "None"
+
+    layer_status = get_layer_status(model)
+    num_adapter_layers = len(layer_status)
+
+    enabled_set: set[bool] = {status.enabled for status in layer_status}  # must be {True}, {False}, or {True, False}
+    enabled: bool | Literal["irregular"]
+    if len(enabled_set) == 1:
+        enabled = enabled_set.pop()
+    else:
+        enabled = "irregular"
+
+    available_adapters: list[str] = sorted(set().union(*(status.available_adapters for status in layer_status)))
+
+    # ideally, active adapters should be consistent across all layers of the model, but we cannot guarantee it
+    all_active_adapters: set[tuple[str, ...]] = {tuple(status.active_adapters) for status in layer_status}
+    active_adapters: list[str] | Literal["irregular"]
+    if not all_active_adapters:
+        active_adapters = []
+    elif len(all_active_adapters) == 1:
+        active_adapters = list(all_active_adapters.pop())
+    else:
+        active_adapters = "irregular"
+
+    # Here we determine what adapters are merged. This is not trivial because multiple adapters can be merged or not at
+    # the same time. Some layers may only have adapter A, some only adapter B, so it's not as easy as just checking
+    # which adapters are merged on each layer.
+
+    # First, determine all adapters that are merged on at least on module.
+    merged_all: set[str] = set()
+    for status in layer_status:
+        merged_all.update(status.merged_adapters)
+
+    # Next, check if on any layer, on of these adapters is not merged.
+    merged_adapters: list[str] | Literal["irregular"] = sorted(merged_all)
+    for status in layer_status:
+        unmerged = set(status.available_adapters) - set(status.merged_adapters)
+        if unmerged & merged_all:
+            # there is overlap between unmerged adapters and adapters that should be merged
+            merged_adapters = "irregular"
+            break
+
+    # check status of requires_grad
+    # first, merge the values for all layers
+    requires_grad_all: dict[str, list[bool | Literal["irregular"]]] = collections.defaultdict(list)
+    for status in layer_status:
+        for key, val in status.requires_grad.items():
+            requires_grad_all[key].append(val)
+
+    # then, check if the values are consistent
+    def check_irrgular(vals: list[bool | Literal["irregular"]]) -> bool | Literal["irregular"]:
+        if all(val is True for val in vals):
+            return True
+        if all(val is False for val in vals):
+            return False
+        return "irregular"
+
+    requires_grad = {key: check_irrgular(vals) for key, vals in requires_grad_all.items()}
+
+    devices_dd = collections.defaultdict(list)
+    for status in layer_status:
+        for key, val in status.devices.items():
+            devices_dd[key].extend(val)
+    devices = {key: sorted(set(val)) for key, val in devices_dd.items()}
+
+    adapter_model_status = TunerModelStatus(
+        base_model_type=base_model_type,
+        adapter_model_type=adapter_model_type,
+        peft_types=peft_types,
+        trainable_params=trainable_params,
+        total_params=total_params,
+        num_adapter_layers=num_adapter_layers,
+        enabled=enabled,
+        active_adapters=active_adapters,
+        merged_adapters=merged_adapters,
+        requires_grad=requires_grad,
+        available_adapters=available_adapters,
+        devices=devices,
+    )
+    return adapter_model_status
--- a/src/peft/tuners/init.py
+++ b/src/peft/tuners/init.py
@ -18,7 +18,7 @@
 # limitations under the License.

 from .adaption_prompt import AdaptionPromptConfig, AdaptionPromptModel
-from .lora import LoraConfig, LoraModel, LoftQConfig
+from .lora import LoraConfig, LoraModel, LoftQConfig, LoraRuntimeConfig
 from .loha import LoHaConfig, LoHaModel
 from .lokr import LoKrConfig, LoKrModel
 from .ia3 import IA3Config, IA3Model
@ -31,4 +31,8 @@ from .multitask_prompt_tuning import MultitaskPromptEmbedding, MultitaskPromptTu
 from .oft import OFTConfig, OFTModel
 from .mixed import MixedModel
 from .poly import PolyConfig, PolyModel
+from .ln_tuning import LNTuningConfig, LNTuningModel
 from .vera import VeraConfig, VeraModel
+from .fourierft import FourierFTConfig, FourierFTModel
+from .xlora import XLoraConfig, XLoraModel
+from .hra import HRAConfig, HRAModel
--- a/src/peft/tuners/vera/buffer_dict.py
+++ b/src/peft/tuners/vera/buffer_dict.py
--- a/src/peft/tuners/adalora/config.py
+++ b/src/peft/tuners/adalora/config.py
@ -50,3 +50,20 @@ class AdaLoraConfig(LoraConfig):

    def __post_init__(self):
        self.peft_type = PeftType.ADALORA
+
+        if self.use_dora:
+            raise ValueError(f"{self.peft_type} does not support DoRA.")
+
+        if self.loftq_config:
+            raise ValueError(f"{self.peft_type} does not support LOFTQ.")
+
+        self.target_modules = (
+            set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
+        )
+        # if target_modules is a regex expression, then layers_to_transform should be None
+        if isinstance(self.target_modules, str) and self.layers_to_transform is not None:
+            raise ValueError("`layers_to_transform` cannot be used when `target_modules` is a str.")
+
+        # if target_modules is a regex expression, then layers_pattern should be None
+        if isinstance(self.target_modules, str) and self.layers_pattern is not None:
+            raise ValueError("`layers_pattern` cannot be used when `target_modules` is a str.")
--- a/src/peft/tuners/adalora/layer.py
+++ b/src/peft/tuners/adalora/layer.py
@ -35,7 +35,8 @@ class AdaLoraLayer(LoraLayer):
    # List all names of layers that may contain adapter weights
    # Note: ranknum doesn't need to be included as it is not an nn.Module
    adapter_layer_names = ("lora_A", "lora_B", "lora_E", "lora_embedding_A", "lora_embedding_B")
-    # other_param_names is defined in LoraLayer
+    # All names of other parameters that may contain adapter-related parameters
+    other_param_names = ("r", "lora_alpha", "scaling", "lora_dropout", "ranknum")

    def __init__(self, base_layer: nn.Module) -> None:
        super().__init__(base_layer)
@ -72,16 +73,12 @@ class AdaLoraLayer(LoraLayer):
        if init_lora_weights:
            self.reset_lora_parameters(adapter_name)

-        if hasattr(self.get_base_layer(), "qweight"):
-            # QuantLinear
-            self.to(self.get_base_layer().qweight.device)
-        else:
-            self.to(self.get_base_layer().weight.device)
+        self._move_adapter_to_device_of_base_layer(adapter_name)
        self.set_adapter(self.active_adapters)

    def reset_lora_parameters(self, adapter_name):
        if adapter_name in self.lora_A.keys():
-            nn.init.normal_(self.lora_E[adapter_name], mean=0.0, std=0.02)
+            nn.init.zeros_(self.lora_E[adapter_name])
            nn.init.normal_(self.lora_A[adapter_name], mean=0.0, std=0.02)
            nn.init.normal_(self.lora_B[adapter_name], mean=0.0, std=0.02)

--- a/src/peft/tuners/adalora/model.py
+++ b/src/peft/tuners/adalora/model.py
@ -229,6 +229,8 @@ class AdaLoraModel(LoraModel):
        try:
            return super().__getattr__(name)  # defer to nn.Module's logic
        except AttributeError:
+            if name == "model":  # see #1892: prevent infinite recursion if class is not initialized
+                raise
            return getattr(self.model, name)

    def forward(self, *args, **kwargs):
@ -349,3 +351,7 @@ class AdaLoraModel(LoraModel):
        # Pass the function and do forward propagation
        else:
            return None
+
+    def add_weighted_adapter(self, *args, **kwargs):
+        """This method is not supported for AdaLoRA, use LoRA instead."""
+        raise TypeError(f"{self.__class__.__name__} does not support add_weighted_adapter method.")
--- a/src/peft/tuners/adaption_prompt/model.py
+++ b/src/peft/tuners/adaption_prompt/model.py
@ -158,4 +158,6 @@ class AdaptionPromptModel(nn.Module):
        except AttributeError:
            # This is necessary as e.g. causal models have various methods that we
            # don't want to re-implement here.
+            if name == "model":  # see #1892: prevent infinite recursion if class is not initialized
+                raise
            return getattr(self.model, name)
--- a/src/peft/tuners/boft/layer.py
+++ b/src/peft/tuners/boft/layer.py
@ -20,41 +20,78 @@ from __future__ import annotations
 import math
 import os
 import warnings
+from contextlib import contextmanager
 from typing import Any, Optional, Union

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.autograd import Function
-from torch.utils.cpp_extension import load

 from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge


-os.environ["CC"] = "gcc"
-os.environ["CXX"] = "gcc"
-curr_dir = os.path.dirname(__file__)
-
 _FBD_CUDA = None


+# this function is a 1:1 copy from accelerate
+@contextmanager
+def patch_environment(**kwargs):
+    """
+    A context manager that will add each keyword argument passed to `os.environ` and remove them when exiting.
+
+    Will convert the values in `kwargs` to strings and upper-case all the keys.
+
+    Example:
+
+    ```python
+    >>> import os
+    >>> from accelerate.utils import patch_environment
+
+    >>> with patch_environment(FOO="bar"):
+    ...     print(os.environ["FOO"])  # prints "bar"
+    >>> print(os.environ["FOO"])  # raises KeyError
+    ```
+    """
+    existing_vars = {}
+    for key, value in kwargs.items():
+        key = key.upper()
+        if key in os.environ:
+            existing_vars[key] = os.environ[key]
+        os.environ[key] = str(value)
+
+    yield
+
+    for key in kwargs:
+        key = key.upper()
+        if key in existing_vars:
+            # restore previous value
+            os.environ[key] = existing_vars[key]
+        else:
+            os.environ.pop(key, None)
+
+
 def get_fbd_cuda():
    global _FBD_CUDA

    if _FBD_CUDA is not None:
        return _FBD_CUDA

+    # This import initializes cuda context and should thus be local, see issue 1877
+    from torch.utils.cpp_extension import load
+
    curr_dir = os.path.dirname(__file__)
    # need ninja to build the extension
    try:
-        fbd_cuda = load(
-            name="fbd_cuda",
-            sources=[f"{curr_dir}/fbd/fbd_cuda.cpp", f"{curr_dir}/fbd/fbd_cuda_kernel.cu"],
-            verbose=True,
-            # build_directory='/tmp/'  # for debugging
-        )
-        # extra_cuda_cflags = ['-std=c++14', '-ccbin=$$(which gcc-7)']) # cuda10.2 is not compatible with gcc9. Specify gcc 7
-        import fbd_cuda
+        with patch_environment(CC="gcc", CXX="gcc"):
+            fbd_cuda = load(
+                name="fbd_cuda",
+                sources=[f"{curr_dir}/fbd/fbd_cuda.cpp", f"{curr_dir}/fbd/fbd_cuda_kernel.cu"],
+                verbose=True,
+                # build_directory='/tmp/'  # for debugging
+            )
+            # extra_cuda_cflags = ['-std=c++14', '-ccbin=$$(which gcc-7)']) # cuda10.2 is not compatible with gcc9. Specify gcc 7
+            import fbd_cuda
    except Exception as e:
        warnings.warn(f"Failed to load the CUDA extension: {e}, check if ninja is available.")
        warnings.warn("Setting boft_n_butterfly_factor to 1 to speed up the finetuning process.")
@ -310,18 +347,11 @@ class BOFTLayer(BaseTunerLayer):

        self.reset_boft_parameters(adapter_name, init_weights)

-        weight = getattr(self, "weight", None)
-        if weight is not None:
-            # the layer is already completely initialized, this is an update
-            if weight.dtype.is_floating_point or weight.dtype.is_complex:
-                self.to(weight.device, dtype=weight.dtype)
-            else:
-                self.to(weight.device)
-
        # set the boft block size and number
        self.boft_block_size[adapter_name] = boft_block_size
        self.boft_block_num[adapter_name] = boft_block_num

+        self._move_adapter_to_device_of_base_layer(adapter_name)
        self.set_adapter(self.active_adapters)

    def reset_boft_parameters(self, adapter_name, init_weights):
@ -544,8 +574,9 @@ class Linear(nn.Module, BOFTLayer):
            block_diagonal_butterfly = torch.block_diag(*torch.unbind(orth_rotate_butterfly))
            block_diagonal_butterfly = block_diagonal_butterfly.unsqueeze(0)

-        butterfly_oft_mat_batch = torch.bmm(block_diagonal_butterfly, self.boft_P.permute(0, 2, 1))
-        butterfly_oft_mat_batch = torch.bmm(self.boft_P, butterfly_oft_mat_batch)
+        boft_P = self.boft_P.to(block_diagonal_butterfly.device)
+        butterfly_oft_mat_batch = torch.bmm(block_diagonal_butterfly, boft_P.permute(0, 2, 1))
+        butterfly_oft_mat_batch = torch.bmm(boft_P, butterfly_oft_mat_batch)
        butterfly_oft_mat = butterfly_oft_mat_batch[0]

        for i in range(1, butterfly_oft_mat_batch.shape[0]):
@ -585,8 +616,9 @@ class Linear(nn.Module, BOFTLayer):
                    block_diagonal_butterfly = torch.block_diag(*torch.unbind(orth_rotate_butterfly))
                    block_diagonal_butterfly = block_diagonal_butterfly.unsqueeze(0)

-                butterfly_oft_mat_batch = torch.bmm(block_diagonal_butterfly, self.boft_P.permute(0, 2, 1))
-                butterfly_oft_mat_batch = torch.bmm(self.boft_P, butterfly_oft_mat_batch)
+                boft_P = self.boft_P.to(block_diagonal_butterfly.device)
+                butterfly_oft_mat_batch = torch.bmm(block_diagonal_butterfly, boft_P.permute(0, 2, 1))
+                butterfly_oft_mat_batch = torch.bmm(boft_P, butterfly_oft_mat_batch)
                butterfly_oft_mat = butterfly_oft_mat_batch[0]

                for i in range(1, butterfly_oft_mat_batch.shape[0]):
@ -742,19 +774,13 @@ class Conv2d(nn.Module, BOFTLayer):

        self.reset_boft_parameters(adapter_name, init_weights)

-        weight = getattr(self, "weight", None)
-        if weight is not None:
-            # the layer is already completely initialized, this is an update
-            if weight.dtype.is_floating_point or weight.dtype.is_complex:
-                self.to(weight.device, dtype=weight.dtype)
-            else:
-                self.to(weight.device)
-        self.set_adapter(self.active_adapters)
-
        # set the boft block size and number
        self.boft_block_size[adapter_name] = boft_block_size
        self.boft_block_num[adapter_name] = boft_block_num

+        self._move_adapter_to_device_of_base_layer(adapter_name)
+        self.set_adapter(self.active_adapters)
+
    def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
        """
        Merge the active adapter weights into the base weights
@ -860,8 +886,9 @@ class Conv2d(nn.Module, BOFTLayer):
            block_diagonal_butterfly = torch.block_diag(*torch.unbind(orth_rotate_butterfly))
            block_diagonal_butterfly = block_diagonal_butterfly.unsqueeze(0)

-        butterfly_oft_mat_batch = torch.bmm(block_diagonal_butterfly, self.boft_P.permute(0, 2, 1))
-        butterfly_oft_mat_batch = torch.bmm(self.boft_P, butterfly_oft_mat_batch)
+        boft_P = self.boft_P.to(block_diagonal_butterfly.device)
+        butterfly_oft_mat_batch = torch.bmm(block_diagonal_butterfly, boft_P.permute(0, 2, 1))
+        butterfly_oft_mat_batch = torch.bmm(boft_P, butterfly_oft_mat_batch)
        butterfly_oft_mat = butterfly_oft_mat_batch[0]

        for i in range(1, butterfly_oft_mat_batch.shape[0]):
@ -903,8 +930,9 @@ class Conv2d(nn.Module, BOFTLayer):
                    block_diagonal_butterfly = torch.block_diag(*torch.unbind(orth_rotate_butterfly))
                    block_diagonal_butterfly = block_diagonal_butterfly.unsqueeze(0)

-                butterfly_oft_mat_batch = torch.bmm(block_diagonal_butterfly, self.boft_P.permute(0, 2, 1))
-                butterfly_oft_mat_batch = torch.bmm(self.boft_P, butterfly_oft_mat_batch)
+                boft_P = self.boft_P.to(block_diagonal_butterfly.device)
+                butterfly_oft_mat_batch = torch.bmm(block_diagonal_butterfly, boft_P.permute(0, 2, 1))
+                butterfly_oft_mat_batch = torch.bmm(boft_P, butterfly_oft_mat_batch)
                butterfly_oft_mat = butterfly_oft_mat_batch[0]

                for i in range(1, butterfly_oft_mat_batch.shape[0]):
--- a/src/peft/tuners/boft/model.py
+++ b/src/peft/tuners/boft/model.py
@ -207,6 +207,8 @@ class BOFTModel(BaseTuner):
        try:
            return super().__getattr__(name)  # defer to nn.Module's logic
        except AttributeError:
+            if name == "model":  # see #1892: prevent infinite recursion if class is not initialized
+                raise
            return getattr(self.model, name)

    def get_peft_config_as_dict(self, inference: bool = False):
--- a/src/peft/tuners/fourierft/init.py
+++ b/src/peft/tuners/fourierft/init.py
@ -0,0 +1,20 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .config import FourierFTConfig
+from .layer import FourierFTLayer, FourierFTLinear
+from .model import FourierFTModel
+
+
+__all__ = ["FourierFTConfig", "FourierFTLayer", "FourierFTLinear", "FourierFTModel"]
--- a/src/peft/tuners/fourierft/config.py
+++ b/src/peft/tuners/fourierft/config.py
@ -0,0 +1,188 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Optional, Union
+
+from peft.config import PeftConfig
+from peft.utils import PeftType
+
+
+@dataclass
+class FourierFTConfig(PeftConfig):
+    """
+    This is the configuration class to store the configuration of a [`FourierFTModel`].
+
+    Args:
+        n_frequency (`int`):
+            Num of learnable frequencies for the Discrete Fourier Transform. 'n_frequency' is an integer that is
+            greater than 0 and less than or equal to d^2 (assuming the weight W has dimensions of d by d).
+            Additionally, it is the number of trainable parameters required to update each delta W weight.
+            'n_frequency' will affect the performance and efficiency for PEFT. Specifically, it has little impact on
+            training speed, but higher values of it (typically) result in larger GPU memory costs and better accuracy.
+            With the same `target_modules`, the number of parameters of LoRA is (2*d*r/n_frequency) times that of
+            FourierFT. The following examples of settings regarding 'n_frequency' can be used as reference for users.
+            For NLU tasks with the RoBERTa-large model, adopting 'n_frequency': 1000 can almost achieve similar results
+            as 'r': 8 in LoRA. At this time, the number of parameters of LoRA is about 16 times that of FourierFT. For
+            image classification tasks with Vit-large models, adopting 'n_frequency': 3000 can almost achieve similar
+            results as 'r': 16 in LoRA, where the number of parameters of LoRA is about 11 times that of FourierFT.
+        scaling (`float`):
+            The scaling value for the delta W matrix. This is an important hyperparameter used for scaling, similar to
+            the 'lora_alpha' parameter in the LoRA method. 'scaling' can be determined during the hyperparameter search
+            process. However, if users want to skip this process, one can refer to the settings in the following
+            scenarios. This parameter can be set to 100.0 or 150.0 for both RoBERTa-base and RoBERTa-large models
+            across all NLU (GLUE) tasks. This parameter can be set to 300.0 for both LLaMA family models for all
+            instruction tuning. This parameter can be set to 300.0 for both ViT-base and ViT-large models across all
+            image classification tasks.
+        random_loc_seed (`int`):
+            Seed for the random location of the frequencies, i.e., the spectral entry matrix.
+        target_modules (`Union[list[str],str]`):
+            List of module names or regex expression of the module names to replace with FourierFT. For example, ['q',
+            'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. Only linear layers are supported.
+        fan_in_fan_out (`bool`):
+            Set this to True if the layer to replace stores weight like (fan_in, fan_out).
+        bias (`str`):
+            Bias type for FourierFT. Can be 'none', 'all' or 'fourier_only'.
+        modules_to_save (`list[str]`):
+            List of modules apart from FourierFT layers to be set as trainable and saved in the final checkpoint. For
+            example, in Sequence Classification or Token Classification tasks, the final layer `classifier/score` are
+            randomly initialized and as such need to be trainable and saved.
+        layers_to_transform (`Union[list[int],int]`):
+            The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes
+            that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at
+            this index.
+        layers_pattern (`str`):
+            The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is
+            not in the common layers pattern.
+        n_frequency_pattern (`dict`):
+            The mapping from layer names or regexp expression to n_frequency which are different from the default
+            specified. For example, `{model.decoder.layers.0.encoder_attn.k_proj: 1000`}.
+        init_weights (`bool`):
+            The initialization of the Fourier weights. Set this to False if the spectrum are initialized to a standard
+            normal distribution. Set this to True if the spectrum are initialized to zeros.
+    """
+
+    n_frequency: int = field(
+        default=1000,
+        metadata={
+            "help": (
+                "Num of learnable frequencies for the Discrete Fourier Transform. 'n_frequency' is an integer that is"
+                "greater than 0 and less than or equal to d^2 (assuming the weight W has dimensions of d by d)."
+                "Additionally, it is the number of trainable parameters required to update each delta W weight."
+                "'n_frequency' will affect the performance and efficiency for PEFT. Specifically, it has little impact on"
+                "training speed, but higher values of it (typically) result in larger GPU memory costs and better accuracy."
+                "With the same `target_modules`, the number of parameters of LoRA is (2*d*r/n_frequency) times that of FourierFT."
+                "The following examples of settings regarding 'n_frequency' can be used as reference for users. For NLU"
+                "tasks with the RoBERTa-large model, adopting 'n_frequency': 1000 can almost achieve similar results as"
+                "'r': 8 in LoRA. At this time, the number of parameters of LoRA is about 16 times that of FourierFT."
+                "For image classification tasks with Vit-large models, adopting 'n_frequency': 3000 can almost achieve"
+                "similar results as 'r': 16 in LoRA, where the number of parameters of LoRA is about 11 times that of FourierFT."
+            )
+        },
+    )
+    scaling: float = field(
+        default=150.0,
+        metadata={
+            "help": (
+                "The scaling value for the delta W matrix. This is an important hyperparameter used for scaling, similar to the"
+                "'lora_alpha' parameter in the LoRA method. 'scaling' can be determined during the hyperparameter search process."
+                "However, if users want to skip this process, one can refer to the settings in the following scenarios."
+                "This parameter can be set to 100.0 or 150.0 for both RoBERTa-base and RoBERTa-large models across all NLU (GLUE) tasks."
+                "This parameter can be set to 300.0 for both LLaMA family models for all instruction tuning."
+                "This parameter can be set to 300.0 for both ViT-base and ViT-large models across all image classification tasks."
+            )
+        },
+    )
+    random_loc_seed: Optional[int] = field(
+        default=777, metadata={"help": "Seed for the random location of the frequencies."}
+    )
+    fan_in_fan_out: bool = field(
+        default=False,
+        metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"},
+    )
+    target_modules: Optional[Union[list[str], str]] = field(
+        default=None,
+        metadata={
+            "help": (
+                "List of module names or regex expression of the module names to replace with FourierFT."
+                "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. "
+                "Only linear layers are supported."
+            )
+        },
+    )
+    bias: str = field(
+        default="none", metadata={"help": "Bias type for FourierFT. Can be 'none', 'all' or 'fourier_only'."}
+    )
+    modules_to_save: Optional[list[str]] = field(
+        default=None,
+        metadata={
+            "help": (
+                "List of modules apart from FourierFT layers to be set as trainable and saved in the final checkpoint. For"
+                " example, in Sequence Classification or Token Classification tasks, the final layer"
+                " `classifier/score` are randomly initialized and as such need to be trainable and saved."
+            )
+        },
+    )
+    layers_to_transform: Optional[Union[list[int], int]] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers"
+                " indexes that are specified inside this list. If a single integer is passed, PEFT will transform only"
+                " the layer at this index."
+            )
+        },
+    )
+    layers_pattern: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer"
+                " pattern is not in the common layers pattern."
+            )
+        },
+    )
+    n_frequency_pattern: Optional[dict] = field(
+        default_factory=dict,
+        metadata={
+            "help": (
+                "The mapping from layer names or regexp expression to n_frequency which are different from the default specified."
+                "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 500`}."
+            )
+        },
+    )
+    init_weights: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "The initialization of the Fourier weights. Set this to False if the spectrum should be initialized to a standard normal distribution."
+                "Set this to True if the spectrum should be initialized to zeros."
+            )
+        },
+    )
+
+    def __post_init__(self):
+        self.peft_type = PeftType.FOURIERFT
+        self.target_modules = (
+            set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
+        )
+        # if target_modules is a regex expression, then layers_to_transform should be None
+        if isinstance(self.target_modules, str) and self.layers_to_transform is not None:
+            raise ValueError("`layers_to_transform` cannot be used when `target_modules` is a str.")
+
+        # if target_modules is a regex expression, then layers_pattern should be None
+        if isinstance(self.target_modules, str) and self.layers_pattern is not None:
+            raise ValueError("`layers_pattern` cannot be used when `target_modules` is a str.")
--- a/src/peft/tuners/fourierft/layer.py
+++ b/src/peft/tuners/fourierft/layer.py
@ -0,0 +1,190 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from typing import Any, List, Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.pytorch_utils import Conv1D
+
+from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge
+
+
+class FourierFTLayer(BaseTunerLayer):
+    # All names of layers that may contain (trainable) adapter weights
+    adapter_layer_names = ("fourierft_spectrum",)
+    # All names of other parameters that may contain adapter-related parameters
+    other_param_names = ("fourierft_n_frequency", "fourierft_scaling", "fourierft_random_loc_seed")
+
+    def __init__(self, base_layer: nn.Module, **kwargs) -> None:
+        self.base_layer = base_layer
+        self.fourierft_n_frequency = {}
+        self.fourierft_scaling = {}
+        self.fourierft_spectrum = nn.ParameterDict({})
+        self.indices = {}
+        self.fourierft_random_loc_seed = {}
+        # Mark the weight as unmerged
+        self._disable_adapters = False
+        self.merged_adapters = []
+        self.kwargs = kwargs
+
+        base_layer = self.get_base_layer()
+        if isinstance(base_layer, nn.Linear):
+            self.in_features, self.out_features = base_layer.in_features, base_layer.out_features
+        elif isinstance(base_layer, Conv1D):
+            self.in_features, self.out_features = (
+                base_layer.weight.ds_shape if hasattr(base_layer.weight, "ds_shape") else base_layer.weight.shape
+            )
+        else:
+            raise ValueError(f"Unsupported layer type {type(base_layer)}")
+
+    def update_layer(self, adapter_name, n_frequency, scaling, init_weights, random_loc_seed):
+        if n_frequency <= 0:
+            raise ValueError(f"`n_frequency` should be a positive integer value but the value passed is {n_frequency}")
+        if n_frequency > self.in_features * self.out_features:
+            raise ValueError(
+                f"`n_frequency` should be less than or equal to the product of the input and output dimensions "
+                f"but the value passed is {n_frequency} and the product is {self.in_features * self.out_features}"
+            )
+        self.fourierft_n_frequency[adapter_name] = n_frequency
+        self.fourierft_random_loc_seed[adapter_name] = random_loc_seed
+        self.indices[adapter_name] = torch.randperm(
+            self.out_features * self.in_features,
+            generator=torch.Generator().manual_seed(self.fourierft_random_loc_seed[adapter_name]),
+        )[:n_frequency]
+        self.indices[adapter_name] = torch.stack(
+            [self.indices[adapter_name] // self.in_features, self.indices[adapter_name] % self.in_features], dim=0
+        )
+        self.fourierft_scaling[adapter_name] = scaling
+        # Actual trainable parameters
+        self.fourierft_spectrum[adapter_name] = nn.Parameter(torch.randn(n_frequency), requires_grad=True)
+
+        if init_weights:
+            self.reset_fourier_parameters(adapter_name)
+
+        self._move_adapter_to_device_of_base_layer(adapter_name)
+        self.set_adapter(self.active_adapters)
+
+    @torch.no_grad()
+    def reset_fourier_parameters(self, adapter_name):
+        if adapter_name in self.fourierft_spectrum.keys():
+            nn.init.zeros_(self.fourierft_spectrum[adapter_name])
+
+    def get_delta_weight(self, adapter) -> torch.Tensor:
+        spectrum = self.fourierft_spectrum[adapter]
+        indices = self.indices[adapter].to(spectrum.device)
+        dense_spectrum = torch.zeros(self.out_features, self.in_features, device=spectrum.device, dtype=spectrum.dtype)
+        dense_spectrum[indices[0, :], indices[1, :]] = spectrum
+        delta_weight = torch.fft.ifft2(dense_spectrum).real * self.fourierft_scaling[adapter]
+        return delta_weight
+
+
+class FourierFTLinear(nn.Module, FourierFTLayer):
+    # FourierFT implemented in a dense layer
+    def __init__(
+        self,
+        base_layer,
+        adapter_name: str,
+        n_frequency: int = 1000,
+        scaling: float = 150.0,
+        fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
+        init_weights: Union[bool, str] = False,
+        random_loc_seed: int = 777,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        FourierFTLayer.__init__(self, base_layer, **kwargs)
+        self.fan_in_fan_out = fan_in_fan_out
+        self._active_adapter = adapter_name
+        self.update_layer(adapter_name, n_frequency, scaling, init_weights, random_loc_seed)
+
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
+        """
+        Merge the active adapter weights into the base weights
+
+        Args:
+            safe_merge (`bool`, *optional*):
+                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
+                before merging the weights. This is useful if you want to check if the merge operation will produce
+                NaNs. Defaults to `False`.
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
+        """
+        adapter_names = check_adapters_to_merge(self, adapter_names)
+        if not adapter_names:
+            # no adapter to merge
+            return
+
+        for active_adapter in adapter_names:
+            if active_adapter in self.fourierft_spectrum.keys():
+                base_layer = self.get_base_layer()
+                if safe_merge:
+                    # Note that safe_merge will be slower than the normal merge
+                    # because of the copy operation.
+                    orig_weights = base_layer.weight.data.clone()
+                    orig_weights += self.get_delta_weight(active_adapter)
+
+                    if not torch.isfinite(orig_weights).all():
+                        raise ValueError(
+                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                        )
+
+                    base_layer.weight.data = orig_weights
+                else:
+                    base_layer.weight.data += self.get_delta_weight(active_adapter)
+                self.merged_adapters.append(active_adapter)
+
+    def unmerge(self) -> None:
+        """
+        This method unmerges all merged adapter layers from the base weights.
+        """
+        if not self.merged:
+            warnings.warn("Already unmerged. Nothing to do.")
+            return
+        while len(self.merged_adapters) > 0:
+            active_adapter = self.merged_adapters.pop()
+            if active_adapter in self.fourierft_spectrum.keys():
+                self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter)
+
+    def get_delta_weight(self, adapter) -> torch.Tensor:
+        return super().get_delta_weight(adapter)
+
+    def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
+        previous_dtype = x.dtype
+
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result = self.base_layer(x, *args, **kwargs)
+        elif self.merged:
+            result = self.base_layer(x, *args, **kwargs)
+        else:
+            result = self.base_layer(x, *args, **kwargs)
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.fourierft_spectrum.keys():
+                    continue
+
+                delta_w = self.get_delta_weight(active_adapter)
+                x = x.to(delta_w.dtype)
+                result = result + F.linear(x, delta_w)
+
+        result = result.to(previous_dtype)
+        return result
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "fourierft." + rep
--- a/src/peft/tuners/fourierft/model.py
+++ b/src/peft/tuners/fourierft/model.py
@ -0,0 +1,346 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import re
+import warnings
+from dataclasses import asdict
+from enum import Enum
+from itertools import chain
+from typing import Optional
+
+import torch
+from tqdm import tqdm
+from transformers.pytorch_utils import Conv1D
+
+from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists
+from peft.utils import (
+    TRANSFORMERS_MODELS_TO_FOURIERFT_TARGET_MODULES_MAPPING,
+    ModulesToSaveWrapper,
+    _get_submodules,
+)
+
+from .config import FourierFTConfig
+from .layer import FourierFTLayer, FourierFTLinear
+
+
+class FourierFTModel(BaseTuner):
+    """
+    Creates FourierFT model from a pretrained transformers model.
+
+    The method is described in detail in https://arxiv.org/abs/2405.03003.
+
+    Args:
+        model ([`torch.nn.Module`]): The model to be adapted.
+        config ([`FourierFTConfig`]): The configuration of the FourierFT model.
+        adapter_name (`str`): The name of the adapter, defaults to `"default"`.
+
+    Returns:
+        `torch.nn.Module`: The FourierFT model.
+
+    **Attributes**:
+        - **model** ([`~transformers.PreTrainedModel`]) -- The model to be adapted.
+        - **peft_config** ([`FourierFTConfig`]): The configuration of the Fourier model.
+    """
+
+    prefix: str = "fourierft_"
+
+    def __init__(self, model, config, adapter_name) -> None:
+        super().__init__(model, config, adapter_name)
+
+    def _check_new_adapter_config(self, config: FourierFTConfig) -> None:
+        """
+        A helper method to check the config when a new adapter is being added.
+
+        Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters.
+
+        """
+        # TODO: there should be a check if any of the existing adapters actually has bias != "none", or else the check
+        # does not fully correspond to the error message.
+        if (len(self.peft_config) > 1) and (config.bias != "none"):
+            raise ValueError(
+                f"{self.__class__.__name__} supports only 1 adapter with bias. When using multiple adapters, "
+                "set bias to 'none' for all adapters."
+            )
+
+    @staticmethod
+    def _check_target_module_exists(fourierft_config, key):
+        return check_target_module_exists(fourierft_config, key)
+
+    def _create_and_replace(
+        self,
+        fourierft_config,
+        adapter_name,
+        target,
+        target_name,
+        parent,
+        current_key,
+        **optional_kwargs,
+    ):
+        if current_key is None:
+            raise ValueError("Current Key shouldn't be `None`")
+        # Regexp matching - Find key which matches current target_name in patterns provided
+        pattern_keys = list(chain(fourierft_config.n_frequency_pattern.keys()))
+        target_name_key = next(filter(lambda key: re.match(rf".*\.{key}$", current_key), pattern_keys), current_key)
+
+        n_frequency = fourierft_config.n_frequency_pattern.get(target_name_key, fourierft_config.n_frequency)
+        scaling = fourierft_config.scaling
+        random_loc_seed = fourierft_config.random_loc_seed
+        bias = hasattr(target, "bias") and target.bias is not None
+        kwargs = {
+            "n_frequency": n_frequency,
+            "scaling": scaling,
+            "fan_in_fan_out": fourierft_config.fan_in_fan_out,
+            "init_weights": fourierft_config.init_weights,
+            "random_loc_seed": fourierft_config.random_loc_seed,
+        }
+        kwargs["bias"] = bias
+        if isinstance(target, FourierFTLayer):
+            target.update_layer(
+                adapter_name,
+                n_frequency,
+                scaling,
+                fourierft_config.init_weights,
+                random_loc_seed,
+            )
+        else:
+            new_module = self._create_new_module(fourierft_config, adapter_name, target, **kwargs)
+            if adapter_name != self.active_adapter:
+                # adding an additional adapter: it is not automatically trainable
+                new_module.requires_grad_(False)
+            self._replace_module(parent, target_name, new_module, target)
+
+    def _replace_module(self, parent, child_name, new_module, child):
+        setattr(parent, child_name, new_module)
+        # It's not necessary to set requires_grad here, as that is handled by
+        # _mark_only_adapters_as_trainable
+
+        # child layer wraps the original module, unpack it
+        if hasattr(child, "base_layer"):
+            child = child.base_layer
+
+        if not hasattr(new_module, "base_layer"):
+            new_module.weight = child.weight
+            if hasattr(child, "bias"):
+                new_module.bias = child.bias
+
+        if getattr(child, "state", None) is not None:
+            if hasattr(new_module, "base_layer"):
+                new_module.base_layer.state = child.state
+            else:
+                new_module.state = child.state
+            new_module.to(child.weight.device)
+
+        # dispatch to correct device
+        for name, module in new_module.named_modules():
+            if "fourierft_" in name:
+                module.to(child.weight.device)
+
+    def _mark_only_adapters_as_trainable(self, model: torch.nn.Module) -> None:
+        for n, p in model.named_parameters():
+            if self.prefix not in n:
+                p.requires_grad = False
+
+        for active_adapter in self.active_adapters:
+            bias = self.peft_config[active_adapter].bias
+            if bias == "none":
+                continue
+
+            if bias == "all":
+                for n, p in model.named_parameters():
+                    if "bias" in n:
+                        p.requires_grad = True
+            elif bias == "fourier_only":
+                for m in model.modules():
+                    if isinstance(m, FourierFTLayer) and hasattr(m, "bias") and m.bias is not None:
+                        m.bias.requires_grad = True
+            else:
+                raise NotImplementedError(f"Requested bias: {bias}, is not implemented.")
+
+    @staticmethod
+    def _create_new_module(fourierft_config, adapter_name, target, **kwargs):
+        if isinstance(target, BaseTunerLayer):
+            target_base_layer = target.get_base_layer()
+        else:
+            target_base_layer = target
+
+        if isinstance(target_base_layer, torch.nn.Linear):
+            if kwargs["fan_in_fan_out"]:
+                warnings.warn(
+                    "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
+                    "Setting fan_in_fan_out to False."
+                )
+                kwargs["fan_in_fan_out"] = fourierft_config.fan_in_fan_out = False
+        elif isinstance(target_base_layer, Conv1D):
+            kwargs["is_target_conv_1d_layer"] = True
+            if not kwargs["fan_in_fan_out"]:
+                warnings.warn(
+                    "fan_in_fan_out is set to False but the target module is `Conv1D`. "
+                    "Setting fan_in_fan_out to True."
+                )
+                kwargs["fan_in_fan_out"] = fourierft_config.fan_in_fan_out = True
+        else:
+            raise ValueError(
+                f"Target module {target} is not supported. Currently, only the following modules are supported: "
+                "`torch.nn.Linear`."
+            )
+
+        new_module = FourierFTLinear(target, adapter_name, **kwargs)
+
+        return new_module
+
+    def __getattr__(self, name: str):
+        """Forward missing attributes to the wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Module's logic
+        except AttributeError:
+            if name == "model":
+                raise
+            return getattr(self.model, name)
+
+    def get_peft_config_as_dict(self, inference: bool = False):
+        config_dict = {}
+        for key, value in self.peft_config.items():
+            config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()}
+            if inference:
+                config["inference_mode"] = True
+        config_dict[key] = config
+        return config
+
+    def _set_adapter_layers(self, enabled: bool = True) -> None:
+        for module in self.model.modules():
+            if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)):
+                module.enable_adapters(enabled)
+
+    def enable_adapter_layers(self) -> None:
+        """Enable all adapters.
+
+        Call this if you have previously disabled all adapters and want to re-enable them.
+        """
+        self._set_adapter_layers(enabled=True)
+
+    def disable_adapter_layers(self) -> None:
+        """Disable all adapters.
+
+        When disabling all adapters, the model output corresponds to the output of the base model.
+        """
+        for active_adapter in self.active_adapters:
+            val = self.peft_config[active_adapter].bias
+            if val != "none":
+                msg = (
+                    f"Careful, disabling adapter layers with bias configured to be '{val}' does not produce the same "
+                    "output as the the base model would without adaption."
+                )
+                warnings.warn(msg)
+        self._set_adapter_layers(enabled=False)
+
+    def set_adapter(self, adapter_name: str | list[str]) -> None:
+        """Set the active adapter(s).
+
+        Args:
+            adapter_name (`str` or `list[str]`): Name of the adapter(s) to be activated.
+        """
+        for module in self.model.modules():
+            if isinstance(module, FourierFTLayer):
+                if module.merged:
+                    warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.")
+                    module.unmerge()
+                module.set_adapter(adapter_name)
+        self.active_adapter = adapter_name
+
+    @staticmethod
+    def _prepare_adapter_config(peft_config, model_config):
+        if peft_config.target_modules is None:
+            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_FOURIERFT_TARGET_MODULES_MAPPING:
+                raise ValueError("Please specify `target_modules` in `peft_config`")
+            peft_config.target_modules = set(
+                TRANSFORMERS_MODELS_TO_FOURIERFT_TARGET_MODULES_MAPPING[model_config["model_type"]]
+            )
+        return peft_config
+
+    def _unload_and_optionally_merge(
+        self,
+        merge=True,
+        progressbar: bool = False,
+        safe_merge: bool = False,
+        adapter_names: Optional[list[str]] = None,
+    ):
+        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
+        desc = "Unloading " + ("and merging " if merge else "") + "model"
+        for key in tqdm(key_list, disable=not progressbar, desc=desc):
+            try:
+                parent, target, target_name = _get_submodules(self.model, key)
+            except AttributeError:
+                continue
+
+            if hasattr(target, "base_layer"):
+                if merge:
+                    target.merge(safe_merge=safe_merge, adapter_names=adapter_names)
+                self._replace_module(parent, target_name, target.get_base_layer(), target)
+            elif isinstance(target, ModulesToSaveWrapper):
+                # save any additional trainable modules part of `modules_to_save`
+                setattr(parent, target_name, target.modules_to_save[target.active_adapter])
+
+        return self.model
+
+    def delete_adapter(self, adapter_name: str):
+        """
+        Deletes an existing adapter.
+
+        Args:
+            adapter_name (str): Name of the adapter to be deleted.
+        """
+        if adapter_name not in list(self.peft_config.keys()):
+            raise ValueError(f"Adapter {adapter_name} does not exist")
+        del self.peft_config[adapter_name]
+
+        # we cannot use self.prefix as we want to include non-trainable fourierft parameters
+        key_list = [key for key, _ in self.model.named_modules() if "fourierft" not in key]
+        new_adapter = None
+        for key in key_list:
+            _, target, _ = _get_submodules(self.model, key)
+            if isinstance(target, FourierFTLayer):
+                target.delete_adapter(adapter_name)
+                if new_adapter is None:
+                    new_adapter = target.active_adapter[:]
+
+        self.active_adapter = new_adapter or []
+
+    def merge_and_unload(
+        self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[list[str]] = None
+    ) -> torch.nn.Module:
+        r"""
+        This method merges the Fourier layers into the base model. This is needed if someone wants to use the base
+        model as a standalone model.
+
+        Args:
+            progressbar (`bool`):
+                whether to show a progressbar indicating the unload and merge process
+            safe_merge (`bool`):
+                whether to activate the safe merging check to check if there is any potential Nan in the adapter
+                weights
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
+        """
+        return self._unload_and_optionally_merge(
+            progressbar=progressbar, safe_merge=safe_merge, adapter_names=adapter_names
+        )
+
+    def unload(self) -> torch.nn.Module:
+        """
+        Gets back the base model by removing all the Fourier modules without merging. This gives back the original base
+        model.
+        """
+        return self._unload_and_optionally_merge(merge=False)
--- a/src/peft/tuners/hra/init.py
+++ b/src/peft/tuners/hra/init.py
@ -0,0 +1,20 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .config import HRAConfig
+from .layer import HRAConv2d, HRALayer, HRALinear
+from .model import HRAModel
+
+
+__all__ = ["HRAConfig", "HRAModel", "HRAConv2d", "HRALinear", "HRALayer"]
--- a/src/peft/tuners/hra/config.py
+++ b/src/peft/tuners/hra/config.py
@ -0,0 +1,116 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import List, Optional, Union
+
+from peft.config import PeftConfig
+from peft.utils import PeftType
+
+
+@dataclass
+class HRAConfig(PeftConfig):
+    """
+    This is the configuration class to store the configuration of a [`HRAModel`].
+
+    Args:
+        r (`int`):
+            The rank of HRA across different layers. It is best to set 'r' to an even number; otherwise, the default
+            initialization method will not work.
+        apply_GS (`bool`):
+            Whether to apply Gram-Schmidt orthogonalization.
+        target_modules (`Optional[Union[List[str], str]]`):
+            The names of the modules to apply the adapter to. If this is specified, only the modules with the specified
+            names will be replaced. When passing a string, a regex match will be performed. When passing a list of
+            strings, either an exact match will be performed or it is checked if the name of the module ends with any
+            of the passed strings. If this is specified as 'all-linear', then all linear modules are chosen, excluding
+            the output layer. If this is not specified, modules will be chosen according to the model architecture. If
+            the architecture is not known, an error will be raised -- in this case, you should specify the target
+            modules manually.
+        init_weights (`bool`):
+            Whether to perform initialization of HRA weights.
+        layers_to_transform (`Union[List[int], int]`):
+            The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices
+            that are specified in this list. If a single integer is passed, it will apply the transformations on the
+            layer at this index.
+        layers_pattern (`str`):
+            The layer pattern name, used only if `layers_to_transform` is different from `None`.
+        rank_pattern (`dict`):
+            The mapping from layer names or regexp expression to ranks which are different from the default rank
+            specified by `r`.
+        modules_to_save (`List[str]`):
+            List of modules apart from adapter layers to be set as trainable and saved in the final checkpoint.
+    """
+
+    r: int = field(
+        default=8,
+        metadata={
+            "help": "The rank of HRA across different layers.",
+            "note": "It is best to set 'r' to an even number; otherwise, the default initialization method will not work.",
+        },
+    )
+    apply_GS: bool = field(
+        default=False,
+        metadata={"help": "Whether to apply Gram-Schmidt orthogonalization or not."},
+    )
+    target_modules: Optional[Union[List[str], str]] = field(
+        default=None,
+        metadata={
+            "help": "List of module names or regex expression of the module names to replace with HRA.",
+            "example": "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' ",
+        },
+    )
+    init_weights: bool = field(
+        default=True,
+        metadata={
+            "help": (
+                "Whether to initialize the weights of the HRA layers with their default initialization. Don't change "
+                "this setting, except if you know exactly what you're doing."
+            ),
+        },
+    )
+    layers_to_transform: Optional[Union[List[int], int]] = field(
+        default=None,
+        metadata={
+            "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index."
+        },
+    )
+    layers_pattern: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern."
+        },
+    )
+    bias: str = field(default="none", metadata={"help": "Bias type for HRA. Can be 'none', 'all' or 'hra_only'"})
+    modules_to_save: Optional[List[str]] = field(
+        default=None,
+        metadata={
+            "help": "List of modules apart from HRA layers to be set as trainable and saved in the final checkpoint. "
+            "For example, in Sequence Classification or Token Classification tasks, "
+            "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
+        },
+    )
+
+    def __post_init__(self):
+        self.peft_type = PeftType.HRA
+        self.target_modules = (
+            set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
+        )
+        # if target_modules is a regex expression, then layers_to_transform should be None
+        if isinstance(self.target_modules, str) and self.layers_to_transform is not None:
+            raise ValueError("`layers_to_transform` cannot be used when `target_modules` is a str.")
+
+        # if target_modules is a regex expression, then layers_pattern should be None
+        if isinstance(self.target_modules, str) and self.layers_pattern is not None:
+            raise ValueError("`layers_pattern` cannot be used when `target_modules` is a str.")
--- a/src/peft/tuners/hra/layer.py
+++ b/src/peft/tuners/hra/layer.py
@ -0,0 +1,435 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import warnings
+from typing import Any, List, Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge
+
+
+class HRALayer(BaseTunerLayer):
+    # All names of layers that may contain (trainable) adapter weights
+    adapter_layer_names = ("hra_u",)
+    # All names of other parameters that may contain adapter-related parameters
+    other_param_names = ("hra_r", "hra_apply_GS")
+
+    def __init__(self, base_layer: nn.Module, **kwargs) -> None:
+        self.base_layer = base_layer
+        self.hra_r = {}
+        self.hra_apply_GS = {}
+        self.hra_u = nn.ParameterDict({})
+        # Mark the weight as unmerged
+        self._disable_adapters = False
+        self.merged_adapters = []
+        self.kwargs = kwargs
+
+        base_layer = self.get_base_layer()
+        if isinstance(base_layer, nn.Linear):
+            self.in_features, self.out_features = base_layer.in_features, base_layer.out_features
+        elif isinstance(base_layer, nn.Conv2d):
+            self.in_features, self.out_features = base_layer.in_channels, base_layer.out_channels
+        else:
+            raise ValueError(f"Unsupported layer type {type(base_layer)}")
+
+    def update_layer(
+        self,
+        adapter_name: str,
+        r: int,
+        apply_GS: bool,
+        init_weights: bool,
+        **kwargs,
+    ) -> None:
+        """Internal function to create hra adapter
+
+        Args:
+            adapter_name (`str`): Name for the adapter to add.
+            r (`int`): Rank for the added adapter.
+            init_weights (`bool`): Whether to initialize weights.
+            apply_GS (`bool`): Whether to apply Gram-Schmidt orthogonalization or not.
+        """
+        if r <= 0:
+            raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
+
+        self.hra_r[adapter_name] = r
+        self.hra_apply_GS[adapter_name] = apply_GS
+
+        # Determine shape of HRA weights
+        base_layer = self.get_base_layer()
+        if isinstance(base_layer, nn.Linear):
+            self.hra_u[adapter_name] = nn.Parameter(torch.empty(self.in_features, r), requires_grad=True)
+        elif isinstance(base_layer, nn.Conv2d):
+            self.hra_u[adapter_name] = nn.Parameter(
+                torch.empty(self.in_features * base_layer.kernel_size[0] * base_layer.kernel_size[0], r),
+                requires_grad=True,
+            )
+        else:
+            raise TypeError(f"HRA is not implemented for base layers of type {type(base_layer).__name__}")
+
+        # Initialize weights
+        if init_weights:
+            self.reset_hra_parameters(adapter_name)
+        else:
+            self.reset_hra_parameters_random(adapter_name)
+
+        # Move new weights to device
+        self._move_adapter_to_device_of_base_layer(adapter_name)
+        self.set_adapter(self.active_adapters)
+
+    def reset_hra_parameters(self, adapter_name: str):
+        if self.hra_r[adapter_name] % 2 != 0:
+            warnings.warn("The symmetric initialization can NOT be performed when r is odd!")
+            nn.init.kaiming_uniform_(self.hra_u[adapter_name], a=math.sqrt(5))
+        else:
+            shape = self.hra_u[adapter_name].shape
+            half_u = torch.zeros(shape[0], shape[1] // 2)
+            nn.init.kaiming_uniform_(half_u, a=math.sqrt(5))
+            self.hra_u[adapter_name] = nn.Parameter(torch.repeat_interleave(half_u, 2, dim=1))
+
+    def reset_hra_parameters_random(self, adapter_name: str):
+        nn.init.kaiming_uniform_(self.hra_u[adapter_name], a=math.sqrt(5))
+
+    def scale_layer(self, scale: float) -> None:
+        if scale == 1:
+            return
+
+        for active_adapter in self.active_adapters:
+            if active_adapter not in self.hra_u.keys():
+                continue
+
+            warnings.warn("Scaling operation for HRA not supported! Automatically set scale to 1.")
+
+    def unscale_layer(self, scale=None) -> None:
+        for active_adapter in self.active_adapters:
+            if active_adapter not in self.hra_u.keys():
+                continue
+
+            warnings.warn("Unscaling operation for HRA not supported! Keeping scale at 1.")
+
+
+class HRALinear(nn.Module, HRALayer):
+    """
+    HRA implemented in a dense layer.
+    """
+
+    def __init__(
+        self,
+        base_layer,
+        adapter_name: str,
+        r: int = 0,
+        apply_GS: bool = False,
+        init_weights: Union[bool, str] = True,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        HRALayer.__init__(self, base_layer, **kwargs)
+        self._active_adapter = adapter_name
+        self.update_layer(adapter_name, r, apply_GS, init_weights, **kwargs)
+
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
+        """
+        Merge the active adapter weights into the base weights
+
+        Args:
+            safe_merge (`bool`, *optional*):
+                If `True`, the merge operation will be performed in a copy of the original weights and check for NaNs
+                before merging the weights. This is useful if you want to check if the merge operation will produce
+                NaNs. Defaults to `False`.
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If `None`, all active adapters will be merged.
+                Defaults to `None`.
+        """
+        adapter_names = check_adapters_to_merge(self, adapter_names)
+        if not adapter_names:
+            # no adapter to merge
+            return
+
+        for active_adapter in adapter_names:
+            if active_adapter in self.hra_u.keys():
+                base_layer = self.get_base_layer()
+                if safe_merge:
+                    # Note that safe_merge will be slower than the normal merge
+                    # because of the copy operation.
+                    orig_weight = base_layer.weight.data.clone()
+                    delta_weight = self.get_delta_weight(active_adapter)
+                    orig_weight = torch.mm(orig_weight, delta_weight)
+
+                    if not torch.isfinite(orig_weight).all():
+                        raise ValueError(
+                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                        )
+
+                    self.base_layer.weight.data = orig_weight
+                else:
+                    delta_weight = self.get_delta_weight(active_adapter)
+                    self.base_layer.weight.data = torch.mm(self.base_layer.weight.data, delta_weight)
+                self.merged_adapters.append(active_adapter)
+
+    def unmerge(self) -> None:
+        """
+        This method unmerges all merged adapter layers from the base weights.
+        """
+        if not self.merged:
+            warnings.warn("Already unmerged. Nothing to do.")
+            return
+        while len(self.merged_adapters) > 0:
+            active_adapter = self.merged_adapters.pop()
+            if active_adapter in self.hra_u.keys():
+                orig_weight = self.get_base_layer().weight.data.clone()
+                delta_weight = self.get_delta_weight(active_adapter, reverse=True)
+                self.get_base_layer().weight.data = torch.mm(orig_weight, delta_weight)
+
+    def get_delta_weight(self, adapter_name: str, reverse: bool = False) -> torch.Tensor:
+        rank = self.hra_r[adapter_name]
+        apply_GS = self.hra_apply_GS[adapter_name]
+        opt_u = self.hra_u[adapter_name]
+        shape = opt_u.shape
+
+        if apply_GS:
+            weight = [(opt_u[:, 0] / opt_u[:, 0].norm()).view(-1, 1)]
+            for i in range(1, rank):
+                ui = opt_u[:, i].view(-1, 1)
+                for j in range(i):
+                    ui = ui - (weight[j].t() @ ui) * weight[j]
+                weight.append((ui / ui.norm()).view(-1, 1))
+            weight = torch.cat(weight, dim=1)
+            weight = torch.eye(shape[0], device=opt_u.device, dtype=opt_u.dtype) - 2 * weight @ weight.t()
+
+        else:
+            opt_u = opt_u / opt_u.norm(dim=0)
+            weight = torch.eye(shape[0], device=opt_u.device, dtype=opt_u.dtype)
+            if reverse:
+                indices = range(rank - 1, -1, -1)
+            else:
+                indices = range(rank)
+
+            for i in indices:
+                ui = opt_u[:, i].view(-1, 1)
+                weight = weight @ (torch.eye(shape[0], device=opt_u.device, dtype=opt_u.dtype) - 2 * ui @ ui.t())
+
+        return weight
+
+    def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
+        previous_dtype = x.dtype
+
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result = self.base_layer(x, *args, **kwargs)
+        elif self.merged:
+            result = self.base_layer(x, *args, **kwargs)
+        else:
+            new_weight = torch.eye(self.in_features, device=x.device)
+
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.hra_u.keys():
+                    continue
+                delta_weight = self.get_delta_weight(active_adapter)
+                new_weight = torch.mm(new_weight, delta_weight)
+
+            x = x.to(self.get_base_layer().weight.data.dtype)
+            orig_weight = self.get_base_layer().weight.data
+            new_weight = torch.mm(orig_weight, new_weight)
+
+            result = F.linear(input=x, weight=new_weight, bias=self.base_layer.bias)
+
+        result = result.to(previous_dtype)
+        return result
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "hra." + rep
+
+
+class HRAConv2d(nn.Module, HRALayer):
+    """HRA implemented in Conv2d layer"""
+
+    def __init__(
+        self,
+        base_layer,
+        adapter_name: str,
+        r: int = 0,
+        apply_GS: bool = False,
+        init_weights: Union[bool, str] = True,
+        **kwargs,
+    ):
+        super().__init__()
+        HRALayer.__init__(self, base_layer)
+        self._active_adapter = adapter_name
+        self.update_layer(adapter_name, r, apply_GS, init_weights, **kwargs)
+
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
+        """
+        Merge the active adapter weights into the base weights
+
+        Args:
+            safe_merge (`bool`, *optional*):
+                If `True`, the merge operation will be performed in a copy of the original weights and check for NaNs
+                before merging the weights. This is useful if you want to check if the merge operation will produce
+                NaNs. Defaults to `False`.
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If `None`, all active adapters will be merged.
+                Defaults to `None`.
+        """
+        adapter_names = check_adapters_to_merge(self, adapter_names)
+        if not adapter_names:
+            # no adapter to merge
+            return
+
+        for active_adapter in adapter_names:
+            if active_adapter in self.hra_u.keys():
+                base_layer = self.get_base_layer()
+                if safe_merge:
+                    # Note that safe_merge will be slower than the normal merge
+                    # because of the copy operation.
+                    orig_weight = base_layer.weight.data.clone()
+                    orig_weight = orig_weight.view(
+                        self.out_features,
+                        self.in_features * self.base_layer.kernel_size[0] * self.base_layer.kernel_size[0],
+                    )
+                    delta_weight = self.get_delta_weight(active_adapter)
+                    orig_weight = torch.mm(orig_weight, delta_weight)
+                    orig_weight = orig_weight.view(
+                        self.out_features,
+                        self.in_features,
+                        self.base_layer.kernel_size[0],
+                        self.base_layer.kernel_size[0],
+                    )
+
+                    if not torch.isfinite(orig_weight).all():
+                        raise ValueError(
+                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                        )
+
+                    self.base_layer.weight.data = orig_weight
+                else:
+                    orig_weight = base_layer.weight.data
+                    orig_weight = orig_weight.view(
+                        self.out_features,
+                        self.in_features * self.base_layer.kernel_size[0] * self.base_layer.kernel_size[0],
+                    )
+                    delta_weight = self.get_delta_weight(active_adapter)
+                    orig_weight = torch.mm(orig_weight, delta_weight)
+                    orig_weight = orig_weight.view(
+                        self.out_features,
+                        self.in_features,
+                        self.base_layer.kernel_size[0],
+                        self.base_layer.kernel_size[0],
+                    )
+
+                    self.base_layer.weight.data = orig_weight
+                self.merged_adapters.append(active_adapter)
+
+    def unmerge(self) -> None:
+        """
+        This method unmerges all merged adapter layers from the base weights.
+        """
+        if not self.merged:
+            warnings.warn("Already unmerged. Nothing to do.")
+            return
+        while len(self.merged_adapters) > 0:
+            active_adapter = self.merged_adapters.pop()
+            if active_adapter in self.hra_u.keys():
+                orig_weight = self.get_base_layer().weight.data.clone()
+                orig_weight = orig_weight.view(
+                    self.out_features,
+                    self.in_features * self.base_layer.kernel_size[0] * self.base_layer.kernel_size[0],
+                )
+                delta_weight = self.get_delta_weight(active_adapter, reverse=True)
+                orig_weight = torch.mm(orig_weight, delta_weight)
+                orig_weight = orig_weight.view(
+                    self.out_features, self.in_features, self.base_layer.kernel_size[0], self.base_layer.kernel_size[0]
+                )
+
+                self.get_base_layer().weight.data = orig_weight
+
+    def get_delta_weight(self, adapter_name: str, reverse: bool = False) -> torch.Tensor:
+        rank = self.hra_r[adapter_name]
+        apply_GS = self.hra_apply_GS[adapter_name]
+        opt_u = self.hra_u[adapter_name]
+        shape = opt_u.shape
+
+        if apply_GS:
+            weight = [(opt_u[:, 0] / opt_u[:, 0].norm()).view(-1, 1)]
+            for i in range(1, rank):
+                ui = opt_u[:, i].view(-1, 1)
+                for j in range(i):
+                    ui = ui - (weight[j].t() @ ui) * weight[j]
+                weight.append((ui / ui.norm()).view(-1, 1))
+            weight = torch.cat(weight, dim=1)
+            weight = torch.eye(shape[0], device=opt_u.device, dtype=opt_u.dtype) - 2 * weight @ weight.t()
+
+        else:
+            opt_u = opt_u / opt_u.norm(dim=0)
+            weight = torch.eye(shape[0], device=opt_u.device, dtype=opt_u.dtype)
+            if reverse:
+                indices = range(rank - 1, -1, -1)
+            else:
+                indices = range(rank)
+
+            for i in indices:
+                ui = opt_u[:, i].view(-1, 1)
+                weight = weight @ (torch.eye(shape[0], device=opt_u.device, dtype=opt_u.dtype) - 2 * ui @ ui.t())
+
+        return weight
+
+    def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
+        previous_dtype = x.dtype
+
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result = self.base_layer(x, *args, **kwargs)
+        elif self.merged:
+            result = self.base_layer(x, *args, **kwargs)
+        else:
+            new_weight = torch.eye(
+                self.in_features * self.base_layer.kernel_size[0] * self.base_layer.kernel_size[0], device=x.device
+            )
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.hra_u.keys():
+                    continue
+                delta_weight = self.get_delta_weight(active_adapter)
+                new_weight = torch.mm(new_weight, delta_weight)
+
+            x = x.to(self.base_layer.weight.data.dtype)
+
+            orig_weight = self.base_layer.weight.data
+            orig_weight = orig_weight.view(
+                self.out_features,
+                self.in_features * self.base_layer.kernel_size[0] * self.base_layer.kernel_size[0],
+            )
+            new_weight = torch.mm(orig_weight, new_weight)
+            new_weight = new_weight.view(
+                self.out_features, self.in_features, self.base_layer.kernel_size[0], self.base_layer.kernel_size[0]
+            )
+
+            result = F.conv2d(
+                input=x,
+                weight=new_weight,
+                bias=self.base_layer.bias,
+                padding=self.base_layer.padding[0],
+                stride=self.base_layer.stride[0],
+            )
+
+        result = result.to(previous_dtype)
+        return result
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "hra." + rep
--- a/src/peft/tuners/hra/model.py
+++ b/src/peft/tuners/hra/model.py
@ -0,0 +1,337 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from dataclasses import asdict
+from enum import Enum
+from typing import List, Optional
+
+import torch
+from torch import nn
+from tqdm import tqdm
+
+from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists
+from peft.utils import (
+    TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,
+    ModulesToSaveWrapper,
+    _get_submodules,
+)
+
+from .config import HRAConfig
+from .layer import HRAConv2d, HRALayer, HRALinear
+
+
+class HRAModel(BaseTuner):
+    """
+    Creates Householder reflection adaptation (HRA) model from a pretrained model. The method is described in
+    https://arxiv.org/abs/2405.17484
+
+    Args:
+        model (`torch.nn.Module`): The model to which the adapter tuner layers will be attached.
+        config ([`HRAConfig`]): The configuration of the HRA model.
+        adapter_name (`str`): The name of the adapter, defaults to `"default"`.
+
+    Returns:
+        `torch.nn.Module`: The HRA model.
+
+    Example:
+        ```py
+        >>> from diffusers import StableDiffusionPipeline
+        >>> from peft import HRAModel, HRAConfig
+
+        >>> config_te = HRAConfig(
+        ...     r=8,
+        ...     target_modules=["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"],
+        ...     init_weights=True,
+        ... )
+        >>> config_unet = HRAConfig(
+        ...     r=8,
+        ...     target_modules=[
+        ...         "proj_in",
+        ...         "proj_out",
+        ...         "to_k",
+        ...         "to_q",
+        ...         "to_v",
+        ...         "to_out.0",
+        ...         "ff.net.0.proj",
+        ...         "ff.net.2",
+        ...     ],
+        ...     init_weights=True,
+        ... )
+
+        >>> model = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> model.text_encoder = HRAModel(model.text_encoder, config_te, "default")
+        >>> model.unet = HRAModel(model.unet, config_unet, "default")
+        ```
+
+    **Attributes**:
+        - **model** ([`~torch.nn.Module`]) -- The model to be adapted.
+        - **peft_config** ([`HRAConfig`]): The configuration of the HRA model.
+    """
+
+    prefix: str = "hra_"
+
+    def _check_new_adapter_config(self, config: HRAConfig) -> None:
+        """
+        A helper method to check the config when a new adapter is being added.
+
+        Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters.
+
+        """
+        # TODO: there should be a check if any of the existing adapters actually has bias != "none", or else the check
+        # does not fully correspond to the error message.
+        if (len(self.peft_config) > 1) and (config.bias != "none"):
+            raise ValueError(
+                f"{self.__class__.__name__} supports only 1 adapter with bias. When using multiple adapters, "
+                "set bias to 'none' for all adapters."
+            )
+
+    @staticmethod
+    def _check_target_module_exists(hra_config, key):
+        return check_target_module_exists(hra_config, key)
+
+    def _create_and_replace(
+        self,
+        hra_config,
+        adapter_name,
+        target,
+        target_name,
+        parent,
+        current_key,
+        **optional_kwargs,
+    ):
+        if current_key is None:
+            raise ValueError("Current Key shouldn't be `None`")
+
+        bias = hasattr(target, "bias") and target.bias is not None
+        kwargs = {
+            "r": hra_config.r,
+            "apply_GS": hra_config.apply_GS,
+            "init_weights": hra_config.init_weights,
+        }
+        kwargs["bias"] = bias
+
+        # If it is not a HRALayer, create a new module, else update it with new adapters
+        if not isinstance(target, HRALayer):
+            new_module = self._create_new_module(hra_config, adapter_name, target, **kwargs)
+            if adapter_name not in self.active_adapters:
+                # adding an additional adapter: it is not automatically trainable
+                new_module.requires_grad_(False)
+            self._replace_module(parent, target_name, new_module, target)
+        else:
+            target.update_layer(
+                adapter_name,
+                r=hra_config.r,
+                apply_GS=hra_config.apply_GS,
+                init_weights=hra_config.init_weights,
+            )
+
+    def _replace_module(self, parent, child_name, new_module, child):
+        setattr(parent, child_name, new_module)
+        # It's not necessary to set requires_grad here, as that is handled by
+        # _mark_only_adapters_as_trainable
+
+        # child layer wraps the original module, unpack it
+        if hasattr(child, "base_layer"):
+            child = child.base_layer
+
+        if not hasattr(new_module, "base_layer"):
+            new_module.weight = child.weight
+            if hasattr(child, "bias"):
+                new_module.bias = child.bias
+
+        if getattr(child, "state", None) is not None:
+            if hasattr(new_module, "base_layer"):
+                new_module.base_layer.state = child.state
+            else:
+                new_module.state = child.state
+            new_module.to(child.weight.device)
+
+        # dispatch to correct device
+        for name, module in new_module.named_modules():
+            if self.prefix in name:
+                module.to(child.weight.device)
+
+    def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None:
+        for n, p in model.named_parameters():
+            if self.prefix not in n:
+                p.requires_grad = False
+
+        for active_adapter in self.active_adapters:
+            bias = self.peft_config[active_adapter].bias
+            if bias == "none":
+                continue
+
+            if bias == "all":
+                for n, p in model.named_parameters():
+                    if "bias" in n:
+                        p.requires_grad = True
+            elif bias == "hra_only":
+                for name, m in model.named_modules():
+                    if isinstance(m, HRALayer) and hasattr(m, "bias") and m.bias is not None:
+                        m.bias.requires_grad = True
+            else:
+                raise NotImplementedError(f"Requested bias: {bias}, is not implemented.")
+
+    @staticmethod
+    def _create_new_module(hra_config, adapter_name, target, **kwargs):
+        if isinstance(target, BaseTunerLayer):
+            target_base_layer = target.get_base_layer()
+        else:
+            target_base_layer = target
+
+        if isinstance(target_base_layer, torch.nn.Linear):
+            new_module = HRALinear(target, adapter_name, **kwargs)
+        elif isinstance(target_base_layer, torch.nn.Conv2d):
+            new_module = HRAConv2d(target, adapter_name, **kwargs)
+        else:
+            raise ValueError(
+                f"Target module {target} is not supported. "
+                "Currently, only `torch.nn.Linear` and `torch.nn.Conv2d` are supported."
+            )
+
+        return new_module
+
+    def __getattr__(self, name: str):
+        """Forward missing attributes to the wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Module's logic
+        except AttributeError:
+            if name == "base_model":
+                raise
+            return getattr(self.model, name)
+
+    def get_peft_config_as_dict(self, inference: bool = False):
+        config_dict = {}
+        for key, value in self.peft_config.items():
+            config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()}
+            if inference:
+                config["inference_mode"] = True
+        config_dict[key] = config
+        return config
+
+    def _set_adapter_layers(self, enabled=True):
+        for module in self.model.modules():
+            if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)):
+                module.enable_adapters(enabled)
+
+    def enable_adapter_layers(self):
+        self._set_adapter_layers(enabled=True)
+
+    def disable_adapter_layers(self):
+        for active_adapter in self.active_adapters:
+            val = self.peft_config[active_adapter].bias
+            if val != "none":
+                msg = (
+                    f"Careful, disabling adapter layers with bias configured to be '{val}' does not produce the same "
+                    "output as the the base model would without adaption."
+                )
+                warnings.warn(msg)
+        self._set_adapter_layers(enabled=False)
+
+    def set_adapter(self, adapter_name):
+        for module in self.model.modules():
+            if isinstance(module, HRALayer):
+                if module.merged:
+                    warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.")
+                    module.unmerge()
+                module.set_adapter(adapter_name)
+        self.active_adapter = adapter_name
+
+    @staticmethod
+    def _prepare_adapter_config(peft_config, model_config):
+        if peft_config.target_modules is None:
+            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING:
+                raise ValueError("Please specify `target_modules` in `peft_config`")
+            peft_config.target_modules = set(
+                TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING[model_config["model_type"]]
+            )
+        return peft_config
+
+    def _unload_and_optionally_merge(
+        self,
+        merge=True,
+        progressbar: bool = False,
+        safe_merge: bool = False,
+        adapter_names: Optional[List[str]] = None,
+    ):
+        self._unloading_checks(adapter_names)
+        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
+        desc = "Unloading " + ("and merging " if merge else "") + "model"
+        for key in tqdm(key_list, disable=not progressbar, desc=desc):
+            try:
+                parent, target, target_name = _get_submodules(self.model, key)
+            except AttributeError:
+                continue
+
+            if hasattr(target, "base_layer"):
+                if merge:
+                    target.merge(safe_merge=safe_merge, adapter_names=adapter_names)
+                self._replace_module(parent, target_name, target.get_base_layer(), target)
+            elif isinstance(target, ModulesToSaveWrapper):
+                # save any additional trainable modules part of `modules_to_save`
+                setattr(parent, target_name, target.modules_to_save[target.active_adapter])
+
+        return self.model
+
+    def delete_adapter(self, adapter_name: str) -> None:
+        """
+        Deletes an existing adapter.
+
+        Args:
+            adapter_name (str): Name of the adapter to be deleted.
+        """
+        if adapter_name not in list(self.peft_config.keys()):
+            raise ValueError(f"Adapter {adapter_name} does not exist")
+        del self.peft_config[adapter_name]
+
+        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
+        new_adapter = None
+        for key in key_list:
+            _, target, _ = _get_submodules(self.model, key)
+            if isinstance(target, HRALayer):
+                target.delete_adapter(adapter_name)
+                if new_adapter is None:
+                    new_adapter = target.active_adapters[:]
+
+        self.active_adapter = new_adapter or []
+
+    def merge_and_unload(
+        self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[List[str]] = None
+    ) -> torch.nn.Module:
+        r"""
+        This method merges the HRA layers into the base model. This is needed if someone wants to use the base model as
+        a standalone model.
+
+        Args:
+            progressbar (`bool`):
+                whether to show a progressbar indicating the unload and merge process
+            safe_merge (`bool`):
+                whether to activate the safe merging check to check if there is any potential Nan in the adapter
+                weights
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
+
+        """
+        return self._unload_and_optionally_merge(
+            progressbar=progressbar, safe_merge=safe_merge, adapter_names=adapter_names
+        )
+
+    def unload(self) -> torch.nn.Module:
+        """
+        Gets back the base model by removing all the hra modules without merging. This gives back the original base
+        model.
+        """
+        return self._unload_and_optionally_merge(merge=False)
--- a/src/peft/tuners/ia3/layer.py
+++ b/src/peft/tuners/ia3/layer.py
@ -61,7 +61,7 @@ class IA3Layer(BaseTunerLayer):
        self.ia3_l[adapter_name] = nn.Parameter(weight)
        if init_ia3_weights:
            self.reset_ia3_parameters(adapter_name)
-        self.to(self.get_base_layer().weight.device)
+        self._move_adapter_to_device_of_base_layer(adapter_name)
        self.set_adapter(self.active_adapters)

    def reset_ia3_parameters(self, adapter_name):
@ -111,6 +111,7 @@ class Linear(nn.Module, IA3Layer):
            if active_adapter in self.ia3_l.keys():
                base_layer = self.get_base_layer()
                ia3_l = transpose(self.ia3_l[active_adapter].data, self.fan_in_fan_out)
+                orig_dtype = base_layer.weight.data.dtype
                if safe_merge:
                    orig_weights = base_layer.weight.data
                    orig_weights = torch.mul(orig_weights, ia3_l)
@ -119,13 +120,14 @@ class Linear(nn.Module, IA3Layer):
                        raise ValueError(
                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
                        )
-                    base_layer.weight.data = orig_weights
+                    base_layer.weight.data = orig_weights.to(orig_dtype)
                else:
-                    base_layer.weight.data = torch.mul(base_layer.weight.data, ia3_l)
+                    base_layer.weight.data = torch.mul(base_layer.weight.data, ia3_l).to(orig_dtype)

                if not self.is_feedforward and (base_layer.bias is not None):
                    scaling = self.ia3_l[active_adapter].reshape(base_layer.bias.shape)
-                    base_layer.bias.data = torch.mul(base_layer.bias.data, scaling.data)
+                    orig_dtype = base_layer.bias.data.dtype
+                    base_layer.bias.data = torch.mul(base_layer.bias.data, scaling.data).to(orig_dtype)

                self.merged_adapters.append(active_adapter)

@ -144,15 +146,16 @@ class Linear(nn.Module, IA3Layer):
                base_layer = self.get_base_layer()
                # Add tolerace to avoid division by zero
                ia3_l = transpose(self.ia3_l[active_adapter].data, self.fan_in_fan_out) + 1e-8
-                base_layer.weight.data = torch.div(base_layer.weight.data, ia3_l)
+                orig_dtype = base_layer.weight.data.dtype
+                base_layer.weight.data = torch.div(base_layer.weight.data, ia3_l).to(orig_dtype)

                if not self.is_feedforward and (base_layer.bias is not None):
                    scaling = self.ia3_l[active_adapter].reshape(base_layer.bias.shape)
-                    base_layer.bias.data = torch.div(base_layer.bias.data, scaling.data + 1e-8)
+                    orig_dtype = base_layer.bias.data.dtype
+                    base_layer.bias.data = torch.div(base_layer.bias.data, scaling.data + 1e-8).to(orig_dtype)

    def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
        dtype = previous_dtype = x.dtype
-
        if self.disable_adapters:
            if self.merged:
                self.unmerge()
@ -171,13 +174,13 @@ class Linear(nn.Module, IA3Layer):
                x = x.to(dtype)
                # TODO: weight.dtype can be != self.ia3_l[self.active_adapters].dtype
                # e.g. bf16 vs fp32. Is that okay?
-                interm = (x * ia3_scaling).to(self.get_base_layer().weight.dtype)
+                interm = (x * ia3_scaling).to(previous_dtype)
                result = self.base_layer(interm, *args, **kwargs)
            else:
                result = self.base_layer(x, *args, **kwargs)
-                result = result.to(dtype) * ia3_scaling
+                result_dtype = result.dtype
+                result = (result * ia3_scaling).to(result_dtype)

-        result = result.to(previous_dtype)
        return result


@ -207,7 +210,7 @@ class Conv2d(nn.Module, IA3Layer):
        self.ia3_l[adapter_name] = nn.Parameter(weight)
        if init_ia3_weights:
            self.reset_ia3_parameters(adapter_name)
-        self.to(self.get_base_layer().weight.device)
+        self._move_adapter_to_device_of_base_layer(adapter_name)
        self.set_adapter(self.active_adapters)

    def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
--- a/src/peft/tuners/ia3/model.py
+++ b/src/peft/tuners/ia3/model.py
@ -15,7 +15,7 @@ from __future__ import annotations

 import re
 import warnings
-from dataclasses import asdict
+from dataclasses import asdict, replace
 from enum import Enum
 from typing import Optional

@ -29,6 +29,7 @@ from peft.utils import (
    TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING,
    TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING,
    ModulesToSaveWrapper,
+    _freeze_adapter,
    _get_submodules,
 )

@ -226,6 +227,8 @@ class IA3Model(BaseTuner):
        try:
            return super().__getattr__(name)  # defer to nn.Module's logic
        except AttributeError:
+            if name == "model":  # see #1892: prevent infinite recursion if class is not initialized
+                raise
            return getattr(self.model, name)

    def get_peft_config_as_dict(self, inference: bool = False):
@ -279,17 +282,20 @@ class IA3Model(BaseTuner):
                module.set_adapter(adapter_name)
        self.active_adapter = adapter_name

-    def _prepare_adapter_config(self, peft_config, model_config):
+    @staticmethod
+    def _prepare_adapter_config(peft_config, model_config):
        if peft_config.target_modules is None:
            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING:
                raise ValueError("Please specify `target_modules` in `peft_config`")
-            peft_config.target_modules = TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING[model_config["model_type"]]
+            peft_config.target_modules = set(
+                TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING[model_config["model_type"]]
+            )
        if peft_config.feedforward_modules is None:
            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING:
                raise ValueError("Please specify `feedforward_modules` in `peft_config`")
-            peft_config.feedforward_modules = TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING[
-                model_config["model_type"]
-            ]
+            peft_config.feedforward_modules = set(
+                TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING[model_config["model_type"]]
+            )
        return peft_config

    def _unload_and_optionally_merge(
@ -393,3 +399,94 @@ class IA3Model(BaseTuner):
                    new_adapter = target.active_adapters[:]

        self.active_adapter = new_adapter or []
+
+    def _check_add_weighted_adapter(self, adapters: list[str]) -> tuple[str, str]:
+        """
+        Helper function to check if the arguments to add_weighted_adapter are valid and compatible with the underlying
+        model.
+        """
+        # Validate existence of adapters
+        for adapter in adapters:
+            if adapter not in self.peft_config:
+                raise ValueError(f"Adapter {adapter} does not exist")
+
+        # Check for conflicting modules_to_save
+        modules_to_save_wrappers = [module for module in self.modules() if isinstance(module, ModulesToSaveWrapper)]
+        if any(
+            sum(adapter in wrapper.modules_to_save for adapter in adapters) > 1 for wrapper in modules_to_save_wrappers
+        ):
+            raise ValueError("Cannot add weighted adapters targeting the same module with modules_to_save.")
+
+        # Ensure all adapters have compatible target and feedforward module types
+        target_module_types = {type(self.peft_config[adapter].target_modules) for adapter in adapters}
+        feedforward_module_types = {type(self.peft_config[adapter].feedforward_modules) for adapter in adapters}
+        if len(target_module_types) > 1 or len(feedforward_module_types) > 1:
+            raise ValueError("All adapter configs should have the same type for target and feedforward modules.")
+
+        # Combine target and feedforward modules
+        if str in target_module_types:
+            new_target_modules = "|".join(f"({self.peft_config[adapter].target_modules})" for adapter in adapters)
+        else:
+            new_target_modules = set.union(*(self.peft_config[adapter].target_modules for adapter in adapters))
+
+        if str in feedforward_module_types:
+            new_feedforward_modules = "|".join(
+                f"({self.peft_config[adapter].feedforward_modules})" for adapter in adapters
+            )
+        else:
+            new_feedforward_modules = set.union(
+                *(self.peft_config[adapter].feedforward_modules for adapter in adapters)
+            )
+
+        return new_target_modules, new_feedforward_modules
+
+    def add_weighted_adapter(
+        self,
+        adapters: list[str],
+        weights: list[float],
+        adapter_name: str,
+    ) -> None:
+        """
+        This method adds a new adapter by merging the given adapters with the given weights.
+
+        Args:
+            adapters (`list`):
+                List of adapter names to be merged.
+            weights (`list`):
+                List of weights for each adapter.
+            adapter_name (`str`):
+                Name of the new adapter.
+        """
+        if adapter_name in list(self.peft_config.keys()):
+            return
+
+        new_target_modules, new_feedforward_modules = self._check_add_weighted_adapter(
+            adapters=adapters,
+        )
+
+        self.peft_config[adapter_name] = replace(
+            self.peft_config[adapters[0]],
+            target_modules=new_target_modules,
+            feedforward_modules=new_feedforward_modules,
+        )
+        self.inject_adapter(self.model, adapter_name)
+
+        # Do we really need that?
+        _freeze_adapter(self.model, adapter_name)
+
+        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
+        for key in key_list:
+            _, target, _ = _get_submodules(self.model, key)
+            if isinstance(target, IA3Layer):
+                if adapter_name in target.ia3_l:
+                    target_ia3_l = target.ia3_l[adapter_name]
+                else:
+                    continue
+
+                target_ia3_l.data = target_ia3_l.data.zero_()
+                for adapter, weight in zip(adapters, weights):
+                    if adapter in target.ia3_l:
+                        current_adapter_ia3_l = target.ia3_l[adapter]
+                    else:
+                        continue
+                    target_ia3_l.data += current_adapter_ia3_l.data * weight
--- a/src/peft/tuners/ln_tuning/init.py
+++ b/src/peft/tuners/ln_tuning/init.py
@ -0,0 +1,19 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .config import LNTuningConfig
+from .model import LNTuningModel
+
+
+__all__ = ["LNTuningConfig", "LNTuningModel"]
--- a/src/peft/tuners/ln_tuning/config.py
+++ b/src/peft/tuners/ln_tuning/config.py
@ -0,0 +1,61 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Optional, Union
+
+from peft.config import PeftConfig
+from peft.utils import PeftType
+
+
+@dataclass
+class LNTuningConfig(PeftConfig):
+    """
+    This is the configuration class to store the configuration of a :class:`~peft.tuners.LNTuningModel`.
+
+    Args:
+        target_modules (`Optional[Union[List[str], str]]`):
+            List of module names or regex expression of the module names to replace with LNTuning. For example,
+            '.*decoder.*' or '.*encoder.*'. If this is not specified, modules will be chosen according to the model
+            architecture. If the architecture is not known, an error will be raised -- in this case, you should specify
+            the target modules manually.
+        modules_to_save (`Optional[Union[List[str], str]]`):
+            List of modules to be set as trainable and saved in the final checkpoint. For example, in Sequence
+            Classification or Token Classification tasks, the final layer `classifier/score` are randomly initialized
+            and as such need to be trainable and saved.
+    """
+
+    target_modules: Optional[Union[list[str], str]] = field(
+        default=None,
+        metadata={
+            "help": (
+                "List of module names or regex expression of the module names to replace with LNTuning."
+                "For example, '.*decoder.*' or '.*encoder.*'. "
+                "If not specified, modules will be chosen according to the model architecture, If the architecture is "
+                "not known, an error will be raised -- in this case, you shoud specify the target modules manually."
+            ),
+        },
+    )
+    modules_to_save: Optional[Union[list[str], str]] = field(
+        default=None,
+        metadata={
+            "help": "List of modules to be set as trainable and saved in the final checkpoint. "
+            "For example, in Sequence Classification or Token Classification tasks, "
+            "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
+        },
+    )
+
+    def __post_init__(self):
+        self.peft_type = PeftType.LN_TUNING
--- a/src/peft/tuners/ln_tuning/layer.py
+++ b/src/peft/tuners/ln_tuning/layer.py
@ -0,0 +1,117 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from copy import deepcopy
+from typing import List, Optional
+
+import torch
+import torch.nn as nn
+
+from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge
+
+
+class LNTuningLayer(nn.Module, BaseTunerLayer):
+    """
+    Selects a layer from the model.
+    """
+
+    adapter_layer_names = ("ln_tuning_layers",)
+
+    def __init__(self, base_layer: nn.Module, adapter_name: str):
+        super().__init__()
+        self.base_layer = base_layer
+        self.ln_tuning_layers = nn.ModuleDict({})
+        self.update_layer(self.base_layer, adapter_name)
+        self._active_adapter = adapter_name
+        self.merged_adapters = []
+
+    def update_layer(self, layer: nn.Module, adapter_name: str):
+        self.ln_tuning_layers[adapter_name] = deepcopy(layer)
+
+    def enable_adapters(self, enabled: bool) -> None:
+        """Toggle the enabling and disabling of adapters
+
+        Takes care of setting the requires_grad flag for the adapter weights.
+
+        Args:
+            enabled (bool): True to enable adapters, False to disable adapters
+        """
+        if enabled:
+            self.set_adapter(self.active_adapters)
+            self._disable_adapters = False
+        else:
+            if self.merged:
+                self.unmerge()
+            # disable grads on all adapter layers
+            for layer_name in self.adapter_layer_names:
+                layer = getattr(self, layer_name)
+                layer.requires_grad_(False)
+            self._disable_adapters = True
+
+    def merge(self, adapter_names: Optional[List[str]] = None):
+        adapter_names = check_adapters_to_merge(self, adapter_names)
+        if not adapter_names:
+            # no adapter to merge
+            return
+
+        if len(adapter_names) > 1:
+            raise ValueError(
+                f"Trying to merge {len(adapter_names)} adapters, but LN "
+                f"tuning does not allow merging more than one adapter at a time"
+            )
+        merged_adapters = set(self.merged_adapters)
+        if merged_adapters:
+            warnings.warn(f"Already merged with {merged_adapters}. Unmerging first.")
+            self.unmerge()
+
+        self.base_layer, self.ln_tuning_layers[adapter_names[0]] = (
+            self.ln_tuning_layers[adapter_names[0]],
+            self.base_layer,
+        )
+        self.merged_adapters.append(adapter_names[0])
+
+    def unmerge(self):
+        if not self.merged:
+            warnings.warn("Already unmerged. Nothing to do.")
+            return
+        # popping one element is sufficient because LN
+        # tuning does not allow merging more than one adapter at a time.
+        merged_name = self.merged_adapters.pop()
+        self.base_layer, self.ln_tuning_layers[merged_name] = (
+            self.ln_tuning_layers[merged_name],
+            self.base_layer,
+        )
+
+    def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result = self.base_layer(x, *args, **kwargs)
+        elif self.merged:
+            result = self.base_layer(x, *args, **kwargs)
+        else:
+            if len(self.active_adapters) != 1:
+                raise ValueError(
+                    f"Trying to run forward with {len(self.active_adapters)} active "
+                    f"adapters, but LN tuning does not allow inference with more than one adapter at a time"
+                )
+            active_adapter = self.active_adapters[0]
+            result = self.ln_tuning_layers[active_adapter](x, *args, **kwargs)
+
+        return result
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "ln_tuning." + rep
--- a/src/peft/tuners/ln_tuning/model.py
+++ b/src/peft/tuners/ln_tuning/model.py
@ -0,0 +1,203 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import warnings
+from typing import Optional
+
+from torch import nn
+from torch.nn.modules import Module
+from tqdm import tqdm
+
+from peft.config import PeftConfig
+from peft.tuners.tuners_utils import BaseTuner, _get_submodules, check_target_module_exists
+from peft.utils import TRANSFORMERS_MODELS_TO_LNTUNING_TARGET_MODULES_MAPPING, ModulesToSaveWrapper
+
+from .layer import LNTuningLayer
+
+
+class LNTuningModel(BaseTuner):
+    """
+    Creates LayerNorm tuning from a pretrained transformer model.
+
+    The method is described in detail in https://arxiv.org/abs/2312.11420.
+
+    Args:
+        model ([`torch.nn.Module`]): The model to be adapted.
+        config ([`LNTuningConfig`]): The configuration of the Lora model.
+        adapter_name (`str`): The name of the adapter, defaults to `"default"`.
+
+    Returns:
+        'torch.nn.Module': The adapted model with LayerNorm tuned on.
+
+    Example:
+
+        ```py
+        >>> from transformers import AutoModelForCausalLM
+        >>> from peft import get_peft_model, TaskType, LNTuningConfig
+
+        >>> peft_config = LNTuningConfig(
+        ...     task_type=TaskType.CAUSAL_LM,
+        ... )
+
+        >>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
+        >>> model = get_peft_model(model, peft_config)
+        >>> model.print_trainable_parameters()
+        ```
+
+    **Attributes**:
+        - **model** ([`~transformers.PreTrainedModel`]) -- The model to be adapted.
+        - **peft_config** ([`LNTuningConfig`]): The configuration of the Lora model.
+    """
+
+    prefix: str = "ln_tuning_"
+
+    def __init__(self, model, config, adapter_name) -> None:
+        # self.adapter_name = adapter_name
+        super().__init__(model, config, adapter_name)
+
+    def __getattr__(self, name: str):
+        """Forward missing attributes to the wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Module's logic
+        except AttributeError:
+            if name == "model":  # see #1892: prevent infinite recursion if class is not initialized
+                raise
+            return getattr(self.model, name)
+
+    # TODO: here need to handle the modules_to_save rather than the target_modules
+    @staticmethod
+    def _prepare_adapter_config(peft_config: PeftConfig, model_config: dict) -> PeftConfig:
+        if peft_config.target_modules is None:
+            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_LNTUNING_TARGET_MODULES_MAPPING:
+                raise ValueError("Please specify `target_modules` in `peft_config`")
+            peft_config.target_modules = set(
+                TRANSFORMERS_MODELS_TO_LNTUNING_TARGET_MODULES_MAPPING[model_config["model_type"]]
+            )
+        return peft_config
+
+    def _create_and_replace(
+        self,
+        peft_config: PeftConfig,
+        adapter_name: str,
+        target: Module,
+        target_name: str,
+        parent: Module,
+        current_key: str,
+    ) -> None:
+        # replace the original module with a same new module
+        new_module = self._create_new_module(peft_config, target, adapter_name)
+        if adapter_name != self.active_adapter:
+            new_module.requires_grad_(False)
+        self._replace_module(parent, target_name, new_module, target)
+
+    def _create_new_module(
+        self,
+        peft_config: PeftConfig,
+        target: Module,
+        adapter_name: str,
+    ) -> Module:
+        if not isinstance(target, LNTuningLayer):
+            new_module = LNTuningLayer(target, adapter_name)
+        else:
+            new_module = target
+            new_module.update_layer(target.base_layer, adapter_name)
+        return new_module
+
+    def _replace_module(self, parent: Module, child_name: str, new_module: Module, child: Module) -> None:
+        setattr(parent, child_name, new_module)
+
+        if hasattr(child, "base_layer"):
+            child = child.base_layer
+
+        if getattr(child, "state", None) is not None:
+            if hasattr(new_module, "base_layer"):
+                new_module.base_layer.state = child.state
+            else:
+                new_module.state = child.state
+            new_module.to(child.weight.device)
+
+        for name, module in new_module.named_modules():
+            weight = child.qweight if hasattr(child, "qweight") else child.weight
+            module.to(weight.device)
+
+    def _mark_only_adapters_as_trainable(self, model: Module):
+        for n, p in model.named_parameters():
+            if self.prefix not in n:
+                p.requires_grad = False
+            else:
+                p.requires_grad = True
+
+    def _check_target_module_exists(self, peft_config: PeftConfig, key: str) -> bool:
+        return check_target_module_exists(peft_config, key)
+
+    def _set_adapter_layers(self, enabled: bool) -> None:
+        for module in self.model.modules():
+            if isinstance(module, (LNTuningLayer, ModulesToSaveWrapper)):
+                module.enable_adapters(enabled)
+
+    def enable_adapter_layers(self) -> None:
+        """Enable all adapters.
+
+        Call this if you have previously disabled all adapters and want to re-enable them.
+        """
+        self._set_adapter_layers(enabled=True)
+
+    def disable_adapter_layers(self) -> None:
+        """Disable all adapters.
+
+        When disabling all adapters, the model output corresponds to the output of the base model.
+        """
+        self._set_adapter_layers(enabled=False)
+
+    def set_adapter(self, adapter_name: str) -> None:
+        for module in self.model.modules():
+            if isinstance(module, LNTuningLayer):
+                if module.merged:
+                    warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.")
+                    module.unmerge()
+                module.set_adapter(adapter_name)
+        self.active_adapter = adapter_name
+
+    def _unload_and_optionally_merge(
+        self,
+        merge=True,
+        progressbar: bool = False,
+        safe_merge: bool = False,
+        adapter_names: Optional[list[str]] = None,
+    ):
+        self._unloading_checks(adapter_names)
+        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
+        desc = "Unloading adapters " + ("and merging " if merge else "") + "model"
+
+        for key in tqdm(key_list, disable=not progressbar, desc=desc):
+            try:
+                parent, target, target_name = _get_submodules(self.model, key)
+            except AttributeError:
+                continue
+
+            if hasattr(target, "base_layer"):
+                if merge:
+                    target.merge(adapter_names)
+                self._replace_module(parent, target_name, target.get_base_layer(), target)
+
+        return self.model
+
+    def unload(self):
+        return self._unload_and_optionally_merge(merge=False)
+
+    def merge_and_unload(
+        self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[list[str]] = None
+    ) -> nn.Module:
+        return self._unload_and_optionally_merge(merge=True)
--- a/src/peft/tuners/loha/layer.py
+++ b/src/peft/tuners/loha/layer.py
@ -148,13 +148,7 @@ class LoHaLayer(nn.Module, LycorisLayer):
            self.reset_adapter_parameters_random(adapter_name)

        # Move new weights to device
-        weight = getattr(self.get_base_layer(), "weight", None)
-        if weight is not None:
-            # the layer is already completely initialized, this is an update
-            if weight.dtype.is_floating_point or weight.dtype.is_complex:
-                self.to(weight.device, dtype=weight.dtype)
-            else:
-                self.to(weight.device)
+        self._move_adapter_to_device_of_base_layer(adapter_name)
        self.set_adapter(self.active_adapters)

    def get_delta_weight(self, adapter_name: str) -> torch.Tensor:
--- a/src/peft/tuners/lokr/layer.py
+++ b/src/peft/tuners/lokr/layer.py
@ -197,13 +197,7 @@ class LoKrLayer(nn.Module, LycorisLayer):
            self.reset_adapter_parameters_random(adapter_name)

        # Move new weights to device
-        weight = getattr(self.get_base_layer(), "weight", None)
-        if weight is not None:
-            # the layer is already completely initialized, this is an update
-            if weight.dtype.is_floating_point or weight.dtype.is_complex:
-                self.to(weight.device, dtype=weight.dtype)
-            else:
-                self.to(weight.device)
+        self._move_adapter_to_device_of_base_layer(adapter_name)
        self.set_adapter(self.active_adapters)

    def get_delta_weight(self, adapter_name: str) -> torch.Tensor:
--- a/src/peft/tuners/lora/init.py
+++ b/src/peft/tuners/lora/init.py
@ -12,15 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from peft.import_utils import is_bnb_4bit_available, is_bnb_available
+from peft.import_utils import is_bnb_4bit_available, is_bnb_available, is_eetq_available

-from .config import LoftQConfig, LoraConfig
+from .config import LoftQConfig, LoraConfig, LoraRuntimeConfig
 from .gptq import QuantLinear
 from .layer import Conv2d, Embedding, Linear, LoraLayer
 from .model import LoraModel


-__all__ = ["LoraConfig", "LoftQConfig", "Conv2d", "Embedding", "LoraLayer", "Linear", "LoraModel", "QuantLinear"]
+__all__ = [
+    "LoraConfig",
+    "LoraRuntimeConfig",
+    "LoftQConfig",
+    "Conv2d",
+    "Embedding",
+    "LoraLayer",
+    "Linear",
+    "LoraModel",
+    "QuantLinear",
+]


 def __getattr__(name):
@ -34,4 +44,9 @@ def __getattr__(name):

        return Linear4bit

+    if (name == "EetqLoraLinear") and is_eetq_available():
+        from .eetq import EetqLoraLinear
+
+        return EetqLoraLinear
+
    raise AttributeError(f"module {__name__} has no attribute {name}")
--- a/src/peft/tuners/lora/bnb.py
+++ b/src/peft/tuners/lora/bnb.py
@ -98,12 +98,16 @@ if is_bnb_available():
                else:
                    # handle dora
                    # since output already includes scaling, set it to 1 here
-                    weight_norm = self._get_weight_norm(output, lora_data, scaling=1).detach()
+                    weight_norm = (
+                        self.lora_magnitude_vector[active_adapter]
+                        .get_weight_norm(output, lora_data, scaling=1)
+                        .detach()
+                    )
                    # We need to cache weight_norm because it has to be based on the original weights. We
                    # cannot calculate it on the fly based on the merged weights when unmerging because its a
                    # different value
                    self._cache_store(f"{active_adapter}-weight_norm", weight_norm)
-                    dora_factor = self.lora_magnitude_vector[active_adapter] / weight_norm
+                    dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm
                    w_data = dora_factor.view(-1, 1) * (output + lora_data)

                if safe_merge and not torch.isfinite(w_data).all():
@ -144,7 +148,7 @@ if is_bnb_available():
                    w_data = output.to(lora_data.dtype).to(lora_data.device) - lora_data
                else:
                    weight_norm = self._cache_pop(f"{active_adapter}-weight_norm")
-                    dora_factor = self.lora_magnitude_vector[active_adapter] / weight_norm
+                    dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm
                    w_data = output.data / dora_factor.view(-1, 1) - lora_data

                self.get_base_layer().weight = bnb.nn.Int8Params(
@ -233,7 +237,14 @@ if is_bnb_available():
                    if not self.use_dora[active_adapter]:
                        output = lora_B(lora_A(dropout(x))) * scaling
                    else:
-                        output = self._apply_dora(x, lora_A, lora_B, scaling, active_adapter)
+                        x = dropout(x)
+                        output = self.lora_magnitude_vector[active_adapter](
+                            x,
+                            lora_A=lora_A,
+                            lora_B=lora_B,
+                            scaling=scaling,
+                            base_layer=self.get_base_layer(),
+                        )
                    if requires_conversion:
                        output = output.to(expected_dtype)

@ -336,12 +347,16 @@ if is_bnb_4bit_available():
                else:
                    # handle dora
                    # since output already includes scaling, set it to 1 here
-                    weight_norm = self._get_weight_norm(output, lora_data, scaling=1).detach()
+                    weight_norm = (
+                        self.lora_magnitude_vector[active_adapter]
+                        .get_weight_norm(output, lora_data, scaling=1)
+                        .detach()
+                    )
                    # We need to cache weight_norm because it has to be based on the original weights. We
                    # cannot calculate it on the fly based on the merged weights when unmerging because its a
                    # different value
                    self._cache_store(f"{active_adapter}-weight_norm", weight_norm)
-                    dora_factor = self.lora_magnitude_vector[active_adapter] / weight_norm
+                    dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm
                    w_data = dora_factor.view(-1, 1) * (output + lora_data)

                if safe_merge and not torch.isfinite(w_data).all():
@ -350,9 +365,9 @@ if is_bnb_4bit_available():
                    )
                if "bnb_quantized" in kwargs:
                    kwargs["bnb_quantized"] = False
-                self.get_base_layer().weight = bnb.nn.Params4bit(w_data.to("cpu"), requires_grad=False, **kwargs).to(
-                    weight.device
-                )
+                kwargs["requires_grad"] = False
+                kwargs.pop("data", None)
+                self.get_base_layer().weight = bnb.nn.Params4bit(w_data.to("cpu"), **kwargs).to(weight.device)
                self.merged_adapters.append(active_adapter)

        def unmerge(self) -> None:
@ -380,14 +395,14 @@ if is_bnb_4bit_available():
                    w_data = output - lora_data
                else:
                    weight_norm = self._cache_pop(f"{active_adapter}-weight_norm")
-                    dora_factor = self.lora_magnitude_vector[active_adapter] / weight_norm
+                    dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm
                    w_data = output.data / dora_factor.view(-1, 1) - lora_data

                if "bnb_quantized" in kwargs:
                    kwargs["bnb_quantized"] = False
-                self.get_base_layer().weight = bnb.nn.Params4bit(w_data.to("cpu"), requires_grad=False, **kwargs).to(
-                    weight.device
-                )
+                kwargs["requires_grad"] = False
+                kwargs.pop("data", None)
+                self.get_base_layer().weight = bnb.nn.Params4bit(w_data.to("cpu"), **kwargs).to(weight.device)

        def get_delta_weight(self, adapter):
            return (
@ -473,7 +488,14 @@ if is_bnb_4bit_available():
                    if not self.use_dora[active_adapter]:
                        output = lora_B(lora_A(dropout(x))) * scaling
                    else:
-                        output = self._apply_dora(x, lora_A, lora_B, scaling, active_adapter)
+                        x = dropout(x)
+                        output = self.lora_magnitude_vector[active_adapter](
+                            x,
+                            lora_A=lora_A,
+                            lora_B=lora_B,
+                            scaling=scaling,
+                            base_layer=self.get_base_layer(),
+                        )
                    if requires_conversion:
                        output = output.to(expected_dtype)

--- a/src/peft/tuners/lora/config.py
+++ b/src/peft/tuners/lora/config.py
@ -14,13 +14,42 @@

 from __future__ import annotations

+import warnings
 from dataclasses import dataclass, field
 from typing import Literal, Optional, Union

+from torch import nn
+
 from peft.config import PeftConfig
 from peft.utils import PeftType


+@dataclass
+class LoraRuntimeConfig:
+    """
+    This is the sub-configuration class to store the runtime configurations for the model.
+
+    Args:
+        ephemeral_gpu_offload (`bool`):
+            Whether to use ephemeral GPU offloading for models partially kept in CPU memory.
+    """
+
+    ephemeral_gpu_offload: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to use ephemeral GPU offloading for models partially kept in CPU memory. Ephemeral GPU offloading result in "
+                "the data involved in intense operations being momentarily copied over to the GPU, and the results copied "
+                "back to CPU. There is a momentary VRAM overhead, but operations are generally orders of magnitude faster "
+                "compared to performing them on the CPU. This is useful when parts of the model and/or components (such "
+                "as adapters) are kept in CPU memory until they are needed. Rather than perform expensive operations on "
+                "small data, the data is transferred to the GPU on-demand, the operation(s) performed, and the results "
+                "moved back to CPU memory. Currently only affects DoRA initialization."
+            )
+        },
+    )
+
+
@dataclass
 class LoftQConfig:
    """
@ -73,11 +102,19 @@ class LoraConfig(PeftConfig):
            Otherwise, it will use the original default value of `lora_alpha/r`.
        modules_to_save (`List[str]`):
            List of modules apart from adapter layers to be set as trainable and saved in the final checkpoint.
-        init_lora_weights (`bool` | `Literal["gaussian", "loftq"]`):
+        init_lora_weights (`bool` | `Literal["gaussian", "olora", "pissa", "pissa_niter_[number of iters]", "loftq"]`):
            How to initialize the weights of the adapter layers. Passing True (default) results in the default
            initialization from the reference implementation from Microsoft. Passing 'gaussian' results in Gaussian
            initialization scaled by the LoRA rank for linear and layers. Setting the initialization to False leads to
-            completely random initialization and is discouraged. Pass `'loftq'` to use LoftQ initialization.
+            completely random initialization and is discouraged. Pass `'loftq'` to use LoftQ initialization. Pass
+            `'olora'` to use OLoRA initialization. Passing `'pissa'` results in the initialization of <a
+            href='https://arxiv.org/abs/2404.02948'>Principal Singular values and Singular vectors Adaptation
+            (PiSSA)</a>, which converges more rapidly than LoRA and ultimately achieves superior performance. Moreover,
+            PiSSA reduces the quantization error compared to QLoRA, leading to further enhancements. Passing
+            `'pissa_niter_[number of iters]'` initiates Fast-SVD-based PiSSA initialization, where `[number of iters]`
+            indicates the number of subspace iterations to perform FSVD, and must be a nonnegative integer. When
+            `[number of iters]` is set to 16, it can complete the initialization of a 7B model within seconds, and the
+            training effect is approximately equivalent to using SVD.
        layers_to_transform (`Union[List[int], int]`):
            The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices
            that are specified in this list. If a single integer is passed, it will apply the transformations on the
@ -108,10 +145,12 @@ class LoraConfig(PeftConfig):
            ranks. Right now, DoRA only supports linear and Conv2D layers. DoRA introduces a bigger overhead than pure
            LoRA, so it is recommended to merge weights for inference. For more information, see
            https://arxiv.org/abs/2402.09353.
-        layer_replication(`List[Tuple[int, int]]`):
+        layer_replication (`List[Tuple[int, int]]`):
            Build a new stack of layers by stacking the original model layers according to the ranges specified. This
            allows expanding (or shrinking) the model without duplicating the base model weights. The new layers will
            all have separate LoRA adapters attached to them.
+        runtime_config (`LoraRuntimeConfig`):
+            Runtime configurations (which are not saved or restored).
    """

    r: int = field(default=8, metadata={"help": "Lora attention dimension"})
@ -140,7 +179,7 @@ class LoraConfig(PeftConfig):
        default=False,
        metadata={
            "help": (
-                "When set to True, uses Rank-Stabilized LoRA doi.org/10.48550/arXiv.2312.03732"
+                "When set to True, uses <a href='https://doi.org/10.48550/arXiv.2312.03732'>Rank-Stabilized LoRA</a>"
                " which sets the adapter scaling factor to `lora_alpha/math.sqrt(r)`, since it"
                " was proven to work better. Otherwise, it will use the original default"
                " value of `lora_alpha/r`."
@ -155,14 +194,18 @@ class LoraConfig(PeftConfig):
            "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
        },
    )
-    init_lora_weights: bool | Literal["gaussian", "loftq"] = field(
+    init_lora_weights: bool | Literal["gaussian", "olora", "pissa", "pissa_niter_[number of iters]", "loftq"] = field(
        default=True,
        metadata={
            "help": (
-                "How to initialize the weights of the LoRA layers. Passing True (default) results in the default "
-                "initialization from the reference implementation from Microsoft. Passing 'gaussian' results "
+                "How to initialize the weights of the LoRA layers. Passing `'True'` (default) results in the default "
+                "initialization from the reference implementation from Microsoft. Passing `'gaussian'` results "
                "in Gaussian initialization scaled by the LoRA rank for linear and layers. Setting the initialization "
-                "to False leads to completely random initialization and is discouraged."
+                "to `'False'` leads to completely random initialization and *is discouraged.*"
+                "Passing `'olora'` results in OLoRA initialization."
+                "Passing `'pissa'` results in PiSSA initialization."
+                "Passing `'pissa_niter_[number of iters]'` initiates Fast-SVD-based PiSSA initialization, "
+                "where [number of iters] indicates the number of subspace iterations to perform fsvd, and must be a nonnegative integer."
                "Pass `'loftq'` to use LoftQ initialization"
            ),
        },
@ -239,12 +282,11 @@ class LoraConfig(PeftConfig):
        default=False,
        metadata={
            "help": (
-                "Enable 'Weight-Decomposed Low-Rank Adaptation' (DoRA). This technique decomposes the updates of the "
+                "Enable <a href='https://arxiv.org/abs/2402.09353'>'Weight-Decomposed Low-Rank Adaptation' (DoRA)</a>. This technique decomposes the updates of the "
                "weights into two parts, magnitude and direction. Direction is handled by normal LoRA, whereas the "
                "magnitude is handled by a separate learnable parameter. This can improve the performance of LoRA, "
                "especially at low ranks. Right now, DoRA only supports linear and Conv2D layers. DoRA introduces a bigger"
-                "overhead than pure LoRA, so it is recommended to merge weights for inference. For more information, "
-                "see  https://arxiv.org/abs/2402.09353."
+                "overhead than pure LoRA, so it is recommended to merge weights for inference."
            )
        },
    )
@ -268,6 +310,17 @@ class LoraConfig(PeftConfig):
            )
        },
    )
+    runtime_config: LoraRuntimeConfig = field(
+        default_factory=LoraRuntimeConfig, metadata={"help": "Runtime configurations"}
+    )
+
+    def to_dict(self):
+        """
+        Returns the configuration for your adapter model as a dictionary. Removes runtime configurations.
+        """
+        rv = super().to_dict()
+        rv.pop("runtime_config")
+        return rv

    def __post_init__(self):
        self.peft_type = PeftType.LORA
@ -294,6 +347,47 @@ class LoraConfig(PeftConfig):
            if self.loftq_config is None:
                raise ValueError("`loftq_config` must be specified when `init_lora_weights` is 'loftq'.")

+        # Using post training conversion of modified base weights to restore their initial values (PiSSA, OLoRA) cannot
+        # be correctly done when using rslora + rank_pattern/alpha_pattern. We can't really know if the user intends
+        # this when they'll eventually call save_pretrained (i.e. if they'll pass
+        # path_initial_model_for_weight_conversionl). Therefore, we only warn but don't raise an error here.
+        if (
+            self.use_rslora
+            and (self.rank_pattern or self.alpha_pattern)
+            and (
+                (isinstance(self.init_lora_weights, str) and (self.init_lora_weights.startswith("pissa")))
+                or (self.init_lora_weights == "olora")
+            )
+        ):
+            msg = (
+                "Using Rank-Stabilized LoRA with rank_pattern/alpha_pattern and post-training conversion of modified "
+                "base weights (PiSSA, OLoRA) means that you won't be able to pass "
+                "`path_initial_model_for_weight_conversion` to `save_pretrained` to restore the initial values of the "
+                "base weights; if you intend to do this, please ensure not to use rslora or rank_pattern/alpha_pattern."
+            )
+            warnings.warn(msg)
+
        # convert loftq_config to dict
        if self.loftq_config and not isinstance(self.loftq_config, dict):
            self.loftq_config = vars(self.loftq_config)
+
+        self._custom_modules: Optional[dict[type[nn.Mmodule], type[nn.Module]]] = None
+
+    def _register_custom_module(self, mapping: dict[type[nn.Mmodule], type[nn.Module]]) -> None:
+        """
+        Experimental API to support providing custom LoRA layers.
+
+        This API is subject to change, you should carefully read the docs before deciding to use it:
+
+        https://huggingface.co/docs/peft/developer_guides/custom_models
+
+        To register custom LoRA module types, call this method with a `mapping` argument that is a dict that maps from
+        the target layer type to the custom LoRA layer type. The dict can contain multiple items if you wish to target
+        multiple layer types. The target layer type can be any nn.Module that we currently don't support in PEFT,
+        whether that is an official PyTorch layer type or a custom layer type. The custom LoRA module class has to be
+        implemented by the user and follow the PEFT conventions for LoRA layers.
+
+        """
+        if self._custom_modules is None:
+            self._custom_modules = {}
+        self._custom_modules.update(mapping)
--- a/src/peft/tuners/lora/dora.py
+++ b/src/peft/tuners/lora/dora.py
@ -0,0 +1,152 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from copy import deepcopy
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from peft.utils.integrations import dequantize_module_weight, gather_params_ctx
+from peft.utils.other import transpose
+
+
+class DoraLinearLayer(nn.Module):
+    def __init__(self, fan_in_fan_out):
+        super().__init__()
+        self.fan_in_fan_out = fan_in_fan_out
+
+    def get_weight_norm(self, weight, lora_weight, scaling) -> torch.Tensor:
+        # calculate L2 norm of weight matrix, column-wise
+        weight = transpose(weight, self.fan_in_fan_out)
+        weight = weight + scaling * lora_weight
+        weight_norm = torch.linalg.norm(weight, dim=1).to(weight.dtype)
+        return weight_norm
+
+    def update_layer(self, *, base_layer, lora_A, lora_B, scaling, place_on_cpu=False) -> None:
+        # temporarily convert fp16 to fp32, as fp16 can cause trouble on CPU with PyTorch < 2.2
+        dtype_is_fp16 = lora_A.dtype == torch.float16
+        if dtype_is_fp16:
+            lora_A = lora_A.float()
+            lora_B = lora_B.float()
+
+        with gather_params_ctx(base_layer.parameters()):
+            if base_layer.__class__.__name__ == "Linear4bit":
+                # We have to create a copy of the base layer, otherwise, FSDP will throw an error. 8bit does not work
+                # yet because Int8Params cannot be correctly deep-copied (attributes vanish)
+                base_layer = deepcopy(base_layer)
+
+            weight = dequantize_module_weight(base_layer)
+            if weight.data.ndim == 4:  # For handling LoRAs applied to Conv2Ds.
+                lora_weight = torch.mm(lora_B.flatten(start_dim=1), lora_A.flatten(start_dim=1))
+                lora_weight = lora_weight.reshape(weight.shape)
+            else:
+                lora_weight = lora_B @ lora_A
+
+            if dtype_is_fp16:
+                lora_weight = lora_weight.half()
+            weight_norm = self.get_weight_norm(weight.to(lora_A.device), lora_weight, scaling)
+
+        if place_on_cpu:
+            weight_norm = weight_norm.to("cpu")
+        self.weight = nn.Parameter(weight_norm, requires_grad=True)
+
+    def forward(self, x, *, lora_A, lora_B, scaling, base_layer):
+        """
+        For DoRA, calculate the extra output from LoRA with DoRA applied. This should be added on top of the base layer
+        output.
+        """
+        lora_result = lora_B(lora_A(x))
+
+        # Don't use `lora_weight = lora_B.weight @ lora_A.weight` because this causes errors with FSDP. Instead,
+        # calculate the same but using forward.
+        x_eye = torch.eye(lora_A.weight.shape[1], device=lora_A.weight.device, dtype=x.dtype)
+        lora_weight = lora_B(lora_A(x_eye)).T
+
+        magnitude = self.weight
+        weight = dequantize_module_weight(base_layer)
+        weight = weight.to(x.dtype)
+        weight_norm = self.get_weight_norm(weight, lora_weight.detach(), scaling)
+        # see section 4.3 of DoRA (https://arxiv.org/abs/2402.09353)
+        # "[...] we suggest treating ||V +∆V ||_c in
+        # Eq. (5) as a constant, thereby detaching it from the gradient
+        # graph. This means that while ||V + ∆V ||_c dynamically
+        # reflects the updates of ∆V , it won’t receive any gradient
+        # during backpropagation"
+        weight_norm = weight_norm.detach()
+        mag_norm_scale = (magnitude / weight_norm).view(1, -1)
+        result_dora = (mag_norm_scale - 1) * (
+            F.linear(x, transpose(weight, self.fan_in_fan_out))
+        ) + mag_norm_scale * lora_result * scaling
+
+        # Note: Computation could potentially be accelerated by using the code below instead of calculating X@W again.
+        # This is only correct if dropout=0, otherwise results will differ:
+        # https://github.com/huggingface/peft/pull/1474#issuecomment-1964682771
+        # bias = self.get_base_layer().bias
+        # if bias is not None:
+        #     result = result - bias
+        # result = mag_norm_scale * result + mag_norm_scale * lora_B(lora_A(x)) * scaling
+        # if bias is not None:
+        #     result = result + bias
+
+        return result_dora
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "lora.dora." + rep
+
+
+class DoraConv2dLayer(DoraLinearLayer):
+    def get_weight_norm(self, weight, lora_weight, scaling) -> torch.Tensor:
+        # calculate L2 norm of weight matrix, column-wise
+        weight = weight + scaling * lora_weight
+        # the following is needed to have compatibility with the 4D weight tensors of Conv2D
+        weight_norm = weight.norm(p=2, dim=(1, 2, 3), keepdim=True).transpose(1, 0)
+        return weight_norm
+
+    def forward(self, x, *, lora_A, lora_B, scaling, base_layer):
+        """
+        For DoRA, calculate the extra output from LoRA with DoRA applied. This should be added on top of the base layer
+        output.
+        """
+        weight = base_layer.weight
+        lora_weight = torch.mm(lora_B.weight.flatten(start_dim=1), lora_A.weight.flatten(start_dim=1))
+        lora_weight = lora_weight.reshape(weight.shape)
+        magnitude = self.weight
+        weight_norm = self.get_weight_norm(weight, lora_weight.detach(), scaling)
+        # see section 4.3 of DoRA (https://arxiv.org/abs/2402.09353)
+        # "[...] we suggest treating ||V +∆V ||_c in
+        # Eq. (5) as a constant, thereby detaching it from the gradient
+        # graph. This means that while ||V + ∆V ||_c dynamically
+        # reflects the updates of ∆V , it won’t receive any gradient
+        # during backpropagation"
+        weight_norm = weight_norm.detach()
+        mag_norm_scale = magnitude / weight_norm
+        result_dora = (mag_norm_scale - 1) * (
+            F.conv2d(
+                x,
+                weight,
+                bias=None,
+                stride=base_layer.stride,
+                padding=base_layer.padding,
+                dilation=base_layer.dilation,
+                groups=base_layer.groups,
+            )
+        ) + mag_norm_scale * lora_B(lora_A(x)) * scaling
+
+        return result_dora
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "lora.dora." + rep
--- a/src/peft/tuners/lora/eetq.py
+++ b/src/peft/tuners/lora/eetq.py
@ -0,0 +1,104 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, List, Optional
+
+import torch
+
+from peft.import_utils import is_eetq_available
+from peft.tuners.lora.layer import LoraLayer
+from peft.tuners.tuners_utils import BaseTunerLayer
+
+
+if is_eetq_available():
+    from eetq import EetqLinear
+
+    class EetqLoraLinear(torch.nn.Module, LoraLayer):
+        def __init__(
+            self,
+            base_layer,
+            adapter_name,
+            r: int = 0,
+            lora_alpha: int = 1,
+            lora_dropout: float = 0.0,
+            init_lora_weights: bool = True,
+            use_rslora: bool = False,
+            **kwargs,
+        ):
+            super().__init__()
+            LoraLayer.__init__(self, base_layer)
+
+            # self.base_layer and self.quant_linear_module are the same; we need the former for consistency and the latter
+            # for backwards compatibility
+            self.quant_linear_module = base_layer
+
+            self._active_adapter = adapter_name
+            self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora)
+
+        def forward(self, x: torch.Tensor):
+            result = self.quant_linear_module(x)
+
+            if self.disable_adapters:
+                return result
+
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.lora_A.keys():
+                    continue
+                lora_A = self.lora_A[active_adapter]
+                lora_B = self.lora_B[active_adapter]
+                dropout = self.lora_dropout[active_adapter]
+                scaling = self.scaling[active_adapter]
+
+                requires_conversion = not torch.is_autocast_enabled()
+                if requires_conversion:
+                    expected_dtype = result.dtype
+                    x = x.to(lora_A.weight.dtype)
+
+                output = lora_B(lora_A(dropout(x)))
+                if requires_conversion:
+                    output = output.to(expected_dtype)
+                output = output * scaling
+                result = result + output
+            return result
+
+        def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
+            raise AttributeError("Merging LoRA layers is not supported for Eetq layers.")
+
+        def unmerge(self) -> None:
+            raise AttributeError("Unmerging LoRA layers is not supported for Eetq layers.")
+
+        def __repr__(self) -> str:
+            rep = super().__repr__()
+            return "lora." + rep
+
+
+def dispatch_eetq(
+    target: torch.nn.Module,
+    adapter_name: str,
+    **kwargs: Any,
+) -> Optional[torch.nn.Module]:
+    new_module = None
+
+    if isinstance(target, BaseTunerLayer):
+        target_base_layer = target.get_base_layer()
+    else:
+        target_base_layer = target
+
+    if is_eetq_available() and isinstance(target_base_layer, EetqLinear):
+        new_module = EetqLoraLinear(target, adapter_name, **kwargs)
+        target.weight = target_base_layer.weight
+
+        if hasattr(target, "bias"):
+            target.bias = target_base_layer.bias
+
+    return new_module
--- a/src/peft/tuners/lora/hqq.py
+++ b/src/peft/tuners/lora/hqq.py
@ -0,0 +1,247 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import copy
+import warnings
+from typing import Any, Optional
+
+import torch
+
+from peft.import_utils import is_hqq_available
+from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge
+from peft.utils.other import transpose
+
+from .layer import LoraLayer
+
+
+if is_hqq_available():
+    from hqq.core.quantize import HQQLinear
+
+    class HqqLoraLinear(torch.nn.Module, LoraLayer):
+        # Lora implemented in a dense layer
+        def __init__(
+            self,
+            base_layer: torch.nn.Module,
+            adapter_name: str,
+            r: int = 0,
+            lora_alpha: int = 1,
+            lora_dropout: float = 0.0,
+            init_lora_weights: bool = True,
+            use_rslora: bool = False,
+            use_dora: bool = False,
+            **kwargs,
+        ) -> None:
+            super().__init__()
+            LoraLayer.__init__(self, base_layer)
+            self.fan_in_fan_out = False
+
+            self._active_adapter = adapter_name
+            self.update_layer(
+                adapter_name,
+                r,
+                lora_alpha=lora_alpha,
+                lora_dropout=lora_dropout,
+                init_lora_weights=init_lora_weights,
+                use_rslora=use_rslora,
+                use_dora=use_dora,
+            )
+
+        def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
+            """
+            Merge the active adapter weights into the base weights
+
+            Args:
+                safe_merge (`bool`, *optional*):
+                    If True, the merge operation will be performed in a copy of the original weights and check for NaNs
+                    before merging the weights. This is useful if you want to check if the merge operation will produce
+                    NaNs. Defaults to `False`.
+                adapter_names (`list[str]`, *optional*):
+                    The list of adapter names that should be merged. If None, all active adapters will be merged.
+                    Defaults to `None`.
+            """
+            adapter_names = check_adapters_to_merge(self, adapter_names)
+            if not adapter_names:
+                # no adapter to merge
+                return
+
+            for active_adapter in adapter_names:
+                if active_adapter not in self.lora_A.keys():
+                    continue
+
+                layer = self.get_base_layer()
+                quant_config = {**copy.deepcopy(layer.quant_config), "offload_meta": layer.offload_meta}
+                lora_data = self.get_delta_weight(active_adapter)
+
+                output = layer.dequantize()
+                if not self.use_dora[active_adapter]:
+                    w_data = output + lora_data
+                else:
+                    # handle dora
+                    # since output already includes scaling, set it to 1 here
+                    weight_norm = self._get_weight_norm(output, lora_data, scaling=1).detach()
+                    # We need to cache weight_norm because it has to be based on the original weights. We
+                    # cannot calculate it on the fly based on the merged weights when unmerging because its a
+                    # different value
+                    self._cache_store(f"{active_adapter}-weight_norm", weight_norm)
+                    dora_factor = self.lora_magnitude_vector[active_adapter] / weight_norm
+                    w_data = dora_factor.view(-1, 1) * (output + lora_data)
+
+                if safe_merge and not torch.isfinite(w_data).all():
+                    raise ValueError(
+                        f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                    )
+                new_hqq_layer = HQQLinear(None, quant_config, compute_dtype=layer.compute_dtype, device=layer.device)
+                quant_config.pop("offload_meta", None)
+                new_hqq_layer.quantize(w_data, **quant_config)
+                self.base_layer = new_hqq_layer
+                self.merged_adapters.append(active_adapter)
+
+        def unmerge(self) -> None:
+            """
+            This method unmerges all merged adapter layers from the base weights.
+            """
+            if not self.merged:
+                warnings.warn("Already unmerged. Nothing to do.")
+                return
+
+            while len(self.merged_adapters) > 0:
+                active_adapter = self.merged_adapters.pop()
+                if active_adapter not in self.lora_A.keys():
+                    continue
+
+                lora_data = self.get_delta_weight(active_adapter)
+                layer = self.get_base_layer()
+                quant_config = {**copy.deepcopy(layer.quant_config), "offload_meta": layer.offload_meta}
+                output = layer.dequantize()
+
+                if not self.use_dora[active_adapter]:
+                    w_data = output - lora_data
+                else:
+                    weight_norm = self._cache_pop(f"{active_adapter}-weight_norm")
+                    dora_factor = self.lora_magnitude_vector[active_adapter] / weight_norm
+                    w_data = output.data / dora_factor.view(-1, 1) - lora_data
+
+                new_hqq_layer = HQQLinear(None, quant_config, compute_dtype=layer.compute_dtype, device=layer.device)
+                quant_config.pop("offload_meta", None)
+                new_hqq_layer.quantize(w_data, **quant_config)
+                self.base_layer = new_hqq_layer
+
+        def get_delta_weight(self, adapter):
+            return (
+                transpose(
+                    self.lora_B[adapter].weight @ self.lora_A[adapter].weight,
+                    False,
+                )
+                * self.scaling[adapter]
+            )
+
+        def _mixed_batch_forward(
+            self, x: torch.Tensor, *args: Any, adapter_names: list[str], **kwargs: Any
+        ) -> torch.Tensor:
+            # This is a special method that handles the case when users pass the argument `adapter_names`. This is an
+            # extra argument that allows mixing different adapters in the same batch at inference time.
+            result = self.base_layer(x, *args, **kwargs)
+
+            unique_adapters = set(adapter_names)
+            sub_batch_indices_list = []
+            for adapter in unique_adapters:
+                sub_batch_indices_list.append([index for index, item in enumerate(adapter_names) if item == adapter])
+
+            for i, active_adapter in enumerate(unique_adapters):
+                if active_adapter == "__base__":
+                    continue
+                if active_adapter not in self.lora_A.keys():
+                    continue
+
+                lora_A = self.lora_A[active_adapter]
+                lora_B = self.lora_B[active_adapter]
+                dropout = self.lora_dropout[active_adapter]
+                scaling = self.scaling[active_adapter]
+
+                requires_conversion = not torch.is_autocast_enabled()
+                if requires_conversion:
+                    expected_dtype = result.dtype
+                    compute_dtype = lora_A.weight.dtype
+                    if x.dtype != compute_dtype:
+                        x = x.to(compute_dtype)
+
+                # getting the sub-batch, passing it to LoRA layers and updating the corresponding indices of the linear
+                # layer output
+                sub_batch = x[sub_batch_indices_list[i]]
+                output = lora_B(lora_A(dropout(sub_batch))) * scaling
+                if requires_conversion:
+                    output = output.to(expected_dtype)
+                result[sub_batch_indices_list[i]] += output
+
+            return result
+
+        def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+            self._check_forward_args(x, *args, **kwargs)
+            adapter_names = kwargs.pop("adapter_names", None)
+
+            if self.disable_adapters:
+                if self.merged:
+                    self.unmerge()
+                result = self.base_layer(x, *args, **kwargs)
+            elif adapter_names is not None:
+                result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs)
+            elif self.merged:
+                result = self.base_layer(x, *args, **kwargs)
+            else:
+                result = self.base_layer(x, *args, **kwargs)
+
+                for active_adapter in self.active_adapters:
+                    if active_adapter not in self.lora_A.keys():
+                        continue
+                    lora_A = self.lora_A[active_adapter]
+                    lora_B = self.lora_B[active_adapter]
+                    dropout = self.lora_dropout[active_adapter]
+                    scaling = self.scaling[active_adapter]
+
+                    requires_conversion = not torch.is_autocast_enabled()
+                    if requires_conversion:
+                        expected_dtype = result.dtype
+                        compute_dtype = lora_A.weight.dtype
+                        if x.dtype != compute_dtype:
+                            x = x.to(compute_dtype)
+
+                    if not self.use_dora[active_adapter]:
+                        output = lora_B(lora_A(dropout(x))) * scaling
+                    else:
+                        output = self._apply_dora(x, lora_A, lora_B, scaling, active_adapter)
+                    if requires_conversion:
+                        output = output.to(expected_dtype)
+
+                    result = result + output
+
+            return result
+
+        def __repr__(self) -> str:
+            rep = super().__repr__()
+            return "lora." + rep
+
+
+def dispatch_hqq(target: torch.nn.Module, adapter_name: str, **kwargs):
+    new_module = None
+
+    if isinstance(target, BaseTunerLayer):
+        target_base_layer = target.get_base_layer()
+    else:
+        target_base_layer = target
+
+    if is_hqq_available() and isinstance(target_base_layer, HQQLinear):
+        new_module = HqqLoraLinear(target_base_layer, adapter_name, **kwargs)
+
+    return new_module
--- a/src/peft/tuners/lora/layer.py
+++ b/src/peft/tuners/lora/layer.py
@ -20,13 +20,15 @@ from typing import Any, Optional, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch import svd_lowrank
 from transformers.pytorch_utils import Conv1D

 from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge
-from peft.utils.integrations import dequantize_bnb_weight, gather_params_ctx
+from peft.utils.integrations import dequantize_module_weight, gather_params_ctx
 from peft.utils.other import transpose

 from .config import LoraConfig
+from .dora import DoraConv2dLayer, DoraLinearLayer


 class LoraLayer(BaseTunerLayer):
@ -35,7 +37,7 @@ class LoraLayer(BaseTunerLayer):
    # All names of other parameters that may contain adapter-related parameters
    other_param_names = ("r", "lora_alpha", "scaling", "lora_dropout")

-    def __init__(self, base_layer: nn.Module, **kwargs) -> None:
+    def __init__(self, base_layer: nn.Module, ephemeral_gpu_offload: bool = False, **kwargs) -> None:
        self.base_layer = base_layer
        self.r = {}
        self.lora_alpha = {}
@ -50,8 +52,9 @@ class LoraLayer(BaseTunerLayer):
        self._disable_adapters = False
        self.merged_adapters = []
        self.use_dora: dict[str, bool] = {}
-        self.lora_magnitude_vector: Optional[torch.nn.ParameterDict] = None  # for DoRA
+        self.lora_magnitude_vector = torch.nn.ModuleDict()  # for DoRA
        self._caches: dict[str, Any] = {}
+        self.ephemeral_gpu_offload: bool = ephemeral_gpu_offload
        self.kwargs = kwargs

        base_layer = self.get_base_layer()
@ -77,8 +80,21 @@ class LoraLayer(BaseTunerLayer):
        elif hasattr(base_layer, "w_bit") and base_layer.__class__.__name__ == "WQLinear_GEMM":
            # Awq layers
            in_features, out_features = base_layer.in_features, base_layer.out_features
+        elif base_layer.__class__.__name__ == "EetqLinear":
+            # Eetq layers
+            in_features, out_features = base_layer.in_features, base_layer.out_features
+        elif hasattr(base_layer, "W_q") and base_layer.__class__.__name__ == "HQQLinear":
+            # HQQ layers
+            in_features, out_features = base_layer.in_features, base_layer.out_features
        else:
-            raise ValueError(f"Unsupported layer type {type(base_layer)}")
+            # possibly support user provided custom layer types using dynamic dispatch
+            if hasattr(base_layer, "in_features") and hasattr(base_layer, "out_features"):
+                in_features, out_features = base_layer.in_features, base_layer.out_features
+            else:
+                in_features, out_features = None, None
+            warnings.warn(
+                f"Unsupported layer type '{type(base_layer)}' encountered, proceed at your own risk.", UserWarning
+            )

        self.in_features = in_features
        self.out_features = out_features
@ -106,21 +122,20 @@ class LoraLayer(BaseTunerLayer):
        else:
            self.scaling[adapter_name] = lora_alpha / r

-        if init_lora_weights == "loftq":
-            self.loftq_init(adapter_name)
+        # for inits that require access to the base weight, use gather_param_ctx so that the weight is gathered when using DeepSpeed
+        if isinstance(init_lora_weights, str) and init_lora_weights.startswith("pissa"):
+            with gather_params_ctx(self.get_base_layer().weight):
+                self.pissa_init(adapter_name, init_lora_weights)
+        elif isinstance(init_lora_weights, str) and init_lora_weights.lower() == "olora":
+            with gather_params_ctx(self.get_base_layer().weight):
+                self.olora_init(adapter_name)
+        elif init_lora_weights == "loftq":
+            with gather_params_ctx(self.get_base_layer().weight):
+                self.loftq_init(adapter_name)
        elif init_lora_weights:
            self.reset_lora_parameters(adapter_name, init_lora_weights)
-
-        # check weight and qweight (for GPTQ)
-        for weight_name in ("weight", "qweight"):
-            weight = getattr(self.get_base_layer(), weight_name, None)
-            if weight is not None:
-                # the layer is already completely initialized, this is an update
-                if weight.dtype.is_floating_point or weight.dtype.is_complex:
-                    self.to(weight.device, dtype=weight.dtype)
-                else:
-                    self.to(weight.device)
-                break
+        # call this before dora_init
+        self._move_adapter_to_device_of_base_layer(adapter_name)

        if use_dora:
            self.dora_init(adapter_name)
@ -145,10 +160,69 @@ class LoraLayer(BaseTunerLayer):
                raise ValueError(f"Unknown initialization {init_lora_weights=}")
            nn.init.zeros_(self.lora_B[adapter_name].weight)
        if adapter_name in self.lora_embedding_A.keys():
-            # initialize a the same way as the default for nn.linear and b to zero
+            # Initialize A to zeros and B the same way as the default for nn.Embedding, see:
+            # https://github.com/microsoft/LoRA/blob/4c0333854cb905966f8cc4e9a74068c1e507c7b7/loralib/layers.py#L59-L60
            nn.init.zeros_(self.lora_embedding_A[adapter_name])
            nn.init.normal_(self.lora_embedding_B[adapter_name])

+    def olora_init(self, adapter_name):
+        dtype = self.get_base_layer().weight.dtype
+        if dtype in [torch.int8, torch.uint8]:
+            weight_tensor = dequantize_module_weight(self.get_base_layer())
+        elif dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            weight_tensor = self.get_base_layer().weight
+        else:
+            raise TypeError(f"Unsupported data type for the base layer. Got {dtype}.")
+
+        scale_factor = self.scaling[adapter_name]
+        r = self.r[adapter_name]
+        weight_tensor = weight_tensor.to(torch.float32)
+        Q, R = torch.linalg.qr(weight_tensor.data)
+
+        Qr, Rr = Q[:, :r], R[:r]
+
+        self.lora_A[adapter_name].weight.data = Rr.contiguous()
+        self.lora_B[adapter_name].weight.data = Qr.contiguous()
+
+        weight_tensor.data -= scale_factor * self.lora_B[adapter_name].weight @ self.lora_A[adapter_name].weight
+        weight_tensor = weight_tensor.to(dtype)
+        self.get_base_layer().weight.data = weight_tensor
+
+    def pissa_init(self, adapter_name, init_lora_weights):
+        weight = self.get_base_layer().weight
+        dtype = weight.dtype
+        if dtype not in [torch.float32, torch.float16, torch.bfloat16]:
+            raise TypeError(
+                "Please initialize PiSSA under float32, float16, or bfloat16. "
+                "Subsequently, re-quantize the residual model to help minimize quantization errors."
+            )
+        weight = weight.to(torch.float32)
+        if init_lora_weights == "pissa":
+            # USV^T = W <-> VSU^T = W^T, where W^T = weight.data in R^{out_channel, in_channel},
+            V, S, Uh = torch.linalg.svd(weight.data, full_matrices=False)
+            Vr = V[:, : self.r[adapter_name]]
+            Sr = S[: self.r[adapter_name]]
+            Sr /= self.scaling[adapter_name]
+            Uhr = Uh[: self.r[adapter_name]]
+        elif len(init_lora_weights.split("_niter_")) == 2:
+            Vr, Sr, Ur = svd_lowrank(
+                weight.data, self.r[adapter_name], niter=int(init_lora_weights.split("_niter_")[-1])
+            )
+            Sr /= self.scaling[adapter_name]
+            Uhr = Ur.t()
+        else:
+            raise ValueError(
+                f"init_lora_weights should be 'pissa' or 'pissa_niter_[number of iters]', got {init_lora_weights} instead."
+            )
+
+        lora_A = torch.diag(torch.sqrt(Sr)) @ Uhr
+        lora_B = Vr @ torch.diag(torch.sqrt(Sr))
+        self.lora_A[adapter_name].weight.data = lora_A
+        self.lora_B[adapter_name].weight.data = lora_B
+        weight = weight.data - self.scaling[adapter_name] * lora_B @ lora_A
+        weight = weight.to(dtype)
+        self.get_base_layer().weight.data = weight
+
    def loftq_init(self, adapter_name):
        from peft.utils.loftq_utils import loftq_init

@ -170,31 +244,27 @@ class LoraLayer(BaseTunerLayer):
            self.lora_embedding_B[adapter_name].weight.data = lora_B
        self.get_base_layer().weight.data = qweight

-    def _get_weight_norm(self, weight, lora_weight, scaling) -> torch.Tensor:
-        # calculate L2 norm of weight matrix, column-wise
-        weight = transpose(weight, self.fan_in_fan_out)
-        weight = weight + scaling * lora_weight
-        weight_norm = torch.linalg.norm(weight, dim=1).to(weight.dtype)
-        return weight_norm
-
    def dora_init(self, adapter_name: str) -> None:
-        lora_A = self.lora_A[adapter_name]
-        lora_B = self.lora_B[adapter_name]
-        scaling = self.scaling[adapter_name]
-        with gather_params_ctx(self.get_base_layer().parameters()):
-            weight = self.get_base_layer().weight
-            quant_state = getattr(self.get_base_layer(), "state", None)
-            weight = dequantize_bnb_weight(weight, state=quant_state)  # no-op if not bnb
-            if weight.data.ndim == 4:  # For handling LoRAs applied to Conv2Ds.
-                lora_weight = torch.mm(lora_B.weight.flatten(start_dim=1), lora_A.weight.flatten(start_dim=1))
-                lora_weight = lora_weight.reshape(weight.shape)
+        if not self.lora_magnitude_vector:
+            # first dora layer being added, add lora_magnitude_vector to the list of learnable parameters
+            self.adapter_layer_names = self.adapter_layer_names[:] + ("lora_magnitude_vector",)
+
+        dora_layer = DoraLinearLayer(fan_in_fan_out=getattr(self, "fan_in_fan_out", False))
+        lora_A = self.lora_A[adapter_name].weight
+        lora_B = self.lora_B[adapter_name].weight
+        place_on_cpu = self.ephemeral_gpu_offload and (lora_A.device.type == "cpu" or lora_B.device.type == "cpu")
+        if self.ephemeral_gpu_offload:
+            if lora_A.device.type == "cuda":
+                lora_B = lora_B.to(lora_A.device)
            else:
-                lora_weight = lora_B.weight @ lora_A.weight
-            weight_norm = self._get_weight_norm(weight, lora_weight, scaling)
-        self.lora_magnitude_vector = nn.ParameterDict()
-        self.lora_magnitude_vector[adapter_name] = nn.Parameter(weight_norm, requires_grad=True)
-        # add lora_magnitude_vector to the list of learnable parameters
-        self.adapter_layer_names = self.adapter_layer_names[:] + ("lora_magnitude_vector",)
+                if lora_B.device.type != "cuda":
+                    lora_B = lora_B.to("cuda")
+                lora_A = lora_A.to(lora_B.device)
+        scaling = self.scaling[adapter_name]
+        dora_layer.update_layer(
+            base_layer=self.get_base_layer(), lora_A=lora_A, lora_B=lora_B, scaling=scaling, place_on_cpu=place_on_cpu
+        )
+        self.lora_magnitude_vector[adapter_name] = dora_layer

    def _cache_store(self, key: str, value: Any) -> None:
        self._caches[key] = value
@ -203,42 +273,6 @@ class LoraLayer(BaseTunerLayer):
        value = self._caches.pop(key)
        return value

-    def _apply_dora(self, x, lora_A, lora_B, scaling, active_adapter):
-        """
-        For DoRA, calculate the extra output from LoRA with DoRA applied. This should be added on top of the base layer
-        output.
-        """
-        lora_weight = lora_B.weight @ lora_A.weight
-        magnitude = self.lora_magnitude_vector[active_adapter]
-        weight = self.get_base_layer().weight
-        quant_state = getattr(self.get_base_layer(), "state", None)
-        weight = dequantize_bnb_weight(weight, state=quant_state)  # no-op if not bnb
-        weight = weight.to(x.dtype)
-        weight_norm = self._get_weight_norm(weight, lora_weight, scaling)
-        # see section 4.3 of DoRA (https://arxiv.org/abs/2402.09353)
-        # "[...] we suggest treating ||V +∆V ||_c in
-        # Eq. (5) as a constant, thereby detaching it from the gradient
-        # graph. This means that while ||V + ∆V ||_c dynamically
-        # reflects the updates of ∆V , it won’t receive any gradient
-        # during backpropagation"
-        weight_norm = weight_norm.detach()
-        mag_norm_scale = (magnitude / weight_norm).view(1, -1)
-        result_dora = (mag_norm_scale - 1) * (
-            F.linear(x, transpose(weight, self.fan_in_fan_out))
-        ) + mag_norm_scale * lora_B(lora_A(x)) * scaling
-
-        # Note: Computation could potentially be accelerated by using the code below instead of calculating X@W again.
-        # This is only correct if dropout=0, otherwise results will differ:
-        # https://github.com/huggingface/peft/pull/1474#issuecomment-1964682771
-        # bias = self.get_base_layer().bias
-        # if bias is not None:
-        #     result = result - bias
-        # result = mag_norm_scale * result + mag_norm_scale * lora_B(lora_A(x)) * scaling
-        # if bias is not None:
-        #     result = result + bias
-
-        return result_dora
-
    def set_scale(self, adapter, scale):
        if adapter not in self.scaling:
            # Ignore the case where the adapter is not in the layer
@ -392,18 +426,20 @@ class Linear(nn.Module, LoraLayer):
                    orig_weights = base_layer.weight.data.clone()
                    delta_weight = self.get_delta_weight(active_adapter)
                    if not self.use_dora[active_adapter]:
-                        orig_weights = orig_weights + delta_weight
+                        orig_weights += delta_weight
                    else:
                        # handle dora
                        # since delta_weight already includes scaling, set it to 1 here
-                        weight_norm = self._get_weight_norm(
-                            orig_weights, transpose(delta_weight, self.fan_in_fan_out), scaling=1
-                        ).detach()
+                        weight_norm = (
+                            self.lora_magnitude_vector[active_adapter]
+                            .get_weight_norm(orig_weights, transpose(delta_weight, self.fan_in_fan_out), scaling=1)
+                            .detach()
+                        )
                        # We need to cache weight_norm because it has to be based on the original weights. We
                        # cannot calculate it on the fly based on the merged weights when unmerging because its a
                        # different value
                        self._cache_store(f"{active_adapter}-weight_norm", weight_norm)
-                        dora_factor = self.lora_magnitude_vector[active_adapter] / weight_norm
+                        dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm
                        dora_factor = transpose(dora_factor.view(-1, 1), self.fan_in_fan_out)
                        orig_weights = dora_factor * (orig_weights + delta_weight)

@ -416,18 +452,22 @@ class Linear(nn.Module, LoraLayer):
                else:
                    delta_weight = self.get_delta_weight(active_adapter)
                    if not self.use_dora[active_adapter]:
-                        base_layer.weight.data = base_layer.weight.data + delta_weight
+                        base_layer.weight.data += delta_weight
                    else:
                        # handle dora
                        # since delta_weight already includes scaling, set it to 1 here
-                        weight_norm = self._get_weight_norm(
-                            base_layer.weight, transpose(delta_weight, self.fan_in_fan_out), scaling=1
-                        ).detach()
+                        weight_norm = (
+                            self.lora_magnitude_vector[active_adapter]
+                            .get_weight_norm(
+                                base_layer.weight, transpose(delta_weight, self.fan_in_fan_out), scaling=1
+                            )
+                            .detach()
+                        )
                        # We need to cache weight_norm because it has to be based on the original weights. We
                        # cannot calculate it on the fly based on the merged weights when unmerging because its a
                        # different value
                        self._cache_store(f"{active_adapter}-weight_norm", weight_norm)
-                        dora_factor = self.lora_magnitude_vector[active_adapter] / weight_norm
+                        dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm
                        dora_factor = transpose(dora_factor.view(-1, 1), self.fan_in_fan_out)
                        new_weight = dora_factor * (base_layer.weight.data + delta_weight)
                        base_layer.weight.data = new_weight
@ -450,7 +490,7 @@ class Linear(nn.Module, LoraLayer):
                    weight.data -= delta_weight
                else:
                    weight_norm = self._cache_pop(f"{active_adapter}-weight_norm")
-                    dora_factor = self.lora_magnitude_vector[active_adapter] / weight_norm
+                    dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm
                    weight_orig = weight.data / dora_factor.view(-1, 1) - delta_weight
                    weight.data = weight_orig

@ -516,7 +556,13 @@ class Linear(nn.Module, LoraLayer):
                    result = result + lora_B(lora_A(dropout(x))) * scaling
                else:
                    x = dropout(x)
-                    result = result + self._apply_dora(x, lora_A, lora_B, scaling, active_adapter)
+                    result = result + self.lora_magnitude_vector[active_adapter](
+                        x,
+                        lora_A=lora_A,
+                        lora_B=lora_B,
+                        scaling=scaling,
+                        base_layer=self.get_base_layer(),
+                    )

            result = result.to(torch_result_dtype)

@ -585,12 +631,7 @@ class Embedding(nn.Module, LoraLayer):
        elif init_lora_weights:
            self.reset_lora_parameters(adapter_name, init_lora_weights)

-        base_layer = self.get_base_layer()
-        weight = getattr(base_layer, "weight", None)
-        if weight is not None:
-            # the layer is already completely initialized, this is an update
-            self.to(base_layer.weight.device, dtype=weight.dtype)
-
+        self._move_adapter_to_device_of_base_layer(adapter_name)
        self.set_adapter(self.active_adapters)

    def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
@ -618,7 +659,7 @@ class Embedding(nn.Module, LoraLayer):
                    # Note that safe_merge will be slower than the normal merge
                    # because of the copy operation.
                    orig_weights = base_layer.weight.data.clone()
-                    orig_weights = orig_weights + self.get_delta_weight(active_adapter)
+                    orig_weights += self.get_delta_weight(active_adapter)

                    if not torch.isfinite(orig_weights).all():
                        raise ValueError(
@ -627,7 +668,7 @@ class Embedding(nn.Module, LoraLayer):

                    base_layer.weight.data = orig_weights
                else:
-                    base_layer.weight.data = base_layer.weight.data + self.get_delta_weight(active_adapter)
+                    base_layer.weight.data += self.get_delta_weight(active_adapter)
                self.merged_adapters.append(active_adapter)

    def unmerge(self) -> None:
@ -808,10 +849,8 @@ class Conv2d(nn.Module, LoraLayer):
        elif init_lora_weights:
            self.reset_lora_parameters(adapter_name, init_lora_weights)

-        weight = getattr(base_layer, "weight", None)
-        if weight is not None:
-            # the layer is already completely initialized, this is an update
-            self.to(base_layer.weight.device, dtype=weight.dtype)
+        # call this before dora_init
+        self._move_adapter_to_device_of_base_layer(adapter_name)

        if use_dora:
            self.dora_init(adapter_name)
@ -821,6 +860,18 @@ class Conv2d(nn.Module, LoraLayer):

        self.set_adapter(self.active_adapters)

+    def dora_init(self, adapter_name: str) -> None:
+        if self.lora_magnitude_vector is None:
+            # first dora layer being added, add lora_magnitude_vector to the list of learnable parameters
+            self.adapter_layer_names = self.adapter_layer_names[:] + ("lora_magnitude_vector",)
+
+        dora_layer = DoraConv2dLayer(fan_in_fan_out=False)
+        lora_A = self.lora_A[adapter_name].weight
+        lora_B = self.lora_B[adapter_name].weight
+        scaling = self.scaling[adapter_name]
+        dora_layer.update_layer(base_layer=self.get_base_layer(), lora_A=lora_A, lora_B=lora_B, scaling=scaling)
+        self.lora_magnitude_vector[adapter_name] = dora_layer
+
    def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
        """
        Merge the active adapter weights inside the base weights
@ -849,16 +900,20 @@ class Conv2d(nn.Module, LoraLayer):
                    delta_weight = self.get_delta_weight(active_adapter)

                    if not self.use_dora[active_adapter]:
-                        orig_weights = orig_weights + delta_weight
+                        orig_weights += delta_weight
                    else:
                        # handle dora
                        # since delta_weight already includes scaling, set it to 1 here
-                        weight_norm = self._get_weight_norm(orig_weights, delta_weight, scaling=1).detach()
+                        weight_norm = (
+                            self.lora_magnitude_vector[active_adapter]
+                            .get_weight_norm(orig_weights, delta_weight, scaling=1)
+                            .detach()
+                        )
                        # We need to cache weight_norm because it has to be based on the original weights. We
                        # cannot calculate it on the fly based on the merged weights when unmerging because its a
                        # different value
                        self._cache_store(f"{active_adapter}-weight_norm", weight_norm)
-                        dora_factor = self.lora_magnitude_vector[active_adapter] / weight_norm
+                        dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm
                        orig_weights = dora_factor.view(-1, 1, 1, 1) * (orig_weights + delta_weight)

                    if not torch.isfinite(orig_weights).all():
@ -869,16 +924,20 @@ class Conv2d(nn.Module, LoraLayer):
                else:
                    delta_weight = self.get_delta_weight(active_adapter)
                    if not self.use_dora[active_adapter]:
-                        base_layer.weight.data = base_layer.weight.data + delta_weight
+                        base_layer.weight.data += delta_weight
                    else:
                        # handle dora
                        # since delta_weight already includes scaling, set it to 1 here
-                        weight_norm = self._get_weight_norm(base_layer.weight, delta_weight, scaling=1).detach()
+                        weight_norm = (
+                            self.lora_magnitude_vector[active_adapter]
+                            .get_weight_norm(base_layer.weight, delta_weight, scaling=1)
+                            .detach()
+                        )
                        # We need to cache weight_norm because it has to be based on the original weights. We
                        # cannot calculate it on the fly based on the merged weights when unmerging because its a
                        # different value
                        self._cache_store(f"{active_adapter}-weight_norm", weight_norm)
-                        dora_factor = self.lora_magnitude_vector[active_adapter] / weight_norm
+                        dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm
                        new_weight = dora_factor.view(-1, 1, 1, 1) * (base_layer.weight.data + delta_weight)
                        base_layer.weight.data = new_weight

@ -900,7 +959,7 @@ class Conv2d(nn.Module, LoraLayer):
                    weight.data -= delta_weight
                else:
                    weight_norm = self._cache_pop(f"{active_adapter}-weight_norm")
-                    dora_factor = self.lora_magnitude_vector[active_adapter] / weight_norm
+                    dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm
                    weight_orig = weight.data / dora_factor.view(-1, 1, 1, 1) - delta_weight
                    weight.data = weight_orig

@ -952,46 +1011,6 @@ class Conv2d(nn.Module, LoraLayer):

        return output_tensor

-    def _get_weight_norm(self, weight, lora_weight, scaling) -> torch.Tensor:
-        # calculate L2 norm of weight matrix, channel-wise
-        weight = weight + scaling * lora_weight
-        # the following is needed to have compatibility with the 4D weight tensors of Conv2D
-        weight_norm = weight.norm(p=2, dim=(1, 2, 3), keepdim=True).transpose(1, 0)
-        return weight_norm
-
-    def _apply_dora(self, x, lora_A, lora_B, scaling, active_adapter):
-        """
-        For DoRA, calculate the extra output from LoRA with DoRA applied. This should be added on top of the base layer
-        output.
-        """
-        base_layer = self.get_base_layer()
-        weight = base_layer.weight
-        lora_weight = torch.mm(lora_B.weight.flatten(start_dim=1), lora_A.weight.flatten(start_dim=1))
-        lora_weight = lora_weight.reshape(weight.shape)
-        magnitude = self.lora_magnitude_vector[active_adapter]
-        weight_norm = self._get_weight_norm(weight, lora_weight, scaling)
-        # see section 4.3 of DoRA (https://arxiv.org/abs/2402.09353)
-        # "[...] we suggest treating ||V +∆V ||_c in
-        # Eq. (5) as a constant, thereby detaching it from the gradient
-        # graph. This means that while ||V + ∆V ||_c dynamically
-        # reflects the updates of ∆V , it won’t receive any gradient
-        # during backpropagation"
-        weight_norm = weight_norm.detach()
-        mag_norm_scale = magnitude / weight_norm
-        result_dora = (mag_norm_scale - 1) * (
-            F.conv2d(
-                x,
-                weight,
-                bias=None,
-                stride=base_layer.stride,
-                padding=base_layer.padding,
-                dilation=base_layer.dilation,
-                groups=base_layer.groups,
-            )
-        ) + mag_norm_scale * lora_B(lora_A(x)) * scaling
-
-        return result_dora
-
    def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
        self._check_forward_args(x, *args, **kwargs)
        adapter_names = kwargs.pop("adapter_names", None)
@ -1021,7 +1040,13 @@ class Conv2d(nn.Module, LoraLayer):
                    result = result + lora_B(lora_A(dropout(x))) * scaling
                else:
                    x = dropout(x)
-                    result = result + self._apply_dora(x, lora_A, lora_B, scaling, active_adapter)
+                    result = result + self.lora_magnitude_vector[active_adapter](
+                        x,
+                        lora_A=lora_A,
+                        lora_B=lora_B,
+                        scaling=scaling,
+                        base_layer=self.get_base_layer(),
+                    )

            result = result.to(torch_result_dtype)
        return result
--- a/src/peft/tuners/lora/model.py
+++ b/src/peft/tuners/lora/model.py
@ -41,6 +41,7 @@ from peft.utils import (
    ModulesToSaveWrapper,
    _freeze_adapter,
    _get_submodules,
+    get_peft_model_state_dict,
    get_quantization_config,
 )
 from peft.utils.merge_utils import dare_linear, dare_ties, magnitude_prune, task_arithmetic, ties
@ -48,7 +49,9 @@ from peft.utils.merge_utils import dare_linear, dare_ties, magnitude_prune, task
 from .aqlm import dispatch_aqlm
 from .awq import dispatch_awq
 from .config import LoraConfig
+from .eetq import dispatch_eetq
 from .gptq import dispatch_gptq
+from .hqq import dispatch_hqq
 from .layer import Conv2d, LoraLayer, dispatch_default
 from .tp_layer import dispatch_megatron

@ -193,6 +196,7 @@ class LoraModel(BaseTuner):
            "init_lora_weights": lora_config.init_lora_weights,
            "use_rslora": lora_config.use_rslora,
            "use_dora": lora_config.use_dora,
+            "ephemeral_gpu_offload": lora_config.runtime_config.ephemeral_gpu_offload,
            "loaded_in_8bit": getattr(self.model, "is_loaded_in_8bit", False),
            "loaded_in_4bit": getattr(self.model, "is_loaded_in_4bit", False),
        }
@ -233,7 +237,10 @@ class LoraModel(BaseTuner):
            child = child.base_layer

        if not hasattr(new_module, "base_layer"):
-            new_module.weight = child.weight
+            if hasattr(new_module, "W_q"):  # HQQ
+                new_module.W_q = child.W_q
+            else:
+                new_module.weight = child.weight
            if hasattr(child, "bias"):
                new_module.bias = child.bias

@ -247,7 +254,15 @@ class LoraModel(BaseTuner):
        # dispatch to correct device
        for name, module in new_module.named_modules():
            if (self.prefix in name) or ("ranknum" in name):
-                weight = child.qweight if hasattr(child, "qweight") else child.weight
+                weight = (
+                    child.qweight
+                    if hasattr(child, "qweight")
+                    else child.W_q
+                    if hasattr(child, "W_q")
+                    else child.weight
+                    if hasattr(child, "weight")
+                    else next(child.parameters())
+                )
                module.to(weight.device)

    def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None:
@ -277,6 +292,26 @@ class LoraModel(BaseTuner):
        # because the first match is always used. Therefore, the default layers should be checked last.
        dispatchers = []

+        if lora_config._custom_modules:
+            # Experimental custom LoRA module support. Allows users to pass a custom mapping for unsupported layer
+            # types by impelementing their own LoRA layers.
+            def dynamic_dispatch_func(target, adapter_name, lora_config, **kwargs):
+                new_module = None
+
+                if isinstance(target, BaseTunerLayer):
+                    target_base_layer = target.get_base_layer()
+                else:
+                    target_base_layer = target
+
+                for key, custom_cls in lora_config._custom_modules.items():
+                    if isinstance(target_base_layer, key):
+                        new_module = custom_cls(target, adapter_name, **kwargs)
+                        break
+
+                return new_module
+
+            dispatchers.append(dynamic_dispatch_func)
+
        # avoid eager bnb import
        if is_bnb_available():
            from .bnb import dispatch_bnb_8bit
@ -288,7 +323,17 @@ class LoraModel(BaseTuner):

            dispatchers.append(dispatch_bnb_4bit)

-        dispatchers.extend([dispatch_aqlm, dispatch_awq, dispatch_gptq, dispatch_megatron, dispatch_default])
+        dispatchers.extend(
+            [
+                dispatch_eetq,
+                dispatch_aqlm,
+                dispatch_awq,
+                dispatch_gptq,
+                dispatch_hqq,
+                dispatch_megatron,
+                dispatch_default,
+            ]
+        )

        new_module = None
        for dispatcher in dispatchers:
@ -310,6 +355,8 @@ class LoraModel(BaseTuner):
        try:
            return super().__getattr__(name)  # defer to nn.Module's logic
        except AttributeError:
+            if name == "model":  # see #1892: prevent infinite recursion if class is not initialized
+                raise
            return getattr(self.model, name)

    def get_peft_config_as_dict(self, inference: bool = False):
@ -572,9 +619,6 @@ class LoraModel(BaseTuner):

        if adapter_name in list(self.peft_config.keys()):
            return
-        for adapter in adapters:
-            if adapter not in list(self.peft_config.keys()):
-                raise ValueError(f"Adapter {adapter} does not exist")

        combination_type, new_rank, new_target_modules = self._check_add_weighted_adapter(
            adapters=adapters,
@ -826,3 +870,41 @@ class LoraModel(BaseTuner):
        model.
        """
        return self._unload_and_optionally_merge(merge=False)
+
+    def subtract_mutated_init(self, output_state_dict: dict[str, torch.Tensor], adapter_name: str, kwargs=None):
+        """
+        This function can calculate the updates of the [PiSSA | OLoRA] by comparing the parameters of the [PiSSA |
+        OLoRA] adapter in `output_state_dict` with the initial values of [PiSSA | OLoRA] in `adapter_name`, thus
+        converting [PiSSA | OLoRA] to LoRA.
+        """
+        for name, param in self.model.named_parameters():
+            if (
+                param.data.dtype != torch.float32
+                and param.data.dtype != torch.float16
+                and param.data.dtype != torch.bfloat16
+            ) and adapter_name.startswith("pissa"):
+                warnings.warn(
+                    r"Note that Quant(W_res) + AB != Quant(W) + \Delta(AB); "
+                    "the converted LoRA, when combined with W or Quant(W), may introduce a certain gap in the fine-tuned model. "
+                    "Therefore, we recommend directly using the Quant(W_res) in conjunction with the PiSSA adapter. "
+                )
+        mutated_init_state_dict = get_peft_model_state_dict(
+            self,
+            state_dict=kwargs.get("state_dict", None),
+            adapter_name=adapter_name,
+        )
+        tensors_lora = {}
+        for name in output_state_dict.keys():
+            ## W = W^{res} + A_0 \times B_0,
+            ## W + \Delta W = W^{res} + A \times B,
+            ## \Delta W = A \times B - A_0 \times B_0 = [A | A_0] \times [B | -B_0]^T = A'B'.
+            if "lora_A" in name:
+                tensors_lora[name] = torch.cat(
+                    [output_state_dict[name], mutated_init_state_dict[".".join(name.split(".")[1:])]], dim=0
+                )
+            elif "lora_B" in name:
+                tensors_lora[name] = torch.cat(
+                    [output_state_dict[name], -mutated_init_state_dict[".".join(name.split(".")[1:])]], dim=1
+                )
+
+        return tensors_lora
--- a/src/peft/tuners/lora/tp_layer.py
+++ b/src/peft/tuners/lora/tp_layer.py
@ -11,16 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations

 import importlib
+import math
 import warnings
-from typing import Any, Optional
+from typing import Any, Optional, Union

 import torch
 import torch.nn as nn
 import torch.nn.init as init

-from peft.tuners.tuners_utils import BaseTunerLayer
+from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge
+from peft.utils import transpose
+from peft.utils.integrations import gather_params_ctx

 from .layer import LoraLayer

@ -42,13 +46,14 @@ class LoraParallelLinear(nn.Module, LoraLayer):
        lora_alpha: int = 1,
        lora_dropout: float = 0.0,
        fan_in_fan_out: bool = False,
-        init_lora_weights: bool = True,
+        is_target_conv_1d_layer: bool = False,
+        init_lora_weights: Union[bool, str] = True,
        use_rslora: bool = False,
        use_dora: bool = False,
        **kwargs,
    ):
        super().__init__()
-        LoraLayer.__init__(self, base_layer=base_layer)
+        LoraLayer.__init__(self, base_layer=base_layer, **kwargs)

        if use_dora:
            raise ValueError(f"{self.__class__.__name__} does not support DoRA yet, please set it to False")
@ -83,6 +88,10 @@ class LoraParallelLinear(nn.Module, LoraLayer):
            **parallel_linear_kwargs,
        )

+        if is_target_conv_1d_layer:
+            raise ValueError(
+                f"{self.__class__.__name__} does not support target_conv_1d_layer yet, please set it to False"
+            )
        self.is_target_conv_1d_layer = False

    def update_layer(
@ -137,23 +146,37 @@ class LoraParallelLinear(nn.Module, LoraLayer):
        self.lora_A[adapter_name] = lora_a
        self.lora_B[adapter_name] = lora_b
        if use_rslora:
-            self.scaling[adapter_name] = lora_alpha / (r**0.5)
+            self.scaling[adapter_name] = lora_alpha / math.sqrt(r)
        else:
            self.scaling[adapter_name] = lora_alpha / r
-        if init_lora_weights:
+
+        # for inits that require access to the base weight, use gather_param_ctx so that the weight is gathered when using DeepSpeed
+        if isinstance(init_lora_weights, str) and init_lora_weights.startswith("pissa"):
+            with gather_params_ctx(self.get_base_layer().weight):
+                self.pissa_init(adapter_name, init_lora_weights)
+        elif isinstance(init_lora_weights, str) and init_lora_weights.lower() == "olora":
+            with gather_params_ctx(self.get_base_layer().weight):
+                self.olora_init(adapter_name)
+        elif init_lora_weights == "loftq":
+            with gather_params_ctx(self.get_base_layer().weight):
+                self.loftq_init(adapter_name)
+        elif init_lora_weights:
            self.reset_lora_parameters(adapter_name, init_lora_weights)

-        weight = getattr(self.get_base_layer(), "weight", None)
-        if weight is not None:
-            # the layer is already completely initialized, this is an update
-            if weight.dtype.is_floating_point or weight.dtype.is_complex:
-                self.to(weight.device, dtype=weight.dtype)
-            else:
-                self.to(weight.device)
+        # call this before dora_init
+        self._move_adapter_to_device_of_base_layer(adapter_name)
+
+        if use_dora:
+            self.dora_init(adapter_name)
+            self.use_dora[adapter_name] = True
+        else:
+            self.use_dora[adapter_name] = False
+
        self.set_adapter(self.active_adapters)

    def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any):
-        previous_dtype = x.dtype
+        self._check_forward_args(x, *args, **kwargs)
+        adapter_names = kwargs.pop("adapter_names", None)
        # If weight is used for matrix multiplication here, the final aggregation operation of the original
        # parallel_linear layer will be missing, so we need to directly call its forward function to obtain the
        # output of the original parallel_linear layer.
@ -161,10 +184,13 @@ class LoraParallelLinear(nn.Module, LoraLayer):
            if self.merged:
                self.unmerge()
            result, bias = self.base_layer(x, *args, **kwargs)
+        elif adapter_names is not None:
+            raise ValueError(f"{self.__class__.__name__} does not support mixed_batch_forward yet.")
        elif self.merged:
            result, bias = self.base_layer(x, *args, **kwargs)
        else:
            result, bias = self.base_layer(x, *args, **kwargs)
+            torch_result_dtype = result.dtype
            for active_adapter in self.active_adapters:
                if active_adapter not in self.lora_A.keys():
                    continue
@ -174,19 +200,162 @@ class LoraParallelLinear(nn.Module, LoraLayer):
                scaling = self.scaling[active_adapter]
                x = x.to(lora_A.weight.dtype)

-                lora_result = lora_A(dropout(x))
-                if isinstance(lora_result, tuple):
-                    lora_result = lora_result[0]
-                lora_result = lora_B(lora_result)
-                if isinstance(lora_result, tuple):
-                    lora_result = lora_result[0]
-                lora_result = lora_result * scaling
+                if not self.use_dora[active_adapter]:
+                    lora_result = lora_A(dropout(x))
+                    if isinstance(lora_result, tuple):
+                        lora_result = lora_result[0]
+                    lora_result = lora_B(lora_result)
+                    if isinstance(lora_result, tuple):
+                        lora_result = lora_result[0]
+                    lora_result = lora_result * scaling

-                result = result + lora_result
+                    result = result + lora_result
+                else:
+                    x = dropout(x)
+                    result = result + self.lora_magnitude_vector[active_adapter](
+                        x,
+                        lora_A=lora_A,
+                        lora_B=lora_B,
+                        scaling=scaling,
+                        base_layer=self.get_base_layer(),
+                    )

-        result = result.to(previous_dtype)
+            result = result.to(torch_result_dtype)
        return result, bias

+    def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
+        """
+        Merge the active adapter weights into the base weights
+
+        Args:
+            safe_merge (`bool`, *optional*):
+                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
+                before merging the weights. This is useful if you want to check if the merge operation will produce
+                NaNs. Defaults to `False`.
+            adapter_names (`list[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
+        """
+        adapter_names = check_adapters_to_merge(self, adapter_names)
+        if not adapter_names:
+            # no adapter to merge
+            return
+
+        for active_adapter in adapter_names:
+            if active_adapter in self.lora_A.keys():
+                base_layer = self.get_base_layer()
+                if safe_merge:
+                    # Note that safe_merge will be slower than the normal merge
+                    # because of the copy operation.
+                    orig_weights = base_layer.weight.data.clone()
+                    delta_weight = self.get_delta_weight(active_adapter)
+                    if not self.use_dora[active_adapter]:
+                        orig_weights = orig_weights + delta_weight
+                    else:
+                        # handle dora
+                        # since delta_weight already includes scaling, set it to 1 here
+                        weight_norm = (
+                            self.lora_magnitude_vector[active_adapter]
+                            .get_weight_norm(orig_weights, transpose(delta_weight, self.fan_in_fan_out), scaling=1)
+                            .detach()
+                        )
+                        # We need to cache weight_norm because it has to be based on the original weights. We
+                        # cannot calculate it on the fly based on the merged weights when unmerging because its a
+                        # different value
+                        self._cache_store(f"{active_adapter}-weight_norm", weight_norm)
+                        dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm
+                        dora_factor = transpose(dora_factor.view(-1, 1), self.fan_in_fan_out)
+                        orig_weights = dora_factor * (orig_weights + delta_weight)
+
+                    if not torch.isfinite(orig_weights).all():
+                        raise ValueError(
+                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                        )
+
+                    base_layer.weight.data = orig_weights
+                else:
+                    delta_weight = self.get_delta_weight(active_adapter)
+                    if not self.use_dora[active_adapter]:
+                        base_layer.weight.data = base_layer.weight.data + delta_weight
+                    else:
+                        # handle dora
+                        # since delta_weight already includes scaling, set it to 1 here
+                        weight_norm = (
+                            self.lora_magnitude_vector[active_adapter]
+                            .get_weight_norm(
+                                base_layer.weight, transpose(delta_weight, self.fan_in_fan_out), scaling=1
+                            )
+                            .detach()
+                        )
+                        # We need to cache weight_norm because it has to be based on the original weights. We
+                        # cannot calculate it on the fly based on the merged weights when unmerging because its a
+                        # different value
+                        self._cache_store(f"{active_adapter}-weight_norm", weight_norm)
+                        dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm
+                        dora_factor = transpose(dora_factor.view(-1, 1), self.fan_in_fan_out)
+                        new_weight = dora_factor * (base_layer.weight.data + delta_weight)
+                        base_layer.weight.data = new_weight
+
+                self.merged_adapters.append(active_adapter)
+
+    def unmerge(self) -> None:
+        """
+        This method unmerges all merged adapter layers from the base weights.
+        """
+        if not self.merged:
+            warnings.warn("Already unmerged. Nothing to do.")
+            return
+        while len(self.merged_adapters) > 0:
+            active_adapter = self.merged_adapters.pop()
+            if active_adapter in self.lora_A.keys():
+                weight = self.get_base_layer().weight
+                delta_weight = self.get_delta_weight(active_adapter)
+                if not self.use_dora[active_adapter]:
+                    weight.data -= delta_weight
+                else:
+                    weight_norm = self._cache_pop(f"{active_adapter}-weight_norm")
+                    dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm
+                    weight_orig = weight.data / dora_factor.view(-1, 1) - delta_weight
+                    weight.data = weight_orig
+
+    def get_delta_weight(self, adapter) -> torch.Tensor:
+        """
+        Compute the delta weight for the given adapter.
+
+        Args:
+            adapter (str):
+                The name of the adapter for which the delta weight should be computed.
+        """
+        device = self.lora_B[adapter].weight.device
+        dtype = self.lora_B[adapter].weight.dtype
+
+        # In case users wants to merge the adapter weights that are in
+        # float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to
+        # float16 because the `@` and matmul operation in general is not supported in torch + cpu + fp16.
+        cast_to_fp32 = device.type == "cpu" and dtype == torch.float16
+
+        weight_A = self.lora_A[adapter].weight
+        weight_B = self.lora_B[adapter].weight
+
+        if cast_to_fp32:
+            weight_A = weight_A.float()
+            weight_B = weight_B.float()
+
+        output_tensor = transpose(weight_B @ weight_A, self.fan_in_fan_out) * self.scaling[adapter]
+
+        if cast_to_fp32:
+            output_tensor = output_tensor.to(dtype=dtype)
+
+            # cast back the weights
+            self.lora_A[adapter].weight.data = weight_A.to(dtype)
+            self.lora_B[adapter].weight.data = weight_B.to(dtype)
+
+        return output_tensor
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "lora." + rep
+

 def dispatch_megatron(
    target: torch.nn.Module,
--- a/src/peft/tuners/lycoris_utils.py
+++ b/src/peft/tuners/lycoris_utils.py
@ -79,8 +79,7 @@ class LycorisLayer(BaseTunerLayer):

    @property
    @abstractmethod
-    def _available_adapters(self) -> set[str]:
-        ...
+    def _available_adapters(self) -> set[str]: ...

    def _init_empty_weights(self, cls, *args, **kwargs) -> None:
        # A helper method that allows to initialize the layer of the given class without spending time to initialize the
@ -95,8 +94,7 @@ class LycorisLayer(BaseTunerLayer):
        self.to_empty(device=final_device)

    @abstractmethod
-    def create_adapter_parameters(self, adapter_name: str, r: int, **kwargs):
-        ...
+    def create_adapter_parameters(self, adapter_name: str, r: int, **kwargs): ...

    # TODO: refactor LoRA to use the same approach
    @abstractmethod
@ -104,8 +102,7 @@ class LycorisLayer(BaseTunerLayer):
        """Activations added on top of the base layer output (i.e. after the base layer forward pass)"""

    @abstractmethod
-    def get_delta_weight(self, adapter_name: str) -> torch.Tensor:
-        ...
+    def get_delta_weight(self, adapter_name: str) -> torch.Tensor: ...

    def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
        """
@ -143,8 +140,7 @@ class LycorisLayer(BaseTunerLayer):
                self.merged_adapters.append(active_adapter)

    @abstractmethod
-    def reset_adapter_parameters(self, adapter_name: str):
-        ...
+    def reset_adapter_parameters(self, adapter_name: str): ...

    def set_scale(self, adapter, scale):
        if adapter not in self._available_adapters:
@ -185,8 +181,7 @@ class LycorisLayer(BaseTunerLayer):
                self.scaling[active_adapter] /= scale

    @abstractmethod
-    def update_layer(self, adapter_name: str, r: int, alpha: float, **kwargs):
-        ...
+    def update_layer(self, adapter_name: str, r: int, alpha: float, **kwargs): ...


 class LycorisTuner(BaseTuner):
@ -205,6 +200,8 @@ class LycorisTuner(BaseTuner):
        try:
            return super().__getattr__(name)  # defer to nn.Module's logic
        except AttributeError:
+            if name == "model":  # see #1892: prevent infinite recursion if class is not initialized
+                raise
            return getattr(self.model, name)

    @staticmethod
@ -220,8 +217,7 @@ class LycorisTuner(BaseTuner):
        target_name,
        parent,
        current_key,
-    ):
-        ...
+    ): ...

    @classmethod
    def _create_new_module(cls, config: LycorisConfig, adapter_name: str, target: nn.Module, **kwargs) -> LycorisLayer:
--- a/src/peft/tuners/mixed/model.py
+++ b/src/peft/tuners/mixed/model.py
@ -183,6 +183,8 @@ class MixedModel(BaseTuner):
        try:
            return super().__getattr__(name)  # defer to nn.Module's logic
        except AttributeError:
+            if name == "model":  # see #1892: prevent infinite recursion if class is not initialized
+                raise
            return getattr(self.model, name)

    def _set_adapter_layers(self, enabled=True):
--- a/Show More
+++ b/Show More