TST FIX Failing AutoAWQ test with torch 2.8 (#2752 )

There is a failing AWQ test since torch 2.6 which is marked as xfail for torch=2.7. However, now torch 2.8 is out and the test is still failing. Therefore, the xfail now checks for torch>=2.7. As AWQ is no longer being maintained, we should expect this situation to deteriorate over time and eventually we'll have to remove it. But for the time being, it still appears to mostly work, so I suggest we leave it as is.
Fix missing code start in docs (#2768 )
2025-10-20 15:33:48 +08:00 · 2025-09-03 19:25:05 +02:00 · 2025-09-03 18:37:52 +02:00 · 2025-09-03 18:26:50 +02:00 · 2025-08-29 17:54:19 +02:00 · 2025-08-27 13:07:05 +02:00
518 changed files with 111585 additions and 14364 deletions
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@ -23,30 +23,14 @@ body:

        Please tag fewer than 3 people.

-        Library: @pacman100 @younesbelkada @benjaminbossan @sayakpaul
+        Library: @benjaminbossan @githubnemo
+
+        diffusers integration: @benjaminbossan @sayakpaul

        Documentation: @stevhliu

      placeholder: "@Username ..."

-  - type: checkboxes
-    id: information-scripts-examples
-    attributes:
-      label: Information
-      description: 'The problem arises when using:'
-      options:
-        - label: "The official example scripts"
-        - label: "My own modified scripts"
-
-  - type: checkboxes
-    id: information-tasks
-    attributes:
-      label: Tasks
-      description: "The tasks I am working on are:"
-      options:
-        - label: "An officially supported task in the `examples` folder"
-        - label: "My own task or dataset (give details below)"
-
  - type: textarea
    id: reproduction
    validations:
--- a/.github/ISSUE_TEMPLATE/feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/feature-request.yml
@ -11,15 +11,6 @@ body:
      description: |
        A clear and concise description of the feature proposal. Please provide a link to the paper and code in case they exist.

-  - type: textarea
-    id: motivation
-    validations:
-      required: true
-    attributes:
-      label: Motivation
-      description: |
-        Please outline the motivation for the proposal. Is your feature request related to a problem? 
-
  - type: textarea
    id: contribution
    validations:
@ -27,4 +18,4 @@ body:
    attributes:
      label: Your contribution
      description: |
-        Is there any way that you could help, e.g. by submitting a PR? 
+        Is there any way that you could help, e.g. by submitting a PR?
--- a/.github/workflows/build_docker_images.yml
+++ b/.github/workflows/build_docker_images.yml
@ -10,36 +10,31 @@ concurrency:
  group: docker-image-builds
  cancel-in-progress: false

+permissions: {}
+
 env:
  CI_SLACK_CHANNEL: ${{ secrets.CI_DOCKER_CHANNEL }}

 jobs:
  latest-cpu:
    name: "Latest Peft CPU [dev]"
-    runs-on: ubuntu-latest
+    runs-on:
+      group: aws-general-8-plus
    steps:
-      - name: Cleanup disk
-        run: |
-          sudo ls -l /usr/local/lib/
-          sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
+        uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2  # v3.10.0
      - name: Check out code
        uses: actions/checkout@v3
+        with:
+          persist-credentials: false
      - name: Login to DockerHub
-        uses: docker/login-action@v2
+        uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772  # v3.4.0
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}

      - name: Build and Push CPU
-        uses: docker/build-push-action@v4
+        uses: docker/build-push-action@14487ce63c7a62a4a324b0bfb37086795e31c6c1  # v6.16.0
        with:
          context: ./docker/peft-cpu
          push: true
@ -47,171 +42,109 @@ jobs:

      - name: Post to Slack
        if: always()
-        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        uses: huggingface/hf-workflows/.github/actions/post-slack@3f88d63d3761558a32e8e46fc2a8536e04bb2aea  # main from Feb 2025-02-24
        with:
-          slack_channel: "C06LKJB31RU"
-          title: 🤗 Results of the PEFT-CPU docker build 
+          slack_channel: ${{ env.CI_SLACK_CHANNEL }}
+          title: 🤗 Results of the PEFT-CPU docker build
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

  latest-cuda:
    name: "Latest Peft GPU [dev]"
-    runs-on: ubuntu-latest
+    runs-on:
+      group: aws-general-8-plus
    steps:
-      - name: Cleanup disk
-        run: |
-          sudo ls -l /usr/local/lib/
-          sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
+        uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2  # v3.10.0
      - name: Check out code
        uses: actions/checkout@v3
+        with:
+          persist-credentials: false
      - name: Login to DockerHub
-        uses: docker/login-action@v1
+        uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772  # v3.4.0
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}

      - name: Build and Push GPU
-        uses: docker/build-push-action@v4
+        uses: docker/build-push-action@14487ce63c7a62a4a324b0bfb37086795e31c6c1  # v6.16.0
        with:
          context: ./docker/peft-gpu
          push: true
          tags: huggingface/peft-gpu
- 
+
      - name: Post to Slack
        if: always()
-        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        uses: huggingface/hf-workflows/.github/actions/post-slack@3f88d63d3761558a32e8e46fc2a8536e04bb2aea  # main from Feb 2025-02-24
        with:
-          slack_channel: "C06LKJB31RU"
-          title: 🤗 Results of the PEFT-GPU docker build 
+          slack_channel: ${{ env.CI_SLACK_CHANNEL }}
+          title: 🤗 Results of the PEFT-GPU docker build
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

  latest-cuda-bnb-source:
    name: "Latest Peft GPU + bnb source [dev]"
-    runs-on: ubuntu-latest
+    runs-on:
+      group: aws-general-8-plus
    steps:
-      - name: Cleanup disk
-        run: |
-          sudo ls -l /usr/local/lib/
-          sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
+        uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2  # v3.10.0
      - name: Check out code
        uses: actions/checkout@v3
+        with:
+          persist-credentials: false
      - name: Login to DockerHub
-        uses: docker/login-action@v1
+        uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772  # v3.4.0
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}

      - name: Build and Push GPU
-        uses: docker/build-push-action@v4
+        uses: docker/build-push-action@14487ce63c7a62a4a324b0bfb37086795e31c6c1  # v6.16.0
        with:
          context: ./docker/peft-gpu-bnb-source
          push: true
          tags: huggingface/peft-gpu-bnb-source
- 
+
      - name: Post to Slack
        if: always()
-        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        uses: huggingface/hf-workflows/.github/actions/post-slack@3f88d63d3761558a32e8e46fc2a8536e04bb2aea  # main from Feb 2025-02-24
        with:
-          slack_channel: "C06LKJB31RU"
-          title: 🤗 Results of the PEFT-GPU (bnb source / HF latest) docker build 
+          slack_channel: ${{ env.CI_SLACK_CHANNEL }}
+          title: 🤗 Results of the PEFT-GPU (bnb source / HF latest) docker build
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

  latest-cuda-bnb-source-latest:
    name: "Latest Peft GPU + bnb source [accelerate / peft / transformers latest]"
-    runs-on: ubuntu-latest
+    runs-on:
+      group: aws-general-8-plus
    steps:
-      - name: Cleanup disk
-        run: |
-          sudo ls -l /usr/local/lib/
-          sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
+        uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2  # v3.10.0
      - name: Check out code
        uses: actions/checkout@v3
+        with:
+          persist-credentials: false
      - name: Login to DockerHub
-        uses: docker/login-action@v1
+        uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772  # v3.4.0
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}

      - name: Build and Push GPU
-        uses: docker/build-push-action@v4
+        uses: docker/build-push-action@14487ce63c7a62a4a324b0bfb37086795e31c6c1  # v6.16.0
        with:
          context: ./docker/peft-gpu-bnb-latest
          push: true
          tags: huggingface/peft-gpu-bnb-latest
-   
+
      - name: Post to Slack
        if: always()
-        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        uses: huggingface/hf-workflows/.github/actions/post-slack@3f88d63d3761558a32e8e46fc2a8536e04bb2aea  # main from Feb 2025-02-24
        with:
-          slack_channel: "C06LKJB31RU"
-          title: 🤗 Results of the PEFT-GPU (bnb source / HF source) docker build 
+          slack_channel: ${{ env.CI_SLACK_CHANNEL }}
+          title: 🤗 Results of the PEFT-GPU (bnb source / HF source) docker build
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

-  latest-cuda-bnb-source-multi:
-    name: "Latest Peft GPU + bnb (multi-backend) source [accelerate / peft / transformers source]"
-    runs-on: ubuntu-latest
-    steps:
-      - name: Cleanup disk
-        run: |
-          sudo ls -l /usr/local/lib/
-          sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
-      - name: Check out code
-        uses: actions/checkout@v3
-      - name: Login to DockerHub
-        uses: docker/login-action@v1
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
-      - name: Build and Push GPU
-        uses: docker/build-push-action@v4
-        with:
-          context: ./docker/peft-gpu-bnb-multi-source
-          push: true
-          tags: huggingface/peft-gpu-bnb-multi-source
- 
-      - name: Post to Slack
-        if: always()
-        uses: huggingface/hf-workflows/.github/actions/post-slack@main
-        with:
-          slack_channel: "C06LKJB31RU"
-          title: 🤗 Results of the PEFT-GPU (bnb source multi-backend / HF latest) docker build 
-          status: ${{ job.status }}
-          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
-
-
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@ -7,6 +7,8 @@ on:
      - doc-builder*
      - v*-release

+permissions: {}
+
 jobs:
   build:
    uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@ -7,6 +7,8 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

+permissions: {}
+
 jobs:
  build:
    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
--- a/.github/workflows/deploy_method_comparison_app.yml
+++ b/.github/workflows/deploy_method_comparison_app.yml
@ -0,0 +1,41 @@
+name: Deploy "method_comparison" Gradio to Spaces
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "method_comparison/**"
+  workflow_dispatch:
+
+permissions: {}
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0  # full history needed for subtree
+          persist-credentials: false
+
+      - name: Authenticate via ~/.netrc
+        env:
+          HF_TOKEN: ${{ secrets.PEFT_INTERNAL_REPO_READ_WRITE }}
+        run: |
+          # netrc needs BOTH login and password entries
+          printf "machine huggingface.co\nlogin hf\npassword ${HF_TOKEN}\n" >> ~/.netrc
+          chmod 600 ~/.netrc
+
+      - name: Deploy method_comparison app to HF Spaces
+        run: |
+          cd method_comparison
+          git init
+          # Spaces expect requirements.txt
+          mv requirements-app.txt requirements.txt
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+          git remote add gradio-app https://huggingface.co/spaces/peft-internal-testing/PEFT-method-comparison
+          git add .
+          git commit -m "🚀 Deploy method comparison app from GH action"
+          git push -f gradio-app HEAD:main
--- a/.github/workflows/integrations_tests.yml
+++ b/.github/workflows/integrations_tests.yml
@ -7,6 +7,8 @@ on:
        description: 'Branch to test on'
        required: true

+permissions: {}
+
 jobs:
  run_transformers_integration_tests:
    strategy:
@ -19,6 +21,7 @@ jobs:
        with:
          ref: ${{ github.event.inputs.branch }}
          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          persist-credentials: false
      - name: Set up Python
        uses: actions/setup-python@v4
        with:
@ -27,8 +30,8 @@ jobs:
          cache-dependency-path: "setup.py"
      - name: print environment variables
        run: |
-          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
-          echo "env.CI_SHA = ${{ env.CI_SHA }}"
+          echo "env.CI_BRANCH = ${CI_BRANCH}"
+          echo "env.CI_SHA = ${CI_SHA}"
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
@ -55,6 +58,7 @@ jobs:
        with:
          ref: ${{ github.event.inputs.branch }}
          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          persist-credentials: false
      - name: Set up Python
        uses: actions/setup-python@v4
        with:
@ -63,13 +67,13 @@ jobs:
          cache-dependency-path: "setup.py"
      - name: print environment variables
        run: |
-          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
-          echo "env.CI_SHA = ${{ env.CI_SHA }}"
+          echo "env.CI_BRANCH = ${CI_BRANCH}"
+          echo "env.CI_SHA = ${CI_SHA}"
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          python -m pip install .[test]
-          
+
          if [ "${{ matrix.diffusers-version }}" == "main" ]; then
              pip install -U git+https://github.com/huggingface/diffusers.git
          else
--- a/.github/workflows/nightly-bnb.yml
+++ b/.github/workflows/nightly-bnb.yml
@ -10,16 +10,19 @@ env:
  IS_GITHUB_CI: "1"
  # To be able to run tests on CUDA 12.2
  NVIDIA_DISABLE_REQUIRE: "1"
-  SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }}
+  SLACK_API_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

+permissions: {}

 jobs:
  run_all_tests_single_gpu:
+    timeout-minutes: 60
    strategy:
      fail-fast: false
      matrix:
-          docker-image-name: ["huggingface/peft-gpu-bnb-source:latest", "huggingface/peft-gpu-bnb-latest:latest", "huggingface/peft-gpu-bnb-multi-source:latest"]
-    runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
+          docker-image-name: ["huggingface/peft-gpu-bnb-source:latest", "huggingface/peft-gpu-bnb-latest:latest"]
+    runs-on:
+      group: aws-g6-4xlarge-plus
    env:
      CUDA_VISIBLE_DEVICES: "0"
      TEST_TYPE: "single_gpu_${{ matrix.docker-image-name }}"
@ -31,6 +34,8 @@ jobs:
        shell: bash
    steps:
      - uses: actions/checkout@v3
+        with:
+          persist-credentials: false
      - name: Pip install
        run: |
          source activate peft
@ -44,26 +49,91 @@ jobs:
            echo "Checking out tag for Transformers version: v$transformers_version"
            git fetch --tags
            git checkout tags/v$transformers_version
-            cd .. 
+            cd ..
          fi
+
+      - name: Test bnb import
+        id: import
+        if: always()
+        run: |
+          source activate peft
+          python3 -m bitsandbytes
+          python3 -c "import bitsandbytes as bnb"
+
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@3f88d63d3761558a32e8e46fc2a8536e04bb2aea  # main from Feb 2025-02-24
+        with:
+          slack_channel: ${{ secrets.BNB_SLACK_CHANNEL_ID }}
+          title: 🤗 Results of bitsandbytes import
+          status: ${{ steps.import.outcome }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
      - name: Run examples on single GPU
+        id: examples_tests
        if: always()
        run: |
          source activate peft
          make tests_examples_single_gpu_bnb
-      
+
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@3f88d63d3761558a32e8e46fc2a8536e04bb2aea  # main from Feb 2025-02-24
+        with:
+          slack_channel: ${{ secrets.BNB_SLACK_CHANNEL_ID }}
+          title: 🤗 Results of bitsandbytes examples tests - single GPU
+          status: ${{ steps.examples_tests.outcome }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
      - name: Run core tests on single GPU
+        id: core_tests
        if: always()
        run: |
          source activate peft
          make tests_core_single_gpu_bnb

+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@3f88d63d3761558a32e8e46fc2a8536e04bb2aea  # main from Feb 2025-02-24
+        with:
+          slack_channel: ${{ secrets.BNB_SLACK_CHANNEL_ID }}
+          title: 🤗 Results of bitsandbytes core tests - single GPU
+          status: ${{ steps.core_tests.outcome }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
+      # TODO: this is a test to see if BNB multi-backend single-GPU tests succeed w/o regression tests
+      # - name: Run BNB regression tests on single GPU
+      #   id: regression_tests
+      #   if: always()
+      #   run: |
+      #     source activate peft
+      #     make tests_gpu_bnb_regression
+
+      # - name: Post to Slack
+      #   if: always()
+      #   uses: huggingface/hf-workflows/.github/actions/post-slack@3f88d63d3761558a32e8e46fc2a8536e04bb2aea  # main from Feb 2025-02-24
+      #   with:
+      #     slack_channel: ${{ secrets.BNB_SLACK_CHANNEL_ID }}
+      #     title: 🤗 Results of bitsandbytes regression tests - single GPU
+      #     status: ${{ steps.regression_tests.outcome }}
+      #     slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
      - name: Run transformers tests on single GPU
+        id: transformers_tests
        if: always()
        run: |
          source activate peft
          make transformers_tests
-          
+
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@3f88d63d3761558a32e8e46fc2a8536e04bb2aea  # main from Feb 2025-02-24
+        with:
+          slack_channel: ${{ secrets.BNB_SLACK_CHANNEL_ID }}
+          title: 🤗 Results of bitsandbytes transformers tests - single GPU
+          status: ${{ steps.transformers_tests.outcome }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
      - name: Generate Report
        if: always()
        run: |
@ -71,11 +141,13 @@ jobs:
          python scripts/log_reports.py --slack_channel_name bnb-daily-ci-collab >> $GITHUB_STEP_SUMMARY

  run_all_tests_multi_gpu:
+    timeout-minutes: 60
    strategy:
      fail-fast: false
      matrix:
-        docker-image-name: ["huggingface/peft-gpu-bnb-source:latest", "huggingface/peft-gpu-bnb-latest:latest", "huggingface/peft-gpu-bnb-multi-source:latest"]
-    runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
+        docker-image-name: ["huggingface/peft-gpu-bnb-source:latest", "huggingface/peft-gpu-bnb-latest:latest"]
+    runs-on:
+      group: aws-g6-12xlarge-plus
    env:
      CUDA_VISIBLE_DEVICES: "0,1"
      TEST_TYPE: "multi_gpu_${{ matrix.docker-image-name }}"
@ -87,6 +159,8 @@ jobs:
        shell: bash
    steps:
      - uses: actions/checkout@v3
+        with:
+          persist-credentials: false
      - name: Pip install
        run: |
          source activate peft
@ -101,31 +175,73 @@ jobs:
            git fetch --tags
            git checkout tags/v$transformers_version
            cd ..
-          fi 
+          fi

-      - name: Run core GPU tests on multi-gpu
+      - name: Test bnb import
+        id: import
        if: always()
        run: |
          source activate peft
-        
+          python3 -m bitsandbytes
+          python3 -c "import bitsandbytes as bnb"
+
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@3f88d63d3761558a32e8e46fc2a8536e04bb2aea  # main from Feb 2025-02-24
+        with:
+          slack_channel: ${{ secrets.BNB_SLACK_CHANNEL_ID }}
+          title: 🤗 Results of bitsandbytes import
+          status: ${{ steps.import.outcome }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
      - name: Run examples on multi GPU
+        id: examples_tests
        if: always()
        run: |
          source activate peft
          make tests_examples_multi_gpu_bnb
-      
+
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@3f88d63d3761558a32e8e46fc2a8536e04bb2aea  # main from Feb 2025-02-24
+        with:
+          slack_channel: ${{ secrets.BNB_SLACK_CHANNEL_ID }}
+          title: 🤗 Results of bitsandbytes examples tests - multi GPU
+          status: ${{ steps.examples_tests.outcome }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
      - name: Run core tests on multi GPU
+        id: core_tests
        if: always()
        run: |
          source activate peft
          make tests_core_multi_gpu_bnb

+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@3f88d63d3761558a32e8e46fc2a8536e04bb2aea  # main from Feb 2025-02-24
+        with:
+          slack_channel: ${{ secrets.BNB_SLACK_CHANNEL_ID }}
+          title: 🤗 Results of bitsandbytes core tests - multi GPU
+          status: ${{ steps.core_tests.outcome }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
      - name: Run transformers tests on multi GPU
+        id: transformers_tests
        if: always()
        run: |
          source activate peft
          make transformers_tests
-          
+
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@3f88d63d3761558a32e8e46fc2a8536e04bb2aea  # main from Feb 2025-02-24
+        with:
+          slack_channel: ${{ secrets.BNB_SLACK_CHANNEL_ID }}
+          title: 🤗 Results of bitsandbytes transformers tests - multi GPU
+          status: ${{ steps.transformers_tests.outcome }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
      - name: Generate Report
        if: always()
        run: |
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@ -10,14 +10,16 @@ env:
  IS_GITHUB_CI: "1"
  # To be able to run tests on CUDA 12.2
  NVIDIA_DISABLE_REQUIRE: "1"
-  SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }}
+  SLACK_API_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

+permissions: {}

 jobs:
  run_all_tests_single_gpu:
    strategy:
      fail-fast: false
-    runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g6-4xlarge-plus
    env:
      CUDA_VISIBLE_DEVICES: "0"
      TEST_TYPE: "single_gpu"
@ -29,12 +31,14 @@ jobs:
        shell: bash
    steps:
      - uses: actions/checkout@v3
+        with:
+          persist-credentials: false
      - name: Pip install
        run: |
          source activate peft
          pip install -e . --no-deps
          pip install pytest-reportlog
-      
+
      - name: Run common tests on single GPU
        run: |
          source activate peft
@ -44,7 +48,7 @@ jobs:
        run: |
          source activate peft
          make tests_examples_single_gpu
-      
+
      - name: Run core tests on single GPU
        run: |
          source activate peft
@ -54,7 +58,7 @@ jobs:
        run: |
          source activate peft
          make tests_regression
-          
+
      - name: Generate Report
        if: always()
        run: |
@ -64,7 +68,8 @@ jobs:
  run_all_tests_multi_gpu:
    strategy:
      fail-fast: false
-    runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g6-12xlarge-plus
    env:
      CUDA_VISIBLE_DEVICES: "0,1"
      TEST_TYPE: "multi_gpu"
@ -76,6 +81,8 @@ jobs:
        shell: bash
    steps:
      - uses: actions/checkout@v3
+        with:
+          persist-credentials: false
      - name: Pip install
        run: |
          source activate peft
@ -85,22 +92,22 @@ jobs:
      - name: Run core GPU tests on multi-gpu
        run: |
          source activate peft
-          
+
      - name: Run common tests on multi GPU
        run: |
          source activate peft
          make tests_common_gpu
-        
+
      - name: Run examples on multi GPU
        run: |
          source activate peft
          make tests_examples_multi_gpu
-      
+
      - name: Run core tests on multi GPU
        run: |
          source activate peft
          make tests_core_multi_gpu
-          
+
      - name: Generate Report
        if: always()
        run: |
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@ -4,24 +4,31 @@ on:
  schedule:
    - cron: "0 15 * * *"

+permissions: {}
+
 jobs:
  close_stale_issues:
    name: Close Stale Issues
    if: github.repository == 'huggingface/peft'
    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
    env:
      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
    steps:
    - uses: actions/checkout@v3
+      with:
+        persist-credentials: false

    - name: Setup Python
      uses: actions/setup-python@v4
      with:
-        python-version: 3.8
+        python-version: 3.11

    - name: Install requirements
      run: |
        pip install PyGithub
    - name: Close stale issues
      run: |
-        python scripts/stale.py
+        python scripts/stale.py
--- a/.github/workflows/test-docker-build.yml
+++ b/.github/workflows/test-docker-build.yml
@ -4,7 +4,10 @@ on:
  pull_request:
    paths:
      # Run only when DockerFile files are modified
-      - "docker/**"
+      - "docker/*/Dockerfile"
+
+permissions: {}
+
 jobs:
  get_changed_files:
    name: "Build all modified docker images"
@ -14,11 +17,13 @@ jobs:
    steps:
      - name: Check out code
        uses: actions/checkout@v3
+        with:
+          persist-credentials: false
      - name: Get changed files
        id: changed-files
        uses: tj-actions/changed-files@1c8e6069583811afb28f97afeaf8e7da80c6be5c #v42
        with:
-          files: docker/**
+          files: docker/*/Dockerfile
          json: "true"
      - name: Run step if only the files listed above change
        if: steps.changed-files.outputs.any_changed == 'true'
@ -26,12 +31,12 @@ jobs:
        env:
          ALL_CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
        run: |
-          echo "matrix=${{ steps.changed-files.outputs.all_changed_files}}" >> $GITHUB_OUTPUT
+          echo "matrix=${ALL_CHANGED_FILES}" >> $GITHUB_OUTPUT
  build_modified_files:
    needs: get_changed_files
    name: Build Docker images on modified files
    runs-on: ubuntu-latest
-    if: ${{ needs.get_changed_files.outputs.matrix }} != ''
+    if: ${{ needs.get_changed_files.outputs.matrix != '[]' }}
    strategy:
      fail-fast: false
      matrix:
@ -48,11 +53,13 @@ jobs:
          sudo du -sh /usr/local/lib/
          sudo du -sh /usr/share/
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
+        uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2  # v3.10.0
      - name: Check out code
        uses: actions/checkout@v3
+        with:
+          persist-credentials: false
      - name: Build Docker image
-        uses: docker/build-push-action@v4
+        uses: docker/build-push-action@14487ce63c7a62a4a324b0bfb37086795e31c6c1  # v6.16.0
        with:
          file: ${{ matrix.docker-file }}
          context: .
--- a/.github/workflows/tests-main.yml
+++ b/.github/workflows/tests-main.yml
@ -6,11 +6,18 @@ on:
    paths-ignore:
        - 'docs/**'

+env:
+  TRANSFORMERS_IS_CI: 1
+
+permissions: {}
+
 jobs:
  tests:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
+        with:
+          persist-credentials: false
      - name: Set up Python 3.11
        uses: actions/setup-python@v4
        with:
@ -26,3 +33,11 @@ jobs:
      - name: Test with pytest
        run: |
          make test
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@3f88d63d3761558a32e8e46fc2a8536e04bb2aea  # main from Feb 2025-02-24
+        with:
+          slack_channel: ${{ secrets.SLACK_CHANNEL_ID }}
+          title: 🤗 Results of transformers main tests
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -9,15 +9,23 @@ on:
    paths-ignore:
      - 'docs/**'

+env:
+  HF_HOME: .cache/huggingface
+  TRANSFORMERS_IS_CI: 1
+
+permissions: {}
+
 jobs:
  check_code_quality:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
+        with:
+          persist-credentials: false
      - name: Set up Python
        uses: actions/setup-python@v4
        with:
-          python-version: "3.8"
+          python-version: "3.11"
          cache: "pip"
          cache-dependency-path: "setup.py"
      - name: Install dependencies
@ -31,12 +39,34 @@ jobs:
  tests:
    needs: check_code_quality
    strategy:
+      fail-fast: false
      matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
-        os: ["ubuntu-latest", "macos-12", "windows-latest"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
+        os: ["ubuntu-latest", "macos-13", "windows-latest"]
    runs-on: ${{ matrix.os }}
    steps:
      - uses: actions/checkout@v3
+        with:
+          persist-credentials: false
+      - name: Model cache
+        uses: actions/cache/restore@v4
+        with:
+          # Avoid caching HF_HOME/modules and Python cache files to prevent interoperability
+          # issues and potential cache poisioning. We also avoid lock files to prevent runs
+          # avoiding re-download because they see a lock file.
+          path: |
+            ${{ env.HF_HOME }}/hub/**
+            !${{ env.HF_HOME }}/**/*.pyc
+          key: model-cache-${{ github.run_id }}
+          restore-keys: model-cache-
+          enableCrossOsArchive: true
+      - name: Dump cache content
+        # TODO: remove this step after 2025-02-15
+        if: matrix.os != 'windows-latest'
+        run: |
+          SHASUM=sha256sum
+          [ -f "$(which shasum)" ] && SHASUM=shasum
+          find "${{ env.HF_HOME }}/hub" -type f -exec "$SHASUM" {} \; > cache_content_initial || true
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v4
        with:
@ -46,8 +76,59 @@ jobs:
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
+          pip install setuptools
          # cpu version of pytorch
          pip install -e .[test]
-      - name: Test with pytest
+      - name: Downgrade numpy on MacOS and Windows
+        # TODO: remove numpy downgrade on MacOS & Windows once torch fixes numpy 2.0 issue
+        shell: bash
+        if: matrix.os == 'windows-latest' || matrix.os == 'macos-13'
        run: |
+          pip install --force-reinstall -U "numpy<2.0.0"
+      - name: Test with pytest
+        # MacOS tests are currently too flaky and will fail almost each time. Thus, continue (green checkmark) even if
+        # they fail, but add a notice so that the failure is not completely silent
+        continue-on-error: ${{ matrix.os == 'macos-13' }}
+        shell: bash
+        run: |
+          set +e
          make test
+          status=$?
+          # Post a notice only if this is macOS AND tests failed
+          if [ "$status" -ne 0 ] && [ "${{ matrix.os }}" = "macos-13" ]; then
+            {
+              echo "## ⚠️ macOS tests failed"
+              echo ""
+              echo "- OS: ${{ matrix.os }}"
+              echo "- Python: ${{ matrix.python-version }}"
+              echo ""
+              echo "Check the logs from this step for details."
+            } >> "$GITHUB_STEP_SUMMARY"
+          fi
+          # Return the real status. On macOS this won't fail the job because of continue-on-error.
+          exit $status
+      - name: Dump cache content and diff
+        # This is just debug info so that we can monitor if the model cache diverges substantially
+        # over time and what the diverging model is.
+        # TODO: remove after 2025-02-15
+        if: matrix.os != 'windows-latest'
+        run: |
+          SHASUM=sha256sum
+          [ -f "$(which shasum)" ] && SHASUM=shasum
+          find "${{ env.HF_HOME }}/hub" -type f -exec "$SHASUM" {} \; > cache_content_after || true
+          diff -udp cache_content_initial cache_content_after || true
+      - name: Delete old model cache entries
+        run: |
+          # make sure that cache cleaning doesn't break the pipeline
+          python scripts/ci_clean_cache.py -d || true
+      - name: Update model cache
+        uses: actions/cache/save@v4
+        # Only let one runner (preferably the one that covers most tests) update the model cache
+        # after *every* run. This way we make sure that our cache is never outdated and we don't
+        # have to keep track of hashes.
+        if: always() && matrix.os == 'ubuntu-latest' && matrix.python-version == '3.10'
+        with:
+          path: |
+            ${{ env.HF_HOME }}/hub/**
+            !${{ env.HF_HOME }}/**/*.pyc
+          key: model-cache-${{ github.run_id }}
--- a/.github/workflows/torch_compile_tests.yml
+++ b/.github/workflows/torch_compile_tests.yml
@ -1,7 +1,5 @@
 name: torch compile tests

-# see peft/tests/__init__.py
-
 on:
  workflow_dispatch:
    inputs:
@ -13,31 +11,46 @@ on:
        required: false
        default: false

+env:
+  RUN_SLOW: "yes"
+  IS_GITHUB_CI: "1"
+  # To be able to run tests on CUDA 12.2
+  NVIDIA_DISABLE_REQUIRE: "1"
+
+permissions: {}
+
 jobs:
  run_tests_with_compile:
-    runs-on: ubuntu-latest
+    runs-on:
+      group: aws-g6-4xlarge-plus
    env:
      PEFT_DEBUG_WITH_TORCH_COMPILE: 1
+      CUDA_VISIBLE_DEVICES: "0"
+      TEST_TYPE: "single_gpu_huggingface/peft-gpu-bnb-latest:latest"
+      USE_PYTORCH_NIGHTLY: "${{ github.event.inputs.pytorch_nightly }}"
+    container:
+      image: "huggingface/peft-gpu-bnb-latest:latest"
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    defaults:
+      run:
+        shell: bash
    steps:
      - uses: actions/checkout@v4
        with:
          ref: ${{ github.event.inputs.branch }}
          repository: ${{ github.event.pull_request.head.repo.full_name }}
-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: "3.10"
-          cache: "pip"
-          cache-dependency-path: "setup.py"
-      - name: Install dependencies
+          persist-credentials: false
+      - name: Pip install
        run: |
-          python -m pip install --upgrade pip
-          python -m pip install .[test]
-          if [ "${{ github.event.inputs.pytorch_nightly }}" = "true" ]; then
+          source activate peft
+          pip install -e . --no-deps
+          pip install pytest-cov pytest-reportlog parameterized datasets scipy einops
+          pip install "pytest>=7.2.0,<8.0.0" # see: https://github.com/huggingface/transformers/blob/ce4fff0be7f6464d713f7ac3e0bbaafbc6959ae5/setup.py#L148C6-L148C26
+          if [ "${USE_PYTORCH_NIGHTLY}" = "true" ]; then
            python -m pip install --upgrade --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
          fi
      - name: Test compile with pytest
        run: |
+          source activate peft
          echo "PEFT_DEBUG_WITH_TORCH_COMPILE=$PEFT_DEBUG_WITH_TORCH_COMPILE"
-          git status
-          make test
+          make tests_torch_compile
--- a/.github/workflows/trufflehog.yml
+++ b/.github/workflows/trufflehog.yml
@ -0,0 +1,18 @@
+on:
+  push:
+
+name: Secret Leaks
+
+permissions: {}
+
+jobs:
+  trufflehog:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+        persist-credentials: false
+    - name: Secret Scanning
+      uses: trufflesecurity/trufflehog@d722a7e50645c42123e31fe97761a88ade988db8  # v3.88.25
--- a/.github/workflows/upload_pr_documentation.yml
+++ b/.github/workflows/upload_pr_documentation.yml
@ -6,6 +6,8 @@ on:
    types:
      - completed

+permissions: {}
+
 jobs:
  build:
    uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
@ -13,4 +15,4 @@ jobs:
      package_name: peft
    secrets:
      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
-      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
+      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
--- a/.github/workflows/zizmor.yaml
+++ b/.github/workflows/zizmor.yaml
@ -0,0 +1,28 @@
+name: CI security linting
+
+on:
+  push:
+    branches: ["main"]
+  pull_request:
+    branches: ["*"]
+    paths:
+      - '.github/**'
+
+permissions: {}
+
+jobs:
+  zizmor:
+    name: zizmor latest via Cargo
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      security-events: write
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          persist-credentials: false
+      - name: Install zizmor
+        run: cargo install --locked zizmor
+      - name: Run zizmor
+        run: zizmor .github/workflows
--- a/.github/zizmor.yml
+++ b/.github/zizmor.yml
@ -0,0 +1,24 @@
+rules:
+  dangerous-triggers:
+    ignore:
+      # this workflow is only triggered after maintainer approval
+      - upload_pr_documentation.yml:3:1
+  cache-poisoning:
+    ignore:
+      # the docker buildx binary is cached and zizmor warns about a cache poisoning attack.
+      # OTOH this cache would make us more resilient against an intrusion on docker-buildx' side.
+      # There is no obvious benefit so we leave it as it is.
+      - build_docker_images.yml:37:9
+      - build_docker_images.yml:70:9
+      - build_docker_images.yml:103:9
+      - build_docker_images.yml:136:9
+      - build_docker_images.yml:169:9
+  unpinned-images:
+    ignore:
+      # We want to test these images with the latest version and we're not using them
+      # to deploy anything so we deem it safe to use those, even if they are unpinned.
+      - nightly-bnb.yml:30:7
+      - nightly-bnb.yml:155:7
+      - nightly.yml:27:7
+      - nightly.yml:77:7
+      - torch_compile_tests.yml:32:7
--- a/.gitignore
+++ b/.gitignore
@ -139,3 +139,7 @@ dmypy.json

 # More test things
 wandb
+
+# method_comparison logs
+method_comparison/MetaMathQA/cancelled_results/
+method_comparison/MetaMathQA/temporary_results/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,13 +1,13 @@
 repos:
  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.2.1
+    rev: v0.12.8
    hooks:
      - id: ruff
        args:
          - --fix
      - id: ruff-format
  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v4.6.0
    hooks:
      - id: check-merge-conflict
      - id: check-yaml
--- a/17
+++ b/17
@ -6,13 +6,13 @@ check_dirs := src tests examples docs scripts docker

 # this target runs checks on all files
 quality:
-	ruff $(check_dirs)
+	ruff check $(check_dirs)
 	ruff format --check $(check_dirs)
 	doc-builder style src/peft tests docs/source --max_len 119 --check_only

 # Format source code automatically and check is there are any problems left that need manual fixing
 style:
-	ruff $(check_dirs) --fix
+	ruff check --fix $(check_dirs)
 	ruff format $(check_dirs)
 	doc-builder style src/peft tests docs/source --max_len 119

@ -31,9 +31,14 @@ tests_core_multi_gpu:
 tests_core_single_gpu:
 	python -m pytest -m single_gpu_tests tests/test_common_gpu.py $(if $(IS_GITHUB_CI),--report-log "core_single_gpu.log",)

+# exclude gemma tests, as generation fails with torch.compile, these failures
+# trigger side effects that make other tests fail with 'RuntimeError: Offset
+# increment outside graph capture encountered unexpectedly.' 
+# TODO re-enable gemma once/if it is fixed
 tests_common_gpu:
-	python -m pytest tests/test_decoder_models.py $(if $(IS_GITHUB_CI),--report-log "common_decoder.log",)
+	python -m pytest tests/test_decoder_models.py -k "not gemma" $(if $(IS_GITHUB_CI),--report-log "common_decoder.log",)
 	python -m pytest tests/test_encoder_decoder_models.py $(if $(IS_GITHUB_CI),--report-log "common_encoder_decoder.log",)
+	python -m pytest tests/test_gptqmodel.py $(if $(IS_GITHUB_CI),--report-log "gptqmodel_gpu.log",)

 tests_examples_multi_gpu_bnb:
 	python -m pytest -m "multi_gpu_tests and bitsandbytes" tests/test_gpu_examples.py $(if $(IS_GITHUB_CI),--report-log "multi_gpu_examples.log",)
@ -47,9 +52,15 @@ tests_core_multi_gpu_bnb:
 tests_core_single_gpu_bnb:
 	python -m pytest -m "single_gpu_tests and bitsandbytes" tests/test_common_gpu.py $(if $(IS_GITHUB_CI),--report-log "core_single_gpu.log",)

+tests_gpu_bnb_regression:
+	python -m pytest tests/bnb/test_bnb_regression.py $(if $(IS_GITHUB_CI),--report-log "bnb_regression_gpu.log",)
+
 # For testing transformers tests for bnb runners
 transformers_tests:
 	RUN_SLOW=1 python -m pytest transformers-clone/tests/quantization/bnb $(if $(IS_GITHUB_CI),--report-log "transformers_tests.log",)

 tests_regression:
 	python -m pytest -s --regression tests/regression/ $(if $(IS_GITHUB_CI),--report-log "regression_tests.log",)
+
+tests_torch_compile:
+	python -m pytest tests/test_torch_compile.py $(if $(IS_GITHUB_CI),--report-log "compile_tests.log",)
--- a/README.md
+++ b/README.md
@ -39,38 +39,43 @@ pip install peft
 Prepare a model for training with a PEFT method such as LoRA by wrapping the base model and PEFT configuration with `get_peft_model`. For the bigscience/mt0-large model, you're only training 0.19% of the parameters!

 ```python
-from transformers import AutoModelForSeq2SeqLM
-from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
-model_name_or_path = "bigscience/mt0-large"
-tokenizer_name_or_path = "bigscience/mt0-large"
+from transformers import AutoModelForCausalLM
+from peft import LoraConfig, TaskType, get_peft_model

+device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
+model_id = "Qwen/Qwen2.5-3B-Instruct"
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device)
 peft_config = LoraConfig(
-    task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
+    r=16,
+    lora_alpha=32,
+    task_type=TaskType.CAUSAL_LM,
+    # target_modules=["q_proj", "v_proj", ...]  # optionally indicate target modules
 )
-
-model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
 model = get_peft_model(model, peft_config)
 model.print_trainable_parameters()
-"trainable params: 2359296 || all params: 1231940608 || trainable%: 0.19151053100118282"
+# prints: trainable params: 3,686,400 || all params: 3,089,625,088 || trainable%: 0.1193
+
+# now perform training on your dataset, e.g. using transformers Trainer, then save the model
+model.save_pretrained("qwen2.5-3b-lora")
 ```

 To load a PEFT model for inference:

-```py
-from peft import AutoPeftModelForCausalLM
-from transformers import AutoTokenizer
-import torch
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import PeftModel

-model = AutoPeftModelForCausalLM.from_pretrained("ybelkada/opt-350m-lora").to("cuda")
-tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
+model_id = "Qwen/Qwen2.5-3B-Instruct"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device)
+model = PeftModel.from_pretrained(model, "qwen2.5-3b-lora")

-model.eval()
 inputs = tokenizer("Preheat the oven to 350 degrees and place the cookie dough", return_tensors="pt")
+outputs = model.generate(**inputs.to(device), max_new_tokens=50)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))

-outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=50)
-print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
-
-"Preheat the oven to 350 degrees and place the cookie dough in the center of the oven. In a large bowl, combine the flour, baking powder, baking soda, salt, and cinnamon. In a separate bowl, combine the egg yolks, sugar, and vanilla."
+# prints something like: Preheat the oven to 350 degrees and place the cookie dough in a baking dish [...]
 ```

 ## Why you should use PEFT
@ -124,6 +129,32 @@ The iterative diffusion process consumes a lot of memory which can make it diffi
 > [!TIP]
 > Take a look at the [examples/lora_dreambooth/train_dreambooth.py](examples/lora_dreambooth/train_dreambooth.py) training script to try training your own Stable Diffusion model with LoRA, and play around with the [smangrul/peft-lora-sd-dreambooth](https://huggingface.co/spaces/smangrul/peft-lora-sd-dreambooth) Space which is running on a T4 instance. Learn more about the PEFT integration in Diffusers in this [tutorial](https://huggingface.co/docs/peft/main/en/tutorial/peft_integrations#diffusers).

+### Transformers
+
+PEFT is directly integrated with [Transformers](https://huggingface.co/docs/transformers/main/en/peft). After loading a model, call `add_adapter` to add a new PEFT adapter to the model:
+
+```python
+from peft import LoraConfig
+model = ...  # transformers model
+peft_config = LoraConfig(...)
+model.add_adapter(lora_config, adapter_name="lora_1")
+```
+
+To load a trained PEFT adapter, call `load_adapter`:
+
+```python
+model = ...  # transformers model
+model.load_adapter(<path-to-adapter>, adapter_name="lora_1")
+```
+
+And to switch between different adapters, call `set_adapter`:
+
+```python
+model.set_adapter("lora_2")
+```
+
+The Transformers integration doesn't include all the functionalities offered in PEFT, such as methods for merging the adapter into the base model.
+
 ### Accelerate

 [Accelerate](https://huggingface.co/docs/accelerate/index) is a library for distributed training and inference on various training setups and hardware (GPUs, TPUs, Apple Silicon, etc.). PEFT models work with Accelerate out of the box, making it really convenient to train really large models or use them for inference on consumer hardware with limited resources.
@ -150,9 +181,9 @@ To use 🤗 PEFT in your publication, please cite it by using the following BibT

 ```bibtex
@Misc{peft,
-  title =        {PEFT: State-of-the-art Parameter-Efficient Fine-Tuning methods},
+  title =        {{PEFT}: State-of-the-art Parameter-Efficient Fine-Tuning methods},
  author =       {Sourab Mangrulkar and Sylvain Gugger and Lysandre Debut and Younes Belkada and Sayak Paul and Benjamin Bossan},
  howpublished = {\url{https://github.com/huggingface/peft}},
  year =         {2022}
 }
-```
+```
--- a/docker/README.md
+++ b/docker/README.md
@ -1,11 +1,8 @@
 # PEFT Docker images

-Here we store all PEFT Docker images used in our testing infrastructure. We use python 3.8 for now on all our images.
+Here we store all PEFT Docker images used in our testing infrastructure. We use python 3.11 for now on all our images.

 - `peft-cpu`: PEFT compiled on CPU with all other HF libraries installed on main branch
- `peft-gpu`: PEFT complied for NVIDIA GPUs wih all other HF libraries installed on main branch
+- `peft-gpu`: PEFT complied for NVIDIA GPUs with all other HF libraries installed on main branch
 - `peft-gpu-bnb-source`: PEFT complied for NVIDIA GPUs with `bitsandbytes` and all other HF libraries installed from main branch
 - `peft-gpu-bnb-latest`: PEFT complied for NVIDIA GPUs with `bitsandbytes` complied from main and all other HF libraries installed from latest PyPi
- `peft-gpu-bnb-multi-source`: PEFT complied for NVIDIA GPUs with `bitsandbytes` complied from `multi-backend` branch and all other HF libraries installed from main branch
-
-`peft-gpu-bnb-source` and `peft-gpu-bnb-multi-source` are essentially the same, with the only difference being `bitsandbytes` compiled on another branch. Make sure to propagate the changes you applied on one file to the other!
--- a/docker/peft-cpu/Dockerfile
+++ b/docker/peft-cpu/Dockerfile
@ -4,7 +4,7 @@
 # Use base conda image to reduce time
 FROM continuumio/miniconda3:latest AS compile-image
 # Specify py version
-ENV PYTHON_VERSION=3.8
+ENV PYTHON_VERSION=3.11
 # Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
 RUN apt-get update && \
    apt-get install -y curl git wget software-properties-common git-lfs && \
--- a/docker/peft-gpu-bnb-latest/Dockerfile
+++ b/docker/peft-gpu-bnb-latest/Dockerfile
@ -4,7 +4,7 @@
 # Use base conda image to reduce time
 FROM continuumio/miniconda3:latest AS compile-image
 # Specify py version
-ENV PYTHON_VERSION=3.8
+ENV PYTHON_VERSION=3.11
 # Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
 RUN apt-get update && \
    apt-get install -y curl git wget software-properties-common git-lfs && \
@ -31,7 +31,7 @@ RUN chsh -s /bin/bash
 SHELL ["/bin/bash", "-c"]

 # Stage 2
-FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS build-image
+FROM nvidia/cuda:12.6.3-devel-ubuntu22.04 AS build-image
 COPY --from=compile-image /opt/conda /opt/conda
 ENV PATH /opt/conda/bin:$PATH

@ -56,7 +56,7 @@ RUN source activate peft && \
    peft \
    optimum \
    auto-gptq && \
-    git clone https://github.com/TimDettmers/bitsandbytes && cd bitsandbytes && \
+    git clone https://github.com/bitsandbytes-foundation/bitsandbytes && cd bitsandbytes && \
    cmake -B . -DCOMPUTE_BACKEND=cuda -S . && \
    cmake --build . && \
    pip install -e . && \ 
--- a/docker/peft-gpu-bnb-multi-source/Dockerfile
+++ b/docker/peft-gpu-bnb-multi-source/Dockerfile
@ -1,68 +0,0 @@
-# Builds GPU docker image of PyTorch
-# Uses multi-staged approach to reduce size
-# Stage 1
-# Use base conda image to reduce time
-FROM continuumio/miniconda3:latest AS compile-image
-# Specify py version
-ENV PYTHON_VERSION=3.8
-# Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
-RUN apt-get update && \
-    apt-get install -y curl git wget software-properties-common git-lfs && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists*
-
-# Install audio-related libraries 
-RUN apt-get update && \
-    apt install -y ffmpeg
-
-RUN apt install -y libsndfile1-dev
-RUN git lfs install
-
-# Create our conda env - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
-RUN conda create --name peft python=${PYTHON_VERSION} ipython jupyter pip
-RUN python3 -m pip install --no-cache-dir --upgrade pip
-
-# Below is copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
-# We don't install pytorch here yet since CUDA isn't available
-# instead we use the direct torch wheel
-ENV PATH /opt/conda/envs/peft/bin:$PATH
-# Activate our bash shell
-RUN chsh -s /bin/bash
-SHELL ["/bin/bash", "-c"]
-
-# Stage 2
-FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS build-image
-COPY --from=compile-image /opt/conda /opt/conda
-ENV PATH /opt/conda/bin:$PATH
-
-RUN chsh -s /bin/bash
-SHELL ["/bin/bash", "-c"]
-
-# Install apt libs
-RUN apt-get update && \
-    apt-get install -y curl git wget cmake && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists*
-
-# Activate the conda env and install transformers + accelerate from source
-# Also clone BNB and build it from source.
-RUN source activate peft && \
-    python3 -m pip install -U --no-cache-dir \
-    librosa \
-    "soundfile>=0.12.1" \
-    scipy \
-    git+https://github.com/huggingface/transformers \
-    git+https://github.com/huggingface/accelerate \
-    peft[test]@git+https://github.com/huggingface/peft \
-    optimum \
-    auto-gptq && \
-    git clone https://github.com/TimDettmers/bitsandbytes && cd bitsandbytes && git checkout multi-backend-refactor && \
-    cmake -B . -DCOMPUTE_BACKEND=cuda -S . && \
-    cmake --build . && \
-    pip install -e . && \ 
-    pip freeze | grep bitsandbytes
-
-RUN echo "source activate peft" >> ~/.profile
-
-# Activate the virtualenv
-CMD ["/bin/bash"]
--- a/docker/peft-gpu-bnb-source/Dockerfile
+++ b/docker/peft-gpu-bnb-source/Dockerfile
@ -4,7 +4,7 @@
 # Use base conda image to reduce time
 FROM continuumio/miniconda3:latest AS compile-image
 # Specify py version
-ENV PYTHON_VERSION=3.8
+ENV PYTHON_VERSION=3.11
 # Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
 RUN apt-get update && \
    apt-get install -y curl git wget software-properties-common git-lfs && \
@ -31,7 +31,7 @@ RUN chsh -s /bin/bash
 SHELL ["/bin/bash", "-c"]

 # Stage 2
-FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS build-image
+FROM nvidia/cuda:12.6.3-devel-ubuntu22.04 AS build-image
 COPY --from=compile-image /opt/conda /opt/conda
 ENV PATH /opt/conda/bin:$PATH

@ -56,7 +56,7 @@ RUN source activate peft && \
    peft[test]@git+https://github.com/huggingface/peft \
    optimum \
    auto-gptq && \
-    git clone https://github.com/TimDettmers/bitsandbytes && cd bitsandbytes && \
+    git clone https://github.com/bitsandbytes-foundation/bitsandbytes && cd bitsandbytes && \
    cmake -B . -DCOMPUTE_BACKEND=cuda -S . && \
    cmake --build . && \
    pip install -e . && \ 
--- a/docker/peft-gpu/Dockerfile
+++ b/docker/peft-gpu/Dockerfile
@ -4,23 +4,18 @@
 # Use base conda image to reduce time
 FROM continuumio/miniconda3:latest AS compile-image
 # Specify py version
-ENV PYTHON_VERSION=3.8
+ENV PYTHON_VERSION=3.11
 # Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
+# Install audio-related libraries
 RUN apt-get update && \
-    apt-get install -y curl git wget software-properties-common git-lfs && \
+    apt-get install -y curl git wget software-properties-common git-lfs ffmpeg libsndfile1-dev && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists*

-# Install audio-related libraries 
-RUN apt-get update && \
-    apt install -y ffmpeg
-
-RUN apt install -y libsndfile1-dev
 RUN git lfs install

 # Create our conda env - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
 RUN conda create --name peft python=${PYTHON_VERSION} ipython jupyter pip
-RUN python3 -m pip install --no-cache-dir --upgrade pip

 # Below is copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
 # We don't install pytorch here yet since CUDA isn't available
@ -31,29 +26,24 @@ RUN chsh -s /bin/bash
 SHELL ["/bin/bash", "-c"]

 # Stage 2
-FROM nvidia/cuda:12.2.2-devel-ubuntu22.04 AS build-image
+FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS build-image
 COPY --from=compile-image /opt/conda /opt/conda
 ENV PATH /opt/conda/bin:$PATH

-RUN chsh -s /bin/bash
-SHELL ["/bin/bash", "-c"]
-RUN source activate peft && \ 
-    python3 -m pip install --no-cache-dir bitsandbytes optimum auto-gptq
-
-# Add autoawq for quantization testing
-RUN source activate peft && \
-    python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.4/autoawq-0.2.4-cp38-cp38-linux_x86_64.whl
-RUN source activate peft && \
-    python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ_kernels/releases/download/v0.0.6/autoawq_kernels-0.0.6-cp38-cp38-linux_x86_64.whl
-
 # Install apt libs
 RUN apt-get update && \
    apt-get install -y curl git wget && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists*

-# Add eetq for quantization testing
-RUN source activate peft && \
+RUN chsh -s /bin/bash
+SHELL ["/bin/bash", "-c"]
+RUN source activate peft && \ 
+    python3 -m pip install --no-cache-dir bitsandbytes optimum auto-gptq && \
+    # Add autoawq for quantization testing
+    python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.7.post2/autoawq-0.2.7.post2-py3-none-any.whl && \
+    python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ_kernels/releases/download/v0.0.9/autoawq_kernels-0.0.9-cp311-cp311-linux_x86_64.whl && \
+    # Add eetq for quantization testing
    python3 -m pip install git+https://github.com/NetEase-FuXi/EETQ.git

 # Activate the conda env and install transformers + accelerate from source
@ -62,19 +52,16 @@ RUN source activate peft && \
    librosa \
    "soundfile>=0.12.1" \
    scipy \
+    torchao \
    git+https://github.com/huggingface/transformers \
    git+https://github.com/huggingface/accelerate \
-    peft[test]@git+https://github.com/huggingface/peft
+    peft[test]@git+https://github.com/huggingface/peft \
+    # Add aqlm for quantization testing
+    aqlm[gpu]>=1.0.2 \
+    # Add HQQ for quantization testing
+    hqq

-# Add aqlm for quantization testing
 RUN source activate peft && \
-    pip install aqlm[gpu]>=1.0.2
-
-# Add HQQ for quantization testing
-RUN source activate peft && \
-pip install hqq
-
-RUN source activate peft && \ 
    pip freeze | grep transformers

 RUN echo "source activate peft" >> ~/.profile
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@ -37,6 +37,8 @@
    title: Adapter injection
  - local: developer_guides/mixed_models
    title: Mixed adapter types
+  - local: developer_guides/torch_compile
+    title: torch.compile
  - local: developer_guides/contributing
    title: Contribute to PEFT
  - local: developer_guides/troubleshooting
@ -88,6 +90,8 @@
      title: LoKr
    - local: package_reference/lora
      title: LoRA
+    - local: package_reference/xlora
+      title: X-LoRA
    - local: package_reference/adapter_utils
      title: LyCORIS
    - local: package_reference/multitask_prompt_tuning
@ -108,12 +112,36 @@
      title: Layernorm tuning
    - local: package_reference/vera
      title: VeRA
-    - local: package_reference/helpers
-      title: Helpers
+    - local: package_reference/fourierft
+      title: FourierFT
+    - local: package_reference/vblora
+      title: VB-LoRA
+    - local: package_reference/hra
+      title: HRA
+    - local: package_reference/cpt
+      title: CPT
+    - local: package_reference/bone
+      title: Bone
+    - local: package_reference/trainable_tokens
+      title: Trainable Tokens
+    - local: package_reference/randlora
+      title: RandLora
+    - local: package_reference/shira
+      title: SHiRA
+    - local: package_reference/c3a
+      title: C3A
+    - local: package_reference/miss
+      title: MiSS
+    - local: package_reference/road
+      title: RoAd
+
    title: Adapters
  - sections:
    - local: package_reference/merge_utils
      title: Model merge
+    - local: package_reference/helpers
+      title: Helpers
+    - local: package_reference/hotswap
+      title: Hotswapping adapters
    title: Utilities
  title: API reference
-
--- a/docs/source/accelerate/deepspeed.md
+++ b/docs/source/accelerate/deepspeed.md
@ -94,7 +94,7 @@ accelerate launch --config_file "configs/deepspeed_config.yaml"  train.py \
 --logging_steps 5 \
 --log_level "info" \
 --logging_strategy "steps" \
--evaluation_strategy "epoch" \
+--eval_strategy "epoch" \
 --save_strategy "epoch" \
 --push_to_hub \
 --hub_private_repo True \
@ -128,24 +128,17 @@ Notice that we are using LoRA with  rank=8, alpha=16 and targeting all linear la

 Let's dive a little deeper into the script so you can see what's going on, and understand how it works.

-The first thing to know is that the script uses DeepSpeed for distributed training as the DeepSpeed config has been passed. The `SFTTrainer` class handles all the heavy lifting of creating the PEFT model using the peft config that is passed. After that, when you call `trainer.train()`, `SFTTrainer` internally uses 🤗 Accelerate to prepare the model, optimizer and trainer using the DeepSpeed config to create DeepSpeed engine which is then trained. The main code snippet is below:
+The first thing to know is that the script uses DeepSpeed for distributed training as the DeepSpeed config has been passed. The [`~trl.SFTTrainer`] class handles all the heavy lifting of creating the PEFT model using the peft config that is passed. After that, when you call `trainer.train()`, [`~trl.SFTTrainer`] internally uses 🤗 Accelerate to prepare the model, optimizer and trainer using the DeepSpeed config to create DeepSpeed engine which is then trained. The main code snippet is below:

 ```python
 # trainer
 trainer = SFTTrainer(
    model=model,
-    tokenizer=tokenizer,
+    processing_class=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
-    packing=data_args.packing,
-    dataset_kwargs={
-        "append_concat_token": data_args.append_concat_token,
-        "add_special_tokens": data_args.add_special_tokens,
-    },
-    dataset_text_field=data_args.dataset_text_field,
-    max_seq_length=data_args.max_seq_length,
 )
 trainer.accelerator.print(f"{trainer.model}")

@ -175,7 +168,7 @@ You can also refer this blog post [Falcon 180B Finetuning using 🤗 PEFT and De
 # Use PEFT QLoRA and DeepSpeed with ZeRO3 for finetuning large models on multiple GPUs

 In this section, we will look at how to use QLoRA and DeepSpeed Stage-3 for finetuning 70B llama model on 2X40GB GPUs.
-For this, we first need `bitsandbytes>=0.43.0`, `accelerate>=0.28.0`, `transformers>4.38.2`, `trl>0.7.11` and `peft>0.9.0`. We need to set `zero3_init_flag` to true when using Accelerate config. Below is the config which can be found at [deepspeed_config_z3_qlora.yaml](https://github.com/huggingface/peft/blob/main/examples/sft/configs/deepspeed_config_z3_qlora.yaml):
+For this, we first need `bitsandbytes>=0.43.3`, `accelerate>=1.0.1`, `transformers>4.44.2`, `trl>0.11.4` and `peft>0.13.0`. We need to set `zero3_init_flag` to true when using Accelerate config. Below is the config which can be found at [deepspeed_config_z3_qlora.yaml](https://github.com/huggingface/peft/blob/main/examples/sft/configs/deepspeed_config_z3_qlora.yaml):

 ```yml
 compute_environment: LOCAL_MACHINE                                                                                                                                           
@ -202,7 +195,7 @@ tpu_use_sudo: false
 use_cpu: false
 ```

-Launch command is given below which is available at [run_peft_qlora_deepspeed_stage3.sh](https://github.com/huggingface/peft/blob/main/examples/sft/run_peft_deepspeed.sh):
+Launch command is given below which is available at [run_peft_qlora_deepspeed_stage3.sh](https://github.com/huggingface/peft/blob/main/examples/sft/run_peft_qlora_deepspeed_stage3.sh):
 ```
 accelerate launch --config_file "configs/deepspeed_config_z3_qlora.yaml"  train.py \
 --seed 100 \
@ -217,7 +210,7 @@ accelerate launch --config_file "configs/deepspeed_config_z3_qlora.yaml"  train.
 --logging_steps 5 \
 --log_level "info" \
 --logging_strategy "steps" \
--evaluation_strategy "epoch" \
+--eval_strategy "epoch" \
 --save_strategy "epoch" \
 --push_to_hub \
 --hub_private_repo True \
@ -445,3 +438,21 @@ dataset['train'][label_column][:10]=['no complaint', 'no complaint', 'complaint'
 1. Merging when using PEFT and DeepSpeed is currently unsupported and will raise error.
 2. When using CPU offloading, the major gains from using PEFT to shrink the optimizer states and gradients to that of the adapter weights would be realized on CPU RAM and there won't be savings with respect to GPU memory.
 3. DeepSpeed Stage 3 and qlora when used with CPU offloading leads to more GPU memory usage when compared to disabling CPU offloading. 
+
+<Tip>
+
+💡 When you have code that requires merging (and unmerging) of weights, try to manually collect the parameters with DeepSpeed Zero-3 beforehand:
+
+```python
+import deepspeed
+
+is_ds_zero_3 = ... # check if Zero-3
+
+with deepspeed.zero.GatheredParameters(list(model.parameters()), enabled= is_ds_zero_3):
+    model.merge_adapter()
+    # do whatever is needed, then unmerge in the same context if unmerging is required
+    ...
+    model.unmerge_adapter()
+```
+
+</Tip>
--- a/docs/source/accelerate/fsdp.md
+++ b/docs/source/accelerate/fsdp.md
@ -74,7 +74,7 @@ accelerate launch --config_file "configs/fsdp_config.yaml"  train.py \
 --logging_steps 5 \
 --log_level "info" \
 --logging_strategy "steps" \
--evaluation_strategy "epoch" \
+--eval_strategy "epoch" \
 --save_strategy "epoch" \
 --push_to_hub \
 --hub_private_repo True \
@ -108,24 +108,17 @@ Notice that we are using LoRA with  rank=8, alpha=16 and targeting all linear la

 Let's dive a little deeper into the script so you can see what's going on, and understand how it works.

-The first thing to know is that the script uses FSDP for distributed training as the FSDP config has been passed. The `SFTTrainer` class handles all the heavy lifting of creating PEFT model using the peft config that is passed. After that when you call `trainer.train()`, Trainer internally uses 🤗 Accelerate to prepare model, optimizer and trainer using the FSDP config to create FSDP wrapped model which is then trained. The main code snippet is below:
+The first thing to know is that the script uses FSDP for distributed training as the FSDP config has been passed. The [`~trl.SFTTrainer`] class handles all the heavy lifting of creating PEFT model using the peft config that is passed. After that when you call `trainer.train()`, Trainer internally uses 🤗 Accelerate to prepare model, optimizer and trainer using the FSDP config to create FSDP wrapped model which is then trained. The main code snippet is below:

 ```python
 # trainer
 trainer = SFTTrainer(
    model=model,
-    tokenizer=tokenizer,
+    processing_class=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
-    packing=data_args.packing,
-    dataset_kwargs={
-        "append_concat_token": data_args.append_concat_token,
-        "add_special_tokens": data_args.add_special_tokens,
-    },
-    dataset_text_field=data_args.dataset_text_field,
-    max_seq_length=data_args.max_seq_length,
 )
 trainer.accelerator.print(f"{trainer.model}")
 if model_args.use_peft_lora:
@ -173,7 +166,7 @@ In the above example, the memory consumed per GPU is  72-80 GB (90-98%) as seen

 In this section, we will look at how to use QLoRA and FSDP for finetuning 70B llama model on 2X24GB GPUs. [Answer.AI](https://www.answer.ai/) in collaboration with bitsandbytes and Hugging Face 🤗 open sourced code enabling the usage of FSDP+QLoRA and explained the whole process in their insightful blogpost [You can now train a 70b language model at home](https://www.answer.ai/posts/2024-03-06-fsdp-qlora.html). This is now integrated in Hugging Face ecosystem. 

-For this, we first need `bitsandbytes>=0.43.0`, `accelerate>=0.28.0`, `transformers>4.38.2`, `trl>0.7.11` and `peft>0.9.0`. We need to set `fsdp_cpu_ram_efficient_loading=true`, `fsdp_use_orig_params=false` and `fsdp_offload_params=true`(cpu offloading) when using Accelerate config. When not using accelerate launcher, you can alternately set the environment variable `export FSDP_CPU_RAM_EFFICIENT_LOADING=true`.  Here, we will be using accelerate config and below is the config which can be found at [fsdp_config_qlora.yaml](https://github.com/huggingface/peft/blob/main/examples/sft/configs/fsdp_config_qlora.yaml):
+For this, we first need `bitsandbytes>=0.43.3`, `accelerate>=1.0.1`, `transformers>4.44.2`, `trl>0.11.4` and `peft>0.13.0`. We need to set `fsdp_cpu_ram_efficient_loading=true`, `fsdp_use_orig_params=false` and `fsdp_offload_params=true`(cpu offloading) when using Accelerate config. When not using accelerate launcher, you can alternately set the environment variable `export FSDP_CPU_RAM_EFFICIENT_LOADING=true`.  Here, we will be using accelerate config and below is the config which can be found at [fsdp_config_qlora.yaml](https://github.com/huggingface/peft/blob/main/examples/sft/configs/fsdp_config_qlora.yaml):

 ```yml
 compute_environment: LOCAL_MACHINE                                                                                                                                           
@ -218,7 +211,7 @@ accelerate launch --config_file "configs/fsdp_config_qlora.yaml"  train.py \
 --logging_steps 5 \
 --log_level "info" \
 --logging_strategy "steps" \
--evaluation_strategy "epoch" \
+--eval_strategy "epoch" \
 --save_strategy "epoch" \
 --push_to_hub \
 --hub_private_repo True \
@ -249,7 +242,7 @@ accelerate launch --config_file "configs/fsdp_config_qlora.yaml"  train.py \
 --bnb_4bit_quant_storage_dtype "bfloat16"
 ```

-Notice the new argument being passed, `bnb_4bit_quant_storage_dtype`, which denotes the data type for packing the 4-bit parameters. For example, when it is set to `bfloat16`, **32/4 = 8** 4-bit params are packed together post quantization. When using mixed precision training with `bfloat16`, `bnb_4bit_quant_storage_dtype` can be either `bfloat16` for pure `bfloat16` finetuning, or `float32` for automatic mixed precision (this consumes more GPU memory). When using mixed precision training with `float16`, `bnb_4bit_quant_storage_dtype` should be set to `float32` for stable automatic mixed precision training.
+Notice the new argument being passed, `bnb_4bit_quant_storage_dtype`, which denotes the data type for packing the 4-bit parameters. For example, when it is set to `bfloat16`, **16/4 = 4** 4-bit params are packed together post quantization. When using mixed precision training with `bfloat16`, `bnb_4bit_quant_storage_dtype` can be either `bfloat16` for pure `bfloat16` finetuning, or `float32` for automatic mixed precision (this consumes more GPU memory). When using mixed precision training with `float16`, `bnb_4bit_quant_storage_dtype` should be set to `float32` for stable automatic mixed precision training.

 In terms of training code, the important code changes are: 

@ -288,4 +281,5 @@ You can also refer the [llama-recipes](https://github.com/facebookresearch/llama
 1. Merging when using PEFT and FSDP is currently unsupported and will raise error.
 2. Passing `modules_to_save` config parameter to is untested at present.
 3. GPU Memory saving when using CPU Offloading is untested at present.
-4. When using FSDP+QLoRA, `paged_adamw_8bit` currently results in an error when saving a checkpoint.
+4. When using FSDP+QLoRA, `paged_adamw_8bit` currently results in an error when saving a checkpoint.
+5. DoRA training with FSDP should work (albeit at lower speed than LoRA). If combined with bitsandbytes (QDoRA), 4-bit quantization should also work, but 8-bit quantization has known issues and is not recommended.
--- a/docs/source/conceptual_guides/adapter.md
+++ b/docs/source/conceptual_guides/adapter.md
@ -50,6 +50,18 @@ In principle, LoRA can be applied to any subset of weight matrices in a neural n
 </div>
 <small><a href="https://hf.co/papers/2103.10385">Navigating Text-To-Image Customization: From LyCORIS Fine-Tuning to Model Evaluation</a></small>

+## Mixture of LoRA Experts (X-LoRA)
+
+[X-LoRA](https://huggingface.co/papers/2402.07148) is a mixture of experts method for LoRA which works by using dense or sparse gating to dynamically activate LoRA experts. The LoRA experts as well as the base model are frozen during training, resulting in a low parameter count as only the gating layers must be trained. In particular, the gating layers output scalings which (depending on config) are granular on the layer and token level. Additionally, during inference, X-LoRA dynamically activates LoRA adapters to recall knowledge and effectively mix them:
+
+The below graphic demonstrates how the scalings change for different prompts for each token. This highlights the activation of different adapters as the generation progresses and the sequence creates new context.
+
+![Token-by-token scalings](https://github.com/EricLBuehler/xlora/raw/master/res/token_by_token_scalings.gif)
+
+For each step, X-LoRA requires the base model to be run twice: first, to get hidden states without any LoRA adapters, and secondly, the hidden states are used to calculate scalings which are applied to the LoRA adapters and the model is run a second time. The output of the second run is the result of the model step.
+
+Ultimately, X-LoRA allows the model to reflect upon its knowledge because of the dual forward pass scheme, and dynamically reconfigure the architecture.
+
 ## Low-Rank Hadamard Product (LoHa)

 Low-rank decomposition can impact performance because the weight updates are limited to the low-rank space, which can constrain a model's expressiveness. However, you don't necessarily want to use a larger rank because it increases the number of trainable parameters. To address this, [LoHa](https://huggingface.co/papers/2108.06098) (a method originally developed for computer vision) was applied to diffusion models where the ability to generate diverse images is an important consideration. LoHa should also work with general model types, but the embedding layers aren't currently implemented in PEFT.
@ -73,19 +85,23 @@ OFT preserves the hyperspherical energy by learning an orthogonal transformation

 ## Orthogonal Butterfly (BOFT)

-[BOFT](https://hf.co/papers/2311.06243) is a method that primarily focuses on preserving a pretrained model's generative performance in the finetuned model. It tries to maintain the same cosine similarity (hyperspherical energy) between all pairwise neurons in a layer because this better captures the semantic information among neurons. This means OFT is more capable at preserving the subject and it is better for controllable generation (similar to [ControlNet](https://huggingface.co/docs/diffusers/using-diffusers/controlnet)).
+[BOFT](https://hf.co/papers/2311.06243) is an improved orthogonal finetuning method that focuses on preserving a pretrained model's generative capabilities while being significantly more parameter-efficient than standard OFT. Like OFT, BOFT maintains the same cosine similarity (hyperspherical energy) between all pairwise neurons in a layer by applying an orthogonal transformation to the pretrained weight matrix, ensuring the semantic relationships among neurons are preserved.

-OFT preserves the hyperspherical energy by learning an orthogonal transformation for neurons to keep the cosine similarity between them unchanged. In practice, this means taking the matrix product of an orthogonal matrix with the pretrained weight matrix. However, to be parameter-efficient, the orthogonal matrix is represented as a block-diagonal matrix with rank `r` blocks. Whereas LoRA reduces the number of trainable parameters with low-rank structures, OFT reduces the number of trainable parameters with a sparse block-diagonal matrix structure.
+Instead of using a block-diagonal orthogonal matrix, BOFT factorizes the orthogonal transformation into a product of **sparse butterfly matrices** (originally introduced in the [Cooley–Tukey FFT](https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm)). Unlike OFT's block-diagonal rotations, which only mix inputs within each block, the butterfly structure guarantees that every input can influence every output, producing a **dense connectivity** with just `O(d log d)` parameters. This factorization preserves expressivity while drastically reducing the parameter count compared to OFT (at the expense of computation time).
+
+In practice, BOFT multiplies each pretrained weight matrix by a sequence of butterfly-structured orthogonal factors, enabling efficient and expressive neuron rotations. This makes BOFT well-suited for controllable generation and tasks where maintaining the pretrained model's subject representation is critical, while also scaling to larger models with lower memory and compute overhead.

 ## Adaptive Low-Rank Adaptation (AdaLoRA)

 [AdaLoRA](https://hf.co/papers/2303.10512) manages the parameter budget introduced from LoRA by allocating more parameters - in other words, a higher rank `r` - for important weight matrices that are better adapted for a task and pruning less important ones. The rank is controlled by a method similar to singular value decomposition (SVD). The ∆W is parameterized with two orthogonal matrices and a diagonal matrix which contains singular values. This parametrization method avoids iteratively applying SVD which is computationally expensive. Based on this method, the rank of ∆W is adjusted according to an importance score. ∆W is divided into triplets and each triplet is scored according to its contribution to model performance. Triplets with low importance scores are pruned and triplets with high importance scores are kept for finetuning.

+Training with AdaLoRA has three phases: the init phase, the budgeting phase and the final phase. In the initial phase, no budgeting is applied, therefore the ranks are not touched. During the budgeting phase the process described above is applied and the rank is redistributed according to a budget, aiming to give more important adapters more rank and less important layers less. When reaching the final phase, budgeting has ended, the ranks are redistributed but we may continue training for a while with the redistributed ranks to further improve performance.
+
 ## Llama-Adapter

-[Llama-Adapter](https://hf.co/papers/2303.16199) is a method for adapting Llama into a instruction-following model. To help adapt the model for instruction-following, the adapter is trained with a 52K instruction-output dataset.
+[Llama-Adapter](https://hf.co/papers/2303.16199) is a method for adapting Llama into an instruction-following model. To help adapt the model for instruction-following, the adapter is trained with a 52K instruction-output dataset.

-A set of of learnable adaption prompts are prefixed to the input instruction tokens. These are inserted into the upper layers of the model because it is better to learn with the higher-level semantics of the pretrained model. The instruction-output tokens prefixed to the input guide the adaption prompt to generate a contextual response.
+A set of learnable adaption prompts are prefixed to the input instruction tokens. These are inserted into the upper layers of the model because it is better to learn with the higher-level semantics of the pretrained model. The instruction-output tokens prefixed to the input guide the adaption prompt to generate a contextual response.

 <div class="flex justify-center">
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/llama-adapter.png"/>
@ -93,3 +109,31 @@ A set of of learnable adaption prompts are prefixed to the input instruction tok
 <small><a href="https://hf.co/papers/2303.16199">LLaMA-Adapter: Efficient Fine-tuning of Language Models with Zero-init Attention</a></small>

 To avoid adding noise to the tokens, the adapter uses zero-initialized attention. On top of this, the adapter adds a learnable gating factor (initialized with zeros) to progressively add information to the model during training. This prevents overwhelming the model's pretrained knowledge with the newly learned instructions.
+
+## Householder Reflection Adaptation (HRA)
+
+[HRA](https://huggingface.co/papers/2405.17484) provides a new perspective connecting LoRA to OFT, which means it can harness the advantages of both strategies, reduce parameters and computation costs while penalizing the loss of pre-training knowledge. 
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/hra.png"/>
+</div>
+<small><a href="https://huggingface.co/papers/2405.17484">Bridging The Gap between Low-rank and Orthogonal Adaptation via Householder Reflection Adaptation</a></small>
+
+HRA constructs a chain of `r` trainable Householder reflections (HRs). Because the Householder reflection matrix is an orthogonal matrix and the product of orthogonal matrices is also an orthogonal matrix, HRA satisfies the theoretical guarantee of Orthogonal Finetuning (OFT). Meanwhile, HRA can also be viewed as a low-rank fine-tuning adapter by rewriting formula. 
+
+The higher `r`, the more trainable parameters, resulting in a larger model capacity and better performance. Besides, due to the chain structure, the orthogonality of HR planes impacts the capacity and regularity of HRA. To achieve a trade-off between the model capacity and regularity, an orthogonality regularizer of the HR planes is added to the loss function. The weight \\(\lambda\\) can control the strength of the regularizer. 
+
+## Bone
+[MiSS](https://huggingface.co/papers/2409.15371) New version of paper(MiSS: Balancing LoRA Performance and Efficiency with Simple Shard Sharing)
+If you already have a Bone checkpoint, you can use `/scripts/convert-bone-to-miss.py` to convert it into a MiSS checkpoint and proceed with training using MiSS.
+
+## MiSS
+[MiSS](https://huggingface.co/papers/2409.15371) MiSS (Matrix Shard Sharing) is a novel Parameter-Efficient Fine-Tuning (PEFT) method designed to address the trade-off between adaptability and efficiency in Large Language Models. The core approach of MiSS involves a simple shard-sharing mechanism. It achieves low-rank adaptation by decomposing a weight matrix into multiple fragments and then utilizing a shared, trainable "common fragment." The final low-rank update matrix is constructed by replicating these shared, partitioned shards. (MiSS is a novel PEFT method that adopts a low-rank structure, requires only a single trainable matrix, and introduces a new update mechanism distinct from LoRA, achieving an excellent balance between performance and efficiency.)
+
+<small><a href="https://huggingface.co/papers/2409.15371">MiSS: Balancing LoRA Performance and Efficiency with Simple Shard Sharing</a></small>
+
+Intuitively, the shape of a single trainable matrix in MiSS is consistent with `lora_B`, so the `r` parameter in MiSS is less than the `r` in LoRA by (`in_feature * r`).
+
+Note: Bat's r (b) is special and requires that weight W satisfies the conditions `in_features % r == 0` and `out_features % r == 0`. Additionally, when `in_features == out_features` and MiSS-r equals LoRA-r, MiSS's number of trainable parameters is only half that of LoRA.
+
+Although the nonlinear updates of Bat bring some performance improvements, they also increase computational overhead. Its main purpose is to provide researchers with a direction for improvement. Therefore, we recommend fine-tuning the comprehensive MiSS model instead.
--- a/docs/source/conceptual_guides/ia3.md
+++ b/docs/source/conceptual_guides/ia3.md
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # IA3 

-This conceptual guide gives a brief overview of [IA3](https://arxiv.org/abs/2205.05638), a parameter-efficient fine tuning technique that is 
+This conceptual guide gives a brief overview of [IA3](https://huggingface.co/papers/2205.05638), a parameter-efficient fine tuning technique that is 
 intended to improve over [LoRA](./lora).

 To make fine-tuning more efficient, IA3 (Infused Adapter by Inhibiting and Amplifying Inner Activations) 
--- a/docs/source/conceptual_guides/oft.md
+++ b/docs/source/conceptual_guides/oft.md
@ -16,9 +16,9 @@ rendered properly in your Markdown viewer.

 # Orthogonal Finetuning (OFT and BOFT) 

-This conceptual guide gives a brief overview of [OFT](https://arxiv.org/abs/2306.07280) and [BOFT](https://arxiv.org/abs/2311.06243), a parameter-efficient fine-tuning technique that utilizes orthogonal matrix to multiplicatively transform the pretrained weight matrices.
+This conceptual guide gives a brief overview of [OFT](https://huggingface.co/papers/2306.07280), [OFTv2](https://www.arxiv.org/abs/2506.19847) and [BOFT](https://huggingface.co/papers/2311.06243), a parameter-efficient fine-tuning technique that utilizes orthogonal matrix to multiplicatively transform the pretrained weight matrices.

-To achieve efficient fine-tuning, OFT represents the weight updates with an orthogonal transformation. The orthogonal transformation is parameterized by an orthogonal matrix multiplied to the pretrained weight matrix. These new matrices can be trained to adapt to the new data while keeping the overall number of changes low. The original weight matrix remains frozen and doesn’t receive any further adjustments. To produce the final results, both the original and the adapted weights are multiplied togethor.
+To achieve efficient fine-tuning, OFT represents the weight updates with an orthogonal transformation. The orthogonal transformation is parameterized by an orthogonal matrix multiplied to the pretrained weight matrix. These new matrices can be trained to adapt to the new data while keeping the overall number of changes low. The original weight matrix remains frozen and doesn't receive any further adjustments. To produce the final results, both the original and the adapted weights are multiplied togethor.

 Orthogonal Butterfly (BOFT) generalizes OFT with Butterfly factorization and further improves its parameter efficiency and finetuning flexibility. In short, OFT can be viewed as a special case of BOFT. Different from LoRA that uses additive low-rank weight updates, BOFT uses multiplicative orthogonal weight updates. The comparison is shown below.

@ -30,7 +30,7 @@ Orthogonal Butterfly (BOFT) generalizes OFT with Butterfly factorization and fur
 BOFT has some advantages compared to LoRA: 

 * BOFT proposes a simple yet generic way to finetune pretrained models to downstream tasks, yielding a better preservation of pretraining knowledge and a better parameter efficiency.
-* Through the orthogonality, BOFT introduces a structural constraint, i.e., keeping the [hyperspherical energy](https://arxiv.org/abs/1805.09298) unchanged during finetuning. This can effectively reduce the forgetting of pretraining knowledge.
+* Through the orthogonality, BOFT introduces a structural constraint, i.e., keeping the [hyperspherical energy](https://huggingface.co/papers/1805.09298) unchanged during finetuning. This can effectively reduce the forgetting of pretraining knowledge.
 * BOFT uses the butterfly factorization to efficiently parameterize the orthogonal matrix, which yields a compact yet expressive learning space (i.e., hypothesis class).
 * The sparse matrix decomposition in BOFT brings in additional inductive biases that are beneficial to generalization.

@ -58,13 +58,25 @@ As with other methods supported by PEFT, to fine-tune a model using OFT or BOFT,
 4. Train the `PeftModel` as you normally would train the base model.


-### BOFT-specific paramters
+### OFT-specific parameters

-`BOFTConfig` allows you to control how OFT/BOFT is applied to the base model through the following parameters:
+`OFTConfig` allows you to control how OFT is applied to the base model through the following parameters:

- `boft_block_size`: the BOFT matrix block size across different layers, expressed in `int`. Smaller block size results in sparser update matrices with fewer trainable paramters. **Note**, please choose `boft_block_size` to be divisible by most layer's input dimension (`in_features`), e.g., 4, 8, 16. Also, please only 
+- `r`: OFT rank, number of OFT blocks per injected layer. **Bigger** `r` results in more sparse update matrices with **fewer** trainable paramters. **Note**: You can only specify either `r` or `oft_block_size`, but not both simultaneously, because `r` × `oft_block_size` = layer dimension. For simplicity, we let the user speficy either `r` or `oft_block_size` and infer the other one. Default set to `r = 0`, the user is advised to set the `oft_block_size` instead for better clarity.
+- `oft_block_size`: OFT block size across different layers. **Bigger** `oft_block_size` results in more dense update matrices with **more** trainable parameters. **Note**: Please choose `oft_block_size` to be divisible by layer's input dimension (`in_features`), e.g., 4, 8, 16. You can only specify either `r` or `oft_block_size`, but not both simultaneously, because `r` × `oft_block_size` = layer dimension. For simplicity, we let the user speficy either `r` or `oft_block_size` and infer the other one. Default set to `oft_block_size = 32`. 
+- `use_cayley_neumann`: Specifies whether to use the Cayley-Neumann parameterization (efficient but approximate) or the vanilla Cayley parameterization (exact but computationally expensive because of matrix inverse). We recommend to set it to `True` for better efficiency, but performance may be slightly worse because of the approximation error. Please test both settings (`True` and `False`) depending on your needs. Default is `False`.
+- `module_dropout`: The multiplicative dropout probability, by setting OFT blocks to identity during training, similar to the dropout layer in LoRA.
+- `bias`: specify if the `bias` parameters should be trained. Can be `"none"`, `"all"` or `"oft_only"`.
+- `target_modules`: The modules (for example, attention blocks) to inject the OFT matrices.
+- `modules_to_save`: List of modules apart from OFT matrices to be set as trainable and saved in the final checkpoint. These typically include model's custom head that is randomly initialized for the fine-tuning task.
+
+### BOFT-specific parameters
+
+`BOFTConfig` allows you to control how BOFT is applied to the base model through the following parameters:
+
+- `boft_block_size`: the BOFT matrix block size across different layers, expressed in `int`. **Bigger** `boft_block_size` results in more dense update matrices with **more** trainable parameters. **Note**, please choose `boft_block_size` to be divisible by most layer's input dimension (`in_features`), e.g., 4, 8, 16. Also, please only 
 specify either `boft_block_size` or `boft_block_num`, but not both simultaneously or leaving both to 0, because `boft_block_size` x `boft_block_num` must equal the layer's input dimension.
- `boft_block_num`: the number of BOFT matrix blocks across different layers, expressed in `int`. Fewer blocks result in sparser update matrices with fewer trainable paramters. **Note**, please choose `boft_block_num` to be divisible by most layer's input dimension (`in_features`), e.g., 4, 8, 16. Also, please only 
+- `boft_block_num`: the number of BOFT matrix blocks across different layers, expressed in `int`. **Bigger** `boft_block_num` result in sparser update matrices with **fewer** trainable parameters. **Note**, please choose `boft_block_num` to be divisible by most layer's input dimension (`in_features`), e.g., 4, 8, 16. Also, please only 
 specify either `boft_block_size` or `boft_block_num`, but not both simultaneously or leaving both to 0, because `boft_block_size` x `boft_block_num` must equal the layer's input dimension.
 - `boft_n_butterfly_factor`: the number of butterfly factors. **Note**, for `boft_n_butterfly_factor=1`, BOFT is the same as vanilla OFT, for `boft_n_butterfly_factor=2`, the effective block size of OFT becomes twice as big and the number of blocks become half.
 - `bias`: specify if the `bias` parameters should be trained. Can be `"none"`, `"all"` or `"boft_only"`.
@ -74,13 +86,59 @@ specify either `boft_block_size` or `boft_block_num`, but not both simultaneousl



+## OFT Example Usage
+
+For using OFT for quantized finetuning with [TRL](https://github.com/huggingface/trl) for `SFT`, `PPO`, or `DPO` fine-tuning, follow the following outline:
+
+```py
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+from trl import SFTTrainer
+from peft import OFTConfig
+
+if use_quantization:
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_quant_storage=torch.bfloat16,
+    )
+
+model = AutoModelForCausalLM.from_pretrained(
+    "model_name", 
+    quantization_config=bnb_config
+)
+tokenizer = AutoTokenizer.from_pretrained("model_name")
+
+# Configure OFT
+peft_config = OFTConfig(
+    oft_block_size=32,
+    use_cayley_neumann=True,
+    target_modules="all-linear",
+    bias="none",
+    task_type="CAUSAL_LM"
+)
+
+trainer = SFTTrainer(
+    model=model,
+    train_dataset=ds['train'],
+    peft_config=peft_config,
+    processing_class=tokenizer,
+    args=training_arguments,
+    data_collator=collator,
+)
+
+trainer.train()
+```
+
+
 ## BOFT Example Usage

 For an example of the BOFT method application to various downstream tasks, please refer to the following guides:

 Take a look at the following step-by-step guides on how to finetune a model with BOFT:
- [Dreambooth finetuning with BOFT](../task_guides/boft_dreambooth) 
- [Controllable generation finetuning with BOFT (ControlNet)](../task_guides/boft_controlnet) 
+- [Dreambooth finetuning with BOFT](https://github.com/huggingface/peft/blob/main/examples/boft_dreambooth/boft_dreambooth.md)
+- [Controllable generation finetuning with BOFT (ControlNet)](https://github.com/huggingface/peft/blob/main/examples/boft_controlnet/boft_controlnet.md)

 For the task of image classification, one can initialize the BOFT config for a DinoV2 model as follows:

--- a/docs/source/conceptual_guides/prompting.md
+++ b/docs/source/conceptual_guides/prompting.md
@ -64,9 +64,9 @@ Take a look at [P-tuning for sequence classification](../task_guides/ptuning-seq
 <div class="flex justify-center">
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/mpt.png"/>
 </div>
-<small><a href="https://hf.co/papers/2103.10385">Multitask prompt tuning enables parameter-efficient transfer learning</a>.</small>
+<small><a href="https://hf.co/papers/2303.02861">Multitask prompt tuning enables parameter-efficient transfer learning</a>.</small>

-[Multitask prompt tuning (MPT)](https://hf.co/papers/2103.10385) learns a single prompt from data for multiple task types that can be shared for different target tasks. Other existing approaches learn a separate soft prompt for each task that need to be retrieved or aggregated for adaptation to target tasks. MPT consists of two stages:
+[Multitask prompt tuning (MPT)](https://hf.co/papers/2303.02861) learns a single prompt from data for multiple task types that can be shared for different target tasks. Other existing approaches learn a separate soft prompt for each task that need to be retrieved or aggregated for adaptation to target tasks. MPT consists of two stages:

 1. source training - for each task, its soft prompt is decomposed into task-specific vectors. The task-specific vectors are multiplied together to form another matrix W, and the Hadamard product is used between W and a shared prompt matrix P to generate a task-specific prompt matrix. The task-specific prompts are distilled into a single prompt matrix that is shared across all tasks. This prompt is trained with multitask training.
 2. target adaptation - to adapt the single prompt for a target task, a target prompt is initialized and expressed as the Hadamard product of the shared prompt matrix and the task-specific low-rank prompt matrix.
@ -75,3 +75,19 @@ Take a look at [P-tuning for sequence classification](../task_guides/ptuning-seq
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/mpt-decomposition.png"/>
 </div>
 <small><a href="https://hf.co/papers/2103.10385">Prompt decomposition</a>.</small>
+
+
+## Context-Aware Prompt Tuning (CPT)
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/cpt.png"/>
+</div>
+<small>CPT optimizing only specific token embeddings while keeping the rest of the model frozen <a href="https://huggingface.co/papers/2410.17222">(image source)</a>.</small>
+
+[Context-Aware Prompt Tuning (CPT)](https://huggingface.co/papers/2410.17222) is designed to enhance few-shot classification by refining only context embeddings. 
+This approach combines ideas from In-Context Learning (ICL), Prompt Tuning (PT), and adversarial optimization, focusing on making model adaptation both parameter-efficient and effective.
+In CPT, only specific context token embeddings are optimized, while the rest of the model remains frozen. 
+To prevent overfitting and maintain stability, CPT uses controlled perturbations to limit the allowed changes to context embeddings within a defined range. 
+Additionally, to address the phenomenon of recency bias—where examples near the end of the context tend to be prioritized over earlier ones—CPT applies a decay loss factor.
+
+Take a look at [Example](https://github.com/huggingface/peft/blob/main/examples/cpt_finetuning/README.md) for a step-by-step guide on how to train a model with CPT.
--- a/docs/source/developer_guides/contributing.md
+++ b/docs/source/developer_guides/contributing.md
@ -49,17 +49,17 @@ $ pip install pre-commit
 $ pre-commit install
 ```

-Running all the tests can take a couple of minutes, so during development it can be more efficient to only run tests specific to your change:
+Running all the tests can take a while, so during development it can be more efficient to only [run tests specific to your change](https://docs.pytest.org/en/6.2.x/usage.html#specifying-tests-selecting-tests), e.g. via:

 ```sh
-pytest tests/ -k <name-of-test>
+pytest tests/<test-file-name> -k <name-of-test>
 ```

-This should finish much quicker and allow for faster iteration. However, you should still run the whole test suite before creating a PR because your change can inadvertently break tests that at first glance are unrelated.
+This should finish much quicker and allow for faster iteration.

 If your change is specific to a hardware setting (e.g., it requires CUDA), take a look at [tests/test_gpu_examples.py](https://github.com/huggingface/peft/blob/1c1c7fdaa6e6abaa53939b865dee1eded82ad032/tests/test_gpu_examples.py) and [tests/test_common_gpu.py](https://github.com/huggingface/peft/blob/1c1c7fdaa6e6abaa53939b865dee1eded82ad032/tests/test_common_gpu.py) to see if it makes sense to add tests there. If your change could have an effect on saving and loading models, please run the tests with the `--regression` flag to trigger regression tests.

-It can happen that while you’re working on your PR, the underlying code base changes due to other changes being merged. If that happens – especially when there is a merge conflict – please update your branch with the latest changes. This can be a merge or a rebase, and we'll squash and merge the PR once it’s ready.
+It can happen that while you’re working on your PR, the underlying code base changes due to other changes being merged. If that happens – especially when there is a merge conflict – please update your branch with the latest changes. This can be a merge or a rebase, and we'll squash and merge the PR once it’s ready. If possible, avoid force pushes to make reviews easier.

 ## PR description

@ -77,10 +77,14 @@ Ideally when a bugfix is provided, it should be accompanied by a test for the bu

 New parameter-efficient fine-tuning methods are developed all the time. If you would like to add a new and promising method to PEFT, please follow these steps.

-1. Before you start to implement the new method, please open a GitHub issue with your proposal. This way, the maintainers can give you some early feedback.
-2. Please add a link to the source (usually a paper) of the method. Some evidence should be provided there is general interest in using the method. We will not add new methods that are freshly published, but there is no evidence of demand for it.
+1. Before you start to implement the new method, please open a [GitHub issue](https://github.com/huggingface/peft/issues) with your proposal. This way, the maintainers can give you some early feedback.
+2. Please add a link to the source (usually a paper) of the method. The paper should be in a final state to avoid changing requirements during development (e.g. due to reviewer feedback).
 3. When implementing the method, it makes sense to look for existing implementations that already exist as a guide. Moreover, when you structure your code, please take inspiration from the other PEFT methods. For example, if your method is similar to LoRA, it makes sense to structure your code similarly or even reuse some functions or classes where it makes sense (some code duplication is okay, but don’t overdo it).
-4. Ideally, in addition to the implementation of the new method, there should also be examples (notebooks, scripts), documentation, and an extensive test suite that proves the method works with a variety of tasks. However, this can be more challenging so it is acceptable to only provide the implementation and at least one working example. Documentation and tests can be added in follow up PRs.
+4. Ideally, in addition to the implementation of the new method, there should also be
+   - [examples](https://github.com/huggingface/peft/tree/main/examples) (notebooks, scripts)
+   - [documentation](https://github.com/huggingface/peft/tree/main/docs/source)
+   - [extensive test suite](https://github.com/huggingface/peft/tree/main/tests) that proves the method correctly integrates with PEFT
+   - [experimental setup](https://github.com/huggingface/peft/tree/main/method_comparison#creating-new-experiments) to run benchmarks
 5. Once you have something that seems to be working, don’t hesitate to create a draft PR even if it’s not in a mergeable state yet. The maintainers are happy to give you feedback and guidance along the way.

 ## Add other features
--- a/docs/source/developer_guides/custom_models.md
+++ b/docs/source/developer_guides/custom_models.md
@ -204,7 +204,7 @@ For a complete example, check out [this notebook](https://github.com/huggingface

 When new popular transformers architectures are released, we do our best to quickly add them to PEFT. If you come across a transformers model that is not supported out of the box, don't worry, it will most likely still work if the config is set correctly. Specifically, you have to identify the layers that should be adapted and set them correctly when initializing the corresponding config class, e.g. `LoraConfig`. Here are some tips to help with this.

-As a first step, it is a good idea is to check the existing models for inspiration. You can find them inside of [constants.py](https://github.com/huggingface/peft/blob/main/src/peft/utils/constants.py) in the PEFT repository. Often, you'll find a similar architecture that uses the same names. For example, if the new model architecture is a variation of the "mistral" model and you want to apply LoRA, you can see that the entry for "mistral" in `TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING` contains `["q_proj", "v_proj"]`. This tells you that for "mistral" models, the `target_modules` for LoRA should be `["q_proj", "v_proj"]`:
+As a first step, it is a good idea to check the existing models for inspiration. You can find them inside of [constants.py](https://github.com/huggingface/peft/blob/main/src/peft/utils/constants.py) in the PEFT repository. Often, you'll find a similar architecture that uses the same names. For example, if the new model architecture is a variation of the "mistral" model and you want to apply LoRA, you can see that the entry for "mistral" in `TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING` contains `["q_proj", "v_proj"]`. This tells you that for "mistral" models, the `target_modules` for LoRA should be `["q_proj", "v_proj"]`:

 ```python
 from peft import LoraConfig, get_peft_model
@ -219,7 +219,7 @@ peft_model = get_peft_model(my_mistral_model, config)

 If that doesn't help, check the existing modules in your model architecture with the `named_modules` method and try to identify the attention layers, especially the key, query, and value layers. Those will often have names such as `c_attn`, `query`, `q_proj`, etc. The key layer is not always adapted, and ideally, you should check whether including it results in better performance.

-Additionally, linear layers are common targets to be adapted (e.g. in [QLoRA paper](https://arxiv.org/abs/2305.14314), authors suggest to adapt them as well). Their names will often contain the strings `fc` or `dense`.
+Additionally, linear layers are common targets to be adapted (e.g. in [QLoRA paper](https://huggingface.co/papers/2305.14314), authors suggest to adapt them as well). Their names will often contain the strings `fc` or `dense`.

 If you want to add a new model to PEFT, please create an entry in [constants.py](https://github.com/huggingface/peft/blob/main/src/peft/utils/constants.py) and open a pull request on the [repository](https://github.com/huggingface/peft/pulls). Don't forget to update the [README](https://github.com/huggingface/peft#models-support-matrix) as well.

@ -238,3 +238,73 @@ peft_model.print_trainable_parameters()
 ```python
 print(peft_model.targeted_module_names)
 ```
+
+## Unsupported module types
+
+Methods like LoRA only work if the target modules are supported by PEFT. For example, it's possible to apply LoRA to `nn.Linear` and `nn.Conv2d` layers, but not, for instance, to `nn.LSTM`. If you find a layer class you want to apply PEFT to is not supported, you can:
+
+ - define a custom mapping to dynamically dispatch custom modules in LoRA
+ -  open an [issue](https://github.com/huggingface/peft/issues) and request the feature where maintainers will implement it or guide you on how to implement it yourself if demand for this module type is sufficiently high
+
+### Experimental support for dynamic dispatch of custom modules in LoRA
+
+> [!WARNING]
+> This feature is experimental and subject to change, depending on its reception by the community. We will introduce a public and stable API if there is significant demand for it.
+
+PEFT supports an experimental API for custom module types for LoRA. Let's assume you have a LoRA implementation for LSTMs. Normally, you would not be able to tell PEFT to use it, even if it would theoretically work with PEFT. However, this is possible with dynamic dispatch of custom layers.
+
+The experimental API currently looks like this:
+
+```python
+class MyLoraLSTMLayer:
+    ...
+
+base_model = ...  # load the base model that uses LSTMs
+
+# add the LSTM layer names to target_modules
+config = LoraConfig(..., target_modules=["lstm"])
+# define a mapping from base layer type to LoRA layer type
+custom_module_mapping = {nn.LSTM: MyLoraLSTMLayer}
+# register the new mapping
+config._register_custom_module(custom_module_mapping)
+# after registration, create the PEFT model
+peft_model = get_peft_model(base_model, config)
+# do training
+```
+
+<Tip>
+
+When you call [`get_peft_model`], you will see a warning because PEFT does not recognize the targeted module type. In this case, you can ignore this warning.
+
+</Tip>
+
+By supplying a custom mapping, PEFT first checks the base model's layers against the custom mapping and dispatches to the custom LoRA layer type if there is a match. If there is no match, PEFT checks the built-in LoRA layer types for a match.
+
+Therefore, this feature can also be used to override existing dispatch logic, e.g. if you want to use your own LoRA layer for `nn.Linear` instead of using the one provided by PEFT.
+
+When creating your custom LoRA module, please follow the same rules as the [existing LoRA modules](https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/layer.py). Some important constraints to consider:
+
+- The custom module should inherit from `nn.Module` and `peft.tuners.lora.layer.LoraLayer`.
+- The `__init__` method of the custom module should have the positional arguments `base_layer` and `adapter_name`. After this, there are additional `**kwargs` that you are free to use or ignore.
+- The learnable parameters should be stored in an `nn.ModuleDict` or `nn.ParameterDict`, where the key corresponds to the name of the specific adapter (remember that a model can have more than one adapter at a time).
+- The name of these learnable parameter attributes should start with `"lora_"`, e.g. `self.lora_new_param = ...`.
+- Some methods are optional, e.g. you only need to implement `merge` and `unmerge` if you want to support weight merging.
+
+Currently, the information about the custom module does not persist when you save the model. When loading the model, you have to register the custom modules again.
+
+```python
+# saving works as always and includes the parameters of the custom modules
+peft_model.save_pretrained(<model-path>)
+
+# loading the model later:
+base_model = ...
+# load the LoRA config that you saved earlier
+config = LoraConfig.from_pretrained(<model-path>)
+# register the custom module again, the same way as the first time
+custom_module_mapping = {nn.LSTM: MyLoraLSTMLayer}
+config._register_custom_module(custom_module_mapping)
+# pass the config instance to from_pretrained:
+peft_model = PeftModel.from_pretrained(model, tmp_path / "lora-custom-module", config=config)
+```
+
+If you use this feature and find it useful, or if you encounter problems, let us know by creating an issue or a discussion on GitHub. This allows us to estimate the demand for this feature and add a public API if it is sufficiently high.
--- a/docs/source/developer_guides/lora.md
+++ b/docs/source/developer_guides/lora.md
@ -9,7 +9,7 @@ Unless required by applicable law or agreed to in writing, software distributed
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.

-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.

 -->
@ -41,7 +41,7 @@ config = LoraConfig(init_lora_weights=False, ...)
 ```

 ### PiSSA
-[PiSSA](https://arxiv.org/abs/2404.02948) initializes the LoRA adapter using the principal singular values and singular vectors. This straightforward modification allows PiSSA to converge more rapidly than LoRA and ultimately attain superior performance. Moreover, PiSSA reduces the quantization error compared to QLoRA, leading to further enhancements. 
+[PiSSA](https://huggingface.co/papers/2404.02948) initializes the LoRA adapter using the principal singular values and singular vectors. This straightforward modification allows PiSSA to converge more rapidly than LoRA and ultimately attain superior performance. Moreover, PiSSA reduces the quantization error compared to QLoRA, leading to further enhancements.

 Configure the initialization method to "pissa", which may take several minutes to execute SVD on the pre-trained model:
 ```python
@ -50,19 +50,90 @@ config = LoraConfig(init_lora_weights="pissa", ...)
 ```
 Alternatively, execute fast SVD, which takes only a few seconds. The number of iterations determines the trade-off between the error and computation time:
 ```python
-lora_config = LoraConfig(init_lora_weights="pissa_niter_[number of iters]", ...) 
+lora_config = LoraConfig(init_lora_weights="pissa_niter_[number of iters]", ...)
 ```
-For detailed instruction on using PiSSA, please follow [these instructions](https://github.com/fxmeng/peft/tree/main/examples/pissa_finetuning).
+For detailed instruction on using PiSSA, please follow [these instructions](https://github.com/huggingface/peft/tree/main/examples/pissa_finetuning).
+
+### CorDA
+
+[CorDA](https://huggingface.co/papers/2406.05223) builds task-aware LoRA adapters from weight decomposition oriented by the context of downstream task to learn (instruction-previewed mode, IPM) or world knowledge to maintain (knowledge-preserved mode, KPM).
+The KPM not only achieves better performance than LoRA on fine-tuning tasks, but also mitigates the catastrophic forgetting of pre-trained world knowledge.
+When preserving pre-trained knowledge is not a concern,
+the IPM is favored because it can further accelerate convergence and enhance the fine-tuning performance.
+
+You need to configure the initialization method to "corda", and specify the mode of IPM or KPM and the dataset to collect covariance matrices.
+
+```py
+@torch.no_grad()
+def run_model():
+    # Assume `model` and `dataset` is in context...
+    model.eval()
+    for batch in dataset:
+        model(**batch)
+
+
+corda_config = CordaConfig(
+    corda_method="kpm",
+)
+lora_config = LoraConfig(
+    init_lora_weights="corda",
+    corda_config=corda_config,
+)
+preprocess_corda(model, lora_config, run_model=run_model)
+peft_model = get_peft_model(model, lora_config)
+```
+
+For detailed instruction on using CorDA, please follow [these instructions](https://github.com/huggingface/peft/tree/main/examples/corda_finetuning).
+
+### OLoRA
+[OLoRA](https://huggingface.co/papers/2406.01775) utilizes QR decomposition to initialize the LoRA adapters. OLoRA translates the base weights of the model by a factor of their QR decompositions, i.e., it mutates the weights before performing any training on them. This approach significantly improves stability, accelerates convergence speed, and ultimately achieves superior performance.
+
+You just need to pass a single additional option to use OLoRA:
+```python
+from peft import LoraConfig
+config = LoraConfig(init_lora_weights="olora", ...)
+```
+For more advanced usage, please refer to our [documentation](https://github.com/huggingface/peft/tree/main/examples/olora_finetuning).
+
+### EVA
+[EVA](https://huggingface.co/papers/2410.07170) performs SVD on the input activations of each layer and uses the right-singular vectors to initialize LoRA weights. It is therefore a data-driven initialization scheme. Furthermore EVA adaptively allocates ranks across layers based on their "explained variance ratio" - a metric derived from the SVD analysis.
+
+You can use EVA by setting `init_lora_weights="eva"` and defining [`EvaConfig`] in [`LoraConfig`]:
+```python
+from peft import LoraConfig, EvaConfig
+peft_config = LoraConfig(
+    init_lora_weights = "eva",
+    eva_config = EvaConfig(rho = 2.0),
+    ...
+)
+```
+The parameter `rho` (≥ 1.0) determines how much redistribution is allowed. When `rho=1.0` and `r=16`, LoRA adapters are limited to exactly 16 ranks, preventing any redistribution from occurring. A recommended value for EVA with redistribution is 2.0, meaning the maximum rank allowed for a layer is 2r.
+
+It is recommended to perform EVA initialization on an accelerator(e.g. CUDA GPU, Intel XPU) as it is much faster. To optimize the amount of available memory for EVA, you can use the `low_cpu_mem_usage` flag in [`get_peft_model`]:
+```python
+peft_model = get_peft_model(model, peft_config, low_cpu_mem_usage=True)
+```
+Then, call [`initialize_lora_eva_weights`] to initialize the EVA weights (in most cases the dataloader used for eva initialization can be the same as the one used for finetuning):
+```python
+initialize_lora_eva_weights(peft_model, dataloader)
+```
+EVA works out of the box with bitsandbytes. Simply initialize the model with `quantization_config` and call [`initialize_lora_eva_weights`] as usual.
+
+<Tip>
+
+For further instructions on using EVA, please refer to our [documentation](https://github.com/huggingface/peft/tree/main/examples/eva_finetuning).
+
+</Tip>

 ### LoftQ

 #### Standard approach

-When quantizing the base model for QLoRA training, consider using the [LoftQ initialization](https://arxiv.org/abs/2310.08659), which has been shown to improve performance when training quantized models. The idea is that the LoRA weights are initialized such that the quantization error is minimized. To use LoftQ, follow [these instructions](https://github.com/huggingface/peft/tree/main/examples/loftq_finetuning).
+When quantizing the base model for QLoRA training, consider using the [LoftQ initialization](https://huggingface.co/papers/2310.08659), which has been shown to improve performance when training quantized models. The idea is that the LoRA weights are initialized such that the quantization error is minimized. To use LoftQ, follow [these instructions](https://github.com/huggingface/peft/tree/main/examples/loftq_finetuning).

 In general, for LoftQ to work best, it is recommended to target as many layers with LoRA as possible, since those not targeted cannot have LoftQ applied. This means that passing `LoraConfig(..., target_modules="all-linear")` will most likely give the best results. Also, you should use `nf4` as quant type in your quantization config when using 4bit quantization, i.e. `BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4")`.

-#### A more convienient way
+#### A more convenient way

 An easier but more limited way to apply LoftQ initialization is to use the convenience function `replace_lora_weights_loftq`. This takes the quantized PEFT model as input and replaces the LoRA weights in-place with their LoftQ-initialized counterparts.

@ -80,7 +151,7 @@ replace_lora_weights_loftq(peft_model)

 `replace_lora_weights_loftq` also allows you to pass a `callback` argument to give you more control over which layers should be modified or not, which empirically can improve the results quite a lot. To see a more elaborate example of this, check out [this notebook](https://github.com/huggingface/peft/blob/main/examples/loftq_finetuning/LoftQ_weight_replacement.ipynb).

-`replace_lora_weights_loftq` implements only one iteration step of LoftQ. This means that only the LoRA weights are updated, instead of iteratevily updating LoRA weights and quantized base model weights. This may lead to lower performance but has the advantage that we can use the original quantized weights derived from the base model, instead of having to keep an extra copy of modified quantized weights. Whether this tradeoff is worthwhile depends on the use case.
+`replace_lora_weights_loftq` implements only one iteration step of LoftQ. This means that only the LoRA weights are updated, instead of iteratively updating LoRA weights and quantized base model weights. This may lead to lower performance but has the advantage that we can use the original quantized weights derived from the base model, instead of having to keep an extra copy of modified quantized weights. Whether this tradeoff is worthwhile depends on the use case.

 At the moment, `replace_lora_weights_loftq` has these additional limitations:

@ -102,10 +173,115 @@ from peft import LoraConfig

 config = LoraConfig(use_rslora=True, ...)
 ```
+### Activated LoRA (aLoRA)
+
+Activated LoRA (aLoRA) is a low rank adapter architecture for Causal LMs that allows for reusing existing base model KV cache for more efficient inference. This approach is best suited for inference pipelines which rely on the base model for most tasks/generations, but use aLoRA adapter(s) to perform specialized task(s) within the chain. For example, checking or correcting generated outputs of the base model. In these settings, inference times can be sped up by an order of magnitude or more. For more information on aLoRA and many example use cases, see https://huggingface.co/papers/2504.12397.
+
+This technique scans for the last occurence of an invocation sequence (`alora_invocation_tokens`) in each input (this can be as short as 1 token), and activates the adapter weights on tokens starting with the beginning of the invocation sequence (any inputs after the invocation sequence are also adapted, and all generated tokens will use the adapted weights). Weights on prior tokens are left un-adapted -- making the cache for those tokens interchangeable with base model cache due to the causal attention mask in Causal LMs. Usage is very similar to standard LoRA, with the key difference that this invocation sequence must be specified when the adapter is created:
+
+```py
+from peft import LoraConfig
+
+config = LoraConfig(alora_invocation_tokens=alora_invocation_tokens, task_type="CAUSAL_LM", ...)
+```
+
+where `alora_invocation_tokens` is a list of integer token ids. Given a desired invocation string, this can be obtained as
+```
+invocation_string = "placeholder"
+alora_invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens=False).
+```
+where the tokenizer is the tokenizer for the base model. Note that we have `add_special_tokens=False` to avoid adding SOS/EOS tokens in our search string (which will most likely cause failure to find).
+
+**Notes**
+* aLoRA is only supported for `task_type=CAUSAL_LM` tasks due to its focus on cache reuse.
+* Since the weights are adapted on fewer tokens, often (not always) aLoRA requires higher rank (`r`) than LoRA. `r=32` can be a good starting point.
+* aLoRA weights cannot be merged into the base model by definition, since the adapter weights are selectively applied to a subset of tokens. Attempts to merge will throw errors.
+* Beam search is not yet supported.
+* It is generally not recommended to add new tokens to the tokenizer that are not present in the base model, as this can complicate the target use case of both the base model and adapter model operating on overlapping context. That said, there is a possible workaround by first efficiently adding [trainable tokens](https://huggingface.co/docs/peft/en/package_reference/trainable_tokens) to the base model prior to training the adapter.
+
+#### Choice of invocation sequence and SFT design 
+
+Each input must have the `alora_invocation_tokens` sequence present, it is not added automatically. To maximize model performance without compromising cache reuse, it is recommended to have the adapter weights activated early, i.e. at the start of any adapter-specific prompting, but after any long inputs such as prior generations or documents. As with any model,
+formatting should be consistent between train and test.
+
+Consider the following example, where the base model has a chat template,
+and the goal is to train the adapter to generate a desired output. 
+
+* Option 1: If there is no task-specific prompt, i.e. the input is a chat history with the `assistant` prompt, then the chat template's `assistant` prompt (e.g. `<|start_of_role|>assistant<|end_of_role|>`) is a natural choice for the invocation string. See the model's chat template to find the prompt for the model.
+* Option 2: If there is a task-specific prompt for the adapter that describes the task the adapter is learning, and that prompt is put as a `user` turn immediately prior to the generation, then the chat template's `user` prompt (e.g. `<|start_of_role|>user<|end_of_role|>`) is a natural choice for the invocation string.
+
+Once deciding on an invocation string, get the model tokenizer and obtain `alora_invocation_tokens` as 
+```
+alora_invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens=False).
+```
+
+An example inference setup is at [alora finetuning](https://github.com/huggingface/peft/blob/main/examples/alora_finetuning/alora_finetuning.py).
+
+**Note** If using custom strings for the invocation string, make sure that the start and end of the string are special tokens to avoid issues with tokenization at the boundaries. 
+
+To see why, imagine that 'a', 'b', 'c', and 'ab' are tokens in your tokenizer (numbers 1, 2, 3, 4 respectively). Suppose that your alora_invocation_tokens = [2, 3]. Now imagine your input string is "abc". Because "ab" is a token, this will get tokenized as [4,3]. So the alora_invocation_tokens will fail to be found, despite the string "bc" being in it. If the start and end of the invocation string are special tokens, however, this failure case will never happen since special tokens are never tokenized into the same token with other characters.
+
+#### Using (and reusing) cache for generation
+The main purpose of Activated LoRA is to make KV cache interchangeable between the base model and aLoRA adapter models **prior to the invocation sequence** since base and adapted KV values are not compatible. Specifically, keys and values stored during one model generation can be used in subsequent generations to avoid expensive prefill operations for context tokens. When sharing cache between the base model and aLoRA adapters, there are 2 main patterns:
+1. The base model has generated something, and an aLoRA adapter is then called to do a followup generation. Example: the base model answers a question, and an aLoRA trained to detect hallucinations checks the base model response.
+2. An aLoRA adapter has generated something, and the base model or a different aLoRA adapter is called to do a followup generation where there is partial context overlap with the original aLoRA. Example: The user provides a query, and an aLoRA rewrites the query to be more self-contained and improve retrieval in a RAG system. Then, documents are retrieved and loaded into context, an aLoRA checks if these documents are indeed relevant to the question, and then the base model generates an answer.
+
+
+To demonstrate the above behaviors when using caching, we're using [DynamicCache](https://huggingface.co/docs/transformers/en/kv_cache) from `transformers`. Care must be taken to ensure that adapted cache values are not mixed with base cache values. In particular, an extra step is required for sharing the cache when there is partial context overlap (pattern 2).
+
+**Pattern 1: Base model followed by aLoRA** Here, the entire input and generation from the base model is input into the aLoRA adapter, along with the invocation sequence:
+```
+from transformers import DynamicCache
+...
+cache = DynamicCache()
+inputs_base = tokenizer(prompt_base, return_tensors="pt")
+# Generate from base model and save cache
+with model_alora.disable_adapter(): 
+    output = model_alora.generate(inputs_base["input_ids"].to(device),attention_mask=inputs_base["attention_mask"].to(device),past_key_values = cache,return_dict_in_generate=True)
+output_text_base = tokenizer.decode(output.sequences[0])
+cache = output.past_key_values
+
+# Generate with aLoRA adapter from cache
+prompt_alora = output_text + INVOCATION_STRING
+inputs_alora = tokenizer(prompt_alora, return_tensors="pt").to(device)
+output = model_alora.generate(**inputs_alora, past_key_values=cache)
+output_text_alora = tokenizer.decode(output[0])
+
+# Note: cache is now tainted with adapter values and cannot be used in base model from here on!
+```
+
+**Pattern 2: aLoRA generation followed by base model (or another aLoRA) with partial context overlap** Here, we prefill the shared context using the base model, and then generate.
+
+```
+from transformers import DynamicCache
+import copy
+...
+cache = DynamicCache()
+inputs_shared = tokenizer(prompt_shared, return_tensors="pt").to(device)
+
+# Prefill from base model and save cache
+with model_alora.disable_adapter():
+    with torch.no_grad():
+        model_alora(**inputs_shared, past_key_values=cache)
+cache_copy = copy.deepcopy(cache)
+
+# Generate from aLoRA using prefilled cache
+prompt_alora = prompt_shared + INVOCATION_STRING
+inputs_alora = tokenizer(prompt_alora, return_tensors="pt").to(device)
+output = model_alora.generate(**inputs_alora, past_key_values=cache)
+output_text_alora = tokenizer.decode(output[0])
+
+# Generate from base model using saved cache not tainted by aLoRA KV values
+prompt_base = prompt_shared
+inputs_base = tokenizer(prompt_base, return_tensors="pt").to(device)
+with model_alora.disable_adapter(): 
+    output = model_alora.generate(**inputs_base, past_key_values=cache_copy)
+output_text_base = tokenizer.decode(output[0])
+```

 ### Weight-Decomposed Low-Rank Adaptation (DoRA)

-This technique decomposes the updates of the weights into two parts, magnitude and direction. Direction is handled by normal LoRA, whereas the magnitude is handled by a separate learnable parameter. This can improve the performance of LoRA, especially at low ranks. For more information on DoRA, see  https://arxiv.org/abs/2402.09353.
+This technique decomposes the updates of the weights into two parts, magnitude and direction. Direction is handled by normal LoRA, whereas the magnitude is handled by a separate learnable parameter. This can improve the performance of LoRA, especially at low ranks. For more information on DoRA, see  https://huggingface.co/papers/2402.09353.

 ```py
 from peft import LoraConfig
@ -113,10 +289,38 @@ from peft import LoraConfig
 config = LoraConfig(use_dora=True, ...)
 ```

+If parts of the model or the DoRA adapter are offloaded to CPU you can get a significant speedup at the cost of some temporary (ephemeral) VRAM overhead by using `ephemeral_gpu_offload=True` in `config.runtime_config`.
+
+```py
+from peft import LoraConfig, LoraRuntimeConfig
+
+config = LoraConfig(use_dora=True, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=True), ...)
+```
+
+A `PeftModel` with a DoRA adapter can also be loaded with `ephemeral_gpu_offload=True` flag using the `from_pretrained` method as well as the `load_adapter` method.
+
+```py
+from peft import PeftModel
+
+model = PeftModel.from_pretrained(base_model, peft_model_id, ephemeral_gpu_offload=True)
+```
+
+DoRA is optimized (computes faster and takes less memory) for models in the evaluation mode, or when dropout is set to 0. We reuse the
+base result at those times to get the speedup.
+Running [dora finetuning](https://github.com/huggingface/peft/blob/main/examples/dora_finetuning/dora_finetuning.py)
+with `CUDA_VISIBLE_DEVICES=0 ZE_AFFINITY_MASK=0 time python examples/dora_finetuning/dora_finetuning.py --quantize --lora_dropout 0 --batch_size 16 --eval_step 2 --use_dora`
+on a 4090 with gradient accumulation set to 2 and max step to 20 resulted with the following observations:
+
+| | Without Optimization | With Optimization |
+| :--: | :--: | :--: |
+| train_runtime | 359.7298 | **279.2676** |
+| train_samples_per_second | 1.779 | **2.292** |
+| train_steps_per_second | 0.056 | **0.072** |
+
 #### Caveats

- DoRA only supports linear and Conv2d layers at the momement.
- DoRA introduces a bigger overhead than pure LoRA, so it is recommended to merge weights for inference, see [`LoraModel.merge_and_unload`]. 
+- DoRA only supports embedding, linear, and Conv2d layers at the moment.
+- DoRA introduces a bigger overhead than pure LoRA, so it is recommended to merge weights for inference, see [`LoraModel.merge_and_unload`].
 - DoRA should work with weights quantized with bitsandbytes ("QDoRA"). However, issues have been reported when using QDoRA with DeepSpeed Zero2.

 ### QLoRA-style training
@ -129,21 +333,183 @@ config = LoraConfig(target_modules="all-linear", ...)

 ### Memory efficient Layer Replication with LoRA

-An approach used to improve the performance of models is to expand a model by duplicating layers in the model to build a larger model from a pretrained model of a given size. For example increasing a 7B model to a 10B model as described in the [SOLAR](https://arxiv.org/abs/2312.15166) paper. PEFT LoRA supports this kind of expansion in a memory efficient manner that supports further fine-tuning using LoRA adapters attached to the layers post replication of the layers. The replicated layers do not take additional memory as they share the underlying weights so the only additional memory required is the memory for the adapter weights. To use this feature you would create a config with the `layer_replication` argument.
+An approach used to improve the performance of models is to expand a model by duplicating layers in the model to build a larger model from a pretrained model of a given size. For example increasing a 7B model to a 10B model as described in the [SOLAR](https://huggingface.co/papers/2312.15166) paper. PEFT LoRA supports this kind of expansion in a memory efficient manner that supports further fine-tuning using LoRA adapters attached to the layers post replication of the layers. The replicated layers do not take additional memory as they share the underlying weights so the only additional memory required is the memory for the adapter weights. To use this feature you would create a config with the `layer_replication` argument.

 ```py
 config = LoraConfig(layer_replication=[[0,4], [2,5]], ...)
 ```

-Assuming the original model had 5 layers `[0, 1, 2 ,3, 4]`, this would create a model with 7 layers arranged as `[0, 1, 2, 3, 2, 3, 4]`. This follows the [mergekit](https://github.com/arcee-ai/mergekit) pass through merge convention where sequences of layers specified as start inclusive and end exclusive tuples are stacked to build the final model. Each layer in the final model gets its own distinct set of LoRA adpaters.
+Assuming the original model had 5 layers `[0, 1, 2 ,3, 4]`, this would create a model with 7 layers arranged as `[0, 1, 2, 3, 2, 3, 4]`. This follows the [mergekit](https://github.com/arcee-ai/mergekit) pass through merge convention where sequences of layers specified as start inclusive and end exclusive tuples are stacked to build the final model. Each layer in the final model gets its own distinct set of LoRA adapters.

 [Fewshot-Metamath-OrcaVicuna-Mistral-10B](https://huggingface.co/abacusai/Fewshot-Metamath-OrcaVicuna-Mistral-10B) is an example of a model trained using this method on Mistral-7B expanded to 10B. The
 [adapter_config.json](https://huggingface.co/abacusai/Fewshot-Metamath-OrcaVicuna-Mistral-10B/blob/main/adapter_config.json) shows a sample LoRA adapter config applying this method for fine-tuning.

-## Merge adapters
+### Fine grained control over ranks and alpha (scaling)
+
+By default, all layers targeted with LoRA will have the same rank `r` and the same `lora_alpha` (which determines the LoRA scaling), depending on what was specified in the [`LoraConfig`]. In some cases, however, you may want to indicate different values for different layers. This is possible by passing the `rank_pattern` and `alpha_pattern` arguments to [`LoraConfig`]. These arguments should be dictionaries with the key being the layer name and the value being the rank/alpha value. The keys can be [regular expressions](https://docs.python.org/3/library/re.html) (regex). All LoRA layers that are not explicitly mentioned in `rank_pattern` and `alpha_pattern` will take the default `r` and `lora_alpha` values.
+
+To give an example, let's assume that we have a model with the following structure:
+
+```python
+>>> print(model)
+Outer(
+  (foo): Linear(...)
+  (module): Middle(
+    (foo): Linear(...)
+    (foobar): Linear(...)
+    (module): Inner(
+      (foo): Linear(...)
+      (barfoo): Linear(...)
+    )
+  )
+)
+```
+
+- `rank_pattern={"foo": 42}` will match all 3 `foo` layers. Neither `foobar` nor `barfoo` are matched.
+- `rank_pattern={"^foo": 42}` will only match the `foo` layer of the model, but neither `module.foo` nor `module.module.foo`. This is because the `^` means "start of string" when using regular expressions, and only `foo` starts with `"foo"`, the other layer names have prefixes.
+- `rank_pattern={"^module.foo": 42}` matches only `module.foo`, but not `module.module.foo`, for the same reason.
+- `rank_pattern={"module.foo": 42}` matches both `module.foo` and `module.module.foo`, but not `foo`.
+- `rank_pattern={"^foo": 42, "^module.module.foo": 55}` matches `foo` and `module.module.foo`, respectively, but not `module.foo`.
+- There is no need to indicate `$` to mark the end of the match, as this is added automatically by PEFT.
+
+The same logic applies to `alpha_pattern`. If you're in doubt, don't try to get fancy with regular expressions -- just pass the full name for each module with a different rank/alpha, preceded by the `^` prefix, and you should be good.
+
+### Targeting `nn.Parameter` directly
+
+> [!WARNING]
+> This feature is experimental and subject to change.
+
+Generally, you should use `target_modules` to target the module (e.g. `nn.Linear`). However, in some circumstances, this is not possible. E.g., in many mixture of expert (MoE) layers in HF Transformers, instead of using `nn.Linear`, an `nn.Parameter` is used. PEFT normally overwrites the `forward` method for LoRA, but for `nn.Parameter`, there is none. Therefore, to apply LoRA to that parameter, it needs to be targeted with `target_parameters`. As an example, for [Llama4](https://huggingface.co/collections/meta-llama/llama-4-67f0c30d9fe03840bc9d0164), you can pass: `target_parameters=['feed_forward.experts.gate_up_proj', 'feed_forward.experts.down_proj]`.
+
+#### Caveats
+
+- At the moment, this argument allows to target 2-dim or 3-dim `nn.Parameter`s. It is assumed that in the case of a 3-dim parameter, the 0th dimension is the expert dimension.
+- It is currently not possible to add multiple LoRA adapters (via `model.add_adapter` or `model.load_adapter`) that use `target_parameters` at the same time.
+
+## Optimizers
+
+LoRA training can optionally include special purpose optimizers. Currently PEFT supports LoRA-FA and LoRA+.
+
+### LoRA-FA Optimizer
+
+LoRA training can be more effective and efficient using LoRA-FA, as described in [LoRA-FA](https://huggingface.co/papers/2308.03303). LoRA-FA reduces activation memory consumption by fixing the matrix A and only tuning the matrix B. During training, the gradient of B is optimized to approximate the full parameter fine-tuning gradient. Moreover, the memory consumption of LoRA-FA is not sensitive to the rank (since it erases the activation of $A$), therefore it can improve performance by enlarging lora rank without increasing memory consumption.
+
+```py
+from peft import LoraConfig, get_peft_model
+from peft.optimizers import create_lorafa_optimizer
+from transformers import Trainer, get_cosine_schedule_with_warmup
+
+base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
+
+config = LoraConfig(...)
+model = get_peft_model(base_model, config)
+
+optimizer = create_lorafa_optimizer(
+    model=model,
+    r=128,
+    lora_alpha=32,
+    lr=7e-5,
+)
+
+scheduler = get_cosine_schedule_with_warmup(
+    optimizer,
+    num_warmup_steps=100,
+    num_training_steps=1000,
+)
+
+trainer = Trainer(
+    ...,
+    optimizers=(optimizer, scheduler),
+)
+```
+
+### LoRA+ optimized LoRA
+
+LoRA training can be optimized using [LoRA+](https://huggingface.co/papers/2402.12354), which uses different learning rates for the adapter matrices A and B, shown to increase finetuning speed by up to 2x and performance by 1-2%.
+
+```py
+from peft import LoraConfig, get_peft_model
+from peft.optimizers import create_loraplus_optimizer
+from transformers import Trainer
+import bitsandbytes as bnb
+
+base_model = ...
+config = LoraConfig(...)
+model = get_peft_model(base_model, config)
+
+optimizer = create_loraplus_optimizer(
+    model=model,
+    optimizer_cls=bnb.optim.Adam8bit,
+    lr=5e-5,
+    loraplus_lr_ratio=16,
+)
+scheduler = None
+
+...
+trainer = Trainer(
+    ...,
+    optimizers=(optimizer, scheduler),
+)
+```
+
+## Efficiently train tokens alongside LoRA
+
+Sometimes it is necessary to not only change some layer's weights but to add new tokens as well. With larger models this can be a memory-costly endeavour. PEFT LoRA adapters support the `trainable_token_indices` parameter which allows tuning of other tokens alongside fine-tuning of specific layers with LoRA. This method only trains the tokens you specify and leaves all other tokens untouched. This saves memory and doesn't throw away learned context of existing token embeddings in contrast to when training the whole embedding matrix. Under the hood this method uses the layer of [`TrainableTokensModel`].
+
+```py
+# for layer 'embed_tokens'
+config = LoraConfig(trainable_token_indices=[idx_1, idx_2, ...], ...)
+
+# specific embedding layer
+config = LoraConfig(trainable_token_indices={'emb_tokens': [idx_1, idx_2, ...]}, ...)
+```
+
+In the snippet below we show how to add new tokens to the model and how to train it alongside the other layers in the model.
+
+```py
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from peft import get_peft_model, LoraConfig
+
+base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
+tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
+
+# we define our new tokens and add them to the tokenizer as special tokens
+special_tokens = ['<|start_think|>', '<|stop_think|>']
+tokenizer.add_special_tokens({'additional_special_tokens': special_tokens})
+
+# make room for new tokens in the embedding matrix if it isn't big enough already
+base_model.resize_token_embeddings(max(len(tokenizer), base_model.model.embed_tokens.num_embeddings))
+
+# typical LoRA config with `trainable_token_indices` targeting embedding layer `embed_tokens`
+# and specifically our new tokens we just added
+lora_config = LoraConfig(
+    target_modules='all-linear',
+    trainable_token_indices={'embed_tokens': tokenizer.convert_tokens_to_ids(special_tokens)},
+)
+peft_model = get_peft_model(base_model, lora_config)
+
+# proceed to train the model like normal
+[...]
+```
+
+The token weights are part of your adapter state dict and saved alongside the LoRA weights.
+If we would have used full fine-tuning with `modules_to_save=['embed_tokens']` we would have stored the full embedding matrix in the checkpoint, leading to a much bigger file.
+
+To give a bit of an indication how much VRAM can be saved, a rudimentary comparison of the above example was made between training the embedding matrix fully (`modules_to_save=["embed_tokens"]`), using a LoRA for the embedding matrix (`target_modules=[..., "embed_tokens"]`, rank 32) and trainable tokens (`trainable_token_indices=[...]`, 6 tokens). Trainable tokens used about as much VRAM (15,562MB vs. 15,581MB) as LoRA while being specific to the tokens and saved ~1GB of VRAM over fully training the embedding matrix.
+
+
+## Merge LoRA weights into the base model

 While LoRA is significantly smaller and faster to train, you may encounter latency issues during inference due to separately loading the base model and the LoRA adapter. To eliminate latency, use the [`~LoraModel.merge_and_unload`] function to merge the adapter weights with the base model. This allows you to use the newly merged model as a standalone model. The [`~LoraModel.merge_and_unload`] function doesn't keep the adapter weights in memory.

+Below is a diagram that explains the intuition of LoRA adapter merging:
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/lora_diagram.png"/>
+</div>
+
+We show in the snippets below how to run that using PEFT.
+
 ```py
 from transformers import AutoModelForCausalLM
 from peft import PeftModel
@ -183,7 +549,7 @@ base_model = AutoModelForCausalLM.from_pretrained(
 )
 ```

-Then we load the first adapter: 
+Then we load the first adapter:

 ```python
 peft_model_id = "alignment-handbook/zephyr-7b-sft-lora"
@ -213,11 +579,13 @@ There are several supported methods for `combination_type`. Refer to the [docume
 Now, perform inference:

 ```python
+device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
+
 tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

 prompt = "Hey, are you conscious? Can you talk to me?"
 inputs = tokenizer(prompt, return_tensors="pt")
-inputs = {k: v.to("cuda") for k, v in inputs.items()}
+inputs = {k: v.to(device) for k, v in inputs.items()}

 with torch.no_grad():
    generate_ids = model.generate(**inputs, max_length=30)
@ -258,7 +626,7 @@ model.delete_adapter("dpo")

 Normally, each inference batch has to use the same adapter(s) in PEFT. This can sometimes be annoying, because we may have batches that contain samples intended to be used with different LoRA adapters. For example, we could have a base model that works well in English and two more LoRA adapters, one for French and one for German. Usually, we would have to split our batches such that each batch only contains samples of one of the languages, we cannot combine different languages in the same batch.

-Thankfully, it is possible to mix different LoRA adapters in the same batch using the `adapter_name` argument. Below, we show an examle of how this works in practice. First, let's load the base model, English, and the two adapters, French and German, like this:
+Thankfully, it is possible to mix different LoRA adapters in the same batch using the `adapter_name` argument. Below, we show an example of how this works in practice. First, let's load the base model, English, and the two adapters, French and German, like this:

 ```python
 from transformers import AutoTokenizer, AutoModelForCausalLM
@ -303,16 +671,19 @@ output = peft_model.generate(**inputs, adapter_names=adapter_names, max_new_toke

 Note that the order does not matter here, i.e. the samples in the batch don't need to be grouped by adapter as in the example above. We just need to ensure that the `adapter_names` argument is aligned correctly with the samples.

+Additionally, the same approach also works with the `modules_to_save` feature, which allows for saving and reusing specific neural network layers, such as custom heads for classification tasks, across different LoRA adapters.
+
 ### Caveats

-Using this features has some drawbacks, namely:
+Using this feature has some drawbacks, namely:

 - It only works for inference, not for training.
 - Disabling adapters using the `with model.disable_adapter()` context takes precedence over `adapter_names`.
- You cannot pass `adapter_names` when some adapter weights where merged with base weight using the `merge_adapter` method. Please unmerge all adapters first by calling `model.unmerge_adapter()`.
+- You cannot pass `adapter_names` when some adapter weights were merged with base weight using the `merge_adapter` method. Please unmerge all adapters first by calling `model.unmerge_adapter()`.
 - For obvious reasons, this cannot be used after calling `merge_and_unload()`, since all the LoRA adapters will be merged into the base weights in this case.
 - This feature does not currently work with DoRA, so set `use_dora=False` in your `LoraConfig` if you want to use it.
+- The `modules_to_save` feature is currently only supported for the layers of types `Linear`, `Embedding`, `Conv2d` and `Conv1d`.
 - There is an expected overhead for inference with `adapter_names`, especially if the amount of different adapters in the batch is high. This is because the batch size is effectively reduced to the number of samples per adapter. If runtime performance is your top priority, try the following:
  - Increase the batch size.
-  - Try to avoid having a large number of different adapters in the same batch, prefer homogeneous batches. This can be achieved by buffering samples with the same adapter and only perform inference with a small handfull of different adapters.
+  - Try to avoid having a large number of different adapters in the same batch, prefer homogeneous batches. This can be achieved by buffering samples with the same adapter and only perform inference with a small handful of different adapters.
  - Take a look at alternative implementations such as [LoRAX](https://github.com/predibase/lorax), [punica](https://github.com/punica-ai/punica), or [S-LoRA](https://github.com/S-LoRA/S-LoRA), which are specialized to work with a large number of different adapters.
--- a/docs/source/developer_guides/low_level_api.md
+++ b/docs/source/developer_guides/low_level_api.md
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # Adapter injection

-With PEFT, you can inject trainable adapters into any `torch` module which allows you to use adapter methods without relying on the modeling classes in PEFT. Currently, PEFT supports injecting [LoRA](../conceptual_guides/adapter#low-rank-adaptation-lora), [AdaLoRA](../conceptual_guides/adapter#adaptive-low-rank-adaptation-adalora), and [IA3](../conceptual_guides/ia3) into models because for these adapters, inplace modification of the model is sufficient for finetuning it.
+With PEFT, you can inject trainable adapters into any `torch` module which allows you to use adapter methods without relying on the modeling classes in PEFT. This works for all adapters except for those based on prompt learning (e.g. prefix tuning or p-tuning).

 Check the table below to see when you should inject adapters.

@ -25,6 +25,8 @@ Check the table below to see when you should inject adapters.
 | the model is modified inplace, keeping all the original attributes and methods | manually write the `from_pretrained` and `save_pretrained` utility functions from Hugging Face to save and load adapters |
 | works for any `torch` module and modality | doesn't work with any of the utility methods provided by `PeftModel` such as disabling and merging adapters |

+## Creating a new PEFT model
+
 To perform the adapter injection, use the [`inject_adapter_in_model`] method. This method takes 3 arguments, the PEFT config, the model, and an optional adapter name. You can also attach multiple adapters to the model if you call [`inject_adapter_in_model`] multiple times with different adapter names.

 For example, to inject LoRA adapters into the `linear` submodule of the `DummyModel` module:
@ -85,6 +87,30 @@ DummyModel(
 )
 ```

+### Injection based on a `state_dict`
+
+Sometimes, it is possible that there is a PEFT adapter checkpoint but the corresponding PEFT config is not known for whatever reason. To inject the PEFT layers for this checkpoint, you would usually have to reverse-engineer the corresponding PEFT config, most notably the `target_modules` argument, based on the `state_dict` from the checkpoint. This can be cumbersome and error prone. To avoid this, it is also possible to call [`inject_adapter_in_model`] and pass the loaded `state_dict` as an argument:
+
+```python
+from safetensors.torch import load_file
+
+model = ...
+state_dict = load_file(<path-to-safetensors-file>)
+lora_config = LoraConfig(...)
+model = inject_adapter_in_model(lora_config, model, state_dict=state_dict)
+```
+
+In this case, PEFT will use the `state_dict` as reference for which layers to target instead of using the PEFT config. As a user, you don't have to set the exact `target_modules` of the PEFT config for this to work. However, you should still pass a PEFT config of the right type, in this example `LoraConfig`, you can leave the `target_modules` as `None`.
+
+Be aware that this still only creates the uninitialized PEFT layers, the values from the `state_dict` are not used to populate the model weights. To populate the weights, proceed with calling [`set_peft_model_state_dict`] as described below.
+
+⚠️ Note that if there is a mismatch between what is configured in the PEFT config and what is found in the `state_dict`, PEFT will warn you about this. You can ignore the warning if you know that the PEFT config is not correctly specified.
+
+> [!WARNING]
+> If the original PEFT adapters was using `target_parameters` instead of `target_modules`, injecting from a `state_dict` will not work correctly. In this case, it is mandatory to use the correct PEFT config for injection.
+
+## Saving the model
+
 To only save the adapter, use the [`get_peft_model_state_dict`] function:

 ```python
@ -95,3 +121,28 @@ print(peft_state_dict)
 ```

 Otherwise, `model.state_dict()` returns the full state dict of the model.
+
+## Loading the model
+
+After loading the saved `state_dict`, it can be applied using the [`set_peft_model_state_dict`] function:
+
+```python
+from peft import set_peft_model_state_dict
+
+model = DummyModel()
+model = inject_adapter_in_model(lora_config, model)
+outcome = set_peft_model_state_dict(model, peft_state_dict)
+# check that there were no wrong keys
+print(outcome.unexpected_keys)
+```
+
+If injecting the adapter is slow or you need to load a large number of adapters, you may use an optimization that allows to create an "empty" adapter on meta device and only fills the weights with real weights when the [`set_peft_model_state_dict`] is called. To do this, pass `low_cpu_mem_usage=True` to both [`inject_adapter_in_model`] and [`set_peft_model_state_dict`].
+
+```python
+model = DummyModel()
+model = inject_adapter_in_model(lora_config, model, low_cpu_mem_usage=True)
+
+print(model.linear.lora_A["default"].weight.device.type == "meta")  # should be True
+set_peft_model_state_dict(model, peft_state_dict, low_cpu_mem_usage=True)
+print(model.linear.lora_A["default"].weight.device.type == "cpu")  # should be True
+```
--- a/docs/source/developer_guides/model_merging.md
+++ b/docs/source/developer_guides/model_merging.md
@ -50,6 +50,9 @@ config = PeftConfig.from_pretrained("smangrul/tinyllama_lora_norobots")
 model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, load_in_4bit=True, device_map="auto").eval()
 tokenizer = AutoTokenizer.from_pretrained("smangrul/tinyllama_lora_norobots")

+model.config.vocab_size = 32005
+model.resize_token_embeddings(32005)
+
 model = PeftModel.from_pretrained(model, "smangrul/tinyllama_lora_norobots", adapter_name="norobots")
 _ = model.load_adapter("smangrul/tinyllama_lora_sql", adapter_name="sql")
 _ = model.load_adapter("smangrul/tinyllama_lora_adcopy", adapter_name="adcopy")
@ -96,12 +99,13 @@ Now you can use the merged model as an instruction-tuned model to write ad copy
 <hfoption id="instruct">

 ```py
+device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
 messages = [
    {"role": "user", "content": "Write an essay about Generative AI."},
 ]
 text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
 inputs = tokenizer(text, return_tensors="pt")
-inputs = {k: v.to("cuda") for k, v in inputs.items()}
+inputs = {k: v.to(device) for k, v in inputs.items()}
 outputs = model.generate(**inputs, max_new_tokens=256, do_sample=True, top_p=0.95, temperature=0.2, repetition_penalty=1.2, eos_token_id=tokenizer.eos_token_id)
 print(tokenizer.decode(outputs[0]))
 ```
@ -110,13 +114,14 @@ print(tokenizer.decode(outputs[0]))
 <hfoption id="ad copy">

 ```py
+device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
 messages = [
    {"role": "system", "content": "Create a text ad given the following product and description."},
    {"role": "user", "content": "Product: Sony PS5 PlayStation Console\nDescription: The PS5 console unleashes new gaming possibilities that you never anticipated."},
 ]
 text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
 inputs = tokenizer(text, return_tensors="pt")
-inputs = {k: v.to("cuda") for k, v in inputs.items()}
+inputs = {k: v.to(device) for k, v in inputs.items()}
 outputs = model.generate(**inputs, max_new_tokens=128, do_sample=True, top_p=0.95, temperature=0.2, repetition_penalty=1.2, eos_token_id=tokenizer.eos_token_id)
 print(tokenizer.decode(outputs[0]))
 ```
@ -125,16 +130,35 @@ print(tokenizer.decode(outputs[0]))
 <hfoption id="SQL">

 ```py
+device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
+
 text = """Table: 2-11365528-2
 Columns: ['Team', 'Head Coach', 'President', 'Home Ground', 'Location']
 Natural Query: Who is the Head Coach of the team whose President is Mario Volarevic?
 SQL Query:"""

 inputs = tokenizer(text, return_tensors="pt")
-inputs = {k: v.to("cuda") for k, v in inputs.items()}
+inputs = {k: v.to(device) for k, v in inputs.items()}
 outputs = model.generate(**inputs, max_new_tokens=64, repetition_penalty=1.1, eos_token_id=tokenizer("</s>").input_ids[-1])
 print(tokenizer.decode(outputs[0]))
 ```

 </hfoption>
 </hfoptions>
+
+
+## Merging (IA)³ Models
+The (IA)³ models facilitate linear merging of adapters. To merge adapters in an (IA)³ model, utilize the `add_weighted_adapter` method from the `IA3Model` class. This method is analogous to the `add_weighted_adapter` method used in `LoraModel`, with the key difference being the absence of the `combination_type` parameter. For example, to merge three (IA)³ adapters into a PEFT model, you would proceed as follows:
+
+```py
+adapters = ["adapter1", "adapter2", "adapter3"]
+weights = [0.4, 0.3, 0.3]
+adapter_name = "merge"
+model.add_weighted_adapter(adapters, weights, adapter_name)
+```
+
+It is recommended that the weights sum to 1.0 to preserve the scale of the model. The merged model can then be set as the active model using the `set_adapter` method:
+
+```py
+model.set_adapter("merge")
+```
--- a/docs/source/developer_guides/quantization.md
+++ b/docs/source/developer_guides/quantization.md
@ -21,7 +21,7 @@ Quantization represents data with fewer bits, making it a useful technique for r
 * optimizing which model weights are quantized with the [AWQ](https://hf.co/papers/2306.00978) algorithm
 * independently quantizing each row of a weight matrix with the [GPTQ](https://hf.co/papers/2210.17323) algorithm
 * quantizing to 8-bit and 4-bit precision with the [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) library
-* quantizing to as low as 2-bit precision with the [AQLM](https://arxiv.org/abs/2401.06118) algorithm
+* quantizing to as low as 2-bit precision with the [AQLM](https://huggingface.co/papers/2401.06118) algorithm

 However, after a model is quantized it isn't typically further trained for downstream tasks because training can be unstable due to the lower precision of the weights and activations. But since PEFT methods only add *extra* trainable parameters, this allows you to train a quantized model with a PEFT adapter on top! Combining quantization with PEFT can be a good strategy for training even the largest models on a single GPU. For example, [QLoRA](https://hf.co/papers/2305.14314) is a method that quantizes a model to 4-bits and then trains it with LoRA. This method allows you to finetune a 65B parameter model on a single 48GB GPU!

@ -107,11 +107,37 @@ QLoRA adds trainable weights to all the linear layers in the transformer archite
 config = LoraConfig(target_modules="all-linear", ...)
 ```

+## GPTQ quantization
+
+You can learn more about gptq based `[2, 3, 4, 8]` bits quantization at [GPTQModel](https://github.com/ModelCloud/GPTQModel) and the Transformers [GPTQ](https://huggingface.co/docs/transformers/quantization/gptq) doc. Post-quant training, PEFT can use both [GPTQModel](https://github.com/ModelCloud/GPTQModel) or [AutoGPTQ](https://github.com/autogptq/autogptq) libraries, but we recommend GPTQModel because AutoGPTQ will be deprecated in a future release. 
+
+```bash
+# gptqmodel install
+pip install gptqmodel --no-build-isolation
+```
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
+
+model_id = "facebook/opt-125m"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+gptq_config = GPTQConfig(bits=4, group_size=128, dataset="wikitext2", tokenizer=tokenizer)
+
+quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=gptq_config)
+
+# save quantized model
+quantized_model.save_pretrained("./opt-125m-gptq")
+tokenizer.save_pretrained("./opt-125m-gptq")
+```
+
+Once quantized, you can post-train GPTQ models with PEFT APIs.
+
 ## AQLM quantization

-Additive Quantization of Language Models ([AQLM](https://arxiv.org/abs/2401.06118)) is a Large Language Models compression method. It quantizes multiple weights together and takes advantage of interdependencies between them. AQLM represents groups of 8-16 weights as a sum of multiple vector codes. This allows it to compress models down to as low as 2-bit with considerably low accuracy losses.
+Additive Quantization of Language Models ([AQLM](https://huggingface.co/papers/2401.06118)) is a Large Language Models compression method. It quantizes multiple weights together and takes advantage of interdependencies between them. AQLM represents groups of 8-16 weights as a sum of multiple vector codes. This allows it to compress models down to as low as 2-bit with considerably low accuracy losses.

-Since the AQLM quantization process is computationally expensive, a use of prequantized models is recommended. A partial list of available models can be found in the official aqlm [repository](https://github.com/Vahe1994/AQLM).
+Since the AQLM quantization process is computationally expensive, the use of prequantized models is recommended. A partial list of available models can be found in the official aqlm [repository](https://github.com/Vahe1994/AQLM).

 The models support LoRA adapter tuning. To tune the quantized model you'll need to install the `aqlm` inference library: `pip install aqlm>=1.0.2`. Finetuned LoRA adapters shall be saved separately, as merging them with AQLM quantized weights is not possible.

@ -166,15 +192,15 @@ model = get_peft_model(model, config)

 ## HQQ quantization

-The models that is quantized using Half-Quadratic Quantization of Large Machine Learning Models ([HQQ](https://mobiusml.github.io/hqq_blog/)) support LoRA adapter tuning. To tune the quantized model, you'll need to install the `hqq` library with: `pip install hqq`.
+The models that are quantized using Half-Quadratic Quantization of Large Machine Learning Models ([HQQ](https://mobiusml.github.io/hqq_blog/)) support LoRA adapter tuning. To tune the quantized model, you'll need to install the `hqq` library with: `pip install hqq`.

-```py
+```python
 from hqq.engine.hf import HQQModelForCausalLM

-quantized_model = HQQModelForCausalLM.from_quantized(save_dir_or_hfhub, device='cuda')
+device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"

+quantized_model = HQQModelForCausalLM.from_quantized(save_dir_or_hfhub, device=device)
 peft_config = LoraConfig(...)
-
 quantized_model = get_peft_model(quantized_model, peft_config)
 ```

@ -184,17 +210,85 @@ Or using transformers version that is compatible with HQQ (e.g. by installing it
 from transformers import HqqConfig, AutoModelForCausalLM

 quant_config = HqqConfig(nbits=4, group_size=64)
-
-quantized_model = AutoModelForCausalLM.from_pretrained(save_dir_or_hfhub, device='cuda', quantization_config=quant_config)
-
+quantized_model = AutoModelForCausalLM.from_pretrained(save_dir_or_hfhub, device_map=device_map, quantization_config=quant_config)
 peft_config = LoraConfig(...)
-
 quantized_model = get_peft_model(quantized_model, peft_config)
 ```

+## torchao (PyTorch Architecture Optimization)
+
+PEFT supports models quantized with [torchao](https://github.com/pytorch/ao) ("ao") for int8 quantization.
+
+```python
+from peft import LoraConfig, get_peft_model
+from transformers import AutoModelForCausalLM, TorchAoConfig
+
+model_id = ...
+quantization_config = TorchAoConfig(quant_type="int8_weight_only")
+base_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
+peft_config = LoraConfig(...)
+model = get_peft_model(base_model, peft_config)
+```
+
+### Caveats:
+
+- Use the most recent versions of torchao (>= v0.4.0) and transformers (> 4.42).
+- Only linear layers are currently supported.
+- `quant_type = "int4_weight_only"` is currently not supported.
+- `NF4` is not implemented in transformers as of yet and is thus also not supported.
+- DoRA only works with `quant_type = "int8_weight_only"` at the moment.
+- There is explicit support for torchao when used with LoRA. However, when torchao quantizes a layer, its class does not change, only the type of the underlying tensor. For this reason, PEFT methods other than LoRA will generally also work with torchao, even if not explicitly supported. Be aware, however, that **merging only works correctly with LoRA and with `quant_type = "int8_weight_only"`**. If you use a different PEFT method or dtype, merging will likely result in an error, and even it doesn't, the results will still be incorrect.
+
+## INC quantization
+
+Intel Neural Compressor ([INC](https://github.com/intel/neural-compressor)) enables model quantization for various devices,
+including Intel Gaudi accelerators (also known as HPU devices). You can perform LoRA fine-tuning on models that have been
+quantized using INC. To use INC with PyTorch models, install the library with: `pip install neural-compressor[pt]`.
+Quantizing a model to FP8 precision for HPU devices can be done with the following single-step quantization workflow:
+
+```python
+import torch
+from neural_compressor.torch.quantization import FP8Config, convert, finalize_calibration, prepare
+quant_configs = {
+    ...
+}
+config = FP8Config(**quant_configs)
+```
+
+Pass the config to the `prepare` method, run inference to gather calibration stats, and call `finalize_calibration`
+and `convert` methods to quantize model to FP8 precision:
+
+```python
+model = prepare(model, config)
+# Run inference to collect calibration statistics
+...
+# Finalize calibration and convert the model to FP8 precision
+finalize_calibration(model)
+model = convert(model)
+# Load PEFT LoRA adapter as usual
+...
+```
+
+An example demonstrating how to load a PEFT LoRA adapter into an INC-quantized FLUX text-to-image model for HPU
+devices is provided [here](https://github.com/huggingface/peft/blob/main/examples/stable_diffusion/inc_flux_lora_hpu.py).
+
+
+### Caveats:
+
+- `merge()` and `unmerge()` methods are currently not supported for INC-quantized models.
+- Currently, only **Linear** INC-quantized layers are supported when loading PEFT adapters.
+
+## Other Supported PEFT Methods
+
+Besides LoRA, the following PEFT methods also support quantization:
+
+- **VeRA** (supports bitsandbytes quantization)
+- **AdaLoRA** (supports both bitsandbytes and GPTQ quantization)
+- **(IA)³** (supports bitsandbytes quantization)
+
 ## Next steps

 If you're interested in learning more about quantization, the following may be helpful:

-* Learn more about details about QLoRA and check out some benchmarks on its impact in the [Making LLMs even more accessible with bitsandbytes, 4-bit quantization and QLoRA](https://huggingface.co/blog/4bit-transformers-bitsandbytes) blog post.
+* Learn more details about QLoRA and check out some benchmarks on its impact in the [Making LLMs even more accessible with bitsandbytes, 4-bit quantization and QLoRA](https://huggingface.co/blog/4bit-transformers-bitsandbytes) blog post.
 * Read more about different quantization schemes in the Transformers [Quantization](https://hf.co/docs/transformers/main/quantization) guide.
--- a/docs/source/developer_guides/torch_compile.md
+++ b/docs/source/developer_guides/torch_compile.md
@ -0,0 +1,71 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# torch.compile
+
+In PEFT, [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) works for some but not all features. The reason why it won't always work is because PEFT is highly dynamic in certain places (loading and switching between multiple adapters, for instance), which can cause trouble for `torch.compile`. In other places, `torch.compile` may work, but won't be as fast as expected because of graph breaks.
+
+If you don't see an error, it doesn't necessarily mean that `torch.compile` worked correctly. It might give you an output, but the output is incorrect. This guide describes what works with `torch.compile` and what doesn't. For your own testing, we recommend using the latest PyTorch version, as `torch.compile` is constantly being improved.
+
+> [!TIP]
+> Unless indicated otherwise, the default `torch.compile` settings were used.
+
+## Training and inference with `torch.compile`
+
+These features **work** with `torch.compile`. Everything listed below was tested with a causal LM:
+
+- Training with `Trainer` from 🤗 transformers
+- Training with a custom PyTorch loop
+- Inference
+- Generation
+
+The following adapters were tested successfully:
+
+- AdaLoRA
+- BOFT
+- Bone
+- IA³
+- Layer Norm Tuning
+- LoHa
+- LoKr
+- LoRA
+- LoRA + DoRA
+- LoRA applied to embedding layers
+- OFT
+- VeRA
+- HRA
+
+## Advanced PEFT features with `torch.compile`
+
+Below are some of the more advanced PEFT features that **work**. They were all tested with LoRA.
+
+- `modules_to_save` (i.e. `config = LoraConfig(..., modules_to_save=...)`)
+- Merging adapters (one or multiple)
+- Merging multiple adapters into one adapter (i.e. calling `model.add_weighted_adapter(...)`)
+- Using PEFT adapters with quantization (bitsandbytes)
+- Disabling adapters (i.e. using `with model.disable_adapter()`)
+- Unloading (i.e. calling `model.merge_and_unload()`)
+- Mixed adapter batches (i.e. calling `model(batch, adapter_names=["__base__", "default", "other", ...])`)
+- Inference with multiple adapters (i.e. using `model.add_adapter` or `model.load_adapter` to load more than 1 adapter); for this, only call `torch.compile` _after_ loading all adapters
+
+Generally, we can expect that if a feature works correctly with LoRA and is also supported by other adapter types, it should also work for that adapter type.
+
+## Test cases
+
+All the use cases listed above are tested inside of [`peft/tests/test_torch_compile.py`](https://github.com/huggingface/peft/blob/main/tests/test_torch_compile.py). If you want to check in more detail how we tested a certain feature, please go to that file and check the test that corresponds to your use case.
+
+> [!TIP]
+> If you have another use case where you know that `torch.compile` does or does not work with PEFT, please contribute by letting us know or by opening a PR to add this use case to the covered test cases.
--- a/docs/source/developer_guides/troubleshooting.md
+++ b/docs/source/developer_guides/troubleshooting.md
@ -39,7 +39,9 @@ Installing PEFT from source is useful for keeping up with the latest development
 python -m pip install git+https://github.com/huggingface/peft
 ```

-## ValueError: Attempting to unscale FP16 gradients
+## Dtype-related issues
+
+### ValueError: Attempting to unscale FP16 gradients

 This error probably occurred because the model was loaded with `torch_dtype=torch.float16` and then used in an automatic mixed precision (AMP) context, e.g. by setting `fp16=True` in the [`~transformers.Trainer`] class from 🤗 Transformers. The reason is that when using AMP, trainable weights should never use fp16. To make this work without loading the whole model in fp32, add the following to your code:

@ -69,6 +71,29 @@ trainer = Trainer(model=peft_model, fp16=True, ...)
 trainer.train()
 ```

+<Tip>
+
+Starting from PEFT version v0.12.0, PEFT automatically promotes the dtype of adapter weights from `torch.float16` and `torch.bfloat16` to `torch.float32` where appropriate. To _prevent_ this behavior, you can pass `autocast_adapter_dtype=False` to [`~get_peft_model`], to [`~PeftModel.from_pretrained`], and to [`~PeftModel.load_adapter`].
+
+</Tip>
+
+### Selecting the dtype of the adapter
+
+Most PEFT methods, like LoRA, work by adding trainable adapter weights. By default, those weights are stored in float32 dtype (fp32), i.e. at a relatively high precision. Therefore, even if the base model is loaded in float16 (fp16) or bfloat16 (bf16), the adapter weights are float32. When the adapter results are calculated during the forward pass, the input will typically be in the dtype of the base model, thus it will be upcast to float32 if necessary, then cast back to the original dtype.
+
+If you prefer to have the adapter weights in the lower precision of the base model, i.e. in float16 or bfloat16, you can pass `autocast_adapter_dtype=False` when creating the model ([`~get_peft_model`]) or loading the model ([`~PeftModel.from_pretrained`]). There are some advantages and disadvantages to this:
+
+Advantages of half precision adapter:
+- computation slightly faster
+- slightly less memory
+- smaller file size of checkpoint (half the size)
+
+Disadvantages of half precision adapter:
+- slightly worse loss
+- higher risk of overflow or underflow
+
+Note that for most use cases, overall runtime and memory cost will be determined by the size of the base model and by the dataset, while the dtype of the PEFT adapter will only have a small impact.
+
 ## Bad results from a loaded PEFT model

 There can be several reasons for getting a poor result from a loaded PEFT model which are listed below. If you're still unable to troubleshoot the problem, see if anyone else had a similar [issue](https://github.com/huggingface/peft/issues) on GitHub, and if you can't find any, open a new issue.
@ -112,11 +137,45 @@ You should probably TRAIN this model on a down-stream task to be able to use it

 The mentioned layers should be added to `modules_to_save` in the config to avoid the described problem.

+<Tip>
+
+As an example, when loading a model that is using the DeBERTa architecture for sequence classification, you'll see a warning that the following weights are newly initialized: `['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']`. From this, it follows that the `classifier` and `pooler` layers should be added to: `modules_to_save=["classifier", "pooler"]`.
+
+</Tip>
+
 ### Extending the vocabulary

-For many language fine-tuning tasks, extending the model's vocabulary is necessary since new tokens are being introduced. This requires extending the embedding layer to account for the new tokens and also storing the embedding layer in addition to the adapter weights when saving the adapter.
+For many language fine-tuning tasks, extending the model's vocabulary is necessary since new tokens are being introduced. This requires extending the embedding layer to account for the new tokens and, depending on the fine-tuning method, also storing the embedding layer in addition to the adapter weights when saving the adapter. There are a few ways of achieving this ordered by parameter effectiveness:

-Save the embedding layer by adding it to the `target_modules` of the config. The embedding layer name must follow the standard naming scheme from Transformers. For example, the Mistral config could look like this:
+- [trainable tokens](../package_reference/trainable_tokens), train only the specified tokens, optionally store only the updated values
+- training an adapter on the embedding matrix, optionally store only the updated values
+- full-finetuning of the embedding layer
+
+#### Using trainable tokens
+
+Let's start with trainable tokens, in this case its [LoRA integration](../developer_guides/lora#efficiently-train-tokens-alongside-lora).  If you're interested in only training the new embeddings and nothing else, refer to the [standalone documentation](../package_reference/trainable_tokens).
+
+To enable selective token training of the embedding layer, you'll need to supply the token ids of your newly added tokens via the `trainable_token_indices` parameter.  Optionally you can specify which layer to target if there is more than one embedding layer. For a Mistral model this could look like this:
+
+```python
+new_tokens = ['<think>', '</think>']
+tokenizer.add_tokens(new_tokens)
+base_model.resize_token_embeddings(len(tokenizer))
+
+lora_config = LoraConfig(
+    ...,
+    trainable_token_indices={'embed_tokens': tokenizer.convert_tokens_to_ids(new_tokens)},
+)
+```
+
+If your model uses tied weights (such as the `lm_head`), trainable tokens will try to resolve those and keep them updated as well, so in that case there should be no need for adding `modules_to_save=["lm_head"]`. This only works if the model uses the Transformers convention for tying weights.
+
+Saving the model with `model.save_pretrained` may save the full embedding matrix instead of
+only the difference as a precaution because the embedding matrix was resized. To save space you can disable this behavior by setting `save_embedding_layers=False` when calling `save_pretrained`. This is safe to do as long as you don't modify the embedding matrix through other means as well, as such changes will be not tracked by trainable tokens.
+
+#### Using an adapter, e.g. LoRA
+
+Prepare the embedding layer by adding it to the `target_modules` of your adapter config. For example, the Mistral config could look like this:

 ```python
 config = LoraConfig(..., target_modules=["embed_tokens", "lm_head", "q_proj", "v_proj"])
@ -124,7 +183,7 @@ config = LoraConfig(..., target_modules=["embed_tokens", "lm_head", "q_proj", "v

 Once added to `target_modules`, PEFT automatically stores the embedding layer when saving the adapter if the model has the [`~transformers.PreTrainedModel.get_input_embeddings`] and [`~transformers.PreTrainedModel.get_output_embeddings`]. This is generally the case for Transformers models.

-If the model's embedding layer doesn't follow the Transformer's naming scheme, you can still save it by manually passing `save_embedding_layers=True` when saving the adapter:
+If the model's embedding layer doesn't follow the Transformer's naming scheme but nevertheless implements `get_input_embeddings`, you can still save it by manually passing `save_embedding_layers=True` when saving the adapter:

 ```python
 model = get_peft_model(...)
@ -136,6 +195,42 @@ For inference, load the base model first and resize it the same way you did befo

 For a complete example, please check out [this notebook](https://github.com/huggingface/peft/blob/main/examples/causal_language_modeling/peft_lora_clm_with_additional_tokens.ipynb).

+#### Full fine-tuning
+
+Full fine-tuning is more costly in terms of VRAM or storage space but if all else fails, you can fall back to this and see if it works for you. Achieve it by adding the name of the embedding layer to `modules_to_save`. Note that you need to add tied layers as well, e.g. `lm_head`. Example for a Mistral model with LoRA:
+
+```python
+config = LoraConfig(..., modules_to_save=["embed_tokens", "lm_head"], target_modules=["q_proj", "v_proj"])
+```
+
+### Getting a warning about "weights not being initialized from the model checkpoint"
+
+When you load your PEFT model which has been trained on a task (for example, classification), you may get a warning like:
+
+> Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']. You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+
+Although this looks scary, it is most likely nothing to worry about. This warning comes from Transformers, and it isn't a PEFT specific warning. It lets you know that a randomly initialized classification head (`score`) is attached to the base model, and the head must be trained to produce sensible predictions.
+
+When you get this warning _before_ training the model, PEFT automatically takes care of making the classification head trainable if you correctly passed the `task_type` argument to the PEFT config.
+
+```python
+from peft import LoraConfig, TaskType
+
+lora_config = LoraConfig(..., task_type=TaskType.SEQ_CLS)
+```
+
+If your classification head does not follow the usual naming conventions from Transformers (which is rare), you have to explicitly tell PEFT the name of the head in `modules_to_save`.
+
+```python
+lora_config = LoraConfig(..., modules_to_save=["name-of-classification-head"])
+```
+
+To check the name of the classification head, print the model and it should be the last module.
+
+If you get this warning from your inference code, i.e. _after_ training the model, when you load the PEFT model, you always have to load the Transformers model first. Since Transformers does not know that you will load PEFT weights afterwards, it still gives the warning.
+
+As always, it is best practice to ensure the model works correctly for inference by running some validation on it.
+
 ### Check layer and model status

 Sometimes a PEFT model can end up in a bad state, especially when handling multiple adapters. There can be some confusion around what adapters exist, which one is active, which one is merged, etc. To help investigate this issue, call the [`~peft.PeftModel.get_layer_status`] and the [`~peft.PeftModel.get_model_status`] methods. 
@ -208,6 +303,7 @@ It is possible to get this information for non-PEFT models if they are using PEF
 >>> pipe = StableDiffusionPipeline.from_pretrained(path, torch_dtype=torch.float16)
 >>> pipe.load_lora_weights(lora_id, adapter_name="adapter-1")
 >>> pipe.load_lora_weights(lora_id, adapter_name="adapter-2")
+>>> pipe.set_lora_device(["adapter-2"], "cuda")
 >>> get_layer_status(pipe.text_encoder)
 [TunerLayerStatus(name='text_model.encoder.layers.0.self_attn.k_proj',
                  module_type='lora.Linear',
@ -215,14 +311,15 @@ It is possible to get this information for non-PEFT models if they are using PEF
                  active_adapters=['adapter-2'],
                  merged_adapters=[],
                  requires_grad={'adapter-1': False, 'adapter-2': True},
-                  available_adapters=['adapter-1', 'adapter-2']),
+                  available_adapters=['adapter-1', 'adapter-2'],
+                  devices={'adapter-1': ['cpu'], 'adapter-2': ['cuda']}),
 TunerLayerStatus(name='text_model.encoder.layers.0.self_attn.v_proj',
                  module_type='lora.Linear',
                  enabled=True,
                  active_adapters=['adapter-2'],
                  merged_adapters=[],
                  requires_grad={'adapter-1': False, 'adapter-2': True},
-                  available_adapters=['adapter-1', 'adapter-2']),
+                  devices={'adapter-1': ['cpu'], 'adapter-2': ['cuda']}),
 ...]

 >>> get_model_status(pipe.unet)
@ -238,5 +335,69 @@ TunerModelStatus(
    merged_adapters=[],
    requires_grad={'adapter-1': False, 'adapter-2': True},
    available_adapters=['adapter-1', 'adapter-2'],
+    devices={'adapter-1': ['cpu'], 'adapter-2': ['cuda']},
 )
 ```
+
+## Speed
+
+### Loading adapter weights is slow
+
+Loading adapters like LoRA weights should generally be fast compared to loading the base model. However, there can be use cases where the adapter weights are quite large or where users need to load a large number of adapters -- the loading time can add up in this case. The reason for this is that the adapter weights are first initialized and then overridden by the loaded weights, which is wasteful. To speed up the loading time, you can pass the `low_cpu_mem_usage=True` argument to [`~PeftModel.from_pretrained`] and [`~PeftModel.load_adapter`].
+
+<Tip>
+
+If this option works well across different use cases, it may become the default for adapter loading in the future.
+
+</Tip>
+
+
+## Reproducibility
+
+### Models using batch norm
+
+When loading a trained PEFT model where the base model uses batch norm (e.g. `torch.nn.BatchNorm1d` or `torch.nn.BatchNorm2d`), you may find that you cannot reproduce the exact same outputs. This is because the batch norm layers keep track of running stats during training, but these stats are not part of the PEFT checkpoint. Therefore, when you load the PEFT model, the running stats of the base model will be used (i.e. from before training with PEFT).
+
+Depending on your use case, this may not be a big deal. If, however, you need your outputs to be 100% reproducible, you can achieve this by adding the batch norm layers to `modules_to_save`. Below is an example of this using resnet and LoRA. Notice that we set `modules_to_save=["classifier", "normalization"]`. We need the `"classifier"` argument because our task is image classification, and we add the `"normalization"` argument to ensure that the batch norm layers are saved in the PEFT checkpoint.
+
+```python
+from transformers import AutoModelForImageClassification
+from peft import LoraConfig, get_peft_model
+
+model_id = "microsoft/resnet-18"
+base_model = AutoModelForImageClassification.from_pretrained(self.model_id)
+config = LoraConfig(
+    target_modules=["convolution"],
+    modules_to_save=["classifier", "normalization"],
+),
+```
+
+Depending on the type of model you use, the batch norm layers could have different names than `"normalization"`, so please ensure that the name matches your model architecture.
+
+## Version mismatch
+
+### Error while loading the config because of an unexpected keyword argument
+
+When you encounter an error like the one shown below, it means the adapter you're trying to load was trained with a more recent version of PEFT than the version you have installed on your system.
+
+```
+TypeError: LoraConfig.__init__() got an unexpected keyword argument <argument-name>
+```
+
+The best way to resolve this issue is to install the latest PEFT version:
+
+```sh
+python -m pip install -U PEFT
+```
+
+If the adapter was trained from a source install of PEFT (an unreleased version of PEFT), then you also need to install PEFT from source.
+
+```sh
+python -m pip install -U git+https://github.com/huggingface/peft.git
+```
+
+If it is not possible for you to upgrade PEFT, there is a workaround you can try.
+
+Assume the error message says that the unknown keyword argument is named `foobar`. Search inside the `adapter_config.json` of this PEFT adapter for the `foobar` entry and delete it from the file. Then save the file and try loading the model again.
+
+This solution works most of the time. As long as it is the default value for `foobar`, it can be ignored. However, when it is set to some other value, you will get incorrect results. Upgrading PEFT is the recommended solution.
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -23,14 +23,14 @@ PEFT is integrated with the Transformers, Diffusers, and Accelerate libraries to
 <div class="mt-10">
  <div class="w-full flex flex-col space-y-4 md:space-y-0 md:grid md:grid-cols-2 md:gap-y-4 md:gap-x-5">
    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="quicktour"
-      ><div class="w-full text-center bg-gradient-to-br from-blue-400 to-blue-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Get started</div>
+      ><div class="w-full text-center bg-gradient-to-br from-blue-400 to-blue-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Quicktour</div>
      <p class="text-gray-700">Start here if you're new to 🤗 PEFT to get an overview of the library's main features, and how to train a model with a PEFT method.</p>
    </a>
-    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./task_guides/image_classification_lora"
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./task_guides/prompt_based_methods"
      ><div class="w-full text-center bg-gradient-to-br from-indigo-400 to-indigo-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">How-to guides</div>
      <p class="text-gray-700">Practical guides demonstrating how to apply various PEFT methods across different types of tasks like image classification, causal language modeling, automatic speech recognition, and more. Learn how to use 🤗 PEFT with the DeepSpeed and Fully Sharded Data Parallel scripts.</p>
    </a>
-    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./conceptual_guides/lora"
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./conceptual_guides/adapter"
      ><div class="w-full text-center bg-gradient-to-br from-pink-400 to-pink-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Conceptual guides</div>
      <p class="text-gray-700">Get a better theoretical understanding of how LoRA and various soft prompting methods help reduce the number of trainable parameters to make training more efficient.</p>
   </a>
--- a/docs/source/install.md
+++ b/docs/source/install.md
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # Installation

-Before you start, you will need to setup your environment, install the appropriate packages, and configure 🤗 PEFT. 🤗 PEFT is tested on **Python 3.8+**.
+Before you start, you will need to setup your environment, install the appropriate packages, and configure 🤗 PEFT. 🤗 PEFT is tested on **Python 3.9+**.

 🤗 PEFT is available on PyPI, as well as GitHub:

@ -43,5 +43,5 @@ repository:
 ```bash
 git clone https://github.com/huggingface/peft
 cd peft
-pip install -e .
+pip install -e .[test]
 ```
--- a/docs/source/package_reference/boft.md
+++ b/docs/source/package_reference/boft.md
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # BOFT

-[Orthogonal Butterfly (BOFT)](https://hf.co/papers/2311.06243) is a generic method designed for finetuning foundation models. It improves the paramter efficiency of the finetuning paradigm -- Orthogonal Finetuning (OFT), by taking inspiration from Cooley-Tukey fast Fourier transform, showing favorable results across finetuning different foundation models, including large vision transformers, large language models and text-to-image diffusion models.
+[Orthogonal Butterfly (BOFT)](https://hf.co/papers/2311.06243) is a generic method designed for finetuning foundation models. It improves the parameter efficiency of the finetuning paradigm -- Orthogonal Finetuning (OFT), by taking inspiration from Cooley-Tukey fast Fourier transform, showing favorable results across finetuning different foundation models, including large vision transformers, large language models and text-to-image diffusion models.

 The abstract from the paper is:

--- a/docs/source/package_reference/bone.md
+++ b/docs/source/package_reference/bone.md
@ -0,0 +1,33 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Bone
+
+DiSHA: Dimension-Sharding Adaptation ([DiSHA](https://huggingface.co/papers/2409.15371)) We introduce Dimension-Sharding Adaptation (DiSHA), which expands the PEFT design space to unlock lower intrinsic ranks and faster convergence by default. Building on DiSHA, we propose an efficient algorithm called Block-Affine Adaptation (Bone) structure and a non-linear update method called Block Affine Transformation Adaptation (BAT).
+
+
+The abstract from the paper is:
+
+Low-Rank Adaptation (LoRA) leverages the low intrinsic rank of weight updates in Large Language Models (LLMs), establishing a Parameter-Efficient Fine-Tuning (PEFT) paradigm. However, LoRA suffers from slow convergence. We introduce Dimension-Sharding Adaptation (DiSHA), which expands the PEFT design space to unlock lower intrinsic ranks and faster convergence by default. Within DiSHA's design space, we propose Block Affine Adaptation (Bone), a computationally efficient structure that delivers both high performance and efficiency. While certain DiSHA configurations may result in colinear updates to weight shards, we address this with Block Affine Transformation Adaptation (BAT), a nonlinear variant of DiSHA. BAT introduces nonlinearity by combining trainable matrices with original weight shards in a nonlinear manner, inducing nonlinearity in matrix updates without introducing additional parameters. Empirical results show that Bone, under the DiSHA framework, consistently outperforms LoRA variants in both NLG and NLU tasks, with significantly improved computational efficiency. Further analysis demonstrates that BAT enhances model capabilities by leveraging its nonlinear design.
+
+
+## BoneConfig
+
+[[autodoc]] tuners.bone.config.BoneConfig
+
+## BoneModel
+
+[[autodoc]] tuners.bone.model.BoneModel
--- a/docs/source/package_reference/c3a.md
+++ b/docs/source/package_reference/c3a.md
@ -0,0 +1,43 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# C3A: Parameter-Efficient Fine-Tuning via Circular Convolution
+
+[C3A](https://huggingface.co/papers/2407.19342) is a parameter-efficient fine-tuning technique that leverages Circular Convolution to achieve high rank adaptation within reasonable resource limits.
+
+Note that you should use a much larger learning rate (LR) for C3A than for other methods. For example, a LR of 1e-1 for C3A is a good starting point. Besides, a much smaller weight decay should be used. You can refer to the `method_comparison` folder for more details.
+
+For the `block_size`, it affects tunable parameters and performance. To start with, you can choose a $\mathrm{gcd}(d_1,d_2)$ near $\frac{\sqrt{d_1\times d_2}}{r}$, where $r$ is the rank for LoRA you would use for this task.
+
+C3A currently has the following constraints:
+
+- Only `nn.Linear` layers are supported.
+- Quantized layers are not supported.
+- The block size should be a common divisor of both the input and output sizes of target layers. 
+
+If these constraints don't work for your use case, consider other methods instead.
+
+The abstract from the paper is:
+
+> Low-Rank Adaptation (LoRA) has gained popularity for fine-tuning large foundation models, leveraging low-rank matrices $\mathbf{A}$ and $\mathbf{B}$ to represent weight changes (i.e., $\Delta \mathbf{W} = \mathbf{B} \mathbf{A}$). This method reduces trainable parameters and mitigates heavy memory consumption associated with full delta matrices by sequentially multiplying $\mathbf{A}$ and $\mathbf{B}$ with the activation. Despite its success, the intrinsic low-rank characteristic may limit its performance. Although several variants have been proposed to address this issue, they often overlook the crucial computational and memory efficiency brought by LoRA. In this paper, we propose Circular Convolution Adaptation (C3A), which not only achieves high-rank adaptation with enhanced performance but also excels in both computational power and memory utilization. Extensive experiments demonstrate that C3A consistently outperforms LoRA and its variants across various fine-tuning tasks. 
+
+## C3AConfig
+
+[[autodoc]] tuners.c3a.config.C3AConfig
+
+## C3AModel
+
+[[autodoc]] tuners.c3a.model.C3AModel
--- a/docs/source/package_reference/cpt.md
+++ b/docs/source/package_reference/cpt.md
@ -0,0 +1,34 @@
+<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# Context-aware Prompt Tuning: Advancing In-Context Learning with Adversarial Methods
+
+[CPT](https://huggingface.co/papers/2410.17222) combines In-Context Learning (ICL), Prompt Tuning (PT), and adversarial optimization to improve few-shot learning by refining context embeddings. CPT updates the context tokens by optimizing both the context and the training examples, encapsulating them into a novel loss design that minimizes overfitting, enables more effective optimization, and drives significant improvements in classification tasks.
+
+[//]: # ([CPT]&#40;https://huggingface.co/papers/2410.17222&#41; for the paper)
+
+The abstract from the paper is:
+
+> Large Language Models (LLMs) can perform few-shot learning using either optimization-based approaches or In-Context Learning (ICL). Optimization-based methods often suffer from overfitting, as they require updating a large number of parameters with limited data. In contrast, ICL avoids overfitting but typically underperforms compared to optimization-based methods and is highly sensitive to the selection, order, and format of demonstration examples. To overcome these challenges, we introduce Context-aware Prompt Tuning (CPT), a method inspired by ICL, Prompt Tuning (PT), and adversarial attacks. CPT builds on the ICL strategy of concatenating examples before the input, extending it by incorporating PT-like learning to refine the context embedding through iterative optimization, extracting deeper insights from the training examples. Our approach carefully modifies specific context tokens, considering the unique structure of the examples within the context. In addition to updating the context with PT-like optimization, CPT draws inspiration from adversarial attacks, adjusting the input based on the labels present in the context while preserving the inherent value of the user-provided data. To ensure robustness and stability during optimization, we employ a projected gradient descent algorithm, constraining token embeddings to remain close to their original values and safeguarding the quality of the context. Our method has demonstrated superior accuracy across multiple classification tasks using various LLM models, outperforming existing baselines and effectively addressing the overfitting challenge in few-shot learning.
+
+
+Take a look at [Example](https://github.com/huggingface/peft/blob/main/examples/cpt_finetuning/README.md) for a step-by-step guide on how to train a model with CPT.
+
+
+## CPTConfig
+
+[[autodoc]] tuners.cpt.config.CPTConfig
+
+## CPTEmbedding
+
+[[autodoc]] tuners.cpt.model.CPTEmbedding
+
--- a/docs/source/package_reference/fourierft.md
+++ b/docs/source/package_reference/fourierft.md
@ -0,0 +1,38 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# FourierFT: Discrete Fourier Transformation Fine-Tuning
+
+[FourierFT](https://huggingface.co/papers/2405.03003) is a parameter-efficient fine-tuning technique that leverages Discrete Fourier Transform to compress the model's tunable weights. This method outperforms LoRA in the GLUE benchmark and common ViT classification tasks using much less parameters.
+
+FourierFT currently has the following constraints:
+
+- Only `nn.Linear` layers are supported.
+- Quantized layers are not supported.
+
+If these constraints don't work for your use case, consider other methods instead.
+
+The abstract from the paper is:
+
+> Low-rank adaptation (LoRA) has recently gained much interest in fine-tuning foundation models. It effectively reduces the number of trainable parameters by incorporating low-rank matrices A and B to represent the weight change, i.e., Delta W=BA. Despite LoRA's progress, it faces storage challenges when handling extensive customization adaptations or larger base models. In this work, we aim to further compress trainable parameters by enjoying the powerful expressiveness of the Fourier transform. Specifically, we introduce FourierFT, which treats Delta W as a matrix in the spatial domain and learns only a small fraction of its spectral coefficients. With the trained spectral coefficients, we implement the inverse discrete Fourier transform to recover Delta W. Empirically, our FourierFT method shows comparable or better performance with fewer parameters than LoRA on various tasks, including natural language understanding, natural language generation, instruction tuning, and image classification. For example, when performing instruction tuning on the LLaMA2-7B model, FourierFT surpasses LoRA with only 0.064M trainable parameters, compared to LoRA's 33.5M.
+
+## FourierFTConfig
+
+[[autodoc]] tuners.fourierft.config.FourierFTConfig
+
+## FourierFTModel
+
+[[autodoc]] tuners.fourierft.model.FourierFTModel
--- a/docs/source/package_reference/helpers.md
+++ b/docs/source/package_reference/helpers.md
@ -2,7 +2,7 @@
 rendered properly in your Markdown viewer.
 -->

-# Document Title
+# Helper methods

 A collection of helper functions for PEFT.

@ -10,3 +10,13 @@ A collection of helper functions for PEFT.

 [[autodoc]] helpers.check_if_peft_model
    - all
+
+## Temporarily Rescaling Adapter Scale in LoraLayer Modules
+
+[[autodoc]] helpers.rescale_adapter_scale
+    - all
+
+## Context manager to disable input dtype casting in the `forward` method of LoRA layers
+
+[[autodoc]] helpers.disable_input_dtype_casting
+    - all
--- a/docs/source/package_reference/hotswap.md
+++ b/docs/source/package_reference/hotswap.md
@ -0,0 +1,76 @@
+<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# Hotswapping adapters
+
+The idea of hotswapping an adapter is the following: We can already load multiple adapters, e.g. two LoRAs, at the same time. But sometimes, we want to load one LoRA and then replace its weights in-place with the LoRA weights of another adapter. This is now possible the `hotswap_adapter` function.
+
+In general, this should be faster than deleting one adapter and loading the adapter in its place, which would be the how to achieve the same final outcome without hotswapping. Another advantage of hotswapping is that it prevents re-compilation in case the PEFT model is already compiled using `torch.compile`. This can save quite a lot of time.
+
+## Example without `torch.compile`
+
+```python
+import torch
+from transformers import AutoModelForCausalLM
+from peft import PeftModel
+from peft.utils.hotswap import hotswap_adapter
+
+model_id = ...
+inputs = ...
+device = ...
+model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
+
+# load lora 0
+model = PeftModel.from_pretrained(model, <path-adapter-0>)
+with torch.inference_mode():
+    output_adapter_0 = model(inputs)
+
+# replace the "default" lora adapter with the new one
+hotswap_adapter(model, <path-adapter-1>, adapter_name="default", torch_device=device)
+with torch.inference_mode():
+    output_adapter_1 = model(inputs).logits
+```
+
+## Example with `torch.compile`
+
+```python
+import torch
+from transformers import AutoModelForCausalLM
+from peft import PeftModel
+from peft.utils.hotswap import hotswap_adapter, prepare_model_for_compiled_hotswap
+
+model_id = ...
+inputs = ...
+device = ...
+max_rank = ...  # maximum rank among all LoRA adapters that will be used
+model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
+
+# load lora 0
+model = PeftModel.from_pretrained(model, <path-adapter-0>)
+# Prepare the model to allow hotswapping even if ranks/scalings of 2nd adapter differ.
+# You can skip this step if all ranks and scalings are identical.
+prepare_model_for_compiled_hotswap(model, target_rank=max_rank)
+model = torch.compile(model)
+with torch.inference_mode():
+    output_adapter_0 = model(inputs)
+
+# replace the "default" lora adapter with the new one
+hotswap_adapter(model, <path-adapter-1>, adapter_name="default", torch_device=device)
+with torch.inference_mode():
+    output_adapter_1 = model(inputs).logits
+```
+
+## Caveats
+
+Hotswapping works with transformers models and diffusers models. However, there are some caveats:
+
+- Right now, only LoRA is properly supported.
+- It only works for the same PEFT method, so no swapping LoRA and LoHa, for example.
+- The adapter that is being swapped in must target the same layers as the previous adapter or a subset of those layers. It cannot target new layers. Therefore, if possible, start with the adapter that targets most layers.
+
+[[autodoc]] utils.hotswap.hotswap_adapter
+    - all
+
+[[autodoc]] utils.hotswap.hotswap_adapter_from_state_dict
+    - all
--- a/docs/source/package_reference/hra.md
+++ b/docs/source/package_reference/hra.md
@ -0,0 +1,32 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Bridging The Gap between Low-rank and Orthogonal Adaptation via Householder Reflection Adaptation (HRA)
+
+[HRA](https://huggingface.co/papers/2405.17484) is a simple but effective adapter-based fine-tuning method by leveraging Householder reflections. This method harnesses the advantages of both strategies, reducing parameters and computation costs while penalizing the loss of pre-training knowledge. It consistently achieves better performance with fewer trainable parameters and outperforms state-of-the-art adapters across different models, including large language models (LLMs) and conditional image generators.
+
+
+The abstract from the paper is:
+
+> While following different technical routes, both low-rank and orthogonal adaptation techniques can efficiently adapt large-scale pre-training models in specific tasks or domains based on a small piece of trainable parameters. In this study, we bridge the gap between these two techniques, proposing a simple but effective adaptation method based on Householder reflections. Given a pre-trained model, our method fine-tunes its layers by multiplying each frozen weight matrix with an orthogonal matrix constructed by a chain of learnable Householder reflections (HRs). This HR-based orthogonal fine-tuning is equivalent to an adaptive low-rank adaptation. Moreover, we show that the orthogonality of the reflection planes corresponding to the HRs impacts the model capacity and regularity. The analysis motivates us to regularize the orthogonality of the HRs, leading to different implementations of the proposed Householder reflection adaptation (HRA) method. Compared with state-of-the-art methods, HRA achieves superior performance with fewer learnable parameters when adapting large language models and conditional image generators. The code is available at [peft](https://github.com/huggingface/peft/tree/main/src/peft/tuners/hra) and [HRA](https://github.com/DaShenZi721/HRA).
+
+## HRAConfig
+
+[[autodoc]] tuners.hra.config.HRAConfig
+
+## HRAModel
+
+[[autodoc]] tuners.hra.model.HRAModel
--- a/docs/source/package_reference/lora.md
+++ b/docs/source/package_reference/lora.md
@ -32,4 +32,20 @@ The abstract from the paper is:

 ## Utility

+### LoftQ
+
 [[autodoc]] utils.loftq_utils.replace_lora_weights_loftq
+
+### Eva
+
+#### EvaConfig
+
+[[autodoc]] tuners.lora.config.EvaConfig
+
+#### initialize_lora_eva_weights
+
+[[autodoc]] tuners.lora.eva.initialize_lora_eva_weights
+
+#### get_eva_state_dict
+
+[[autodoc]] tuners.lora.eva.get_eva_state_dict
--- a/docs/source/package_reference/miss.md
+++ b/docs/source/package_reference/miss.md
@ -0,0 +1,32 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# MiSS
+
+MiSS: Balancing LoRA Performance and Efficiency with Simple Shard Sharing([MiSS](https://huggingface.co/papers/2409.15371)) is a novel PEFT method that adopts a low-rank structure, requires only a single trainable matrix, and introduces a new update mechanism distinct from LoRA, achieving an excellent balance between performance and efficiency.
+
+The abstract from the paper is:
+
+*Parameter-Efficient Fine-Tuning (PEFT) methods, particularly Low-Rank Adaptation (LoRA), effectively reduce the number of trainable parameters in Large Language Models (LLMs). However, as model scales continue to grow, the demand for computational resources remains a significant challenge. Existing LoRA variants often struggle to strike an optimal balance between adaptability (model performance and convergence speed) and efficiency (computational overhead, memory usage, and initialization time). This paper introduces MiSS(Matrix Shard Sharing ), a novel PEFT approach that addresses this trade-off through a simple shard-sharing mechanism. MiSS leverages the insight that a low-rank adaptation can be achieved by decomposing the weight matrix into multiple fragment matrices and utilizing a shared, trainable common fragment. This method constructs the low-rank update matrix through the replication of these shared, partitioned shards. We also propose a hardware-efficient and broadly applicable implementation for MiSS. Extensive experiments conducted on a range of tasks, alongside a systematic analysis of computational performance, demonstrate MiSS's superiority. The results show that MiSS significantly outperforms standard LoRA and its prominent variants in both model performance metrics and computational efficiency, including initialization speed and training throughput. By effectively balancing expressive power and resource utilization, MiSS offers a compelling solution for efficiently adapting large-scale models*.
+
+
+## MissConfig
+
+[[autodoc]] tuners.miss.config.MissConfig
+
+## MissModel
+
+[[autodoc]] tuners.miss.model.MissModel
--- a/docs/source/package_reference/oft.md
+++ b/docs/source/package_reference/oft.md
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # OFT

-[Orthogonal Finetuning (OFT)](https://hf.co/papers/2306.07280) is a method developed for adapting text-to-image diffusion models. It works by reparameterizing the pretrained weight matrices with it's orthogonal matrix to preserve information in the pretrained model. To reduce the number of parameters, OFT introduces a block-diagonal structure in the orthogonal matrix.
+[Orthogonal Finetuning (OFT)](https://hf.co/papers/2306.07280) is a method developed for adapting text-to-image diffusion models. It works by reparameterizing the pretrained weight matrices with its orthogonal matrix to preserve information in the pretrained model. To reduce the number of parameters, OFT introduces a block-diagonal structure in the orthogonal matrix.

 The abstract from the paper is:

--- a/docs/source/package_reference/poly.md
+++ b/docs/source/package_reference/poly.md
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # Polytropon

-[Polytropon](https://hf.co/papers/2202.13914) is a multitask model with a number of different LoRA adapters in it's "inventory". The model learns the correct combination of adapters from the inventory with a routing function to choose the best subset of modules for a specific task. PEFT also supports [Multi-Head Adapter Routing (MHR)](https://hf.co/papers/2211.03831) for Polytropon which builds on and improves the routing function by combining the adapter heads more granularly. The adapter heads are separated into disjoint blocks and a different routing function is learned for each one, allowing for more expressivity.
+[Polytropon](https://hf.co/papers/2202.13914) is a multitask model with a number of different LoRA adapters in its "inventory". The model learns the correct combination of adapters from the inventory with a routing function to choose the best subset of modules for a specific task. PEFT also supports [Multi-Head Adapter Routing (MHR)](https://hf.co/papers/2211.03831) for Polytropon which builds on and improves the routing function by combining the adapter heads more granularly. The adapter heads are separated into disjoint blocks and a different routing function is learned for each one, allowing for more expressivity.

 <hfoptions id="paper">
 <hfoption id="Combining Modular Skills in Multitask Learning">
--- a/docs/source/package_reference/randlora.md
+++ b/docs/source/package_reference/randlora.md
@ -0,0 +1,45 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# RandLora: Full-rank parameter-efficient fine-tuning of large models 
+[RandLora](https://huggingface.co/papers/2502.00987) is a parameter-efficient fine-tuning technique that is similar to [LoRA](https://huggingface.co/papers/2106.09685) and [VeRA](https://huggingface.co/papers/2310.11454) but performs full rank updates to improve performance. RandLora can be particulary usefull when adapting large model to hard tasks that require complex updates while preserving the parameter efficiency of LoRA. The full rank update of RandLora is achieved by linearly scaling random bases. The random bases are a collection of multiple low rank matrices such that the summation of their ranks if greater or equal to the full rank of the parameter matrices. The trainable parameters of RandLora are two diagonal matrices (vectors) that get multiplied with the right hand low rank random bases, in a similar way to VeRA's update. To maintain low memory usage, RandLora uses a custom function that prevents storing unnecessary bases in memory for backpropagation.
+
+RandLora presents the noteworthy difference that contrary to other LoRA-like PEFT algorithm, increasing RandLora's random base ranks increases the amount of trainable parameters. Because number of bases x bases rank is constant in RandLora, reducing the rank will increase the number of random bases, hence the number of base-specific trainable diagonal bases.
+
+Because reducing the rank of RandLora's random bases will increase their number, RandLora can become slower to train than LoRA for very small ranks where typically, ranks below 4 with result in a large training time increase. This does not affect inference though as the RandLora adapters can be merged into the pretrained weight matrices.
+
+RandLora additionally supports training with sparse, ternary random bases (only containing -1, 0 and 1). These bases are as described in [Bingham et al.](https://cs-people.bu.edu/evimaria/cs565/kdd-rp.pdf) and [Ping et al.](https://hastie.su.domains/Papers/Ping/KDD06_rp.pdf) and could theoretically be used to reduce compute needs by performing aggregations instead of matrix multiplications to create the weight update. This is not currently supported. Although it does not currently reduce compute, using sparse random bases in RandLora can reduce overfitting in some cases. For users intersted in using sparse ternary bases, the `sparse` option is recommended over the `very_sparse` one that can reduce perfromance. 
+
+Similarly to VeRA, when saving the RandLora's parameters, it's possible to eschew storing the low rank matrices by setting `save_projection=False` on the `VeraConfig`. In that case, these matrices will be restored based on the fixed random seed from the `projection_prng_key` argument. This cuts down on the size of the checkpoint, but we cannot guarantee reproducibility on all devices and for all future versions of PyTorch. If you want to ensure reproducibility, set `save_projection=True` (which is the default).
+
+As in Vera and to handle different shapes of adapted layers, RandLora initializes shared A and B matrices with the largest required size for each dimension. During the forward pass, submatrices A and B for a given layer are sliced out from these shared matrices and used as described in the paper. For example, adapting two linear layers of shapes (100, 20) and (80, 50) will create A and B matrices of shapes (rank, 50) and (100, rank) respectively. Then, to adapt a layer of shape (100, 20), submatrices A and B of shapes (rank, 20) and (100, rank) will be extracted.
+
+RandLora currently has the following constraint:
+
+- Only `nn.Linear` layers are supported.
+
+The abstract from the paper is:
+
+> Low-Rank Adaptation (LoRA) and its variants have shown impressive results in reducing the number of trainable parameters and memory requirements of large transformer networks while maintaining fine-tuning performance. The low-rank nature of the weight update inherently limits the representation power of fine-tuned models, however, thus potentially compromising performance on complex tasks. This raises a critical question: when a performance gap between LoRA and standard fine-tuning is observed, is it due to the reduced number of trainable parameters or the rank deficiency?
+This paper aims to answer this question by introducing RandLora, a parameter-efficient method that performs full-rank updates using a learned linear combinations of low-rank, non-trainable random matrices. Our method limits the number of trainable parameters by restricting optimization to diagonal scaling matrices applied to the fixed random matrices. This allows us to effectively overcome the low-rank limitations while maintaining parameter and memory efficiency during training. Through extensive experimentation across vision, language, and vision-language benchmarks, we systematically evaluate the limitations of LoRA and existing random basis methods. Our findings reveal that full-rank updates are beneficial across vision and language tasks individually, and even more so for vision-language tasks, where RandLora significantly reduces---and sometimes eliminates---the performance gap between standard fine-tuning and LoRA, demonstrating its efficacy.
+
+## RandLoraConfig
+
+[[autodoc]] tuners.randlora.config.RandLoraConfig
+
+## RandLoraModel
+
+[[autodoc]] tuners.randlora.model.RandLoraModel
--- a/docs/source/package_reference/road.md
+++ b/docs/source/package_reference/road.md
@ -0,0 +1,31 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# RoAd
+
+[RoAd](https://arxiv.org/pdf/2409.00119) is a parameter‑efficient fine‑tuning technique that adapts large language models by learning a small set of 2×2 rotation matrices (and optional scaling factors) applied to pairs of hidden dimensions. RoAd achieves competitive or superior performance compared to other PEFT methods with under 0.1% trainable parameters. Unlike LoRA’s batched low‑rank updates, RoAd’s sparse rotations reformulate to simple element‑wise operations, yielding significantly higher serving throughput when handling heterogeneous requests in the same batch, i.e. serving multiple adapters simulatenously. Moreover, RoAd integrates seamlessly into a distributed interchange intervention framework, interpreting its sparse 2D rotations as task-specific interventions within learned subspaces of hidden representations. These orthogonal subspaces can be composed to merge multiple task-specific behaviors—like multilingual capabilities or instruction following—without additional fine-tuning, enabling modular, interpretable adaptations in LLMs.
+
+Finetuning with RoAd typically requires higher learning rate compared to LoRA or similar methods, around 1e-3. Currently RoAd only supports linear layers and it can be used on models quantized with bitsandbytes (4-bit or 8-bit).
+
+For running inference with different RoAd adapters in the same batch see [Inference with different LoRA adapters in the same batch](../developer_guides/lora#inference-with-different-lora-adapters-in-the-same-batch).
+
+## RoadConfig
+
+[[autodoc]] tuners.road.config.RoadConfig
+
+## RoadModel
+
+[[autodoc]] tuners.road.model.RoadModel
--- a/docs/source/package_reference/shira.md
+++ b/docs/source/package_reference/shira.md
@ -0,0 +1,35 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Sparse High Rank Adapters
+
+Sparse High Rank Adapters or [SHiRA](https://arxiv.org/abs/2406.13175) is an alternate type of adapter and has been found to have significant advantages over the low rank adapters. Specifically, SHiRA achieves better accuracy than LoRA for a variety of vision and language tasks. It also offers simpler and higher quality multi-adapter fusion by significantly reducing concept loss, a common problem faced by low rank adapters. SHiRA directly finetunes a small number of the base model's parameters to finetune the model on any adaptation task.
+
+SHiRA currently has the following constraint:
+
+- Only `nn.Linear` layers are supported.
+
+The abstract from the paper is:
+
+> Low Rank Adaptation (LoRA) has gained massive attention in the recent generative AI research. One of the main advantages of LoRA is its ability to be fused with pretrained models, adding no overhead during inference. However, from a mobile deployment standpoint, we can either avoid inference overhead in the fused mode but lose the ability to switch adapters rapidly, or suffer significant (up to 30% higher) inference latency while enabling rapid switching in the unfused mode. LoRA also exhibits concept-loss when multiple adapters are used concurrently. In this paper, we propose Sparse High Rank Adapters (SHiRA), a new paradigm which incurs no inference overhead, enables rapid switching, and significantly reduces concept-loss. Specifically, SHiRA can be trained by directly tuning only 1-2% of the base model weights while leaving others unchanged. This results in a highly sparse adapter which can be switched directly in the fused mode. We further provide theoretical and empirical insights on how high sparsity in SHiRA can aid multi-adapter fusion by reducing concept loss. Our extensive experiments on LVMs and LLMs demonstrate that finetuning only a small fraction of the parameters in the base model significantly outperforms LoRA while enabling both rapid switching and multi-adapter fusion. Finally, we provide a latency- and memory-efficient SHiRA implementation based on Parameter-Efficient Finetuning (PEFT) Library which trains at nearly the same speed as LoRA while consuming up to 16% lower peak GPU memory, thus making SHiRA easy to adopt for practical use cases. To demonstrate rapid switching benefits during inference, we show that loading SHiRA on a base model can be 5x-16x faster than LoRA fusion on a CPU.
+
+## ShiraConfig
+
+[[autodoc]] tuners.shira.config.ShiraConfig
+
+## ShiraModel
+
+[[autodoc]] tuners.shira.model.ShiraModel
--- a/docs/source/package_reference/trainable_tokens.md
+++ b/docs/source/package_reference/trainable_tokens.md
@ -0,0 +1,50 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Trainable Tokens
+
+The Trainable Tokens method provides a way to target specific token embeddings for fine-tuning without resorting to
+training the full embedding matrix or using an adapter on the embedding matrix. It is based on the initial implementation from
+[here](https://github.com/huggingface/peft/pull/1541).
+
+The method only targets specific tokens and selectively trains the token indices you specify. Consequently the
+required RAM will be lower and disk memory is also significantly lower than storing the full fine-tuned embedding matrix.
+
+Some preliminary benchmarks acquired with [this script](https://github.com/huggingface/peft/blob/main/scripts/train_memory.py)
+suggest that for `gemma-2-2b` (which has a rather large embedding matrix) you can save ~4 GiB VRAM with Trainable Tokens
+over fully fine-tuning the embedding matrix. While LoRA will use comparable amounts of VRAM it might also target
+tokens you don't want to be changed. Note that these are just indications and varying embedding matrix sizes might skew
+these numbers a bit.
+
+Note that this method does not add tokens for you, you have to add tokens to the tokenizer yourself and resize the
+embedding matrix of the model accordingly. This method will only re-train the embeddings for the tokens you specify.
+This method can also be used in conjunction with LoRA layers! See [the LoRA developer guide](../developer_guides/lora#efficiently-train-tokens-alongside-lora).
+
+> [!TIP]
+> Saving the model with [`~PeftModel.save_pretrained`] or retrieving the state dict using
+> [`get_peft_model_state_dict`] when adding new tokens may save the full embedding matrix instead of only the difference
+> as a precaution because the embedding matrix was resized. To save space you can disable this behavior by setting
+> `save_embedding_layers=False` when calling `save_pretrained`. This is safe to do as long as you don't modify the
+> embedding matrix through other means as well, as such changes will be not tracked by trainable tokens.
+
+## TrainableTokensConfig
+
+[[autodoc]] tuners.trainable_tokens.config.TrainableTokensConfig
+
+## TrainableTokensModel
+
+[[autodoc]] tuners.trainable_tokens.model.TrainableTokensModel
+
--- a/docs/source/package_reference/vblora.md
+++ b/docs/source/package_reference/vblora.md
@ -0,0 +1,40 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# VB-LoRA: Extreme Parameter Efficient Fine-Tuning with Vector Banks
+
+## Overview
+
+[VB-LoRA](https://huggingface.co/papers/2405.15179) is a parameter-efficient fine-tuning technique that extends LoRA by learning a fine-grained parameter-sharing scheme at the sub-vector level, achieving significantly higher parameter efficiency. This makes VB-LoRA especially useful in scenarios where storage and transmission costs are critical. It works by decomposing low-rank matrices—from different layers and modules such as K, Q, V, and FFN—into sub-vectors, which are then globally shared through a vector bank.
+
+The abstract from the paper is:
+
+*As the adoption of large language models increases and the need for per-user or per-task model customization grows, the parameter-efficient fine-tuning (PEFT) methods, such as low-rank adaptation (LoRA) and its variants, incur substantial storage and transmission costs. To further reduce stored parameters, we introduce a "divide-and-share" paradigm that breaks the barriers of low-rank decomposition across matrix dimensions, modules and layers by sharing parameters globally via a vector bank. As an instantiation of the paradigm to LoRA, our proposed VB-LoRA composites all the low-rank matrices of LoRA from a shared vector bank with a differentiable top-k admixture module. VB-LoRA achieves extreme parameter efficiency while maintaining comparable or better performance compared to state-of-the-art PEFT methods. Extensive experiments demonstrate the effectiveness of VB-LoRA on natural language understanding, natural language generation, and instruction tuning tasks. When fine-tuning the Llama2-13B model, VB-LoRA only uses 0.4% of LoRA's stored parameters, yet achieves superior results.*
+
+## Usage Tips
+
+- VB-LoRA utilizes a sparse top-k module to learn the sharing machanism. When saving adapter parameters, you can either save only the top-k weights and their indices by setting `save_only_topk_weights = True` in `VBLoRAConfig`, or save all the trainable logits by setting it to `False`. Enabling `save_only_topk_weights = True` significantly reduces storage space; for instance, in Llama2-7B, the storage file size decreases from 308MB to 2.5MB. Note that models saved with `save_only_topk_weights = True` are intended for merging or inference only and cannot be used to resume training.
+
+- VB-LoRA has two sets of training parameters: vector bank parameters and logit parameters. In practice, we found that logit parameters require a higher learning rate, while vector bank parameters require a lower learning rate. When using the AdamW optimizer, typical learning rates are 0.01 for logits and 0.001 for vector bank parameters.
+
+## VBLoRAConfig
+
+[[autodoc]] tuners.vblora.config.VBLoRAConfig
+
+## VBLoRAModel
+
+[[autodoc]] tuners.vblora.model.VBLoRAModel
+
--- a/docs/source/package_reference/vera.md
+++ b/docs/source/package_reference/vera.md
@ -20,13 +20,11 @@ rendered properly in your Markdown viewer.

 When saving the adapter parameters, it's possible to eschew storing the low rank matrices by setting `save_projection=False` on the `VeraConfig`. In that case, these matrices will be restored based on the fixed random seed from the `projection_prng_key` argument. This cuts down on the size of the checkpoint, but we cannot guarantee reproducibility on all devices and for all future versions of PyTorch. If you want to ensure reproducibility, set `save_projection=True` (which is the default).

-VeRA currently has the following constraints:
+To handle different shapes of adapted layers, VeRA initializes shared A and B matrices with the largest required size for each dimension. During the forward pass, submatrices A and B for a given layer are sliced out from these shared matrices and used as described in the paper. For example, adapting two linear layers of shapes (100, 20) and (80, 50) will create A and B matrices of shapes (rank, 50) and (100, rank) respectively. Then, to adapt a layer of shape (100, 20), submatrices A and B of shapes (rank, 20) and (100, rank) will be extracted.
+
+VeRA currently has the following constraint:

- All targeted parameters must have the same shape.
 - Only `nn.Linear` layers are supported.
- Quantized layers are not supported.
-
-If these constraints don't work for your use case, use LoRA instead.

 The abstract from the paper is:

--- a/docs/source/package_reference/xlora.md
+++ b/docs/source/package_reference/xlora.md
@ -0,0 +1,56 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# X-LoRA
+
+Mixture of LoRA Experts ([X-LoRA](https://huggingface.co/papers/2402.07148)) is a PEFT method enabling sparse or dense mixture of LoRA experts based on a high granularity (token, layer, sequence) scalings matrix. This leverages frozen LoRA adapters and a frozen base model to drastically reduces the number of parameters that need to be fine-tuned.
+
+A unique aspect of X-LoRA is its versatility: it can be applied to any `transformers` base model with LoRA adapters. This means that, despite the mixture of experts strategy, no changes to the model code must be made.
+
+The below graphic demonstrates how the scalings change for different prompts for each token. This highlights the activation of different adapters as the generation progresses and the sequence creates new context.
+
+![Token-by-token scalings](https://github.com/EricLBuehler/xlora/raw/master/res/token_by_token_scalings.gif)
+
+The abstract from the paper is:
+
+*We report a mixture of expert strategy to create fine-tuned large language models using a deep layer-wise token-level approach based on low-rank adaptation (LoRA). Starting with a set of pre-trained LoRA adapters, our gating strategy uses the hidden states to dynamically mix adapted layers, allowing the resulting X-LoRA model to draw upon different capabilities and create never-before-used deep layer-wise combinations to solve tasks. The design is inspired by the biological principles of universality and diversity, where neural network building blocks are reused in different hierarchical manifestations. Hence, the X-LoRA model can be easily implemented for any existing large language model (LLM) without a need for modifications of the underlying structure. We develop a tailored X-LoRA model that offers scientific capabilities including forward/inverse analysis tasks and enhanced reasoning capability, focused on biomaterial analysis, protein mechanics and design. The impact of this work include access to readily expandable and adaptable models with strong domain knowledge and the capability to integrate across areas of knowledge. Featuring experts in biology, mathematics, reasoning, bio-inspired materials, mechanics and materials, chemistry, protein biophysics, mechanics and quantum-mechanics based molecular properties, we conduct a series of physics-focused case studies. We examine knowledge recall, protein mechanics forward/inverse tasks, protein design, adversarial agentic modeling including ontological knowledge graph construction, as well as molecular design. The model is capable not only of making quantitative predictions of nanomechanical properties of proteins or quantum mechanical molecular properties, but also reasons over the results and correctly predicts likely mechanisms that explain distinct molecular behaviors.*.
+
+Please cite X-LoRA as:
+```bibtex
+@article{10.1063/5.0203126,
+    author = {Buehler, Eric L. and Buehler, Markus J.},
+    title = "{X-LoRA: Mixture of low-rank adapter experts, a flexible framework for large language models with applications in protein mechanics and molecular design}",
+    journal = {APL Machine Learning},
+    volume = {2},
+    number = {2},
+    pages = {026119},
+    year = {2024},
+    month = {05},
+    abstract = "{We report a mixture of expert strategy to create fine-tuned large language models using a deep layer-wise token-level approach based on low-rank adaptation (LoRA). Starting with a set of pre-trained LoRA adapters, our gating strategy uses the hidden states to dynamically mix adapted layers, allowing the resulting X-LoRA model to draw upon different capabilities and create never-before-used deep layer-wise combinations to solve tasks. The design is inspired by the biological principles of universality and diversity, where neural network building blocks are reused in different hierarchical manifestations. Hence, the X-LoRA model can be easily implemented for any existing large language model without a need for modifications of the underlying structure. We develop a tailored X-LoRA model that offers scientific capabilities, including forward/inverse analysis tasks and enhanced reasoning capability, focused on biomaterial analysis, protein mechanics, and design. The impact of this work includes access to readily expandable and adaptable models with strong domain knowledge and the capability to integrate across areas of knowledge. Featuring experts in biology, mathematics, reasoning, bio-inspired materials, mechanics and materials, chemistry, protein biophysics, mechanics, and quantum-mechanics based molecular properties, we conduct a series of physics-focused case studies. We examine knowledge recall, protein mechanics forward/inverse tasks, protein design, adversarial agentic modeling including ontological knowledge graph construction, and molecular design. The model is capable not only of making quantitative predictions of nanomechanical properties of proteins or quantum mechanical molecular properties but also reasoning over the results and correctly predicting likely mechanisms that explain distinct molecular behaviors.}",
+    issn = {2770-9019},
+    doi = {10.1063/5.0203126},
+    url = {https://doi.org/10.1063/5.0203126},
+    eprint = {https://pubs.aip.org/aip/aml/article-pdf/doi/10.1063/5.0203126/19964043/026119\_1\_5.0203126.pdf},
+}
+```
+
+## XLoraConfig
+
+[[autodoc]] tuners.xlora.config.XLoraConfig
+
+## XLoraModel
+
+[[autodoc]] tuners.xlora.model.XLoraModel
--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@ -76,7 +76,7 @@ training_args = TrainingArguments(
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
-    evaluation_strategy="epoch",
+    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
 )
@ -90,7 +90,7 @@ trainer = Trainer(
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
-    tokenizer=tokenizer,
+    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
 )
--- a/docs/source/task_guides/ia3.md
+++ b/docs/source/task_guides/ia3.md
@ -92,7 +92,7 @@ processed_ds = ds.map(
 )
 ```

-Create a training and evaluation [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader), and set `pin_memory=True` to speed up data transfer to the GPU during training if your dataset samples are on a CPU.
+Create a training and evaluation [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader), and set `pin_memory=True` to speed up data transfer to the accelerator during training if your dataset samples are on a CPU.

 ```py
 from torch.utils.data import DataLoader
@ -159,12 +159,12 @@ lr_scheduler = get_linear_schedule_with_warmup(
 )
 ```

-Move the model to the GPU and create a training loop that reports the loss and perplexity for each epoch.
+Move the model to the accelerator and create a training loop that reports the loss and perplexity for each epoch.

 ```py
 from tqdm import tqdm

-device = "cuda"
+device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
 model = model.to(device)

 for epoch in range(num_epochs):
@ -219,7 +219,9 @@ To load the model for inference, use the [`~AutoPeftModelForSeq2SeqLM.from_pretr
 ```py
 from peft import AutoPeftModelForSeq2SeqLM

-model = AutoPeftModelForSeq2SeqLM.from_pretrained("<your-hf-account-name>/mt0-large-ia3").to("cuda")
+device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
+
+model = AutoPeftModelForSeq2SeqLM.from_pretrained("<your-hf-account-name>/mt0-large-ia3").to(device)
 tokenizer = AutoTokenizer.from_pretrained("bigscience/mt0-large")

 i = 15
--- a/docs/source/task_guides/lora_based_methods.md
+++ b/docs/source/task_guides/lora_based_methods.md
@ -20,6 +20,8 @@ A popular way to efficiently train large models is to insert (typically in the a

 There are several different ways to express the weight matrix as a low-rank decomposition, but [Low-Rank Adaptation (LoRA)](../conceptual_guides/adapter#low-rank-adaptation-lora) is the most common method. The PEFT library supports several other LoRA variants, such as [Low-Rank Hadamard Product (LoHa)](../conceptual_guides/adapter#low-rank-hadamard-product-loha), [Low-Rank Kronecker Product (LoKr)](../conceptual_guides/adapter#low-rank-kronecker-product-lokr), and [Adaptive Low-Rank Adaptation (AdaLoRA)](../conceptual_guides/adapter#adaptive-low-rank-adaptation-adalora). You can learn more about how these methods work conceptually in the [Adapters](../conceptual_guides/adapter) guide. If you're interested in applying these methods to other tasks and use cases like semantic segmentation, token classification, take a look at our [notebook collection](https://huggingface.co/collections/PEFT/notebooks-6573b28b33e5a4bf5b157fc1)!

+Additionally, PEFT supports the [X-LoRA](../conceptual_guides/adapter#mixture-of-lora-experts-x-lora) Mixture of LoRA Experts method.
+
 This guide will show you how to quickly train an image classification model - with a low-rank decomposition method - to identify the class of food shown in an image.

 <Tip>
@ -257,7 +259,7 @@ batch_size = 128
 args = TrainingArguments(
    peft_model_id,
    remove_unused_columns=False,
-    evaluation_strategy="epoch",
+    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-3,
    per_device_train_batch_size=batch_size,
@ -279,7 +281,7 @@ trainer = Trainer(
    args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
-    tokenizer=image_processor,
+    processing_class=image_processor,
    data_collator=collate_fn,
 )
 trainer.train()
@ -307,7 +309,7 @@ Let's load the model from the Hub and test it out on a food image.

 ```py
 from peft import PeftConfig, PeftModel
-from transfomers import AutoImageProcessor
+from transformers import AutoImageProcessor
 from PIL import Image
 import requests

--- a/docs/source/task_guides/prompt_based_methods.md
+++ b/docs/source/task_guides/prompt_based_methods.md
@ -43,7 +43,13 @@ Use the [`~datasets.load_dataset`] function to load the dataset and create a new
 ```py
 from datasets import load_dataset

-ds = load_dataset("ought/raft", "twitter_complaints")
+ds = load_dataset(
+    "parquet",
+    data_files={
+        "train": "hf://datasets/ought/raft@refs/convert/parquet/twitter_complaints/train/0000.parquet",
+        "test": "hf://datasets/ought/raft@refs/convert/parquet/twitter_complaints/test/0000.parquet"
+    }
+)

 classes = [k.replace("_", " ") for k in ds["train"].features["Label"].names]
 ds = ds.map(
--- a/docs/source/tutorial/peft_model_config.md
+++ b/docs/source/tutorial/peft_model_config.md
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # PEFT configurations and models

-The sheer size of today's large pretrained models - which commonly have billions of parameters - present a significant training challenge because they require more storage space and more computational power to crunch all those calculations. You'll need access to powerful GPUs or TPUs to train these large pretrained models which is expensive, not widely accessible to everyone, not environmentally friendly, and not very practical. PEFT methods address many of these challenges. There are several types of PEFT methods (soft prompting, matrix decomposition, adapters), but they all focus on the same thing, reduce the number of trainable parameters. This makes it more accessible to train and store large models on consumer hardware.
+The sheer size of today's large pretrained models - which commonly have billions of parameters - presents a significant training challenge because they require more storage space and more computational power to crunch all those calculations. You'll need access to powerful GPUs or TPUs to train these large pretrained models which is expensive, not widely accessible to everyone, not environmentally friendly, and not very practical. PEFT methods address many of these challenges. There are several types of PEFT methods (soft prompting, matrix decomposition, adapters), but they all focus on the same thing, reduce the number of trainable parameters. This makes it more accessible to train and store large models on consumer hardware.

 The PEFT library is designed to help you quickly train large models on free or low-cost GPUs, and in this tutorial, you'll learn how to setup a configuration to apply a PEFT method to a pretrained base model for training. Once the PEFT configuration is setup, you can use any training framework you like (Transformer's [`~transformers.Trainer`] class, [Accelerate](https://hf.co/docs/accelerate), a custom PyTorch training loop).

@ -99,7 +99,7 @@ You can create your own configuration for training by initializing a [`PromptEnc
 from peft import PromptEncoderConfig, TaskType

 p_tuning_config = PromptEncoderConfig(
-    encoder_reprameterization_type="MLP",
+    encoder_reparameterization_type="MLP",
    encoder_hidden_size=128,
    num_attention_heads=16,
    num_layers=24,
@ -135,6 +135,9 @@ lora_model.print_trainable_parameters()
 "trainable params: 1,572,864 || all params: 332,769,280 || trainable%: 0.472659014678278"
 ```

+> [!WARNING]
+> When calling [`get_peft_model`], the base model will be modified *in-place*. That means, when calling [`get_peft_model`] on a model that was already modified in the same way before, this model will be further mutated. Therefore, if you would like to modify your PEFT configuration after having called [`get_peft_model()`] before, you would first have to unload the model with [`~LoraModel.unload`] and then call [`get_peft_model()`] with your new configuration. Alternatively, you can re-initialize the model to ensure a fresh, unmodified state before applying a new PEFT configuration.
+
 Now you can train the [`PeftModel`] with your preferred training framework! After training, you can save your model locally with [`~PeftModel.save_pretrained`] or upload it to the Hub with the [`~transformers.PreTrainedModel.push_to_hub`] method.

 ```py
--- a/examples/alora_finetuning/README.md
+++ b/examples/alora_finetuning/README.md
@ -0,0 +1,76 @@
+# Activated LoRA (aLoRA)
+
+## Introduction
+Activated LoRA (aLoRA) is an adapter that selectively activates its weights only after a given invocation sequence, ensuring that hidden states match the base model prior to this point. This allows reusing the base model KVs (stored in the KV cache) for tokens before the invocation,
+enabling much faster real-world inference (e.g. vLLM) when switching between generation with the base model and generation with adapters.
+See the [paper](https://huggingface.co/papers/2504.12397) for more details.
+
+## Quick start (shown for Mistral 7B)
+```python
+import torch
+from peft import LoraConfig, get_peft_model
+from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, DataCollatorForLanguageModeling
+from datasets import load_dataset
+
+model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3", device_map="cuda")
+tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
+dataset = load_dataset("Lots-of-LoRAs/task1660_super_glue_question_generation", split="train")
+
+invocation_string = "[/INST]" # End of user turn in Mistral chat template
+invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens=False)
+
+lora_config = LoraConfig(
+    task_type="CAUSAL_LM",
+    alora_invocation_tokens=invocation_tokens,
+    r=32,
+    target_modules=["q_proj", "k_proj", "v_proj"],
+)
+
+peft_model = get_peft_model(model, lora_config)
+data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
+trainer = Trainer(
+    model=peft_model,
+    train_dataset=dataset,
+    dataset_text_field="text",
+    max_seq_length=2048,
+    tokenizer=tokenizer,
+    data_collator=data_collator,
+)
+trainer.train()
+peft_model.save_pretrained("alora-mistral-7b")
+```
+
+### Use the training example script directly
+Pass the invocation string with `--invocation_string` when running the training example
+script. For Mistral 7B, do:
+```bash
+python examples/alora_finetuning/alora_finetuning.py --base_model mistralai/Mistral-7B-Instruct-v0.3 --data_path Lots-of-LoRAs/task1660_super_glue_question_generation --invocation_string "[/INST]"
+```
+and similarly for Llama-3.2-3B-Instruct:
+```bash
+python examples/alora_finetuning/alora_finetuning.py --base_model meta-llama/Llama-3.2-3B-Instruct --data_path Lots-of-LoRAs/task1660_super_glue_question_generation --invocation_string "<|start_header_id|>assistant<|end_header_id|>"
+```
+
+### Full example of the script
+```bash
+python alora_finetuning.py \
+    --base_model "PATH_TO_MODEL" \
+    --data_path "PATH_TO_DATASET" \
+    --output_dir "PATH_TO_OUTPUT_DIR" \
+    --batch_size 1 \
+    --num_epochs 3 \
+    --learning_rate 3e-4 \
+    --cutoff_len 512 \
+    --val_set_size 500 \
+    --invocation_string "[/INST]" \
+    --quantize \
+    --eval_step 10 \
+    --save_step 100 \
+    --device "cuda:0" \
+    --lora_r 32 \
+    --lora_alpha 32 \
+    --lora_dropout 0.05 \
+    --lora_target_modules "q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj" \
+    --hub_model_id "YOUR_HF_REPO" \
+    --push_to_hub
+```
--- a/examples/alora_finetuning/alora_finetuning.py
+++ b/examples/alora_finetuning/alora_finetuning.py
@ -0,0 +1,251 @@
+import os
+
+import torch
+from datasets import load_dataset
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    DataCollatorForLanguageModeling,
+    Trainer,
+    TrainingArguments,
+)
+
+from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
+
+
+def train_model(
+    base_model: str,
+    data_path: str,
+    output_dir: str,
+    batch_size: int,
+    num_epochs: int,
+    learning_rate: float,
+    cutoff_len: int,
+    val_set_size: int,
+    invocation_string: str,
+    quantize: bool,
+    eval_step: int,
+    save_step: int,
+    device: str,
+    lora_r: int,
+    lora_alpha: int,
+    lora_dropout: float,
+    lora_target_modules: str,
+    hub_model_id: str,
+    push_to_hub: bool,
+):
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    hf_token = os.getenv("HF_TOKEN")
+
+    device = torch.device(device)
+    print(f"Using device: {device}")
+
+    tokenizer = AutoTokenizer.from_pretrained(base_model, token=hf_token)
+    tokenizer.pad_token = tokenizer.unk_token
+    invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens=False)
+
+    if quantize:
+        model = AutoModelForCausalLM.from_pretrained(
+            base_model,
+            token=hf_token,
+            quantization_config=BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=(
+                    torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
+                ),
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4",
+            ),
+        )
+        model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
+    else:
+        model = AutoModelForCausalLM.from_pretrained(base_model, token=hf_token)
+
+    lora_config = LoraConfig(
+        task_type="CAUSAL_LM",
+        alora_invocation_tokens=invocation_tokens,
+        r=lora_r,
+        lora_alpha=lora_alpha,
+        target_modules=(lora_target_modules.split(",") if lora_target_modules else ["q_proj", "k_proj", "v_proj"]),
+        lora_dropout=lora_dropout,
+        bias="none",
+    )
+
+    model = get_peft_model(model, lora_config)
+
+    model.to(device)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    dataset = load_dataset(data_path)
+
+    def tokenize_function(examples):
+        formatted_texts = [
+            tokenizer.apply_chat_template(
+                [
+                    {"role": "user", "content": user_msg},
+                    {"role": "assistant", "content": assistant_msg},
+                ],
+                tokenize=False,  # get plain text first
+                add_generation_prompt=False,
+            )
+            for user_msg, assistant_msg in zip(examples["input"], examples["output"])
+        ]
+
+        # 2) Tokenize those texts
+        model_inputs = tokenizer(
+            formatted_texts,
+            padding="max_length",
+            truncation=True,
+            max_length=cutoff_len,
+        )
+
+        labels = []
+        for ids in model_inputs["input_ids"]:
+            labels.append([(token_id if token_id != tokenizer.pad_token_id else -100) for token_id in ids])
+        model_inputs["labels"] = labels
+
+        return model_inputs
+
+    # Tokenize the dataset and prepare for training
+    tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)
+
+    # Data collator to dynamically pad the batched examples
+    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
+
+    training_args = TrainingArguments(
+        output_dir=output_dir,
+        num_train_epochs=num_epochs,
+        per_device_train_batch_size=batch_size,
+        per_device_eval_batch_size=batch_size,
+        warmup_steps=100,
+        weight_decay=0.01,
+        logging_dir="./logs",
+        logging_steps=eval_step,
+        save_steps=save_step,
+        save_total_limit=2,
+        push_to_hub=push_to_hub,
+        hub_model_id=hub_model_id,
+        gradient_accumulation_steps=16,
+        fp16=True,
+        learning_rate=learning_rate,
+        hub_token=hf_token,
+    )
+
+    torch.cuda.empty_cache()
+
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_datasets["train"],
+        eval_dataset=tokenized_datasets["test"],
+        data_collator=data_collator,
+    )
+
+    trainer.train()
+
+    if push_to_hub:
+        trainer.push_to_hub(commit_message="Fine-tuned model")
+
+    model.save_pretrained(output_dir)
+    tokenizer.save_pretrained(output_dir)
+
+
+def model_inference(model_path: str, adapter_path: str, prompt: str = None, data_path: str = None):
+    """
+    Simple inference with the tuned aLoRA adapter. Optionally (reuse_cache = True) demonstrates
+    that the aLoRA adapter can (but does not need to) use KV cache created by the base model,
+    perhaps during a prior generation turn.
+
+    Purely for demonstration purposes. See the [paper](https://huggingface.co/papers/2504.12397)
+    for realistic multiturn cache reuse examples.
+    """
+    if prompt is None:
+        # Use first row of test data
+        dataset = load_dataset(data_path)
+        prompt = dataset["test"][0]["input"]
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    base_model = AutoModelForCausalLM.from_pretrained(model_path)
+    alora_model = PeftModel.from_pretrained(base_model, adapter_path)
+    chat = [{"role": "user", "content": prompt}]
+    text = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
+    inputs = tokenizer(text, return_tensors="pt").to(base_model.device)
+
+    # Generate answer with adapter
+    output_dict = alora_model.generate(**inputs, return_dict_in_generate=True, max_new_tokens=20)
+    alora_outputs = output_dict.sequences
+
+    # Print results
+    print(f"Prompt: {text}")
+    response = tokenizer.decode(alora_outputs[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True)
+    print(f"Trained adapter response: {response}")
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Fine-tune Mistral with Activated LoRA")
+    parser.add_argument(
+        "--base_model", type=str, default="mistralai/Mistral-7B-Instruct-v0.3", help="Base model path or name"
+    )
+    parser.add_argument(
+        "--data_path",
+        type=str,
+        default="Lots-of-LoRAs/task1660_super_glue_question_generation",
+        help="Dataset path or name",
+    )
+    parser.add_argument(
+        "--output_dir", type=str, default="path/to/output", help="Output directory for the fine-tuned model"
+    )
+    parser.add_argument("--batch_size", type=int, default=2, help="Batch size")
+    parser.add_argument("--num_epochs", type=int, default=1, help="Number of training epochs")
+    parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate")
+    parser.add_argument("--cutoff_len", type=int, default=2048, help="Cutoff length for tokenization")
+    parser.add_argument("--val_set_size", type=int, default=500, help="Validation set size")
+    parser.add_argument(
+        "--invocation_string",
+        type=str,
+        default="[/INST]",
+        help="String that activates the aLoRA adapter. Model dependent.",
+    )
+    parser.add_argument("--quantize", action="store_true", help="Use quantization")
+    parser.add_argument("--eval_step", type=int, default=10, help="Evaluation step interval")
+    parser.add_argument("--save_step", type=int, default=100, help="Save step interval")
+    parser.add_argument("--device", type=str, default="cuda:0", help="Device to use for training")
+    parser.add_argument("--lora_r", type=int, default=32, help="LoRA rank")
+    parser.add_argument("--lora_alpha", type=int, default=32, help="LoRA alpha")
+    parser.add_argument("--lora_dropout", type=float, default=0.05, help="LoRA dropout rate")
+    parser.add_argument(
+        "--lora_target_modules", type=str, default=None, help="Comma-separated list of target modules for LoRA"
+    )
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default="path/to/repo",
+        help="Repository name to push the model on the Hugging Face Hub",
+    )
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to Hugging Face Hub")
+    args = parser.parse_args()
+    train_model(
+        base_model=args.base_model,
+        data_path=args.data_path,
+        output_dir=args.output_dir,
+        batch_size=args.batch_size,
+        num_epochs=args.num_epochs,
+        learning_rate=args.learning_rate,
+        cutoff_len=args.cutoff_len,
+        val_set_size=args.val_set_size,
+        invocation_string=args.invocation_string,
+        quantize=args.quantize,
+        eval_step=args.eval_step,
+        save_step=args.save_step,
+        device=args.device,
+        lora_r=args.lora_r,
+        lora_alpha=args.lora_alpha,
+        lora_dropout=args.lora_dropout,
+        lora_target_modules=args.lora_target_modules,
+        hub_model_id=args.hub_model_id,
+        push_to_hub=args.push_to_hub,
+    )
+    print("Model trained. Running test inference.")
+    model_inference(model_path=args.base_model, adapter_path=args.output_dir, data_path=args.data_path)
--- a/examples/boft_controlnet/boft_controlnet.md
+++ b/examples/boft_controlnet/boft_controlnet.md
@ -19,9 +19,9 @@ rendered properly in your Markdown viewer.

 This guide demonstrates how to use BOFT, an orthogonal fine-tuning method, to fine-tune Stable Diffusion with either `stabilityai/stable-diffusion-2-1` or `runwayml/stable-diffusion-v1-5` model for controllable generation.

-By using BOFT from 🤗 PEFT, we can significantly reduce the number of trainable parameters while still achieving impressive results in various fine-tuning tasks across different foundation models. BOFT enhances model efficiency by integrating full-rank orthogonal matrices with a butterfly structure into specific model blocks, such as attention blocks, mirroring the approach used in LoRA. During fine-tuning, only these inserted matrices are trained, leaving the original model parameters untouched. During inference, the trainable BOFT paramteres can be merged into the original model, eliminating any additional computational costs.
+By using BOFT from 🤗 PEFT, we can significantly reduce the number of trainable parameters while still achieving impressive results in various fine-tuning tasks across different foundation models. BOFT enhances model efficiency by integrating full-rank orthogonal matrices with a butterfly structure into specific model blocks, such as attention blocks, mirroring the approach used in LoRA. During fine-tuning, only these inserted matrices are trained, leaving the original model parameters untouched. During inference, the trainable BOFT parameters can be merged into the original model, eliminating any additional computational costs.

-As a member of the **orthogonal finetuning** class, BOFT presents a systematic and principled method for fine-tuning. It possesses several unique properties and has demonstrated superior performance compared to LoRA in a variety of scenarios. For further details on BOFT, please consult the [PEFT's GitHub repo's concept guide OFT](https://https://huggingface.co/docs/peft/index), the [original BOFT paper](https://arxiv.org/abs/2311.06243) and the [original OFT paper](https://arxiv.org/abs/2306.07280).
+As a member of the **orthogonal finetuning** class, BOFT presents a systematic and principled method for fine-tuning. It possesses several unique properties and has demonstrated superior performance compared to LoRA in a variety of scenarios. For further details on BOFT, please consult the [PEFT's GitHub repo's concept guide OFT](https://https://huggingface.co/docs/peft/index), the [original BOFT paper](https://huggingface.co/papers/2311.06243) and the [original OFT paper](https://huggingface.co/papers/2306.07280).

 In this guide we provide a controllable generation (ControlNet) fine-tuning script that is available in [PEFT's GitHub repo examples](https://github.com/huggingface/peft/tree/main/examples/boft_controlnet). This implementation is adapted from [diffusers's ControlNet](https://github.com/huggingface/diffusers/tree/main/examples/controlnet) and [Hecong Wu's ControlLoRA](https://github.com/HighCWu/ControlLoRA). You can try it out and finetune on your custom images.

@ -58,7 +58,7 @@ export DATASET_NAME="oftverse/control-celeba-hq"

 ## Train controllable generation (ControlNet) with BOFT

-Start with setting some hyperparamters for BOFT:
+Start with setting some hyperparameters for BOFT:
 ```bash
 PEFT_TYPE="boft"
 BLOCK_NUM=8
@ -174,4 +174,4 @@ accelerate launch eval.py \
  --output_dir=$OUTPUT_DIR \
  --dataset_name=$DATASET_NAME \
  --vis_overlays \
-```
+```
--- a/examples/boft_controlnet/eval.py
+++ b/examples/boft_controlnet/eval.py
@ -13,7 +13,7 @@
 # limitations under the License.

 # The implementation is based on "Parameter-Efficient Orthogonal Finetuning
-# via Butterfly Factorization" (https://arxiv.org/abs/2311.06243) in ICLR 2024.
+# via Butterfly Factorization" (https://huggingface.co/papers/2311.06243) in ICLR 2024.

 import glob
 import os
@ -32,8 +32,14 @@ from utils.args_loader import parse_args
 from utils.dataset import make_dataset


-detect_model = face_alignment.FaceAlignment(face_alignment.LandmarksType.TWO_D, device="cuda:0", flip_input=False)
+# Determine the best available device
+if torch.cuda.is_available():
+    device = "cuda:0"
+else:
+    # TODO: xpu support in facealignment will be ready after this PR is merged:https://github.com/1adrianb/face-alignment/pull/371
+    device = "cpu"

+detect_model = face_alignment.FaceAlignment(face_alignment.LandmarksType.TWO_D, device=device, flip_input=False)
 # with open('./data/celebhq-text/prompt_val_blip_full.json', 'rt') as f:    # fill50k, COCO
 #     for line in f:
 #         val_data = json.loads(line)
--- a/examples/boft_controlnet/requirements.txt
+++ b/examples/boft_controlnet/requirements.txt
@ -1,8 +1,10 @@
 datasets==2.16.1
-diffusers==0.17.1
-transformers==4.36.2
-accelerate==0.25.0
+diffusers==0.34.0
+transformers==4.54.0
+accelerate==1.9.0
 wandb==0.16.1
 scikit-image==0.22.0
 opencv-python==4.9.0.80
-face-alignment==1.4.1
+git+https://github.com/1adrianb/face-alignment.git
+huggingface_hub==0.34.3
+numpy<2.0.0
--- a/examples/boft_controlnet/test_controlnet.py
+++ b/examples/boft_controlnet/test_controlnet.py
@ -13,7 +13,7 @@
 # limitations under the License.

 # The implementation is based on "Parameter-Efficient Orthogonal Finetuning
-# via Butterfly Factorization" (https://arxiv.org/abs/2311.06243) in ICLR 2024.
+# via Butterfly Factorization" (https://huggingface.co/papers/2311.06243) in ICLR 2024.

 import os
 import sys
@ -37,12 +37,17 @@ from utils.unet_2d_condition import UNet2DConditionNewModel


 sys.path.append("../../src")
-from peft import PeftModel
+from peft import PeftModel  # noqa: E402


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
 check_min_version("0.10.0.dev0")
-device = torch.device("cuda:0")
+if torch.xpu.is_available():
+    device = "xpu:0"
+elif torch.cuda.is_available():
+    device = "cuda:0"
+else:
+    device = "cpu"


 def main(args):
--- a/examples/boft_controlnet/test_controlnet.sh
+++ b/examples/boft_controlnet/test_controlnet.sh
@ -13,7 +13,7 @@ export DATASET_NAME="oftverse/control-celeba-hq"
 export CKPT_NAME="checkpoint-${ITER_NUM}"
 export OUTPUT_DIR="./output/${DATASET_NAME}/${RUN_NAME}/${CKPT_NAME}"
 export CONTROLNET_PATH="${OUTPUT_DIR}/controlnet/model.safetensors"
-export UNET_PATH="${OUTPUT_DIR}/unet/${RUN_NAME}"
+export UNET_PATH="${OUTPUT_DIR}/unet"
 export RESULTS_PATH="${OUTPUT_DIR}/results"


--- a/examples/boft_controlnet/train_controlnet.py
+++ b/examples/boft_controlnet/train_controlnet.py
@ -14,7 +14,7 @@
 # limitations under the License.

 # The implementation is based on "Parameter-Efficient Orthogonal Finetuning
-# via Butterfly Factorization" (https://arxiv.org/abs/2311.06243) in ICLR 2024.
+# via Butterfly Factorization" (https://huggingface.co/papers/2311.06243) in ICLR 2024.

 import itertools
 import logging
@ -215,7 +215,9 @@ def main(args):
        text_encoder.to(accelerator.device, dtype=weight_dtype)

    if args.enable_xformers_memory_efficient_attention:
-        if is_xformers_available():
+        if accelerator.device.type == "xpu":
+            logger.warning("XPU doesn't support xformers yet, xformers is not applied.")
+        elif is_xformers_available():
            import xformers

            xformers_version = version.parse(xformers.__version__)
@ -513,11 +515,17 @@ def main(args):
                    break

        # Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
-        accelerator.print(f"GPU Memory before entering the train : {b2mb(tracemalloc.begin)}")
-        accelerator.print(f"GPU Memory consumed at the end of the train (end-begin): {tracemalloc.used}")
-        accelerator.print(f"GPU Peak Memory consumed during the train (max-begin): {tracemalloc.peaked}")
        accelerator.print(
-            f"GPU Total Peak Memory consumed during the train (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}"
+            f"{accelerator.device.type.upper()} Memory before entering the train : {b2mb(tracemalloc.begin)}"
+        )
+        accelerator.print(
+            f"{accelerator.device.type.upper()} Memory consumed at the end of the train (end-begin): {tracemalloc.used}"
+        )
+        accelerator.print(
+            f"{accelerator.device.type.upper()} Peak Memory consumed during the train (max-begin): {tracemalloc.peaked}"
+        )
+        accelerator.print(
+            f"{accelerator.device.type.upper()} Total Peak Memory consumed during the train (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}"
        )

        accelerator.print(f"CPU Memory before entering the train : {b2mb(tracemalloc.cpu_begin)}")
--- a/examples/boft_controlnet/utils/light_controlnet.py
+++ b/examples/boft_controlnet/utils/light_controlnet.py
@ -14,13 +14,13 @@


 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Optional, Union

 import torch
 from diffusers.configuration_utils import ConfigMixin, register_to_config
 from diffusers.models.attention_processor import AttentionProcessor, AttnProcessor
 from diffusers.models.modeling_utils import ModelMixin
-from diffusers.models.unet_2d_blocks import (
+from diffusers.models.unets.unet_2d_blocks import (
    CrossAttnDownBlock2D,
    DownBlock2D,
 )
@ -34,13 +34,13 @@ logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

@dataclass
 class ControlNetOutput(BaseOutput):
-    down_block_res_samples: Tuple[torch.Tensor]
+    down_block_res_samples: tuple[torch.Tensor]
    mid_block_res_sample: torch.Tensor


 class ControlNetConditioningEmbedding(nn.Module):
    """
-    Quoting from https://arxiv.org/abs/2302.05543: "Stable Diffusion uses a pre-processing method similar to VQ-GAN
+    Quoting from https://huggingface.co/papers/2302.05543: "Stable Diffusion uses a pre-processing method similar to VQ-GAN
    [11] to convert the entire dataset of 512 × 512 images into smaller 64 × 64 “latent images” for stabilized
    training. This requires ControlNets to convert image-based conditions to 64 × 64 feature space to match the
    convolution size. We use a tiny network E(·) of four convolution layers with 4 × 4 kernels and 2 × 2 strides
@ -52,7 +52,7 @@ class ControlNetConditioningEmbedding(nn.Module):
        self,
        conditioning_embedding_channels: int,
        conditioning_channels: int = 3,
-        block_out_channels: Tuple[int] = (16, 32, 96, 256),
+        block_out_channels: tuple[int] = (16, 32, 96, 256),
    ):
        super().__init__()

@ -92,7 +92,7 @@ class ControlNetModel(ModelMixin, ConfigMixin):
        in_channels: int = 4,
        out_channels: int = 320,
        controlnet_conditioning_channel_order: str = "rgb",
-        conditioning_embedding_out_channels: Optional[Tuple[int]] = (16, 32, 96, 256),
+        conditioning_embedding_out_channels: Optional[tuple[int]] = (16, 32, 96, 256),
    ):
        super().__init__()

@ -104,7 +104,7 @@ class ControlNetModel(ModelMixin, ConfigMixin):

    @property
    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+    def attn_processors(self) -> dict[str, AttentionProcessor]:
        r"""
        Returns:
            `dict` of attention processors: A dictionary containing all attention processors used in the model with
@ -113,7 +113,7 @@ class ControlNetModel(ModelMixin, ConfigMixin):
        # set recursively
        processors = {}

-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: dict[str, AttentionProcessor]):
            if hasattr(module, "set_processor"):
                processors[f"{name}.processor"] = module.processor

@ -128,7 +128,7 @@ class ControlNetModel(ModelMixin, ConfigMixin):
        return processors

    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+    def set_attn_processor(self, processor: Union[AttentionProcessor, dict[str, AttentionProcessor]]):
        r"""
        Parameters:
            `processor (`dict` of `AttentionProcessor` or `AttentionProcessor`):
@ -220,7 +220,7 @@ class ControlNetModel(ModelMixin, ConfigMixin):
        # Recursively walk through all the children.
        # Any children which exposes the set_attention_slice method
        # gets the message
-        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: list[int]):
            if hasattr(module, "set_attention_slice"):
                module.set_attention_slice(slice_size.pop())

@ -238,7 +238,7 @@ class ControlNetModel(ModelMixin, ConfigMixin):
    def forward(
        self,
        controlnet_cond: torch.FloatTensor,
-    ) -> Union[ControlNetOutput, Tuple]:
+    ) -> Union[ControlNetOutput, tuple]:
        # check channel order
        channel_order = self.config.controlnet_conditioning_channel_order

--- a/examples/boft_controlnet/utils/pipeline_controlnet.py
+++ b/examples/boft_controlnet/utils/pipeline_controlnet.py
@ -13,14 +13,14 @@
 # limitations under the License.

 from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Optional, Union

 import numpy as np
 import PIL.Image
 import torch
 from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
 from diffusers.pipelines.controlnet.pipeline_controlnet import StableDiffusionControlNetPipeline
-from diffusers.utils import BaseOutput, is_compiled_module, logging
+from diffusers.utils import BaseOutput, logging
 from torch.nn import functional as F
 from utils.light_controlnet import ControlNetModel

@ -42,8 +42,8 @@ class LightControlNetPipelineOutput(BaseOutput):
            (nsfw) content, or `None` if safety checking could not be performed.
    """

-    images: Union[List[PIL.Image.Image], np.ndarray]
-    nsfw_content_detected: Optional[List[bool]]
+    images: Union[list[PIL.Image.Image], np.ndarray]
+    nsfw_content_detected: Optional[list[bool]]


 class LightControlNetPipeline(StableDiffusionControlNetPipeline):
@ -164,23 +164,23 @@ class LightControlNetPipeline(StableDiffusionControlNetPipeline):
    @torch.no_grad()
    def __call__(
        self,
-        prompt: Union[str, List[str]] = None,
+        prompt: Union[str, list[str]] = None,
        image: Union[
            torch.FloatTensor,
            PIL.Image.Image,
            np.ndarray,
-            List[torch.FloatTensor],
-            List[PIL.Image.Image],
-            List[np.ndarray],
+            list[torch.FloatTensor],
+            list[PIL.Image.Image],
+            list[np.ndarray],
        ] = None,
        height: Optional[int] = None,
        width: Optional[int] = None,
        num_inference_steps: int = 50,
        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Optional[Union[str, list[str]]] = None,
        num_images_per_prompt: Optional[int] = 1,
        eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        generator: Optional[Union[torch.Generator, list[torch.Generator]]] = None,
        latents: Optional[torch.FloatTensor] = None,
        prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
@ -188,8 +188,8 @@ class LightControlNetPipeline(StableDiffusionControlNetPipeline):
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
        callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        cross_attention_kwargs: Optional[dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, list[float]] = 1.0,
        guess_mode: bool = False,
    ):
        r"""
@ -215,9 +215,9 @@ class LightControlNetPipeline(StableDiffusionControlNetPipeline):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://huggingface.co/papers/2207.12598).
                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting `guidance_scale >
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            negative_prompt (`str` or `List[str]`, *optional*):
@ -227,7 +227,7 @@ class LightControlNetPipeline(StableDiffusionControlNetPipeline):
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                Corresponds to parameter eta (η) in the DDIM paper: https://huggingface.co/papers/2010.02502. Only applies to
                [`schedulers.DDIMScheduler`], will be ignored for others.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
@ -298,11 +298,11 @@ class LightControlNetPipeline(StableDiffusionControlNetPipeline):

        device = self._execution_device
        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
        # corresponds to doing no classifier free guidance.
        do_classifier_free_guidance = guidance_scale > 1.0

-        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+        controlnet = self.controlnet._orig_mod if hasattr(self.controlnet, "_orig_mod") else self.controlnet

        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
@ -426,7 +426,10 @@ class LightControlNetPipeline(StableDiffusionControlNetPipeline):
        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
            self.unet.to("cpu")
            self.controlnet.to("cpu")
-            torch.cuda.empty_cache()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            elif torch.xpu.is_available():
+                torch.xpu.empty_cache()

        if not output_type == "latent":
            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
--- a/examples/boft_controlnet/utils/tracemalloc.py
+++ b/examples/boft_controlnet/utils/tracemalloc.py
@ -13,10 +13,12 @@ def b2mb(x):
 # This context manager is used to track the peak memory usage of the process
 class TorchTracemalloc:
    def __enter__(self):
+        self.device_type = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
+        self.device_module = getattr(torch, self.device_type, torch.cuda)
        gc.collect()
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()  # reset the peak gauge to zero
-        self.begin = torch.cuda.memory_allocated()
+        self.device_module.empty_cache()
+        self.device_module.reset_peak_memory_stats()  # reset the peak gauge to zero
+        self.begin = self.device_module.memory_allocated()
        self.process = psutil.Process()

        self.cpu_begin = self.cpu_mem_used()
@ -46,9 +48,9 @@ class TorchTracemalloc:
        self.peak_monitoring = False

        gc.collect()
-        torch.cuda.empty_cache()
-        self.end = torch.cuda.memory_allocated()
-        self.peak = torch.cuda.max_memory_allocated()
+        self.device_module.empty_cache()
+        self.end = self.device_module.memory_allocated()
+        self.peak = self.device_module.max_memory_allocated()
        self.used = b2mb(self.end - self.begin)
        self.peaked = b2mb(self.peak - self.begin)

--- a/examples/boft_controlnet/utils/unet_2d_condition.py
+++ b/examples/boft_controlnet/utils/unet_2d_condition.py
@ -13,7 +13,7 @@
 # limitations under the License.

 from dataclasses import dataclass
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Optional, Union

 import torch
 from diffusers.models import UNet2DConditionModel
@ -44,13 +44,13 @@ class UNet2DConditionNewModel(UNet2DConditionModel):
        class_labels: Optional[torch.Tensor] = None,
        timestep_cond: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
-        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        cross_attention_kwargs: Optional[dict[str, Any]] = None,
+        added_cond_kwargs: Optional[dict[str, torch.Tensor]] = None,
+        down_block_additional_residuals: Optional[tuple[torch.Tensor]] = None,
        mid_block_additional_residual: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        return_dict: bool = True,
-    ) -> Union[UNet2DConditionOutput, Tuple]:
+    ) -> Union[UNet2DConditionOutput, tuple]:
        r"""
        Args:
            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
--- a/examples/boft_dreambooth/boft_dreambooth.md
+++ b/examples/boft_dreambooth/boft_dreambooth.md
@ -18,9 +18,9 @@ rendered properly in your Markdown viewer.

 This guide demonstrates how to use BOFT, an orthogonal fine-tuning method, to fine-tune Dreambooth with either `stabilityai/stable-diffusion-2-1` or `runwayml/stable-diffusion-v1-5` model.

-By using BOFT from 🤗 PEFT, we can significantly reduce the number of trainable parameters while still achieving impressive results in various fine-tuning tasks across different foundation models. BOFT enhances model efficiency by integrating full-rank orthogonal matrices with a butterfly structure into specific model blocks, such as attention blocks, mirroring the approach used in LoRA. During fine-tuning, only these inserted matrices are trained, leaving the original model parameters untouched. During inference, the trainable BOFT paramteres can be merged into the original model, eliminating any additional computational costs.
+By using BOFT from 🤗 PEFT, we can significantly reduce the number of trainable parameters while still achieving impressive results in various fine-tuning tasks across different foundation models. BOFT enhances model efficiency by integrating full-rank orthogonal matrices with a butterfly structure into specific model blocks, such as attention blocks, mirroring the approach used in LoRA. During fine-tuning, only these inserted matrices are trained, leaving the original model parameters untouched. During inference, the trainable BOFT parameters can be merged into the original model, eliminating any additional computational costs.

-As a member of the **orthogonal finetuning** class, BOFT presents a systematic and principled method for fine-tuning. It possesses several unique properties and has demonstrated superior performance compared to LoRA in a variety of scenarios. For further details on BOFT, please consult the [PEFT's GitHub repo's concept guide OFT](https://https://huggingface.co/docs/peft/index), the [original BOFT paper](https://arxiv.org/abs/2311.06243) and the [original OFT paper](https://arxiv.org/abs/2306.07280).
+As a member of the **orthogonal finetuning** class, BOFT presents a systematic and principled method for fine-tuning. It possesses several unique properties and has demonstrated superior performance compared to LoRA in a variety of scenarios. For further details on BOFT, please consult the [PEFT's GitHub repo's concept guide OFT](https://https://huggingface.co/docs/peft/index), the [original BOFT paper](https://huggingface.co/papers/2311.06243) and the [original OFT paper](https://huggingface.co/papers/2306.07280).

 In this guide we provide a Dreambooth fine-tuning script that is available in [PEFT's GitHub repo examples](https://github.com/huggingface/peft/tree/main/examples/boft_dreambooth). This implementation is adapted from [peft's lora_dreambooth](https://github.com/huggingface/peft/tree/main/examples/lora_dreambooth). You can try it out and finetune on your custom images.

@ -40,6 +40,7 @@ cd peft/examples/boft_dreambooth

 Set up your environment: install PEFT, and all the required libraries. At the time of writing this guide we recommend installing PEFT from source. The following environment setup should work on A100 and H100:

+### CUDA
 ```bash
 conda create --name peft python=3.10
 conda activate peft
@ -48,6 +49,16 @@ conda install xformers -c xformers
 pip install -r requirements.txt
 pip install git+https://github.com/huggingface/peft
 ```
+The follwing environment setuo is validated work on Intel XPU:
+
+### Intel XPU
+```bash
+conda create --name peft python=3.10
+conda activate peft
+pip install pip install torch==2.8.0.dev20250615+xpu torchvision==0.23.0.dev20250615+xpu torchaudio==2.8.0.dev20250615+xpu --index-url https://download.pytorch.org/whl/nightly/xpu --no-cache-dir
+pip install -r requirements.txt
+pip install git+https://github.com/huggingface/peft
+```

 ## Download the data

@ -92,10 +103,10 @@ To learn more about DreamBooth fine-tuning with prior-preserving loss, check out
 Launch the training script with `accelerate` and pass hyperparameters, as well as LoRa-specific arguments to it such as:

 - `use_boft`: Enables BOFT in the training script.
- `boft_block_size`: the BOFT matrix block size across different layers, expressed in `int`. Smaller block size results in sparser update matrices with fewer trainable paramters. **Note**, please choose it to be dividable to most layer `in_features` dimension, e.g., 4, 8, 16. Also, you can only specify either `boft_block_size` or `boft_block_num`, but not both simultaneously, because `boft_block_size` x `boft_block_num` = layer dimension.
- `boft_block_num`: the number of BOFT matrix blocks across different layers, expressed in `int`. Fewer blocks result in sparser update matrices with fewer trainable paramters. **Note**, please choose it to be dividable to most layer `in_features` dimension, e.g., 4, 8, 16. Also, you can only specify either `boft_block_size` or `boft_block_num`, but not both simultaneously, because `boft_block_size` x `boft_block_num` = layer dimension.
- `boft_n_butterfly_factor`: the number of butterfly factors. **Note**, for `boft_n_butterfly_factor=1`, BOFT is the same as vanilla OFT, for `boft_n_butterfly_factor=2`, the effective block size of OFT becomes twice as big and the number of blocks become half.
- `bias`: specify if the `bias` paramteres should be traind. Can be `none`, `all` or `boft_only`.
+- `boft_block_size`: the BOFT matrix block size across different layers, expressed in `int`. Smaller block size results in sparser update matrices with fewer trainable parameters. **Note**, please choose it to be dividable to most layer `in_features` dimension, e.g., 4, 8, 16. Also, you can only specify either `boft_block_size` or `boft_block_num`, but not both simultaneously, because `boft_block_size` x `boft_block_num` = layer dimension.
+- `boft_block_num`: the number of BOFT matrix blocks across different layers, expressed in `int`. Fewer blocks result in sparser update matrices with fewer trainable parameters. **Note**, please choose it to be dividable to most layer `in_features` dimension, e.g., 4, 8, 16. Also, you can only specify either `boft_block_size` or `boft_block_num`, but not both simultaneously, because `boft_block_size` x `boft_block_num` = layer dimension.
+- `boft_n_butterfly_factor`: the number of butterfly factors. **Note**, for `boft_n_butterfly_factor=1`, BOFT is the same as vanilla OFT, for `boft_n_butterfly_factor=2`, the effective block size of OFT becomes twice as big and the number of blocks becomes half.
+- `bias`: specify if the `bias` parameters should be trained. Can be `none`, `all` or `boft_only`.
 - `boft_dropout`: specify the probability of multiplicative dropout.

 Here's what the full set of script arguments may look like:
--- a/examples/boft_dreambooth/dreambooth_inference.ipynb
+++ b/examples/boft_dreambooth/dreambooth_inference.ipynb
@ -44,8 +44,10 @@
   "outputs": [],
   "source": [
    "def get_boft_sd_pipeline(\n",
-    "    ckpt_dir, base_model_name_or_path=None, epoch=int, dtype=torch.float32, device=\"cuda\", adapter_name=\"default\"\n",
+    "    ckpt_dir, base_model_name_or_path=None, epoch=int, dtype=torch.float32, device=\"auto\", adapter_name=\"default\"\n",
    "):\n",
+    "    if device == \"auto\":\n",
+    "        device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n",
    "\n",
    "    if base_model_name_or_path is None:\n",
    "        raise ValueError(\"Please specify the base model name or path\")\n",
@ -152,14 +154,6 @@
    "image = pipe(prompt, num_inference_steps=50, guidance_scale=7, negative_prompt=negative_prompt).images[0]\n",
    "image"
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f534eca2-94a4-432b-b092-7149ac44b12f",
-   "metadata": {},
-   "outputs": [],
-   "source": []
  }
 ],
 "metadata": {
--- a/examples/boft_dreambooth/requirements.txt
+++ b/examples/boft_dreambooth/requirements.txt
@ -1,13 +1,13 @@
-transformers==4.36.2
-accelerate==0.25.0
+transformers==4.54.0
+accelerate==1.9.0
 evaluate
 tqdm
-datasets==2.16.1
-diffusers==0.17.1
+datasets==4.0.0
+diffusers==0.34.0
 Pillow
 huggingface_hub
 safetensors
 nb_conda_kernels
 ipykernel
 ipywidgets
-wandb==0.16.1
+wandb==0.21.0
--- a/examples/boft_dreambooth/train_dreambooth.py
+++ b/examples/boft_dreambooth/train_dreambooth.py
@ -14,7 +14,7 @@
 # limitations under the License.

 # The implementation is based on "Parameter-Efficient Orthogonal Finetuning
-# via Butterfly Factorization" (https://arxiv.org/abs/2311.06243) in ICLR 2024.
+# via Butterfly Factorization" (https://huggingface.co/papers/2311.06243) in ICLR 2024.

 import hashlib
 import itertools
@ -139,7 +139,7 @@ def main(args):
        cur_class_images = len(list(class_images_dir.iterdir()))

        if cur_class_images < args.num_class_images:
-            torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
+            torch_dtype = torch.float16 if accelerator.device.type in ["cuda", "xpu"] else torch.float32
            if args.prior_generation_precision == "fp32":
                torch_dtype = torch.float32
            elif args.prior_generation_precision == "fp16":
@ -176,6 +176,8 @@ def main(args):
            del pipeline
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
+            elif torch.xpu.is_available():
+                torch.xpu.empty_cache()

    # Handle the repository creation
    if accelerator.is_main_process:
@ -263,7 +265,9 @@ def main(args):
    text_encoder.to(accelerator.device, dtype=weight_dtype)

    if args.enable_xformers_memory_efficient_attention:
-        if is_xformers_available():
+        if accelerator.device.type == "xpu":
+            logger.warn("XPU hasn't support xformers yet, ignore it.")
+        elif is_xformers_available():
            unet.enable_xformers_memory_efficient_attention()
        else:
            raise ValueError("xformers is not available. Make sure it is installed correctly")
@ -276,7 +280,7 @@ def main(args):

    # Enable TF32 for faster training on Ampere GPUs,
    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
-    if args.allow_tf32:
+    if args.allow_tf32 and torch.cuda.is_available():
        torch.backends.cuda.matmul.allow_tf32 = True

    if args.scale_lr:
@ -581,18 +585,27 @@ def main(args):
                            )

                    del pipeline
-                    torch.cuda.empty_cache()
+                    if torch.cuda.is_available():
+                        torch.cuda.empty_cache()
+                    elif torch.xpu.is_available():
+                        torch.xpu.empty_cache()

                if global_step >= args.max_train_steps:
                    break

-        # Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
+        # Printing the accelerator memory usage details such as allocated memory, peak memory, and total memory usage
        if not args.no_tracemalloc:
-            accelerator.print(f"GPU Memory before entering the train : {b2mb(tracemalloc.begin)}")
-            accelerator.print(f"GPU Memory consumed at the end of the train (end-begin): {tracemalloc.used}")
-            accelerator.print(f"GPU Peak Memory consumed during the train (max-begin): {tracemalloc.peaked}")
            accelerator.print(
-                f"GPU Total Peak Memory consumed during the train (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}"
+                f"{accelerator.device.type.upper()} Memory before entering the train : {b2mb(tracemalloc.begin)}"
+            )
+            accelerator.print(
+                f"{accelerator.device.type.upper()} Memory consumed at the end of the train (end-begin): {tracemalloc.used}"
+            )
+            accelerator.print(
+                f"{accelerator.device.type.upper()} Peak Memory consumed during the train (max-begin): {tracemalloc.peaked}"
+            )
+            accelerator.print(
+                f"{accelerator.device.type.upper()} Total Peak Memory consumed during the train (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}"
            )

            accelerator.print(f"CPU Memory before entering the train : {b2mb(tracemalloc.cpu_begin)}")
--- a/examples/boft_dreambooth/utils/tracemalloc.py
+++ b/examples/boft_dreambooth/utils/tracemalloc.py
@ -13,10 +13,12 @@ def b2mb(x):
 # This context manager is used to track the peak memory usage of the process
 class TorchTracemalloc:
    def __enter__(self):
+        self.device_type = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
+        self.device_module = getattr(torch, self.device_type, torch.cuda)
        gc.collect()
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()  # reset the peak gauge to zero
-        self.begin = torch.cuda.memory_allocated()
+        self.device_module.empty_cache()
+        self.device_module.reset_peak_memory_stats()  # reset the peak gauge to zero
+        self.begin = self.device_module.memory_allocated()
        self.process = psutil.Process()

        self.cpu_begin = self.cpu_mem_used()
@ -46,9 +48,9 @@ class TorchTracemalloc:
        self.peak_monitoring = False

        gc.collect()
-        torch.cuda.empty_cache()
-        self.end = torch.cuda.memory_allocated()
-        self.peak = torch.cuda.max_memory_allocated()
+        self.device_module.empty_cache()
+        self.end = self.device_module.memory_allocated()
+        self.peak = self.device_module.max_memory_allocated()
        self.used = b2mb(self.end - self.begin)
        self.peaked = b2mb(self.peak - self.begin)

--- a/examples/bone_finetuning/README.md
+++ b/examples/bone_finetuning/README.md
@ -0,0 +1,96 @@
+# DiSHA: Dimension-Sharding Adaptation with Fast Convergence and Fast Computation
+## Introduction ([Paper](https://huggingface.co/papers/2409.15371), [code](https://github.com/JL-er/DiSHA))
+Low-Rank Adaptation (LoRA) leverages the low intrinsic rank of weight updates in Large Language Models (LLMs), establishing a Parameter-Efficient Fine-Tuning (PEFT) paradigm. However, LoRA suffers from slow convergence. We introduce Dimension-Sharding Adaptation (DiSHA), which expands the PEFT design space to unlock lower intrinsic ranks and faster convergence by default. Within DiSHA's design space, we propose Block Affine Adaptation (Bone), a computationally efficient structure that delivers both high performance and efficiency. While certain DiSHA configurations may result in colinear updates to weight shards, we address this with Block Affine Transformation Adaptation (BAT), a nonlinear variant of DiSHA. BAT introduces nonlinearity by combining trainable matrices with original weight shards in a nonlinear manner, inducing nonlinearity in matrix updates without introducing additional parameters. Empirical results show that Bone, under the DiSHA framework, consistently outperforms LoRA variants in both NLG and NLU tasks, with significantly improved computational efficiency. Further analysis demonstrates that BAT enhances model capabilities by leveraging its nonlinear design.
+
+
+## Quick Start
+```python
+import torch
+from peft import LoraConfig, get_peft_model
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from trl import SFTConfig, SFTTrainer
+from datasets import load_dataset
+
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16, device_map="auto")
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+tokenizer.pad_token_id = tokenizer.eos_token_id
+bone_config = BoneConfig(
+    r = 64
+)
+#Bat performs better than Bone, but it uses more memory and is twice as slow. If you want to use the Bat method, you only need to add the parameter init_weights="bat".
+# bone_config = BoneConfig(
+#     r = 64,
+#     init_weights="bat"
+# )
+peft_model = get_peft_model(model, bone_config)
+
+peft_model.print_trainable_parameters()
+
+dataset = load_dataset("imdb", split="train[:1%]")
+
+training_args = SFTConfig(dataset_text_field="text", max_seq_length=128)
+trainer = SFTTrainer(
+    model=peft_model,
+    args=training_args,
+    train_dataset=dataset,
+    processing_class=tokenizer,
+)
+trainer.train()
+peft_model.save_pretrained("bone-llama-2-7b")
+```
+
+
+To utilize the fine-tuned Bone modules, simply run the following command:
+```python
+import torch
+from peft import PeftModel
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16, device_map="auto"
+)
+peft_model = PeftModel.from_pretrained(model, "bone-llama-2-7b")
+```
+
+## Advanced Usage
+
+### Fine-tune 
+```shell
+#Bat performs better than Bone, but it uses more memory and is twice as slow. If you want to use the Bat method, you only need to add the parameter init_weights="bat".
+python bone_finetuning.py \
+    --base_model_name_or_path meta-llama/Llama-2-7b-hf \
+    --output_dir output/bone-llama-2-7b-metamath-10k \
+    --bone_r 64 \
+    --init_weights True \
+    --bits bf16 \
+    --data_path meta-math/MetaMathQA \
+    --dataset_split train[:100000] \
+    --dataset_field query response \
+    --bf16 True \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 2 \
+    --gradient_accumulation_steps 8 \
+    --save_strategy "steps" \
+    --save_steps 1000 \
+    --save_total_limit 1 \
+    --logging_steps 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --tf32 True \
+    --report_to none
+```
+
+
+
+# Citation
+```bib
+@misc{kang2025dishadimensionshardingadaptationlarge,
+      title={DiSHA: Dimension-Sharding Adaptation of Large Language Models with Fast Convergence and Fast Computation}, 
+      author={Jiale Kang},
+      year={2025},
+      eprint={2409.15371},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://huggingface.co/papers/2409.15371}, 
+}
--- a/examples/bone_finetuning/bone_finetuning.py
+++ b/examples/bone_finetuning/bone_finetuning.py
@ -0,0 +1,105 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from dataclasses import dataclass, field
+from typing import Literal, Optional
+
+import torch
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
+from trl import SFTConfig, SFTTrainer
+
+from peft import BoneConfig, get_peft_model
+
+
+@dataclass
+class ScriptArguments(SFTConfig):
+    # model configs
+    base_model_name_or_path: Optional[str] = field(
+        default=None, metadata={"help": "The name or path of the fp32/16 base model."}
+    )
+    bits: str = field(default="bf16", metadata={"help": "(`['bf16', 'fp16', fp32]`)"})
+    init_weights: Literal[True, "bat"] = field(
+        default=True,
+        metadata={
+            "help": ("True -> Bone; `bat` -> Bat"),
+        },
+    )
+    bone_r: int = field(default=16)
+    merge_and_save: bool = field(default=False)
+    # dataset configs
+    data_path: str = field(default="imdb", metadata={"help": "Path to the training data."})
+    dataset_split: str = field(default="train[:1%]", metadata={"help": "(`['train', 'test', 'eval']`):"})
+    dataset_field: list[str] = field(default=None, metadata={"help": "Fields of dataset input and output."})
+
+
+parser = HfArgumentParser(ScriptArguments)
+script_args = parser.parse_args_into_dataclasses()[0]
+print(script_args)
+
+print(f"Load pre-processed residual model in {script_args.bits} bits.")
+if script_args.bits in ["nf4", "fp4", "int8"]:
+    print("Bone currently does not support quantization.")
+
+elif script_args.base_model_name_or_path is not None:
+    print(f"No available pre-processed model, manually initialize a Bone using {script_args.base_model_name_or_path}.")
+    model = AutoModelForCausalLM.from_pretrained(
+        script_args.base_model_name_or_path,
+        torch_dtype=(
+            torch.float16
+            if script_args.bits == "fp16"
+            else (torch.bfloat16 if script_args.bits == "bf16" else torch.float32)
+        ),
+        device_map="auto",
+    )
+    tokenizer = AutoTokenizer.from_pretrained(script_args.base_model_name_or_path)
+    tokenizer.pad_token_id = tokenizer.eos_token_id
+    bone_config = BoneConfig(
+        r=script_args.bone_r,
+        target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
+        bias="none",
+        task_type="CAUSAL_LM",
+        init_weights=script_args.init_weights,
+    )
+    peft_model = get_peft_model(model, bone_config)
+
+print(peft_model)
+peft_model.print_trainable_parameters()
+
+print(f"Training Bone with trl on the {script_args.data_path}[{script_args.dataset_split}] dataset.")
+dataset = load_dataset(script_args.data_path, split=script_args.dataset_split)
+dataset = dataset.map(
+    lambda example: {
+        "text": f"### USER: {example[script_args.dataset_field[0]]}\n### ASSISTANT: {example[script_args.dataset_field[1]]}"
+    }
+)
+
+trainer = SFTTrainer(
+    model=peft_model,
+    args=script_args,
+    train_dataset=dataset,
+    processing_class=tokenizer,
+)
+trainer.train()
+trainer.save_state()
+
+peft_model.save_pretrained(
+    os.path.join(script_args.output_dir, "bone_ft"),
+)
+
+if script_args.merge_and_save:
+    model = peft_model.merge_and_unload()
+    model.save_pretrained(os.path.join(script_args.output_dir, "bone_merged"))
+    tokenizer.save_pretrained(os.path.join(script_args.output_dir, "bone_merged"))
--- a/examples/causal_language_modeling/peft_ln_tuning_clm.ipynb
+++ b/examples/causal_language_modeling/peft_ln_tuning_clm.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "id": "71fbfca2",
   "metadata": {},
   "outputs": [],
@ -16,10 +16,9 @@
    "from torch.utils.data import DataLoader\n",
    "from transformers import default_data_collator, get_linear_schedule_with_warmup\n",
    "from tqdm import tqdm\n",
-    "from datasets import load_dataset\n",
    "\n",
    "# Hyper-parameters\n",
-    "device = \"cuda\"\n",
+    "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n",
    "model_name_or_path = \"bigscience/bloomz-560m\"\n",
    "tokenizer_name_or_path = \"bigscience/bloomz-560m\"\n",
    "peft_config = LNTuningConfig(\n",
@ -48,7 +47,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "id": "e1a3648b",
   "metadata": {},
   "outputs": [
@ -84,9 +83,13 @@
    }
   ],
   "source": [
-    "from datasets import load_dataset\n",
-    "\n",
-    "dataset = load_dataset(\"ought/raft\", dataset_name)\n",
+    "dataset = load_dataset(\n",
+    "    \"parquet\",\n",
+    "    data_files={\n",
+    "        \"train\": f\"hf://datasets/ought/raft@refs/convert/parquet/{dataset_name}/train/0000.parquet\",\n",
+    "        \"test\": f\"hf://datasets/ought/raft@refs/convert/parquet/{dataset_name}/test/0000.parquet\"\n",
+    "    }\n",
+    ")\n",
    "\n",
    "classes = [k.replace(\"_\", \" \") for k in dataset[\"train\"].features[\"Label\"].names]\n",
    "print(classes)\n",
--- a/examples/causal_language_modeling/peft_lora_clm_accelerate_big_model_inference.ipynb
+++ b/examples/causal_language_modeling/peft_lora_clm_accelerate_big_model_inference.ipynb
@ -1,481 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "71fbfca2",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "===================================BUG REPORT===================================\n",
-      "Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
-      "For effortless bug reporting copy-paste your error into this form: https://docs.google.com/forms/d/e/1FAIpQLScPB8emS3Thkp66nvqwmjTEgxp8Y9ufuWTzFyr9kJ5AoI47dQ/viewform?usp=sf_link\n",
-      "================================================================================\n",
-      "CUDA SETUP: CUDA runtime path found: /home/sourab/miniconda3/envs/ml/lib/libcudart.so\n",
-      "CUDA SETUP: Highest compute capability among GPUs detected: 7.5\n",
-      "CUDA SETUP: Detected CUDA version 117\n",
-      "CUDA SETUP: Loading binary /home/sourab/miniconda3/envs/ml/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...\n"
-     ]
-    }
-   ],
-   "source": [
-    "from transformers import AutoModelForCausalLM\n",
-    "from peft import PeftModel, PeftConfig\n",
-    "import torch\n",
-    "from datasets import load_dataset\n",
-    "import os\n",
-    "from transformers import AutoTokenizer\n",
-    "from torch.utils.data import DataLoader\n",
-    "from transformers import default_data_collator, get_linear_schedule_with_warmup\n",
-    "from tqdm import tqdm\n",
-    "from datasets import load_dataset\n",
-    "\n",
-    "device = \"cuda\"\n",
-    "model_name_or_path = \"bigscience/bloomz-7b1\"\n",
-    "tokenizer_name_or_path = \"bigscience/bloomz-7b1\"\n",
-    "dataset_name = \"twitter_complaints\"\n",
-    "text_column = \"Tweet text\"\n",
-    "label_column = \"text_label\"\n",
-    "max_length = 64\n",
-    "lr = 1e-3\n",
-    "num_epochs = 50\n",
-    "batch_size = 8"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e1a3648b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from datasets import load_dataset\n",
-    "\n",
-    "dataset = load_dataset(\"ought/raft\", dataset_name)\n",
-    "\n",
-    "classes = [k.replace(\"_\", \" \") for k in dataset[\"train\"].features[\"Label\"].names]\n",
-    "print(classes)\n",
-    "dataset = dataset.map(\n",
-    "    lambda x: {\"text_label\": [classes[label] for label in x[\"Label\"]]},\n",
-    "    batched=True,\n",
-    "    num_proc=1,\n",
-    ")\n",
-    "print(dataset)\n",
-    "dataset[\"train\"][0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "fe12d4d3",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "3\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "10cabeec92ab428f9a660ebaecbaf865",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "8a344e989ab34c71b230acee68b477e8",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Running tokenizer on dataset:   0%|          | 0/4 [00:00<?, ?ba/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "# data preprocessing\n",
-    "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)\n",
-    "if tokenizer.pad_token_id is None:\n",
-    "    tokenizer.pad_token_id = tokenizer.eos_token_id\n",
-    "target_max_length = max([len(tokenizer(class_label)[\"input_ids\"]) for class_label in classes])\n",
-    "print(target_max_length)\n",
-    "\n",
-    "\n",
-    "def preprocess_function(examples):\n",
-    "    batch_size = len(examples[text_column])\n",
-    "    inputs = [f\"{text_column} : {x} Label : \" for x in examples[text_column]]\n",
-    "    targets = [str(x) for x in examples[label_column]]\n",
-    "    model_inputs = tokenizer(inputs)\n",
-    "    labels = tokenizer(targets, add_special_tokens=False)  # don't add bos token because we concatenate with inputs\n",
-    "    for i in range(batch_size):\n",
-    "        sample_input_ids = model_inputs[\"input_ids\"][i]\n",
-    "        label_input_ids = labels[\"input_ids\"][i] + [tokenizer.eos_token_id]\n",
-    "        # print(i, sample_input_ids, label_input_ids)\n",
-    "        model_inputs[\"input_ids\"][i] = sample_input_ids + label_input_ids\n",
-    "        labels[\"input_ids\"][i] = [-100] * len(sample_input_ids) + label_input_ids\n",
-    "        model_inputs[\"attention_mask\"][i] = [1] * len(model_inputs[\"input_ids\"][i])\n",
-    "    # print(model_inputs)\n",
-    "    for i in range(batch_size):\n",
-    "        sample_input_ids = model_inputs[\"input_ids\"][i]\n",
-    "        label_input_ids = labels[\"input_ids\"][i]\n",
-    "        model_inputs[\"input_ids\"][i] = [tokenizer.pad_token_id] * (\n",
-    "            max_length - len(sample_input_ids)\n",
-    "        ) + sample_input_ids\n",
-    "        model_inputs[\"attention_mask\"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[\n",
-    "            \"attention_mask\"\n",
-    "        ][i]\n",
-    "        labels[\"input_ids\"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids\n",
-    "        model_inputs[\"input_ids\"][i] = torch.tensor(model_inputs[\"input_ids\"][i][:max_length])\n",
-    "        model_inputs[\"attention_mask\"][i] = torch.tensor(model_inputs[\"attention_mask\"][i][:max_length])\n",
-    "        labels[\"input_ids\"][i] = torch.tensor(labels[\"input_ids\"][i][:max_length])\n",
-    "    model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
-    "    return model_inputs\n",
-    "\n",
-    "\n",
-    "processed_datasets = dataset.map(\n",
-    "    preprocess_function,\n",
-    "    batched=True,\n",
-    "    num_proc=1,\n",
-    "    remove_columns=dataset[\"train\"].column_names,\n",
-    "    load_from_cache_file=False,\n",
-    "    desc=\"Running tokenizer on dataset\",\n",
-    ")\n",
-    "\n",
-    "train_dataset = processed_datasets[\"train\"]\n",
-    "\n",
-    "\n",
-    "train_dataloader = DataLoader(\n",
-    "    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2795b9d0",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def test_preprocess_function(examples):\n",
-    "    batch_size = len(examples[text_column])\n",
-    "    inputs = [f\"{text_column} : {x} Label : \" for x in examples[text_column]]\n",
-    "    model_inputs = tokenizer(inputs)\n",
-    "    # print(model_inputs)\n",
-    "    for i in range(batch_size):\n",
-    "        sample_input_ids = model_inputs[\"input_ids\"][i]\n",
-    "        model_inputs[\"input_ids\"][i] = [tokenizer.pad_token_id] * (\n",
-    "            max_length - len(sample_input_ids)\n",
-    "        ) + sample_input_ids\n",
-    "        model_inputs[\"attention_mask\"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[\n",
-    "            \"attention_mask\"\n",
-    "        ][i]\n",
-    "        model_inputs[\"input_ids\"][i] = torch.tensor(model_inputs[\"input_ids\"][i][:max_length])\n",
-    "        model_inputs[\"attention_mask\"][i] = torch.tensor(model_inputs[\"attention_mask\"][i][:max_length])\n",
-    "    return model_inputs\n",
-    "\n",
-    "\n",
-    "processed_datasets = dataset.map(\n",
-    "    test_preprocess_function,\n",
-    "    batched=True,\n",
-    "    num_proc=1,\n",
-    "    remove_columns=dataset[\"train\"].column_names,\n",
-    "    load_from_cache_file=False,\n",
-    "    desc=\"Running tokenizer on dataset\",\n",
-    ")\n",
-    "\n",
-    "eval_dataset = processed_datasets[\"train\"]\n",
-    "test_dataset = processed_datasets[\"test\"]\n",
-    "\n",
-    "eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)\n",
-    "test_dataloader = DataLoader(test_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)\n",
-    "print(next(iter(eval_dataloader)))\n",
-    "print(next(iter(test_dataloader)))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "42b14a11",
-   "metadata": {},
-   "source": [
-    "You can load model from hub or local\n",
-    "\n",
-    "- Load model from Hugging Face Hub, you can change to your own model id\n",
-    "```python\n",
-    "peft_model_id = \"username/twitter_complaints_bigscience_bloomz-7b1_LORA_CAUSAL_LM\"\n",
-    "```\n",
-    "- Or load model form local\n",
-    "```python\n",
-    "peft_model_id = \"twitter_complaints_bigscience_bloomz-7b1_LORA_CAUSAL_LM\"\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "9caac014",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/sourab/pet/src/peft/tuners/lora.py:143: UserWarning: fan_in_fan_out is set to True but the target module is not a Conv1D. Setting fan_in_fan_out to False.\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "bc38030106a14173a1363eb1ee388eda",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading:   0%|          | 0.00/15.8M [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "from peft import PeftModel, PeftConfig\n",
-    "\n",
-    "max_memory = {0: \"1GIB\", 1: \"1GIB\", 2: \"2GIB\", 3: \"10GIB\", \"cpu\": \"30GB\"}\n",
-    "peft_model_id = \"smangrul/twitter_complaints_bigscience_bloomz-7b1_LORA_CAUSAL_LM\"\n",
-    "config = PeftConfig.from_pretrained(peft_model_id)\n",
-    "model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, device_map=\"auto\", max_memory=max_memory)\n",
-    "model = PeftModel.from_pretrained(model, peft_model_id, device_map=\"auto\", max_memory=max_memory)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 35,
-   "id": "6fac10b5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "2a08ee6d",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'base_model.model.transformer.word_embeddings': 3,\n",
-       " 'base_model.model.lm_head': 3,\n",
-       " 'base_model.model.transformer.word_embeddings_layernorm': 3,\n",
-       " 'base_model.model.transformer.h.0': 3,\n",
-       " 'base_model.model.transformer.h.1': 3,\n",
-       " 'base_model.model.transformer.h.2': 3,\n",
-       " 'base_model.model.transformer.h.3': 3,\n",
-       " 'base_model.model.transformer.h.4': 3,\n",
-       " 'base_model.model.transformer.h.5': 3,\n",
-       " 'base_model.model.transformer.h.6': 3,\n",
-       " 'base_model.model.transformer.h.7': 3,\n",
-       " 'base_model.model.transformer.h.8': 'cpu',\n",
-       " 'base_model.model.transformer.h.9': 'cpu',\n",
-       " 'base_model.model.transformer.h.10': 'cpu',\n",
-       " 'base_model.model.transformer.h.11': 'cpu',\n",
-       " 'base_model.model.transformer.h.12': 'cpu',\n",
-       " 'base_model.model.transformer.h.13': 'cpu',\n",
-       " 'base_model.model.transformer.h.14': 'cpu',\n",
-       " 'base_model.model.transformer.h.15': 'cpu',\n",
-       " 'base_model.model.transformer.h.16': 'cpu',\n",
-       " 'base_model.model.transformer.h.17': 'cpu',\n",
-       " 'base_model.model.transformer.h.18': 'cpu',\n",
-       " 'base_model.model.transformer.h.19': 'cpu',\n",
-       " 'base_model.model.transformer.h.20': 'cpu',\n",
-       " 'base_model.model.transformer.h.21': 'cpu',\n",
-       " 'base_model.model.transformer.h.22': 'cpu',\n",
-       " 'base_model.model.transformer.h.23': 'cpu',\n",
-       " 'base_model.model.transformer.h.24': 'cpu',\n",
-       " 'base_model.model.transformer.h.25': 'cpu',\n",
-       " 'base_model.model.transformer.h.26': 'cpu',\n",
-       " 'base_model.model.transformer.h.27': 'cpu',\n",
-       " 'base_model.model.transformer.h.28': 'cpu',\n",
-       " 'base_model.model.transformer.h.29': 'cpu',\n",
-       " 'base_model.model.transformer.ln_f': 'cpu'}"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "model.hf_device_map"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 34,
-   "id": "b33be5e6",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "@HondaCustSvc Your customer service has been horrible during the recall process. I will never purchase a Honda again.\n",
-      "{'input_ids': tensor([[227985,   5484,    915,   2566, 216744,     38,   1316,     54,  42705,\n",
-      "          32465,  52166,   9440,   1809,   3784,  88483,   9411,    368,  84342,\n",
-      "           4451,     17,    473,   2152,  11705,  82406,    267,  51591,   5734,\n",
-      "             17,  77658,    915,    210]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
-      "         1, 1, 1, 1, 1, 1, 1]])}\n",
-      "tensor([[227985,   5484,    915,   2566, 216744,     38,   1316,     54,  42705,\n",
-      "          32465,  52166,   9440,   1809,   3784,  88483,   9411,    368,  84342,\n",
-      "           4451,     17,    473,   2152,  11705,  82406,    267,  51591,   5734,\n",
-      "             17,  77658,    915,    210,  16449,   5952,      3,      3,      3,\n",
-      "              3,      3,      3,      3,      3]])\n",
-      "['Tweet text : @HondaCustSvc Your customer service has been horrible during the recall process. I will never purchase a Honda again. Label : complaint']\n"
-     ]
-    }
-   ],
-   "source": [
-    "model.eval()\n",
-    "i = 89\n",
-    "inputs = tokenizer(f'{text_column} : {dataset[\"test\"][i][\"Tweet text\"]} Label : ', return_tensors=\"pt\")\n",
-    "print(dataset[\"test\"][i][\"Tweet text\"])\n",
-    "print(inputs)\n",
-    "\n",
-    "with torch.no_grad():\n",
-    "    outputs = model.generate(input_ids=inputs[\"input_ids\"], max_new_tokens=10)\n",
-    "    print(outputs)\n",
-    "    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "b6d6cd5b",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [01:42<00:00, 14.70s/it]\n"
-     ]
-    }
-   ],
-   "source": [
-    "model.eval()\n",
-    "eval_preds = []\n",
-    "for _, batch in enumerate(tqdm(eval_dataloader)):\n",
-    "    batch = {k: v for k, v in batch.items() if k != \"labels\"}\n",
-    "    with torch.no_grad():\n",
-    "        outputs = model.generate(**batch, max_new_tokens=10)\n",
-    "    preds = outputs[:, max_length:].detach().cpu().numpy()\n",
-    "    eval_preds.extend(tokenizer.batch_decode(preds, skip_special_tokens=True))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "61264abe",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "accuracy=100.0\n",
-      "eval_preds[:10]=['no complaint', 'no complaint', 'complaint', 'complaint', 'no complaint', 'no complaint', 'no complaint', 'complaint', 'complaint', 'no complaint']\n",
-      "dataset['train'][label_column][:10]=['no complaint', 'no complaint', 'complaint', 'complaint', 'no complaint', 'no complaint', 'no complaint', 'complaint', 'complaint', 'no complaint']\n"
-     ]
-    }
-   ],
-   "source": [
-    "correct = 0\n",
-    "total = 0\n",
-    "for pred, true in zip(eval_preds, dataset[\"train\"][label_column]):\n",
-    "    if pred.strip() == true.strip():\n",
-    "        correct += 1\n",
-    "    total += 1\n",
-    "accuracy = correct / total * 100\n",
-    "print(f\"{accuracy=}\")\n",
-    "print(f\"{eval_preds[:10]=}\")\n",
-    "print(f\"{dataset['train'][label_column][:10]=}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a70802a3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.eval()\n",
-    "test_preds = []\n",
-    "\n",
-    "for _, batch in enumerate(tqdm(test_dataloader)):\n",
-    "    batch = {k: v for k, v in batch.items() if k != \"labels\"}\n",
-    "    with torch.no_grad():\n",
-    "        outputs = model.generate(**batch, max_new_tokens=10)\n",
-    "    preds = outputs[:, max_length:].detach().cpu().numpy()\n",
-    "    test_preds.extend(tokenizer.batch_decode(preds, skip_special_tokens=True))\n",
-    "    if len(test_preds) > 100:\n",
-    "        break\n",
-    "test_preds"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e1c4ad9c",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.4"
-  },
-  "vscode": {
-   "interpreter": {
-    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/examples/causal_language_modeling/peft_lora_clm_accelerate_ds_zero3_offload.py
+++ b/examples/causal_language_modeling/peft_lora_clm_accelerate_ds_zero3_offload.py
@ -61,9 +61,11 @@ def b2mb(x):
 class TorchTracemalloc:
    def __enter__(self):
        gc.collect()
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()  # reset the peak gauge to zero
-        self.begin = torch.cuda.memory_allocated()
+        self.device_type = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
+        self.device_module = getattr(torch, self.device_type, torch.cuda)
+        self.device_module.empty_cache()
+        self.device_module.reset_peak_memory_stats()  # reset the peak gauge to zero
+        self.begin = self.device_module.memory_allocated()
        self.process = psutil.Process()

        self.cpu_begin = self.cpu_mem_used()
@ -93,9 +95,9 @@ class TorchTracemalloc:
        self.peak_monitoring = False

        gc.collect()
-        torch.cuda.empty_cache()
-        self.end = torch.cuda.memory_allocated()
-        self.peak = torch.cuda.max_memory_allocated()
+        self.device_module.empty_cache()
+        self.end = self.device_module.memory_allocated()
+        self.peak = self.device_module.max_memory_allocated()
        self.used = b2mb(self.end - self.begin)
        self.peaked = b2mb(self.peak - self.begin)

@ -120,7 +122,13 @@ def main():
    do_test = False
    set_seed(seed)

-    dataset = load_dataset("ought/raft", dataset_name)
+    dataset = load_dataset(
+        "parquet",
+        data_files={
+            "train": f"hf://datasets/ought/raft@refs/convert/parquet/{dataset_name}/train/0000.parquet",
+            "test": f"hf://datasets/ought/raft@refs/convert/parquet/{dataset_name}/test/0000.parquet",
+        },
+    )
    classes = [k.replace("_", " ") for k in dataset["train"].features["Label"].names]
    dataset = dataset.map(
        lambda x: {"text_label": [classes[label] for label in x["Label"]]},
@ -162,7 +170,6 @@ def main():
        batch_size = len(examples[text_column])
        inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]]
        model_inputs = tokenizer(inputs)
-        # print(model_inputs)
        for i in range(batch_size):
            sample_input_ids = model_inputs["input_ids"][i]
            model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
@ -248,12 +255,18 @@ def main():
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
-        # Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
-        accelerator.print(f"GPU Memory before entering the train : {b2mb(tracemalloc.begin)}")
-        accelerator.print(f"GPU Memory consumed at the end of the train (end-begin): {tracemalloc.used}")
-        accelerator.print(f"GPU Peak Memory consumed during the train (max-begin): {tracemalloc.peaked}")
+        # Printing the memory usage details such as allocated memory, peak memory, and total memory usage
        accelerator.print(
-            f"GPU Total Peak Memory consumed during the train (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}"
+            f"{accelerator.device.type.upper()} Memory before entering the train : {b2mb(tracemalloc.begin)}"
+        )
+        accelerator.print(
+            f"{accelerator.device.type.upper()} Memory consumed at the end of the train (end-begin): {tracemalloc.used}"
+        )
+        accelerator.print(
+            f"{accelerator.device.type.upper()} Peak Memory consumed during the train (max-begin): {tracemalloc.peaked}"
+        )
+        accelerator.print(
+            f"{accelerator.device.type.upper()} Total Peak Memory consumed during the train (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}"
        )

        accelerator.print(f"CPU Memory before entering the train : {b2mb(tracemalloc.cpu_begin)}")
@ -280,12 +293,18 @@ def main():
                preds = preds[:, max_length:].detach().cpu().numpy()
                eval_preds.extend(tokenizer.batch_decode(preds, skip_special_tokens=True))

-        # Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
-        accelerator.print(f"GPU Memory before entering the eval : {b2mb(tracemalloc.begin)}")
-        accelerator.print(f"GPU Memory consumed at the end of the eval (end-begin): {tracemalloc.used}")
-        accelerator.print(f"GPU Peak Memory consumed during the eval (max-begin): {tracemalloc.peaked}")
+        # Printing the memory usage details such as allocated memory, peak memory, and total memory usage
        accelerator.print(
-            f"GPU Total Peak Memory consumed during the eval (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}"
+            f"{accelerator.device.type.upper()} Memory before entering the eval : {b2mb(tracemalloc.begin)}"
+        )
+        accelerator.print(
+            f"{accelerator.device.type.upper()} Memory consumed at the end of the eval (end-begin): {tracemalloc.used}"
+        )
+        accelerator.print(
+            f"{accelerator.device.type.upper()} Peak Memory consumed during the eval (max-begin): {tracemalloc.peaked}"
+        )
+        accelerator.print(
+            f"{accelerator.device.type.upper()} Total Peak Memory consumed during the eval (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}"
        )

        accelerator.print(f"CPU Memory before entering the eval : {b2mb(tracemalloc.cpu_begin)}")
@ -297,9 +316,9 @@ def main():

        correct = 0
        total = 0
-        assert len(eval_preds) == len(
-            dataset["train"][label_column]
-        ), f"{len(eval_preds)} != {len(dataset['train'][label_column])}"
+        assert len(eval_preds) == len(dataset["train"][label_column]), (
+            f"{len(eval_preds)} != {len(dataset['train'][label_column])}"
+        )
        for pred, true in zip(eval_preds, dataset["train"][label_column]):
            if pred.strip() == true.strip():
                correct += 1
--- a/examples/causal_language_modeling/peft_lora_clm_with_additional_tokens.ipynb
+++ b/examples/causal_language_modeling/peft_lora_clm_with_additional_tokens.ipynb
@ -26,14 +26,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "id": "6f864c90",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
-    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"3\"\n",
    "os.environ[\"WANDB_PROJECT\"] = \"PeftExamples\"\n",
    "import transformers\n",
    "from peft import (\n",
@ -168,7 +167,7 @@
    "model = AutoModelForCausalLM.from_pretrained(\n",
    "    model_name,\n",
    "    low_cpu_mem_usage=True\n",
-    "    # use_flash_attention_2=True, # leading to an error\n",
+    "    # attn_implementation =\"flash_attention_2\", # leading to an error\n",
    ")\n",
    "model.resize_token_embeddings(len(tokenizer))"
   ]
@ -740,7 +739,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
   "id": "71851793",
   "metadata": {},
   "outputs": [
@ -763,7 +762,8 @@
    "context = dataset[\"test\"][i][\"context\"]\n",
    "\n",
    "batch = tokenizer(context, return_tensors=\"pt\")\n",
-    "batch = {k: v.to(\"cuda\") for k, v in batch.items()}\n",
+    "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n",
+    "batch = {k: v.to(device) for k, v in batch.items()}\n",
    "model.eval()\n",
    "output_tokens = model.generate(\n",
    "    **batch,\n",
@ -892,7 +892,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
   "id": "589c46d7-d567-40b4-ab7d-e0a9e1cab40e",
   "metadata": {},
   "outputs": [
@ -956,12 +956,12 @@
    "inference_model = AutoModelForCausalLM.from_pretrained(\n",
    "    model_name,\n",
    "    low_cpu_mem_usage=True,\n",
-    "    # use_flash_attention_2=True,\n",
+    "    # attn_implementation =\"flash_attention_2\",\n",
    ")\n",
    "inference_model.resize_token_embeddings(len(tokenizer))\n",
    "\n",
    "inference_model = PeftModel.from_pretrained(inference_model, \"smangrul/mistral_lora_clm_with_added_tokens\")\n",
-    "inference_model.to(\"cuda\")\n",
+    "inference_model.to(device)\n",
    "inference_model.eval()\n",
    "\n",
    "output_tokens = inference_model.generate(\n",
--- a/examples/causal_language_modeling/peft_prefix_tuning_clm.ipynb
+++ b/examples/causal_language_modeling/peft_prefix_tuning_clm.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "id": "71fbfca2",
   "metadata": {},
   "outputs": [],
@ -16,9 +16,8 @@
    "from torch.utils.data import DataLoader\n",
    "from transformers import default_data_collator, get_linear_schedule_with_warmup\n",
    "from tqdm import tqdm\n",
-    "from datasets import load_dataset\n",
    "\n",
-    "device = \"cuda\"\n",
+    "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n",
    "model_name_or_path = \"bigscience/bloomz-560m\"\n",
    "tokenizer_name_or_path = \"bigscience/bloomz-560m\"\n",
    "peft_config = PrefixTuningConfig(task_type=TaskType.CAUSAL_LM, num_virtual_tokens=30)\n",
@ -37,7 +36,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "id": "e1a3648b",
   "metadata": {},
   "outputs": [
@ -102,9 +101,14 @@
    }
   ],
   "source": [
-    "from datasets import load_dataset\n",
+    "dataset = load_dataset(\n",
+    "    \"parquet\",\n",
+    "    data_files={\n",
+    "        \"train\": f\"hf://datasets/ought/raft@refs/convert/parquet/{dataset_name}/train/0000.parquet\",\n",
+    "        \"test\": f\"hf://datasets/ought/raft@refs/convert/parquet/{dataset_name}/test/0000.parquet\"\n",
+    "    }\n",
+    ")\n",
    "\n",
-    "dataset = load_dataset(\"ought/raft\", dataset_name)\n",
    "\n",
    "classes = [k.replace(\"_\", \" \") for k in dataset[\"train\"].features[\"Label\"].names]\n",
    "print(classes)\n",
@ -318,24 +322,6 @@
    "model.print_trainable_parameters()"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "bd419634",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "trainable params: 1474560 || all params: 560689152 || trainable%: 0.26299064191632515\n"
-     ]
-    }
-   ],
-   "source": [
-    "model.print_trainable_parameters()"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -1276,7 +1262,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "ckpt = f\"{peft_model_id}/adapter_model.bin\"\n",
+    "ckpt = f\"{peft_model_id}/adapter_model.safetensors\"\n",
    "!du -h $ckpt"
   ]
  },
--- a/examples/causal_language_modeling/peft_prompt_tuning_clm.ipynb
+++ b/examples/causal_language_modeling/peft_prompt_tuning_clm.ipynb
@ -16,9 +16,8 @@
    "from torch.utils.data import DataLoader\n",
    "from transformers import default_data_collator, get_linear_schedule_with_warmup\n",
    "from tqdm import tqdm\n",
-    "from datasets import load_dataset\n",
    "\n",
-    "device = \"cuda\"\n",
+    "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n",
    "model_name_or_path = \"bigscience/bloomz-560m\"\n",
    "tokenizer_name_or_path = \"bigscience/bloomz-560m\"\n",
    "peft_config = PromptTuningConfig(\n",
@ -48,9 +47,13 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from datasets import load_dataset\n",
-    "\n",
-    "dataset = load_dataset(\"ought/raft\", dataset_name)\n",
+    "dataset = load_dataset(\n",
+    "    \"parquet\",\n",
+    "    data_files={\n",
+    "        \"train\": f\"hf://datasets/ought/raft@refs/convert/parquet/{dataset_name}/train/0000.parquet\",\n",
+    "        \"test\": f\"hf://datasets/ought/raft@refs/convert/parquet/{dataset_name}/test/0000.parquet\"\n",
+    "    }\n",
+    ")\n",
    "\n",
    "classes = [k.replace(\"_\", \" \") for k in dataset[\"train\"].features[\"Label\"].names]\n",
    "print(classes)\n",
@ -1115,24 +1118,12 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
   "id": "4928c7f1",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
-      "36K\tbigscience/bloomz-560m_PROMPT_TUNING_CAUSAL_LM/adapter_model.bin\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
-    "ckpt = f\"{peft_model_id}/adapter_model.bin\"\n",
+    "ckpt = f\"{peft_model_id}/adapter_model.safetensors\"\n",
    "!du -h $ckpt"
   ]
  },
--- a/examples/causal_language_modeling/requirements.txt
+++ b/examples/causal_language_modeling/requirements.txt
@ -1,6 +1,7 @@
-transformers
+transformers<4.54.0
 accelerate
 evaluate
 deepspeed
 tqdm
-datasets
+dataclass-csv
+datasets==3.6.0
--- a/examples/conditional_generation/multitask_prompt_tuning.ipynb
+++ b/examples/conditional_generation/multitask_prompt_tuning.ipynb
@ -9,12 +9,13 @@
   },
   "outputs": [],
   "source": [
+    "import torch\n",
    "from datasets import load_dataset\n",
    "from transformers import set_seed, AutoModelForSeq2SeqLM, AutoTokenizer\n",
    "from peft import get_peft_model, MultitaskPromptTuningConfig, TaskType, MultitaskPromptTuningInit\n",
    "\n",
    "set_seed(42)\n",
-    "\n",
+    "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n",
    "model_name = \"google/flan-t5-base\"\n",
    "\n",
    "peft_config = MultitaskPromptTuningConfig(\n",
@ -31,18 +32,18 @@
    "model = AutoModelForSeq2SeqLM.from_pretrained(model_name)\n",
    "model = get_peft_model(model, peft_config)\n",
    "\n",
-    "model = model.cuda()\n",
+    "model = model.to(device)\n",
    "\n",
    "\n",
    "def send_to_device(batch):\n",
    "    for i in batch:\n",
-    "        batch[i] = batch[i].cuda()\n",
+    "        batch[i] = batch[i].to(device)\n",
    "    return batch"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
   "id": "eb112bc1-ffaf-49fa-a216-0d601ec304ee",
   "metadata": {
    "tags": []
@ -86,7 +87,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
   "id": "e5a16ec4-8fef-4ba9-95b6-a661eb51e50c",
   "metadata": {
    "tags": []
@ -159,7 +160,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
   "id": "cceecc94-f43a-4f62-8d45-926f2f02f36d",
   "metadata": {
    "tags": []
@ -293,7 +294,7 @@
    "    num_tasks=1,\n",
    "    task_type=TaskType.SEQ_2_SEQ_LM,\n",
    "    prompt_tuning_init=MultitaskPromptTuningInit.EXACT_SOURCE_TASK,\n",
-    "    prompt_tuning_init_state_dict_path=\"checkpoints_source/50000/adapter_model.bin\",\n",
+    "    prompt_tuning_init_state_dict_path=\"checkpoints_source/50000/adapter_model.safetensors\",\n",
    "    num_virtual_tokens=50,\n",
    "    num_transformer_submodules=1,\n",
    ")\n",
@ -302,7 +303,7 @@
    "model = AutoModelForSeq2SeqLM.from_pretrained(model_name)\n",
    "model = get_peft_model(model, peft_config)\n",
    "\n",
-    "model = model.cuda()"
+    "model = model.to(device)"
   ]
  },
  {
@ -360,8 +361,9 @@
   "source": [
    "# load last checkpoint for now\n",
    "from peft import set_peft_model_state_dict\n",
+    "from safetensors.torch import load_file\n",
    "\n",
-    "sd_6000 = torch.load(\"checkpoints_target/6000/adapter_model.bin\")\n",
+    "sd_6000 = load_file(\"checkpoints_target/6000/adapter_model.safetensors\")\n",
    "set_peft_model_state_dict(model, sd_6000)\n",
    "\n",
    "# evaluate val\n",
@ -382,6 +384,22 @@
    "f1 = {f1}\"\"\"\n",
    ")"
   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1d18325c-9607-4cb5-a5b0-5b44dfee2a75",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "43988e92-af42-45cb-8bca-f19c193ad04f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
  }
 ],
 "metadata": {
@ -400,7 +418,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.9.13"
+   "version": "3.11.13"
  }
 },
 "nbformat": 4,
--- a/examples/conditional_generation/peft_adalora_seq2seq.py
+++ b/examples/conditional_generation/peft_adalora_seq2seq.py
@ -11,7 +11,7 @@ from peft import AdaLoraConfig, PeftConfig, PeftModel, TaskType, get_peft_model

 os.environ["TOKENIZERS_PARALLELISM"] = "false"

-device = "cuda"
+device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
 model_name_or_path = "facebook/bart-base"
 tokenizer_name_or_path = "facebook/bart-base"

@ -24,6 +24,20 @@ num_epochs = 8
 batch_size = 8


+# loading dataset
+dataset = load_dataset("financial_phrasebank", "sentences_allagree")
+dataset = dataset["train"].train_test_split(test_size=0.1)
+dataset["validation"] = dataset["test"]
+del dataset["test"]
+
+classes = dataset["train"].features["label"].names
+dataset = dataset.map(
+    lambda x: {"text_label": [classes[label] for label in x["label"]]},
+    batched=True,
+    num_proc=1,
+)
+
+
 # creating model
 peft_config = AdaLoraConfig(
    init_r=12,
@ -37,6 +51,7 @@ peft_config = AdaLoraConfig(
    lora_dropout=0.1,
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
+    total_step=len(dataset["train"]) * num_epochs,
 )

 model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
@ -44,20 +59,6 @@ model = get_peft_model(model, peft_config)
 model.print_trainable_parameters()


-# loading dataset
-dataset = load_dataset("financial_phrasebank", "sentences_allagree")
-dataset = dataset["train"].train_test_split(test_size=0.1)
-dataset["validation"] = dataset["test"]
-del dataset["test"]
-
-classes = dataset["train"].features["label"].names
-dataset = dataset.map(
-    lambda x: {"text_label": [classes[label] for label in x["label"]]},
-    batched=True,
-    num_proc=1,
-)
-
-
 # data preprocessing
 tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

@ -159,7 +160,7 @@ peft_model_id = f"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task
 model.save_pretrained(peft_model_id)


-ckpt = f"{peft_model_id}/adapter_model.bin"
+ckpt = f"{peft_model_id}/adapter_model.safetensors"
 # get_ipython().system('du -h $ckpt')


--- a/examples/conditional_generation/peft_ia3_seq2seq.ipynb
+++ b/examples/conditional_generation/peft_ia3_seq2seq.ipynb
@ -2,7 +2,8 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
+   "id": "0c152fc8",
   "metadata": {
    "id": "5f93b7d1"
   },
@ -22,7 +23,7 @@
    "from tqdm import tqdm\n",
    "from datasets import load_dataset\n",
    "\n",
-    "device = \"cuda\"\n",
+    "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n",
    "model_name_or_path = \"bigscience/mt0-large\"\n",
    "tokenizer_name_or_path = \"bigscience/mt0-large\"\n",
    "\n",
@ -37,7 +38,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 2,
+   "id": "4e23624f",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
@ -49,10 +51,10 @@
    {
     "data": {
      "text/plain": [
-       "<module 'peft' from '/usr/local/lib/python3.10/dist-packages/peft/__init__.py'>"
+       "<module 'peft' from '/usr/local/lib/python3.11/dist-packages/peft/__init__.py'>"
      ]
     },
-     "execution_count": 13,
+     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -65,7 +67,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
+   "id": "da74b569",
   "metadata": {
    "id": "8d0850ac"
   },
@ -79,7 +82,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 4,
+   "id": "df33fce2",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
@ -233,7 +237,7 @@
       ")"
      ]
     },
-     "execution_count": 15,
+     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -244,7 +248,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 5,
+   "id": "63d7bc2d",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
@ -257,7 +262,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "trainable params: 282,624 || all params: 1,229,863,936 || trainable%: 0.022980103060766553\n"
+      "trainable params: 282,624 || all params: 1,229,863,936 || trainable%: 0.0230\n"
     ]
    },
    {
@ -276,11 +281,11 @@
       "                (SelfAttention): MT5Attention(\n",
       "                  (q): Linear(in_features=1024, out_features=1024, bias=False)\n",
       "                  (k): Linear(\n",
-       "                    in_features=1024, out_features=1024, bias=False\n",
+       "                    (base_layer): Linear(in_features=1024, out_features=1024, bias=False)\n",
       "                    (ia3_l): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n",
       "                  )\n",
       "                  (v): Linear(\n",
-       "                    in_features=1024, out_features=1024, bias=False\n",
+       "                    (base_layer): Linear(in_features=1024, out_features=1024, bias=False)\n",
       "                    (ia3_l): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n",
       "                  )\n",
       "                  (o): Linear(in_features=1024, out_features=1024, bias=False)\n",
@ -293,7 +298,7 @@
       "                (DenseReluDense): MT5DenseGatedActDense(\n",
       "                  (wi_0): Linear(in_features=1024, out_features=2816, bias=False)\n",
       "                  (wi_1): Linear(\n",
-       "                    in_features=1024, out_features=2816, bias=False\n",
+       "                    (base_layer): Linear(in_features=1024, out_features=2816, bias=False)\n",
       "                    (ia3_l): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 2816x1])\n",
       "                  )\n",
       "                  (wo): Linear(in_features=2816, out_features=1024, bias=False)\n",
@ -311,11 +316,11 @@
       "                (SelfAttention): MT5Attention(\n",
       "                  (q): Linear(in_features=1024, out_features=1024, bias=False)\n",
       "                  (k): Linear(\n",
-       "                    in_features=1024, out_features=1024, bias=False\n",
+       "                    (base_layer): Linear(in_features=1024, out_features=1024, bias=False)\n",
       "                    (ia3_l): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n",
       "                  )\n",
       "                  (v): Linear(\n",
-       "                    in_features=1024, out_features=1024, bias=False\n",
+       "                    (base_layer): Linear(in_features=1024, out_features=1024, bias=False)\n",
       "                    (ia3_l): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n",
       "                  )\n",
       "                  (o): Linear(in_features=1024, out_features=1024, bias=False)\n",
@ -327,7 +332,7 @@
       "                (DenseReluDense): MT5DenseGatedActDense(\n",
       "                  (wi_0): Linear(in_features=1024, out_features=2816, bias=False)\n",
       "                  (wi_1): Linear(\n",
-       "                    in_features=1024, out_features=2816, bias=False\n",
+       "                    (base_layer): Linear(in_features=1024, out_features=2816, bias=False)\n",
       "                    (ia3_l): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 2816x1])\n",
       "                  )\n",
       "                  (wo): Linear(in_features=2816, out_features=1024, bias=False)\n",
@ -352,11 +357,11 @@
       "                (SelfAttention): MT5Attention(\n",
       "                  (q): Linear(in_features=1024, out_features=1024, bias=False)\n",
       "                  (k): Linear(\n",
-       "                    in_features=1024, out_features=1024, bias=False\n",
+       "                    (base_layer): Linear(in_features=1024, out_features=1024, bias=False)\n",
       "                    (ia3_l): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n",
       "                  )\n",
       "                  (v): Linear(\n",
-       "                    in_features=1024, out_features=1024, bias=False\n",
+       "                    (base_layer): Linear(in_features=1024, out_features=1024, bias=False)\n",
       "                    (ia3_l): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n",
       "                  )\n",
       "                  (o): Linear(in_features=1024, out_features=1024, bias=False)\n",
@ -369,11 +374,11 @@
       "                (EncDecAttention): MT5Attention(\n",
       "                  (q): Linear(in_features=1024, out_features=1024, bias=False)\n",
       "                  (k): Linear(\n",
-       "                    in_features=1024, out_features=1024, bias=False\n",
+       "                    (base_layer): Linear(in_features=1024, out_features=1024, bias=False)\n",
       "                    (ia3_l): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n",
       "                  )\n",
       "                  (v): Linear(\n",
-       "                    in_features=1024, out_features=1024, bias=False\n",
+       "                    (base_layer): Linear(in_features=1024, out_features=1024, bias=False)\n",
       "                    (ia3_l): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n",
       "                  )\n",
       "                  (o): Linear(in_features=1024, out_features=1024, bias=False)\n",
@ -385,7 +390,7 @@
       "                (DenseReluDense): MT5DenseGatedActDense(\n",
       "                  (wi_0): Linear(in_features=1024, out_features=2816, bias=False)\n",
       "                  (wi_1): Linear(\n",
-       "                    in_features=1024, out_features=2816, bias=False\n",
+       "                    (base_layer): Linear(in_features=1024, out_features=2816, bias=False)\n",
       "                    (ia3_l): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 2816x1])\n",
       "                  )\n",
       "                  (wo): Linear(in_features=2816, out_features=1024, bias=False)\n",
@ -403,11 +408,11 @@
       "                (SelfAttention): MT5Attention(\n",
       "                  (q): Linear(in_features=1024, out_features=1024, bias=False)\n",
       "                  (k): Linear(\n",
-       "                    in_features=1024, out_features=1024, bias=False\n",
+       "                    (base_layer): Linear(in_features=1024, out_features=1024, bias=False)\n",
       "                    (ia3_l): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n",
       "                  )\n",
       "                  (v): Linear(\n",
-       "                    in_features=1024, out_features=1024, bias=False\n",
+       "                    (base_layer): Linear(in_features=1024, out_features=1024, bias=False)\n",
       "                    (ia3_l): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n",
       "                  )\n",
       "                  (o): Linear(in_features=1024, out_features=1024, bias=False)\n",
@ -419,11 +424,11 @@
       "                (EncDecAttention): MT5Attention(\n",
       "                  (q): Linear(in_features=1024, out_features=1024, bias=False)\n",
       "                  (k): Linear(\n",
-       "                    in_features=1024, out_features=1024, bias=False\n",
+       "                    (base_layer): Linear(in_features=1024, out_features=1024, bias=False)\n",
       "                    (ia3_l): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n",
       "                  )\n",
       "                  (v): Linear(\n",
-       "                    in_features=1024, out_features=1024, bias=False\n",
+       "                    (base_layer): Linear(in_features=1024, out_features=1024, bias=False)\n",
       "                    (ia3_l): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n",
       "                  )\n",
       "                  (o): Linear(in_features=1024, out_features=1024, bias=False)\n",
@ -435,7 +440,7 @@
       "                (DenseReluDense): MT5DenseGatedActDense(\n",
       "                  (wi_0): Linear(in_features=1024, out_features=2816, bias=False)\n",
       "                  (wi_1): Linear(\n",
-       "                    in_features=1024, out_features=2816, bias=False\n",
+       "                    (base_layer): Linear(in_features=1024, out_features=2816, bias=False)\n",
       "                    (ia3_l): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 2816x1])\n",
       "                  )\n",
       "                  (wo): Linear(in_features=2816, out_features=1024, bias=False)\n",
@ -457,7 +462,7 @@
       ")"
      ]
     },
-     "execution_count": 16,
+     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -470,7 +475,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 6,
+   "id": "155b8728",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
@ -519,27 +525,14 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "WARNING:datasets.builder:Found cached dataset financial_phrasebank (/root/.cache/huggingface/datasets/financial_phrasebank/sentences_allagree/1.0.0/550bde12e6c30e2674da973a55f57edde5181d53f5a5a34c1531c53f93b7e141)\n"
+      "Using the latest cached version of the dataset since financial_phrasebank couldn't be found on the Hugging Face Hub\n",
+      "Found the latest cached dataset configuration 'sentences_allagree' at /root/.cache/huggingface/datasets/financial_phrasebank/sentences_allagree/1.0.0/550bde12e6c30e2674da973a55f57edde5181d53f5a5a34c1531c53f93b7e141 (last modified on Thu Jul 31 03:15:41 2025).\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "bbfb7533b5ca459194e171df56b79566",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "9e12d97af6124a5a8c6627708b300c1e",
+       "model_id": "43b03e9b6de94bf0921228482d7be1e5",
       "version_major": 2,
       "version_minor": 0
      },
@ -553,7 +546,7 @@
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "0c561dab67914ea9b6e1aab803600551",
+       "model_id": "d08de1efca67472781017b806f33870c",
       "version_major": 2,
       "version_minor": 0
      },
@ -567,12 +560,12 @@
    {
     "data": {
      "text/plain": [
-       "{'sentence': 'It will be operated by Nokia , and supported by its Nokia NetAct network and service management system .',\n",
+       "{'sentence': 'SCOPI Chief Business Excellence Officer , Eng .',\n",
       " 'label': 1,\n",
       " 'text_label': 'neutral'}"
      ]
     },
-     "execution_count": 17,
+     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -596,7 +589,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 7,
+   "id": "723fb67d",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
@ -633,7 +627,63 @@
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e1e80a68a9e7429397cafc96c3c11f80",
+       "model_id": "7e08a312e5454c188f52fc2ca902c463",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer_config.json:   0%|          | 0.00/430 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "25d5de12709748c9959cd011c5c641de",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5b39c130813843c18e7f9187ffec37df",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "de27076e123243fd89dbad1c9e1f0596",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "special_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1b55669bf13a4e2886f34c12d5f50354",
       "version_major": 2,
       "version_minor": 0
      },
@ -647,7 +697,7 @@
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "21f582e1208a4a38ae3c0cdce87e5c14",
+       "model_id": "f914229f180b4188925d9e804b92475c",
       "version_major": 2,
       "version_minor": 0
      },
@ -695,7 +745,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 8,
+   "id": "36d56ea7",
   "metadata": {
    "id": "f733a3c6"
   },
@ -712,7 +763,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 9,
+   "id": "6b0a0536",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
@ -725,45 +777,45 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "100%|██████████| 255/255 [02:33<00:00,  1.67it/s]\n",
-      "100%|██████████| 29/29 [00:08<00:00,  3.48it/s]\n"
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 255/255 [00:52<00:00,  4.86it/s]\n",
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:02<00:00, 12.67it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "epoch=0: train_ppl=tensor(1.4939, device='cuda:0') train_epoch_loss=tensor(0.4014, device='cuda:0') eval_ppl=tensor(1.0514, device='cuda:0') eval_epoch_loss=tensor(0.0501, device='cuda:0')\n"
+      "epoch=0: train_ppl=tensor(1.4686, device='xpu:0') train_epoch_loss=tensor(0.3843, device='xpu:0') eval_ppl=tensor(1.0421, device='xpu:0') eval_epoch_loss=tensor(0.0412, device='xpu:0')\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "100%|██████████| 255/255 [02:32<00:00,  1.67it/s]\n",
-      "100%|██████████| 29/29 [00:08<00:00,  3.43it/s]\n"
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 255/255 [00:49<00:00,  5.20it/s]\n",
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:02<00:00, 13.62it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "epoch=1: train_ppl=tensor(1.0523, device='cuda:0') train_epoch_loss=tensor(0.0510, device='cuda:0') eval_ppl=tensor(1.0383, device='cuda:0') eval_epoch_loss=tensor(0.0376, device='cuda:0')\n"
+      "epoch=1: train_ppl=tensor(1.0683, device='xpu:0') train_epoch_loss=tensor(0.0661, device='xpu:0') eval_ppl=tensor(1.0264, device='xpu:0') eval_epoch_loss=tensor(0.0261, device='xpu:0')\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "100%|██████████| 255/255 [02:32<00:00,  1.68it/s]\n",
-      "100%|██████████| 29/29 [00:08<00:00,  3.44it/s]"
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 255/255 [00:49<00:00,  5.20it/s]\n",
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:02<00:00, 13.63it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "epoch=2: train_ppl=tensor(1.0397, device='cuda:0') train_epoch_loss=tensor(0.0389, device='cuda:0') eval_ppl=tensor(1.0392, device='cuda:0') eval_epoch_loss=tensor(0.0385, device='cuda:0')\n"
+      "epoch=2: train_ppl=tensor(1.0451, device='xpu:0') train_epoch_loss=tensor(0.0441, device='xpu:0') eval_ppl=tensor(1.0191, device='xpu:0') eval_epoch_loss=tensor(0.0190, device='xpu:0')\n"
     ]
    },
    {
@ -814,6 +866,7 @@
  {
   "cell_type": "code",
   "execution_count": 21,
+   "id": "761b90e4",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
@ -849,6 +902,7 @@
  {
   "cell_type": "code",
   "execution_count": 22,
+   "id": "8e0658ac",
   "metadata": {
    "id": "a8de6005"
   },
@ -861,7 +915,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": null,
+   "id": "ef7fbf9c",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
@ -874,18 +929,19 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "1.2M\tbigscience/mt0-large_IA3_SEQ_2_SEQ_LM/adapter_model.bin\n"
+      "1.2M\tbigscience/mt0-large_IA3_SEQ_2_SEQ_LM/adapter_model.safetensors\n"
     ]
    }
   ],
   "source": [
-    "ckpt = f\"{peft_model_id}/adapter_model.bin\"\n",
+    "ckpt = f\"{peft_model_id}/adapter_model.safetensors\"\n",
    "!du -h $ckpt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
+   "id": "4774d931",
   "metadata": {
    "id": "76c2fc29"
   },
@ -903,6 +959,7 @@
  {
   "cell_type": "code",
   "execution_count": 25,
+   "id": "996ddf0a",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
@ -946,6 +1003,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
+   "id": "701eda1b",
   "metadata": {
    "id": "66c65ea4"
   },
@ -955,6 +1013,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
+   "id": "7d7718c5",
   "metadata": {
    "id": "65e71f78"
   },
@ -970,7 +1029,7 @@
   "provenance": []
  },
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@ -984,7 +1043,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.3"
+   "version": "3.11.13"
  },
  "vscode": {
   "interpreter": {
--- a/examples/conditional_generation/peft_lora_seq2seq.ipynb
+++ b/examples/conditional_generation/peft_lora_seq2seq.ipynb
@ -2,26 +2,10 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "id": "5f93b7d1",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "===================================BUG REPORT===================================\n",
-      "Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
-      "For effortless bug reporting copy-paste your error into this form: https://docs.google.com/forms/d/e/1FAIpQLScPB8emS3Thkp66nvqwmjTEgxp8Y9ufuWTzFyr9kJ5AoI47dQ/viewform?usp=sf_link\n",
-      "================================================================================\n",
-      "CUDA SETUP: CUDA runtime path found: /home/sourab/miniconda3/envs/ml/lib/libcudart.so\n",
-      "CUDA SETUP: Highest compute capability among GPUs detected: 7.5\n",
-      "CUDA SETUP: Detected CUDA version 117\n",
-      "CUDA SETUP: Loading binary /home/sourab/miniconda3/envs/ml/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "from transformers import AutoModelForSeq2SeqLM\n",
    "from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, LoraConfig, TaskType\n",
@ -36,7 +20,7 @@
    "from tqdm import tqdm\n",
    "from datasets import load_dataset\n",
    "\n",
-    "device = \"cuda\"\n",
+    "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n",
    "model_name_or_path = \"bigscience/mt0-large\"\n",
    "tokenizer_name_or_path = \"bigscience/mt0-large\"\n",
    "\n",
@ -51,7 +35,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
   "id": "8d0850ac",
   "metadata": {},
   "outputs": [],
@ -75,18 +59,19 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "Found cached dataset financial_phrasebank (/home/sourab/.cache/huggingface/datasets/financial_phrasebank/sentences_allagree/1.0.0/550bde12e6c30e2674da973a55f57edde5181d53f5a5a34c1531c53f93b7e141)\n"
+      "Using the latest cached version of the dataset since financial_phrasebank couldn't be found on the Hugging Face Hub\n",
+      "Found the latest cached dataset configuration 'sentences_allagree' at /root/.cache/huggingface/datasets/financial_phrasebank/sentences_allagree/1.0.0/550bde12e6c30e2674da973a55f57edde5181d53f5a5a34c1531c53f93b7e141 (last modified on Thu Jul 31 05:47:32 2025).\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "3403bf3d718042018b0531848cc30209",
+       "model_id": "867f7bbb679d4b6eae344812fb797c19",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?it/s]"
+       "Map:   0%|          | 0/2037 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
@ -95,26 +80,12 @@
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "d3d5c45e3776469f9560b6eaa9346f8f",
+       "model_id": "a6964a9de5e64d4e80c1906e2bed9f21",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
-       "  0%|          | 0/3 [00:00<?, ?ba/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e9736f26e9aa450b8d65f95c0b9c81cc",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?ba/s]"
+       "Map:   0%|          | 0/227 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
@ -123,7 +94,7 @@
    {
     "data": {
      "text/plain": [
-       "{'sentence': \"The 10,000-odd square metre plot that Stockmann has bought for the Nevsky Center shopping center is located on Nevsky Prospect , St Petersburg 's high street , next to the Vosstaniya Square underground station , in the immediate vicinity of Moscow Station .\",\n",
+       "{'sentence': 'The bank VTB24 provides mortgage loans to buy apartments in the complex at 11-13 % per annum in rubles .',\n",
       " 'label': 1,\n",
       " 'text_label': 'neutral'}"
      ]
@ -159,12 +130,12 @@
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c460989d4ab24e3f97d81ef040b1d1b4",
+       "model_id": "a867fe83918c435ab8a52bee2737f4f3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
-       "Running tokenizer on dataset:   0%|          | 0/3 [00:00<?, ?ba/s]"
+       "Running tokenizer on dataset:   0%|          | 0/2037 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
@ -173,12 +144,12 @@
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "1acc389b08b94f8a87900b9fbdbccce4",
+       "model_id": "97ceaf1285f348bd8272e2bec54050c6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
-       "Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]"
+       "Running tokenizer on dataset:   0%|          | 0/227 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
@ -237,63 +208,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
   "id": "6b3a4090",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|████████████████████████████████████████████████████████████████████████████████████████| 255/255 [02:21<00:00,  1.81it/s]\n",
-      "100%|██████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:07<00:00,  4.13it/s]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "epoch=0: train_ppl=tensor(14.6341, device='cuda:0') train_epoch_loss=tensor(2.6834, device='cuda:0') eval_ppl=tensor(1.0057, device='cuda:0') eval_epoch_loss=tensor(0.0057, device='cuda:0')\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|████████████████████████████████████████████████████████████████████████████████████████| 255/255 [02:00<00:00,  2.11it/s]\n",
-      "100%|██████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:05<00:00,  5.66it/s]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "epoch=1: train_ppl=tensor(1.7576, device='cuda:0') train_epoch_loss=tensor(0.5640, device='cuda:0') eval_ppl=tensor(1.0052, device='cuda:0') eval_epoch_loss=tensor(0.0052, device='cuda:0')\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|████████████████████████████████████████████████████████████████████████████████████████| 255/255 [01:33<00:00,  2.74it/s]\n",
-      "100%|██████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:04<00:00,  6.23it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "epoch=2: train_ppl=tensor(1.3830, device='cuda:0') train_epoch_loss=tensor(0.3243, device='cuda:0') eval_ppl=tensor(1.0035, device='cuda:0') eval_epoch_loss=tensor(0.0035, device='cuda:0')\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "# training and evaluation\n",
    "model = model.to(device)\n",
@ -375,7 +293,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
   "id": "bd20cd4c",
   "metadata": {},
   "outputs": [
@ -383,12 +301,12 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "9,2M\tbigscience/mt0-large_LORA_SEQ_2_SEQ_LM/adapter_model.bin\r\n"
+      "9,2M\tbigscience/mt0-large_LORA_SEQ_2_SEQ_LM/adapter_model.safetensors\r\n"
     ]
    }
   ],
   "source": [
-    "ckpt = f\"{peft_model_id}/adapter_model.bin\"\n",
+    "ckpt = f\"{peft_model_id}/adapter_model.safetensors\"\n",
    "!du -h $ckpt"
   ]
  },
@ -473,7 +391,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.5"
+   "version": "3.11.13"
  },
  "vscode": {
   "interpreter": {
--- a/examples/conditional_generation/peft_lora_seq2seq_accelerate_big_model_inference.ipynb
+++ b/examples/conditional_generation/peft_lora_seq2seq_accelerate_big_model_inference.ipynb
@ -1,253 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "71fbfca2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from transformers import AutoModelForSeq2SeqLM\n",
-    "from peft import PeftModel, PeftConfig\n",
-    "import torch\n",
-    "from datasets import load_dataset\n",
-    "import os\n",
-    "from transformers import AutoTokenizer\n",
-    "from torch.utils.data import DataLoader\n",
-    "from transformers import default_data_collator, get_linear_schedule_with_warmup\n",
-    "from tqdm import tqdm\n",
-    "from datasets import load_dataset\n",
-    "\n",
-    "dataset_name = \"twitter_complaints\"\n",
-    "text_column = \"Tweet text\"\n",
-    "label_column = \"text_label\"\n",
-    "batch_size = 8\n",
-    "\n",
-    "peft_model_id = \"smangrul/twitter_complaints_bigscience_T0_3B_LORA_SEQ_2_SEQ_LM\"\n",
-    "config = PeftConfig.from_pretrained(peft_model_id)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "cc55820a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "peft_model_id = \"smangrul/twitter_complaints_bigscience_T0_3B_LORA_SEQ_2_SEQ_LM\"\n",
-    "max_memory = {0: \"6GIB\", 1: \"0GIB\", 2: \"0GIB\", 3: \"0GIB\", 4: \"0GIB\", \"cpu\": \"30GB\"}\n",
-    "config = PeftConfig.from_pretrained(peft_model_id)\n",
-    "model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, device_map=\"auto\", max_memory=max_memory)\n",
-    "model = PeftModel.from_pretrained(model, peft_model_id, device_map=\"auto\", max_memory=max_memory)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e1a3648b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from datasets import load_dataset\n",
-    "\n",
-    "dataset = load_dataset(\"ought/raft\", dataset_name)\n",
-    "\n",
-    "classes = [k.replace(\"_\", \" \") for k in dataset[\"train\"].features[\"Label\"].names]\n",
-    "print(classes)\n",
-    "dataset = dataset.map(\n",
-    "    lambda x: {\"text_label\": [classes[label] for label in x[\"Label\"]]},\n",
-    "    batched=True,\n",
-    "    num_proc=1,\n",
-    ")\n",
-    "print(dataset)\n",
-    "dataset[\"train\"][0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "fe12d4d3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)\n",
-    "target_max_length = max([len(tokenizer(class_label)[\"input_ids\"]) for class_label in classes])\n",
-    "\n",
-    "\n",
-    "def preprocess_function(examples):\n",
-    "    inputs = examples[text_column]\n",
-    "    targets = examples[label_column]\n",
-    "    model_inputs = tokenizer(inputs, truncation=True)\n",
-    "    labels = tokenizer(\n",
-    "        targets, max_length=target_max_length, padding=\"max_length\", truncation=True, return_tensors=\"pt\"\n",
-    "    )\n",
-    "    labels = labels[\"input_ids\"]\n",
-    "    labels[labels == tokenizer.pad_token_id] = -100\n",
-    "    model_inputs[\"labels\"] = labels\n",
-    "    return model_inputs\n",
-    "\n",
-    "\n",
-    "processed_datasets = dataset.map(\n",
-    "    preprocess_function,\n",
-    "    batched=True,\n",
-    "    num_proc=1,\n",
-    "    remove_columns=dataset[\"train\"].column_names,\n",
-    "    load_from_cache_file=True,\n",
-    "    desc=\"Running tokenizer on dataset\",\n",
-    ")\n",
-    "\n",
-    "train_dataset = processed_datasets[\"train\"]\n",
-    "eval_dataset = processed_datasets[\"train\"]\n",
-    "test_dataset = processed_datasets[\"test\"]\n",
-    "\n",
-    "\n",
-    "def collate_fn(examples):\n",
-    "    return tokenizer.pad(examples, padding=\"longest\", return_tensors=\"pt\")\n",
-    "\n",
-    "\n",
-    "train_dataloader = DataLoader(\n",
-    "    train_dataset, shuffle=True, collate_fn=collate_fn, batch_size=batch_size, pin_memory=True\n",
-    ")\n",
-    "eval_dataloader = DataLoader(eval_dataset, collate_fn=collate_fn, batch_size=batch_size, pin_memory=True)\n",
-    "test_dataloader = DataLoader(test_dataset, collate_fn=collate_fn, batch_size=batch_size, pin_memory=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "b33be5e6",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "@NYTsupport i have complained a dozen times &amp; yet my papers are still thrown FAR from my door. Why is this so hard to resolve?\n",
-      "{'input_ids': tensor([[25335,  1499,     3,    10,  3320, 12056,   382, 20390,     3,    23,\n",
-      "            43, 25932,     3,     9,  9611,   648,     3,   184,  4624,   117,\n",
-      "           780,    82,  5778,    33,   341,     3, 12618,   377,  4280,    45,\n",
-      "            82,  1365,     5,  1615,    19,    48,    78,   614,    12,  7785,\n",
-      "            58, 16229,     3,    10,     3,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
-      "         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}\n",
-      "tensor([[    0, 10394,     1]], device='cuda:0')\n",
-      "['complaint']\n"
-     ]
-    }
-   ],
-   "source": [
-    "model.eval()\n",
-    "i = 15\n",
-    "inputs = tokenizer(f'{text_column} : {dataset[\"test\"][i][\"Tweet text\"]} Label : ', return_tensors=\"pt\")\n",
-    "print(dataset[\"test\"][i][\"Tweet text\"])\n",
-    "print(inputs)\n",
-    "\n",
-    "with torch.no_grad():\n",
-    "    outputs = model.generate(input_ids=inputs[\"input_ids\"].to(\"cuda\"), max_new_tokens=10)\n",
-    "    print(outputs)\n",
-    "    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "b6d6cd5b",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "  0%|                                                                                                    | 0/7 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
-      "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:10<00:00,  1.48s/it]\n"
-     ]
-    }
-   ],
-   "source": [
-    "model.eval()\n",
-    "eval_preds = []\n",
-    "for _, batch in enumerate(tqdm(eval_dataloader)):\n",
-    "    batch = {k: v.to(\"cuda\") for k, v in batch.items() if k != \"labels\"}\n",
-    "    with torch.no_grad():\n",
-    "        outputs = model.generate(**batch, max_new_tokens=10)\n",
-    "    preds = outputs.detach().cpu().numpy()\n",
-    "    eval_preds.extend(tokenizer.batch_decode(preds, skip_special_tokens=True))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "61264abe",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "accuracy=100.0\n",
-      "eval_preds[:10]=['no complaint', 'no complaint', 'complaint', 'complaint', 'no complaint', 'no complaint', 'no complaint', 'complaint', 'complaint', 'no complaint']\n",
-      "dataset['train'][label_column][:10]=['no complaint', 'no complaint', 'complaint', 'complaint', 'no complaint', 'no complaint', 'no complaint', 'complaint', 'complaint', 'no complaint']\n"
-     ]
-    }
-   ],
-   "source": [
-    "correct = 0\n",
-    "total = 0\n",
-    "for pred, true in zip(eval_preds, dataset[\"train\"][label_column]):\n",
-    "    if pred.strip() == true.strip():\n",
-    "        correct += 1\n",
-    "    total += 1\n",
-    "accuracy = correct / total * 100\n",
-    "print(f\"{accuracy=}\")\n",
-    "print(f\"{eval_preds[:10]=}\")\n",
-    "print(f\"{dataset['train'][label_column][:10]=}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a70802a3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.eval()\n",
-    "test_preds = []\n",
-    "\n",
-    "for _, batch in enumerate(tqdm(test_dataloader)):\n",
-    "    batch = {k: v for k, v in batch.items() if k != \"labels\"}\n",
-    "    with torch.no_grad():\n",
-    "        outputs = model.generate(**batch, max_new_tokens=10)\n",
-    "    preds = outputs.detach().cpu().numpy()\n",
-    "    test_preds.extend(tokenizer.batch_decode(preds, skip_special_tokens=True))\n",
-    "    if len(test_preds) > 100:\n",
-    "        break\n",
-    "test_preds"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.5 (v3.10.5:f377153967, Jun  6 2022, 12:36:10) [Clang 13.0.0 (clang-1300.0.29.30)]"
-  },
-  "vscode": {
-   "interpreter": {
-    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/Show More
+++ b/Show More