Prepare 0.15.1 patch (#2459 )

This release is a patch release to release a fix for #2450 which might result in loss of `modules_to_save` when trained with deepspeed ZerO stage 3.
Release 0.15.0 (#2435 )
2025-10-21 07:53:47 +08:00 · 2025-03-27 16:33:26 +01:00 · 2025-03-19 15:27:10 +01:00 · 2025-03-19 10:57:58 +01:00 · 2025-03-17 16:31:09 +01:00 · 2025-03-14 14:17:31 +01:00
223 changed files with 27494 additions and 2102 deletions
--- a/.github/workflows/build_docker_images.yml
+++ b/.github/workflows/build_docker_images.yml
@ -10,6 +10,8 @@ concurrency:
  group: docker-image-builds
  cancel-in-progress: false

+permissions: {}
+
 env:
  CI_SLACK_CHANNEL: ${{ secrets.CI_DOCKER_CHANNEL }}

@ -23,6 +25,8 @@ jobs:
        uses: docker/setup-buildx-action@v1
      - name: Check out code
        uses: actions/checkout@v3
+        with:
+          persist-credentials: false
      - name: Login to DockerHub
        uses: docker/login-action@v2
        with:
@ -54,6 +58,8 @@ jobs:
        uses: docker/setup-buildx-action@v1
      - name: Check out code
        uses: actions/checkout@v3
+        with:
+          persist-credentials: false
      - name: Login to DockerHub
        uses: docker/login-action@v1
        with:
@ -85,6 +91,8 @@ jobs:
        uses: docker/setup-buildx-action@v1
      - name: Check out code
        uses: actions/checkout@v3
+        with:
+          persist-credentials: false
      - name: Login to DockerHub
        uses: docker/login-action@v1
        with:
@ -116,6 +124,8 @@ jobs:
        uses: docker/setup-buildx-action@v1
      - name: Check out code
        uses: actions/checkout@v3
+        with:
+          persist-credentials: false
      - name: Login to DockerHub
        uses: docker/login-action@v1
        with:
@ -147,6 +157,8 @@ jobs:
        uses: docker/setup-buildx-action@v1
      - name: Check out code
        uses: actions/checkout@v3
+        with:
+          persist-credentials: false
      - name: Login to DockerHub
        uses: docker/login-action@v1
        with:
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@ -7,6 +7,8 @@ on:
      - doc-builder*
      - v*-release

+permissions: {}
+
 jobs:
   build:
    uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@ -7,6 +7,8 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

+permissions: {}
+
 jobs:
  build:
    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
--- a/.github/workflows/integrations_tests.yml
+++ b/.github/workflows/integrations_tests.yml
@ -7,6 +7,8 @@ on:
        description: 'Branch to test on'
        required: true

+permissions: {}
+
 jobs:
  run_transformers_integration_tests:
    strategy:
@ -19,6 +21,7 @@ jobs:
        with:
          ref: ${{ github.event.inputs.branch }}
          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          persist-credentials: false
      - name: Set up Python
        uses: actions/setup-python@v4
        with:
@ -27,8 +30,8 @@ jobs:
          cache-dependency-path: "setup.py"
      - name: print environment variables
        run: |
-          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
-          echo "env.CI_SHA = ${{ env.CI_SHA }}"
+          echo "env.CI_BRANCH = ${CI_BRANCH}"
+          echo "env.CI_SHA = ${CI_SHA}"
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
@ -55,6 +58,7 @@ jobs:
        with:
          ref: ${{ github.event.inputs.branch }}
          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          persist-credentials: false
      - name: Set up Python
        uses: actions/setup-python@v4
        with:
@ -63,13 +67,13 @@ jobs:
          cache-dependency-path: "setup.py"
      - name: print environment variables
        run: |
-          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
-          echo "env.CI_SHA = ${{ env.CI_SHA }}"
+          echo "env.CI_BRANCH = ${CI_BRANCH}"
+          echo "env.CI_SHA = ${CI_SHA}"
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          python -m pip install .[test]
-          
+
          if [ "${{ matrix.diffusers-version }}" == "main" ]; then
              pip install -U git+https://github.com/huggingface/diffusers.git
          else
--- a/.github/workflows/nightly-bnb.yml
+++ b/.github/workflows/nightly-bnb.yml
@ -10,8 +10,9 @@ env:
  IS_GITHUB_CI: "1"
  # To be able to run tests on CUDA 12.2
  NVIDIA_DISABLE_REQUIRE: "1"
-  SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }}
+  SLACK_API_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

+permissions: {}

 jobs:
  run_all_tests_single_gpu:
@ -33,6 +34,8 @@ jobs:
        shell: bash
    steps:
      - uses: actions/checkout@v3
+        with:
+          persist-credentials: false
      - name: Pip install
        run: |
          source activate peft
@ -156,6 +159,8 @@ jobs:
        shell: bash
    steps:
      - uses: actions/checkout@v3
+        with:
+          persist-credentials: false
      - name: Pip install
        run: |
          source activate peft
@ -189,11 +194,6 @@ jobs:
          status: ${{ steps.import.outcome }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

-      - name: Run core GPU tests on multi-gpu
-        if: always()
-        run: |
-          source activate peft
-
      - name: Run examples on multi GPU
        id: examples_tests
        if: always()
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@ -10,8 +10,9 @@ env:
  IS_GITHUB_CI: "1"
  # To be able to run tests on CUDA 12.2
  NVIDIA_DISABLE_REQUIRE: "1"
-  SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }}
+  SLACK_API_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

+permissions: {}

 jobs:
  run_all_tests_single_gpu:
@ -30,6 +31,8 @@ jobs:
        shell: bash
    steps:
      - uses: actions/checkout@v3
+        with:
+          persist-credentials: false
      - name: Pip install
        run: |
          source activate peft
@ -78,6 +81,8 @@ jobs:
        shell: bash
    steps:
      - uses: actions/checkout@v3
+        with:
+          persist-credentials: false
      - name: Pip install
        run: |
          source activate peft
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@ -4,6 +4,8 @@ on:
  schedule:
    - cron: "0 15 * * *"

+permissions: {}
+
 jobs:
  close_stale_issues:
    name: Close Stale Issues
@ -16,11 +18,13 @@ jobs:
      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
    steps:
    - uses: actions/checkout@v3
+      with:
+        persist-credentials: false

    - name: Setup Python
      uses: actions/setup-python@v4
      with:
-        python-version: 3.8
+        python-version: 3.11

    - name: Install requirements
      run: |
--- a/.github/workflows/test-docker-build.yml
+++ b/.github/workflows/test-docker-build.yml
@ -4,7 +4,10 @@ on:
  pull_request:
    paths:
      # Run only when DockerFile files are modified
-      - "docker/**"
+      - "docker/*/Dockerfile"
+
+permissions: {}
+
 jobs:
  get_changed_files:
    name: "Build all modified docker images"
@ -14,11 +17,13 @@ jobs:
    steps:
      - name: Check out code
        uses: actions/checkout@v3
+        with:
+          persist-credentials: false
      - name: Get changed files
        id: changed-files
        uses: tj-actions/changed-files@1c8e6069583811afb28f97afeaf8e7da80c6be5c #v42
        with:
-          files: docker/**
+          files: docker/*/Dockerfile
          json: "true"
      - name: Run step if only the files listed above change
        if: steps.changed-files.outputs.any_changed == 'true'
@ -26,7 +31,7 @@ jobs:
        env:
          ALL_CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
        run: |
-          echo "matrix=${{ steps.changed-files.outputs.all_changed_files}}" >> $GITHUB_OUTPUT
+          echo "matrix=${ALL_CHANGED_FILES}" >> $GITHUB_OUTPUT
  build_modified_files:
    needs: get_changed_files
    name: Build Docker images on modified files
@ -51,6 +56,8 @@ jobs:
        uses: docker/setup-buildx-action@v1
      - name: Check out code
        uses: actions/checkout@v3
+        with:
+          persist-credentials: false
      - name: Build Docker image
        uses: docker/build-push-action@v4
        with:
--- a/.github/workflows/tests-main.yml
+++ b/.github/workflows/tests-main.yml
@ -6,11 +6,15 @@ on:
    paths-ignore:
        - 'docs/**'

+permissions: {}
+
 jobs:
  tests:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
+        with:
+          persist-credentials: false
      - name: Set up Python 3.11
        uses: actions/setup-python@v4
        with:
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -9,15 +9,22 @@ on:
    paths-ignore:
      - 'docs/**'

+env:
+  HF_HOME: .cache/huggingface
+
+permissions: {}
+
 jobs:
  check_code_quality:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
+        with:
+          persist-credentials: false
      - name: Set up Python
        uses: actions/setup-python@v4
        with:
-          python-version: "3.8"
+          python-version: "3.11"
          cache: "pip"
          cache-dependency-path: "setup.py"
      - name: Install dependencies
@ -34,11 +41,32 @@ jobs:
      # TODO: remove 'fail-fast' line once timeout issue from the Hub is solved
      fail-fast: false
      matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
-        os: ["ubuntu-latest", "macos-12", "windows-latest"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
+        os: ["ubuntu-latest", "macos-13", "windows-latest"]
    runs-on: ${{ matrix.os }}
    steps:
      - uses: actions/checkout@v3
+        with:
+          persist-credentials: false
+      - name: Model cache
+        uses: actions/cache/restore@v4
+        with:
+          # Avoid caching HF_HOME/modules and Python cache files to prevent interoperability
+          # issues and potential cache poisioning. We also avoid lock files to prevent runs
+          # avoiding re-download because they see a lock file.
+          path: |
+            ${{ env.HF_HOME }}/hub/**
+            !${{ env.HF_HOME }}/**/*.pyc
+          key: model-cache-${{ github.run_id }}
+          restore-keys: model-cache-
+          enableCrossOsArchive: true
+      - name: Dump cache content
+        # TODO: remove this step after 2025-02-15
+        if: matrix.os != 'windows-latest'
+        run: |
+          SHASUM=sha256sum
+          [ -f "$(which shasum)" ] && SHASUM=shasum
+          find "${{ env.HF_HOME }}/hub" -type f -exec "$SHASUM" {} \; > cache_content_initial || true
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v4
        with:
@ -48,14 +76,40 @@ jobs:
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
+          pip install setuptools
          # cpu version of pytorch
          pip install -e .[test]
      - name: Downgrade numpy on MacOS and Windows
        # TODO: remove numpy downgrade on MacOS & Windows once torch fixes numpy 2.0 issue
        shell: bash
-        if: matrix.os == 'windows-latest' || matrix.os == 'macos-12'
+        if: matrix.os == 'windows-latest' || matrix.os == 'macos-13'
        run: |
          pip install --force-reinstall -U "numpy<2.0.0"
      - name: Test with pytest
        run: |
          make test
+      - name: Dump cache content and diff
+        # This is just debug info so that we can monitor if the model cache diverges substantially
+        # over time and what the diverging model is.
+        # TODO: remove after 2025-02-15
+        if: matrix.os != 'windows-latest'
+        run: |
+          SHASUM=sha256sum
+          [ -f "$(which shasum)" ] && SHASUM=shasum
+          find "${{ env.HF_HOME }}/hub" -type f -exec "$SHASUM" {} \; > cache_content_after || true
+          diff -udp cache_content_initial cache_content_after || true
+      - name: Delete old model cache entries
+        run: |
+          # make sure that cache cleaning doesn't break the pipeline
+          python scripts/ci_clean_cache.py -d || true
+      - name: Update model cache
+        uses: actions/cache/save@v4
+        # Only let one runner (preferably the one that covers most tests) update the model cache
+        # after *every* run. This way we make sure that our cache is never outdated and we don't
+        # have to keep track of hashes.
+        if: always() && matrix.os == 'ubuntu-latest' && matrix.python-version == '3.10'
+        with:
+          path: |
+            ${{ env.HF_HOME }}/hub/**
+            !${{ env.HF_HOME }}/**/*.pyc
+          key: model-cache-${{ github.run_id }}
--- a/.github/workflows/torch_compile_tests.yml
+++ b/.github/workflows/torch_compile_tests.yml
@ -17,6 +17,8 @@ env:
  # To be able to run tests on CUDA 12.2
  NVIDIA_DISABLE_REQUIRE: "1"

+permissions: {}
+
 jobs:
  run_tests_with_compile:
    runs-on:
@ -25,6 +27,7 @@ jobs:
      PEFT_DEBUG_WITH_TORCH_COMPILE: 1
      CUDA_VISIBLE_DEVICES: "0"
      TEST_TYPE: "single_gpu_huggingface/peft-gpu-bnb-latest:latest"
+      USE_PYTORCH_NIGHTLY: "${{ github.event.inputs.pytorch_nightly }}"
    container:
      image: "huggingface/peft-gpu-bnb-latest:latest"
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -36,13 +39,14 @@ jobs:
        with:
          ref: ${{ github.event.inputs.branch }}
          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          persist-credentials: false
      - name: Pip install
        run: |
          source activate peft
          pip install -e . --no-deps
          pip install pytest-cov pytest-reportlog parameterized datasets scipy einops
          pip install "pytest>=7.2.0,<8.0.0" # see: https://github.com/huggingface/transformers/blob/ce4fff0be7f6464d713f7ac3e0bbaafbc6959ae5/setup.py#L148C6-L148C26
-          if [ "${{ github.event.inputs.pytorch_nightly }}" = "true" ]; then
+          if [ "${USE_PYTORCH_NIGHTLY}" = "true" ]; then
            python -m pip install --upgrade --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
          fi
      - name: Test compile with pytest
--- a/.github/workflows/trufflehog.yml
+++ b/.github/workflows/trufflehog.yml
@ -3,6 +3,8 @@ on:

 name: Secret Leaks

+permissions: {}
+
 jobs:
  trufflehog:
    runs-on: ubuntu-latest
@ -11,5 +13,6 @@ jobs:
      uses: actions/checkout@v4
      with:
        fetch-depth: 0
+        persist-credentials: false
    - name: Secret Scanning
      uses: trufflesecurity/trufflehog@main
--- a/.github/workflows/upload_pr_documentation.yml
+++ b/.github/workflows/upload_pr_documentation.yml
@ -6,6 +6,8 @@ on:
    types:
      - completed

+permissions: {}
+
 jobs:
  build:
    uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
@ -13,4 +15,4 @@ jobs:
      package_name: peft
    secrets:
      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
-      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
+      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
--- a/.github/workflows/zizmor.yaml
+++ b/.github/workflows/zizmor.yaml
@ -0,0 +1,28 @@
+name: CI security linting
+
+on:
+  push:
+    branches: ["main"]
+  pull_request:
+    branches: ["*"]
+    paths:
+      - '.github/**'
+
+permissions: {}
+
+jobs:
+  zizmor:
+    name: zizmor latest via Cargo
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      security-events: write
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          persist-credentials: false
+      - name: Install zizmor
+        run: cargo install --locked zizmor
+      - name: Run zizmor
+        run: zizmor .github/workflows
--- a/.github/zizmor.yml
+++ b/.github/zizmor.yml
@ -0,0 +1,15 @@
+rules:
+  dangerous-triggers:
+    ignore:
+      # this workflow is only triggered after maintainer approval
+      - upload_pr_documentation.yml:3:1
+  cache-poisoning:
+    ignore:
+      # the docker buildx binary is cached and zizmor warns about a cache poisoning attack.
+      # OTOH this cache would make us more resilient against an intrusion on docker-buildx' side.
+      # There is no obvious benefit so we leave it as it is.
+      - build_docker_images.yml:37:9
+      - build_docker_images.yml:70:9
+      - build_docker_images.yml:103:9
+      - build_docker_images.yml:136:9
+      - build_docker_images.yml:169:9
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,6 +1,6 @@
 repos:
  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.6.1
+    rev: v0.9.2
    hooks:
      - id: ruff
        args:
--- a/1
+++ b/1
@ -34,6 +34,7 @@ tests_core_single_gpu:
 tests_common_gpu:
 	python -m pytest tests/test_decoder_models.py $(if $(IS_GITHUB_CI),--report-log "common_decoder.log",)
 	python -m pytest tests/test_encoder_decoder_models.py $(if $(IS_GITHUB_CI),--report-log "common_encoder_decoder.log",)
+	python -m pytest tests/test_gptqmodel.py $(if $(IS_GITHUB_CI),--report-log "gptqmodel_gpu.log",)

 tests_examples_multi_gpu_bnb:
 	python -m pytest -m "multi_gpu_tests and bitsandbytes" tests/test_gpu_examples.py $(if $(IS_GITHUB_CI),--report-log "multi_gpu_examples.log",)
--- a/docker/README.md
+++ b/docker/README.md
@ -1,6 +1,6 @@
 # PEFT Docker images

-Here we store all PEFT Docker images used in our testing infrastructure. We use python 3.8 for now on all our images.
+Here we store all PEFT Docker images used in our testing infrastructure. We use python 3.11 for now on all our images.

 - `peft-cpu`: PEFT compiled on CPU with all other HF libraries installed on main branch
 - `peft-gpu`: PEFT complied for NVIDIA GPUs wih all other HF libraries installed on main branch
--- a/docker/peft-cpu/Dockerfile
+++ b/docker/peft-cpu/Dockerfile
@ -4,7 +4,7 @@
 # Use base conda image to reduce time
 FROM continuumio/miniconda3:latest AS compile-image
 # Specify py version
-ENV PYTHON_VERSION=3.8
+ENV PYTHON_VERSION=3.11
 # Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
 RUN apt-get update && \
    apt-get install -y curl git wget software-properties-common git-lfs && \
--- a/docker/peft-gpu-bnb-latest/Dockerfile
+++ b/docker/peft-gpu-bnb-latest/Dockerfile
@ -4,7 +4,7 @@
 # Use base conda image to reduce time
 FROM continuumio/miniconda3:latest AS compile-image
 # Specify py version
-ENV PYTHON_VERSION=3.8
+ENV PYTHON_VERSION=3.11
 # Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
 RUN apt-get update && \
    apt-get install -y curl git wget software-properties-common git-lfs && \
@ -31,7 +31,7 @@ RUN chsh -s /bin/bash
 SHELL ["/bin/bash", "-c"]

 # Stage 2
-FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS build-image
+FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS build-image
 COPY --from=compile-image /opt/conda /opt/conda
 ENV PATH /opt/conda/bin:$PATH

--- a/docker/peft-gpu-bnb-multi-source/Dockerfile
+++ b/docker/peft-gpu-bnb-multi-source/Dockerfile
@ -4,7 +4,7 @@
 # Use base conda image to reduce time
 FROM continuumio/miniconda3:latest AS compile-image
 # Specify py version
-ENV PYTHON_VERSION=3.8
+ENV PYTHON_VERSION=3.11
 # Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
 RUN apt-get update && \
    apt-get install -y curl git wget software-properties-common git-lfs && \
@ -31,7 +31,7 @@ RUN chsh -s /bin/bash
 SHELL ["/bin/bash", "-c"]

 # Stage 2
-FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS build-image
+FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS build-image
 COPY --from=compile-image /opt/conda /opt/conda
 ENV PATH /opt/conda/bin:$PATH

--- a/docker/peft-gpu-bnb-source/Dockerfile
+++ b/docker/peft-gpu-bnb-source/Dockerfile
@ -4,7 +4,7 @@
 # Use base conda image to reduce time
 FROM continuumio/miniconda3:latest AS compile-image
 # Specify py version
-ENV PYTHON_VERSION=3.8
+ENV PYTHON_VERSION=3.11
 # Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
 RUN apt-get update && \
    apt-get install -y curl git wget software-properties-common git-lfs && \
@ -31,7 +31,7 @@ RUN chsh -s /bin/bash
 SHELL ["/bin/bash", "-c"]

 # Stage 2
-FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS build-image
+FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS build-image
 COPY --from=compile-image /opt/conda /opt/conda
 ENV PATH /opt/conda/bin:$PATH

--- a/docker/peft-gpu/Dockerfile
+++ b/docker/peft-gpu/Dockerfile
@ -4,23 +4,18 @@
 # Use base conda image to reduce time
 FROM continuumio/miniconda3:latest AS compile-image
 # Specify py version
-ENV PYTHON_VERSION=3.8
+ENV PYTHON_VERSION=3.11
 # Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
+# Install audio-related libraries
 RUN apt-get update && \
-    apt-get install -y curl git wget software-properties-common git-lfs && \
+    apt-get install -y curl git wget software-properties-common git-lfs ffmpeg libsndfile1-dev && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists*

-# Install audio-related libraries 
-RUN apt-get update && \
-    apt install -y ffmpeg
-
-RUN apt install -y libsndfile1-dev
 RUN git lfs install

 # Create our conda env - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
 RUN conda create --name peft python=${PYTHON_VERSION} ipython jupyter pip
-RUN python3 -m pip install --no-cache-dir --upgrade pip

 # Below is copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
 # We don't install pytorch here yet since CUDA isn't available
@ -31,29 +26,24 @@ RUN chsh -s /bin/bash
 SHELL ["/bin/bash", "-c"]

 # Stage 2
-FROM nvidia/cuda:12.2.2-devel-ubuntu22.04 AS build-image
+FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS build-image
 COPY --from=compile-image /opt/conda /opt/conda
 ENV PATH /opt/conda/bin:$PATH

-RUN chsh -s /bin/bash
-SHELL ["/bin/bash", "-c"]
-RUN source activate peft && \ 
-    python3 -m pip install --no-cache-dir bitsandbytes optimum auto-gptq
-
-# Add autoawq for quantization testing
-RUN source activate peft && \
-    python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.4/autoawq-0.2.4-cp38-cp38-linux_x86_64.whl
-RUN source activate peft && \
-    python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ_kernels/releases/download/v0.0.6/autoawq_kernels-0.0.6-cp38-cp38-linux_x86_64.whl
-
 # Install apt libs
 RUN apt-get update && \
    apt-get install -y curl git wget && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists*

-# Add eetq for quantization testing
-RUN source activate peft && \
+RUN chsh -s /bin/bash
+SHELL ["/bin/bash", "-c"]
+RUN source activate peft && \ 
+    python3 -m pip install --no-cache-dir bitsandbytes optimum auto-gptq && \
+    # Add autoawq for quantization testing
+    python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.7.post2/autoawq-0.2.7.post2-py3-none-any.whl && \
+    python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ_kernels/releases/download/v0.0.9/autoawq_kernels-0.0.9-cp311-cp311-linux_x86_64.whl && \
+    # Add eetq for quantization testing
    python3 -m pip install git+https://github.com/NetEase-FuXi/EETQ.git

 # Activate the conda env and install transformers + accelerate from source
@ -62,19 +52,16 @@ RUN source activate peft && \
    librosa \
    "soundfile>=0.12.1" \
    scipy \
+    torchao \
    git+https://github.com/huggingface/transformers \
    git+https://github.com/huggingface/accelerate \
-    peft[test]@git+https://github.com/huggingface/peft
+    peft[test]@git+https://github.com/huggingface/peft \
+    # Add aqlm for quantization testing
+    aqlm[gpu]>=1.0.2 \
+    # Add HQQ for quantization testing
+    hqq

-# Add aqlm for quantization testing
 RUN source activate peft && \
-    pip install aqlm[gpu]>=1.0.2
-
-# Add HQQ for quantization testing
-RUN source activate peft && \
-pip install hqq
-
-RUN source activate peft && \ 
    pip freeze | grep transformers

 RUN echo "source activate peft" >> ~/.profile
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@ -116,6 +116,14 @@
      title: FourierFT
    - local: package_reference/vblora
      title: VB-LoRA
+    - local: package_reference/hra
+      title: HRA
+    - local: package_reference/cpt
+      title: CPT
+    - local: package_reference/bone
+      title: Bone
+    - local: package_reference/trainable_tokens
+      title: Trainable Tokens

    title: Adapters
  - sections:
@ -123,5 +131,7 @@
      title: Model merge
    - local: package_reference/helpers
      title: Helpers
+    - local: package_reference/hotswap
+      title: Hotswapping adapters
    title: Utilities
  title: API reference
--- a/docs/source/accelerate/deepspeed.md
+++ b/docs/source/accelerate/deepspeed.md
@ -128,7 +128,7 @@ Notice that we are using LoRA with  rank=8, alpha=16 and targeting all linear la

 Let's dive a little deeper into the script so you can see what's going on, and understand how it works.

-The first thing to know is that the script uses DeepSpeed for distributed training as the DeepSpeed config has been passed. The `SFTTrainer` class handles all the heavy lifting of creating the PEFT model using the peft config that is passed. After that, when you call `trainer.train()`, `SFTTrainer` internally uses 🤗 Accelerate to prepare the model, optimizer and trainer using the DeepSpeed config to create DeepSpeed engine which is then trained. The main code snippet is below:
+The first thing to know is that the script uses DeepSpeed for distributed training as the DeepSpeed config has been passed. The [`~trl.SFTTrainer`] class handles all the heavy lifting of creating the PEFT model using the peft config that is passed. After that, when you call `trainer.train()`, [`~trl.SFTTrainer`] internally uses 🤗 Accelerate to prepare the model, optimizer and trainer using the DeepSpeed config to create DeepSpeed engine which is then trained. The main code snippet is below:

 ```python
 # trainer
@ -139,13 +139,6 @@ trainer = SFTTrainer(
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
-    packing=data_args.packing,
-    dataset_kwargs={
-        "append_concat_token": data_args.append_concat_token,
-        "add_special_tokens": data_args.add_special_tokens,
-    },
-    dataset_text_field=data_args.dataset_text_field,
-    max_seq_length=data_args.max_seq_length,
 )
 trainer.accelerator.print(f"{trainer.model}")

@ -175,7 +168,7 @@ You can also refer this blog post [Falcon 180B Finetuning using 🤗 PEFT and De
 # Use PEFT QLoRA and DeepSpeed with ZeRO3 for finetuning large models on multiple GPUs

 In this section, we will look at how to use QLoRA and DeepSpeed Stage-3 for finetuning 70B llama model on 2X40GB GPUs.
-For this, we first need `bitsandbytes>=0.43.0`, `accelerate>=0.28.0`, `transformers>4.38.2`, `trl>0.7.11` and `peft>0.9.0`. We need to set `zero3_init_flag` to true when using Accelerate config. Below is the config which can be found at [deepspeed_config_z3_qlora.yaml](https://github.com/huggingface/peft/blob/main/examples/sft/configs/deepspeed_config_z3_qlora.yaml):
+For this, we first need `bitsandbytes>=0.43.3`, `accelerate>=1.0.1`, `transformers>4.44.2`, `trl>0.11.4` and `peft>0.13.0`. We need to set `zero3_init_flag` to true when using Accelerate config. Below is the config which can be found at [deepspeed_config_z3_qlora.yaml](https://github.com/huggingface/peft/blob/main/examples/sft/configs/deepspeed_config_z3_qlora.yaml):

 ```yml
 compute_environment: LOCAL_MACHINE                                                                                                                                           
--- a/docs/source/accelerate/fsdp.md
+++ b/docs/source/accelerate/fsdp.md
@ -108,7 +108,7 @@ Notice that we are using LoRA with  rank=8, alpha=16 and targeting all linear la

 Let's dive a little deeper into the script so you can see what's going on, and understand how it works.

-The first thing to know is that the script uses FSDP for distributed training as the FSDP config has been passed. The `SFTTrainer` class handles all the heavy lifting of creating PEFT model using the peft config that is passed. After that when you call `trainer.train()`, Trainer internally uses 🤗 Accelerate to prepare model, optimizer and trainer using the FSDP config to create FSDP wrapped model which is then trained. The main code snippet is below:
+The first thing to know is that the script uses FSDP for distributed training as the FSDP config has been passed. The [`~trl.SFTTrainer`] class handles all the heavy lifting of creating PEFT model using the peft config that is passed. After that when you call `trainer.train()`, Trainer internally uses 🤗 Accelerate to prepare model, optimizer and trainer using the FSDP config to create FSDP wrapped model which is then trained. The main code snippet is below:

 ```python
 # trainer
@ -119,13 +119,6 @@ trainer = SFTTrainer(
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
-    packing=data_args.packing,
-    dataset_kwargs={
-        "append_concat_token": data_args.append_concat_token,
-        "add_special_tokens": data_args.add_special_tokens,
-    },
-    dataset_text_field=data_args.dataset_text_field,
-    max_seq_length=data_args.max_seq_length,
 )
 trainer.accelerator.print(f"{trainer.model}")
 if model_args.use_peft_lora:
@ -173,7 +166,7 @@ In the above example, the memory consumed per GPU is  72-80 GB (90-98%) as seen

 In this section, we will look at how to use QLoRA and FSDP for finetuning 70B llama model on 2X24GB GPUs. [Answer.AI](https://www.answer.ai/) in collaboration with bitsandbytes and Hugging Face 🤗 open sourced code enabling the usage of FSDP+QLoRA and explained the whole process in their insightful blogpost [You can now train a 70b language model at home](https://www.answer.ai/posts/2024-03-06-fsdp-qlora.html). This is now integrated in Hugging Face ecosystem. 

-For this, we first need `bitsandbytes>=0.43.0`, `accelerate>=0.28.0`, `transformers>4.38.2`, `trl>0.7.11` and `peft>0.9.0`. We need to set `fsdp_cpu_ram_efficient_loading=true`, `fsdp_use_orig_params=false` and `fsdp_offload_params=true`(cpu offloading) when using Accelerate config. When not using accelerate launcher, you can alternately set the environment variable `export FSDP_CPU_RAM_EFFICIENT_LOADING=true`.  Here, we will be using accelerate config and below is the config which can be found at [fsdp_config_qlora.yaml](https://github.com/huggingface/peft/blob/main/examples/sft/configs/fsdp_config_qlora.yaml):
+For this, we first need `bitsandbytes>=0.43.3`, `accelerate>=1.0.1`, `transformers>4.44.2`, `trl>0.11.4` and `peft>0.13.0`. We need to set `fsdp_cpu_ram_efficient_loading=true`, `fsdp_use_orig_params=false` and `fsdp_offload_params=true`(cpu offloading) when using Accelerate config. When not using accelerate launcher, you can alternately set the environment variable `export FSDP_CPU_RAM_EFFICIENT_LOADING=true`.  Here, we will be using accelerate config and below is the config which can be found at [fsdp_config_qlora.yaml](https://github.com/huggingface/peft/blob/main/examples/sft/configs/fsdp_config_qlora.yaml):

 ```yml
 compute_environment: LOCAL_MACHINE                                                                                                                                           
--- a/docs/source/conceptual_guides/adapter.md
+++ b/docs/source/conceptual_guides/adapter.md
@ -93,6 +93,8 @@ OFT preserves the hyperspherical energy by learning an orthogonal transformation

 [AdaLoRA](https://hf.co/papers/2303.10512) manages the parameter budget introduced from LoRA by allocating more parameters - in other words, a higher rank `r` - for important weight matrices that are better adapted for a task and pruning less important ones. The rank is controlled by a method similar to singular value decomposition (SVD). The ∆W is parameterized with two orthogonal matrices and a diagonal matrix which contains singular values. This parametrization method avoids iteratively applying SVD which is computationally expensive. Based on this method, the rank of ∆W is adjusted according to an importance score. ∆W is divided into triplets and each triplet is scored according to its contribution to model performance. Triplets with low importance scores are pruned and triplets with high importance scores are kept for finetuning.

+Training with AdaLoRA has three phases: the init phase, the budgeting phase and the final phase. In the initial phase, no budgeting is applied, therefore the ranks are not touched. During the budgeting phase the process described above is applied and the rank is redistributed according to a budget, aiming to give more important adapters more rank and less important layers less. When reaching the final phase, budgeting has ended, the ranks are redistributed but we may continue training for a while with the redistributed ranks to further improve performance.
+
 ## Llama-Adapter

 [Llama-Adapter](https://hf.co/papers/2303.16199) is a method for adapting Llama into a instruction-following model. To help adapt the model for instruction-following, the adapter is trained with a 52K instruction-output dataset.
@ -105,3 +107,27 @@ A set of of learnable adaption prompts are prefixed to the input instruction tok
 <small><a href="https://hf.co/papers/2303.16199">LLaMA-Adapter: Efficient Fine-tuning of Language Models with Zero-init Attention</a></small>

 To avoid adding noise to the tokens, the adapter uses zero-initialized attention. On top of this, the adapter adds a learnable gating factor (initialized with zeros) to progressively add information to the model during training. This prevents overwhelming the model's pretrained knowledge with the newly learned instructions.
+
+## Householder Reflection Adaptation (HRA)
+
+[HRA](https://huggingface.co/papers/2405.17484) provides a new perspective connecting LoRA to OFT, which means it can harness the advantages of both strategies, reduce parameters and computation costs while penalizing the loss of pre-training knowledge. 
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/hra.png"/>
+</div>
+<small><a href="https://huggingface.co/papers/2405.17484">Bridging The Gap between Low-rank and Orthogonal Adaptation via Householder Reflection Adaptation</a></small>
+
+HRA constructs a chain of `r` trainable Householder reflections (HRs). Because the Householder reflection matrix is an orthogonal matrix and the product of orthogonal matrices is also an orthogonal matrix, HRA satisfies the theoretical guarantee of Orthogonal Finetuning (OFT). Meanwhile, HRA can also be viewed as an low-rank fine-tuning adapter by rewriting formula. 
+
+The higher `r`, the more trainable parameters, resulting in a larger model capacity and better performance. Besides, due to the chain structure, the orthogonality of HR planes impacts the capacity and regularity of HRA. To achieve a trade-off between the model capacity and regularity, an orthogonality regularizer of the HR planes is added to the loss function. The weight \\(\lambda\\) can control the strength of the regularizer. 
+
+## Bone
+[Bone](https://huggingface.co/papers/2409.15371) A novel PEFT technique distinct from LoRA, called Block-Affine Adaptation (Bone). By dividing the original weights into multiple subspaces that share a single matrix for weight updates, Bone simplifies the process by requiring the trainable matrix to be initialized to zero, eliminating the need for complex initialization as in some LoRA variants. Compared to LoRA, Bone significantly reduces memory usage and achieves faster computation.
+
+<small><a href="https://huggingface.co/papers/2409.15371">Bone: Block-Affine Adaptation of Large Language Models</a></small>
+
+Intuitively, the shape of a single trainable matrix in Bone is consistent with `lora_B`, so the `r` parameter in Bone is less than the `r` in LoRA by (`in_feature * r`).
+
+Bat is a new structure composed of Bone and "Weight Guide." You can use it by setting init_weights == "bat".Bat reduces the number of trainable parameters by using a block grouping method, and the block computation of weight W effectively promotes information exchange in the original weights, enhancing data fitting capability during fine-tuning. The experiment mentions controlling the size of trainable parameters through b (block size), similar to r (rank) in LoRA. For consistency within PEFT, we also name b as r. Note: Bat's r (b) is special and requires that weight W satisfies the conditions `in_features % r == 0` and `out_features % r == 0`. Additionally, when `in_features == out_features` and Bone-r equals LoRA-r, Bone's number of trainable parameters is only half that of LoRA.
+
+Bat currently has some issues: it is slower than LoRA and requires checkpointing to address excessive memory usage due to intermediate values, which further reduces training speed. We plan to address this in the future. Contributions are welcome.
--- a/docs/source/conceptual_guides/oft.md
+++ b/docs/source/conceptual_guides/oft.md
@ -79,8 +79,8 @@ specify either `boft_block_size` or `boft_block_num`, but not both simultaneousl
 For an example of the BOFT method application to various downstream tasks, please refer to the following guides:

 Take a look at the following step-by-step guides on how to finetune a model with BOFT:
- [Dreambooth finetuning with BOFT](../task_guides/boft_dreambooth) 
- [Controllable generation finetuning with BOFT (ControlNet)](../task_guides/boft_controlnet) 
+- [Dreambooth finetuning with BOFT](https://github.com/huggingface/peft/blob/main/examples/boft_dreambooth/boft_dreambooth.md)
+- [Controllable generation finetuning with BOFT (ControlNet)](https://github.com/huggingface/peft/blob/main/examples/boft_controlnet/boft_controlnet.md)

 For the task of image classification, one can initialize the BOFT config for a DinoV2 model as follows:

--- a/docs/source/conceptual_guides/prompting.md
+++ b/docs/source/conceptual_guides/prompting.md
@ -75,3 +75,19 @@ Take a look at [P-tuning for sequence classification](../task_guides/ptuning-seq
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/mpt-decomposition.png"/>
 </div>
 <small><a href="https://hf.co/papers/2103.10385">Prompt decomposition</a>.</small>
+
+
+## Context-Aware Prompt Tuning (CPT)
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/cpt.png"/>
+</div>
+<small>CPT optimizing only specific token embeddings while keeping the rest of the model frozen <a href="https://huggingface.co/papers/2410.17222">(image source)</a>.</small>
+
+[Context-Aware Prompt Tuning (CPT)](https://huggingface.co/papers/2410.17222) is designed to enhance few-shot classification by refining only context embeddings. 
+This approach combines ideas from In-Context Learning (ICL), Prompt Tuning (PT), and adversarial optimization, focusing on making model adaptation both parameter-efficient and effective.
+In CPT, only specific context token embeddings are optimized, while the rest of the model remains frozen. 
+To prevent overfitting and maintain stability, CPT uses controlled perturbations to limit the allowed changes to context embeddings within a defined range. 
+Additionally, to address the phenomenon of recency bias—where examples near the end of the context tend to be prioritized over earlier ones—CPT applies a decay loss factor.
+
+Take a look at [Example](https://github.com/huggingface/peft/blob/main/examples/cpt_finetuning/README.md) for a step-by-step guide on how to train a model with CPT.
--- a/docs/source/developer_guides/lora.md
+++ b/docs/source/developer_guides/lora.md
@ -41,7 +41,7 @@ config = LoraConfig(init_lora_weights=False, ...)
 ```

 ### PiSSA
-[PiSSA](https://arxiv.org/abs/2404.02948) initializes the LoRA adapter using the principal singular values and singular vectors. This straightforward modification allows PiSSA to converge more rapidly than LoRA and ultimately attain superior performance. Moreover, PiSSA reduces the quantization error compared to QLoRA, leading to further enhancements. 
+[PiSSA](https://arxiv.org/abs/2404.02948) initializes the LoRA adapter using the principal singular values and singular vectors. This straightforward modification allows PiSSA to converge more rapidly than LoRA and ultimately attain superior performance. Moreover, PiSSA reduces the quantization error compared to QLoRA, leading to further enhancements.

 Configure the initialization method to "pissa", which may take several minutes to execute SVD on the pre-trained model:
 ```python
@ -50,9 +50,40 @@ config = LoraConfig(init_lora_weights="pissa", ...)
 ```
 Alternatively, execute fast SVD, which takes only a few seconds. The number of iterations determines the trade-off between the error and computation time:
 ```python
-lora_config = LoraConfig(init_lora_weights="pissa_niter_[number of iters]", ...) 
+lora_config = LoraConfig(init_lora_weights="pissa_niter_[number of iters]", ...)
 ```
-For detailed instruction on using PiSSA, please follow [these instructions](https://github.com/fxmeng/peft/tree/main/examples/pissa_finetuning).
+For detailed instruction on using PiSSA, please follow [these instructions](https://github.com/huggingface/peft/tree/main/examples/pissa_finetuning).
+
+### CorDA
+
+[CorDA](https://arxiv.org/pdf/2406.05223) builds task-aware LoRA adapters from weight decomposition oriented by the context of downstream task to learn (instruction-previewed mode, IPM) or world knowledge to maintain (knowledge-preserved mode, KPM).
+The KPM not only achieves better performance than LoRA on fine-tuning tasks, but also mitigates the catastrophic forgetting of pre-trained world knowledge.
+When preserving pre-trained knowledge is not a concern,
+the IPM is favored because it can further accelerate convergence and enhance the fine-tuning performance.
+
+You need to configure the initialization method to "corda", and specify the mode of IPM or KPM and the dataset to collect covariance matrices.
+
+```py
+@torch.no_grad()
+def run_model():
+    # Assume `model` and `dataset` is in context...
+    model.eval()
+    for batch in dataset:
+        model(**batch)
+
+
+corda_config = CordaConfig(
+    corda_method="kpm",
+)
+lora_config = LoraConfig(
+    init_lora_weights="corda",
+    corda_config=corda_config,
+)
+preprocess_corda(model, lora_config, run_model=run_model)
+peft_model = get_peft_model(model, lora_config)
+```
+
+For detailed instruction on using CorDA, please follow [these instructions](https://github.com/huggingface/peft/tree/main/examples/corda_finetuning).

 ### OLoRA
 [OLoRA](https://arxiv.org/abs/2406.01775) utilizes QR decomposition to initialize the LoRA adapters. OLoRA translates the base weights of the model by a factor of their QR decompositions, i.e., it mutates the weights before performing any training on them. This approach significantly improves stability, accelerates convergence speed, and ultimately achieves superior performance.
@ -63,6 +94,37 @@ from peft import LoraConfig
 config = LoraConfig(init_lora_weights="olora", ...)
 ```
 For more advanced usage, please refer to our [documentation](https://github.com/huggingface/peft/tree/main/examples/olora_finetuning).
+
+### EVA
+[EVA](https://arxiv.org/pdf/2410.07170) performs SVD on the input activations of each layer and uses the right-singular vectors to initialize LoRA weights. It is therefore a data-driven initialization scheme. Furthermore EVA adaptively allocates ranks across layers based on their "explained variance ratio" - a metric derived from the SVD analysis.
+
+You can use EVA by setting `init_lora_weights="eva"` and defining [`EvaConfig`] in [`LoraConfig`]:
+```python
+from peft import LoraConfig, EvaConfig
+peft_config = LoraConfig(
+    init_lora_weights = "eva",
+    eva_config = EvaConfig(rho = 2.0),
+    ...
+)
+```
+The parameter `rho` (≥ 1.0) determines how much redistribution is allowed. When `rho=1.0` and `r=16`, LoRA adapters are limited to exactly 16 ranks, preventing any redistribution from occurring. A recommended value for EVA with redistribution is 2.0, meaning the maximum rank allowed for a layer is 2r.
+
+It is recommended to perform EVA initialization on a GPU as it is much faster. To optimize the amount of available memory for EVA, you can use the `low_cpu_mem_usage` flag in [`get_peft_model`]:
+```python
+peft_model = get_peft_model(model, peft_config, low_cpu_mem_usage=True)
+```
+Then, call [`initialize_lora_eva_weights`] to initialize the EVA weights (in most cases the dataloader used for eva initialization can be the same as the one used for finetuning):
+```python
+initialize_lora_eva_weights(peft_model, dataloader)
+```
+EVA works out of the box with bitsandbytes. Simply initialize the model with `quantization_config` and call [`initialize_lora_eva_weights`] as usual.
+
+<Tip>
+
+For further instructions on using EVA, please refer to our [documentation](https://github.com/huggingface/peft/tree/main/examples/eva_finetuning).
+
+</Tip>
+
 ### LoftQ

 #### Standard approach
@ -138,10 +200,22 @@ from peft import PeftModel
 model = PeftModel.from_pretrained(base_model, peft_model_id, ephemeral_gpu_offload=True)
 ```

+DoRA is optimized (computes faster and takes less memory) for models in the evaluation mode, or when dropout is set to 0. We reuse the
+base result at those times to get the speedup.
+Running [dora finetuning](https://github.com/huggingface/peft/blob/main/examples/dora_finetuning/dora_finetuning.py)
+with `CUDA_VISIBLE_DEVICES=0 time python examples/dora_finetuning/dora_finetuning.py --quantize --lora_dropout 0 --batch_size 16 --eval_step 2 --use_dora`
+on a 4090 with gradient accumulation set to 2 and max step to 20 resulted with the following observations:
+
+| | Without Optimization | With Optimization |
+| :--: | :--: | :--: |
+| train_runtime | 359.7298 | **279.2676** |
+| train_samples_per_second | 1.779 | **2.292** |
+| train_steps_per_second | 0.056 | **0.072** |
+
 #### Caveats

- DoRA only supports linear and Conv2d layers at the moment.
- DoRA introduces a bigger overhead than pure LoRA, so it is recommended to merge weights for inference, see [`LoraModel.merge_and_unload`]. 
+- DoRA only supports embedding, linear, and Conv2d layers at the moment.
+- DoRA introduces a bigger overhead than pure LoRA, so it is recommended to merge weights for inference, see [`LoraModel.merge_and_unload`].
 - DoRA should work with weights quantized with bitsandbytes ("QDoRA"). However, issues have been reported when using QDoRA with DeepSpeed Zero2.

 ### QLoRA-style training
@ -165,6 +239,36 @@ Assuming the original model had 5 layers `[0, 1, 2 ,3, 4]`, this would create a
 [Fewshot-Metamath-OrcaVicuna-Mistral-10B](https://huggingface.co/abacusai/Fewshot-Metamath-OrcaVicuna-Mistral-10B) is an example of a model trained using this method on Mistral-7B expanded to 10B. The
 [adapter_config.json](https://huggingface.co/abacusai/Fewshot-Metamath-OrcaVicuna-Mistral-10B/blob/main/adapter_config.json) shows a sample LoRA adapter config applying this method for fine-tuning.

+### Fine grained control over ranks and alpha (scaling)
+
+By default, all layers targeted with LoRA will have the same rank `r` and the same `lora_alpha` (which determines the LoRA scaling), depending on what was specified in the [`LoraConfig`]. In same cases, however, you may want to indicate different values for different layers. This is possible by passing the `rank_pattern` and `alpha_pattern` arguments to [`LoraConfig`]. These arguments should be dictionaries with the key being the layer name and the value being the rank/alpha value. The keys can be [regular expressesions](https://docs.python.org/3/library/re.html) (regex). All LoRA layers that are not explicitly mentioned in `rank_pattern` and `alpha_pattern` will take the default `r` and `lora_alpha` values.
+
+To give an examples, let's assume that we have a model with the following structure:
+
+```python
+>>> print(model)
+Outer(
+  (foo): Linear(...)
+  (module): Middle(
+    (foo): Linear(...)
+    (foobar): Linear(...)
+    (module): Inner(
+      (foo): Linear(...)
+      (barfoo): Linear(...)
+    )
+  )
+)
+```
+
+- `rank_pattern={"foo": 42}` will match all 3 `foo` layers. Neither `foobar` nor `barfoo` are matched.
+- `rank_pattern={"^foo": 42}` will only match the `foo` layer of the model, but neither `module.foo` nor `module.module.foo`. This is because the `^` means "start of string" when using regular expressions, and only `foo` starts with `"foo"`, the other layer names have prefixes.
+- `rank_pattern={"^module.foo": 42}` matches only `module.foo`, but not `module.module.foo`, for the same reason.
+- `rank_pattern={"module.foo": 42}` matches both `module.foo` and `module.module.foo`, but not `foo`.
+- `rank_pattern={"^foo": 42, "^module.module.foo": 55}` matches `foo` and `module.module.foo`, respectively, but not `module.foo`.
+- There is no need to indicate `$` to mark the end of the match, as this is added automatically by PEFT.
+
+The same logic applies to `alpha_pattern`. If you're in doubt, don't try to get fancy with regular expressions -- just pass the full name for each module with a different rank/alpha, preceded by the `^` prefix, and you should be good.
+
 ## Optimizers

 LoRA training can optionally include special purpose optimizers. Currently the only such optimizer is LoRA+.
@ -198,6 +302,52 @@ trainer = Trainer(
 )
 ```

+## Efficiently train tokens alongside LoRA
+
+Sometimes it is necessary to not only change some layer's weights but to add new tokens as well. With larger models this can be a memory-costly endeavour. PEFT LoRA adapters support the `trainable_token_indices` parameter which allows tuning of other tokens alongside fine-tuning of specific layers with LoRA. This method only trains the tokens you specify and leaves all other tokens untouched. This saves memory and doesn't throw away learned context of existing token embeddings in contrast to when training the whole embedding matrix. Under the hood this method uses the layer of [`TrainableTokensModel`].
+
+```py
+# for layer 'embed_tokens'
+config = LoraConfig(trainable_token_indices=[idx_1, idx_2, ...], ...)
+
+# specific embedding layer
+config = LoraConfig(trainable_token_indices={'emb_tokens': [idx_1, idx_2, ...]}, ...)
+```
+
+In the snippet below we show how to add new tokens to the model and how to train it alongside the other layers in the model.
+
+```py
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from peft import get_peft_model, LoraConfig
+
+base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
+tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
+
+# we define our new tokens and add them to the tokenizer as special tokens
+special_tokens = ['<|start_think|>', '<|stop_think|>']
+tokenizer.add_special_tokens({'additional_special_tokens': special_tokens})
+
+# make room for new tokens in the embedding matrix if it isn't big enough already
+base_model.resize_token_embeddings(max(len(tokenizer), base_model.model.embed_tokens.num_embeddings)
+
+# typical LoRA config with `trainable_token_indices` targeting embedding layer `embed_tokens`
+# and specifically our new tokens we just added
+lora_config = LoraConfig(
+    target_modules='all-linear',
+    trainable_token_indices={'embed_tokens': tokenizer.convert_tokens_to_ids(special_tokens)},
+)
+peft_model = get_peft_model(base_model, lora_config)
+
+# proceed to train the model like normal
+[...]
+```
+
+The token weights are part of your adapter state dict and saved alongside the LoRA weights.
+If we would have used full fine-tuning with `modules_to_save=['embed_tokens']` we would have stored the full embedding matrix in the checkpoint, leading to a much bigger file.
+
+To give a bit of an indication how much VRAM can be saved, a rudimentary comparison of the above example was made between training the embedding matrix fully (`modules_to_save=["embed_tokens"]`), using a LoRA for the embedding matrix (`target_modules=[..., "embed_tokens"]`, rank 32) and trainable tokens (`trainable_token_indices=[...]`, 6 tokens). Trainable tokens used about as much VRAM (15,562MB vs. 15,581MB) as LoRA while being specific to the tokens and saved ~1GB of VRAM over fully training the embedding matrix.
+
+
 ## Merge LoRA weights into the base model

 While LoRA is significantly smaller and faster to train, you may encounter latency issues during inference due to separately loading the base model and the LoRA adapter. To eliminate latency, use the [`~LoraModel.merge_and_unload`] function to merge the adapter weights with the base model. This allows you to use the newly merged model as a standalone model. The [`~LoraModel.merge_and_unload`] function doesn't keep the adapter weights in memory.
@ -249,7 +399,7 @@ base_model = AutoModelForCausalLM.from_pretrained(
 )
 ```

-Then we load the first adapter: 
+Then we load the first adapter:

 ```python
 peft_model_id = "alignment-handbook/zephyr-7b-sft-lora"
@ -369,7 +519,7 @@ output = peft_model.generate(**inputs, adapter_names=adapter_names, max_new_toke

 Note that the order does not matter here, i.e. the samples in the batch don't need to be grouped by adapter as in the example above. We just need to ensure that the `adapter_names` argument is aligned correctly with the samples.

-Additionally, the same approach also works with the `modules_to_save` feature, which allows for saving and reusing specific neural network layers, such as custom heads for classification tasks, across different LoRA adapters. 
+Additionally, the same approach also works with the `modules_to_save` feature, which allows for saving and reusing specific neural network layers, such as custom heads for classification tasks, across different LoRA adapters.

 ### Caveats

--- a/docs/source/developer_guides/model_merging.md
+++ b/docs/source/developer_guides/model_merging.md
@ -50,6 +50,9 @@ config = PeftConfig.from_pretrained("smangrul/tinyllama_lora_norobots")
 model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, load_in_4bit=True, device_map="auto").eval()
 tokenizer = AutoTokenizer.from_pretrained("smangrul/tinyllama_lora_norobots")

+model.config.vocab_size = 32005
+model.resize_token_embeddings(32005)
+
 model = PeftModel.from_pretrained(model, "smangrul/tinyllama_lora_norobots", adapter_name="norobots")
 _ = model.load_adapter("smangrul/tinyllama_lora_sql", adapter_name="sql")
 _ = model.load_adapter("smangrul/tinyllama_lora_adcopy", adapter_name="adcopy")
--- a/docs/source/developer_guides/quantization.md
+++ b/docs/source/developer_guides/quantization.md
@ -107,6 +107,32 @@ QLoRA adds trainable weights to all the linear layers in the transformer archite
 config = LoraConfig(target_modules="all-linear", ...)
 ```

+## GPTQ quantization
+
+You can learn more about gptq based `[2, 3, 4, 8]` bits quantization at [GPTQModel](https://github.com/ModelCloud/GPTQModel) and the Transformers [GPTQ](https://huggingface.co/docs/transformers/quantization/gptq) doc. Post-quant training, PEFT can use both [GPTQModel](https://github.com/ModelCloud/GPTQModel) or [AutoGPTQ](https://github.com/autogptq/autogptq) libraries, but we recommend GPTQModel because AutoGPTQ will be deprecated in a future release. 
+
+```bash
+# gptqmodel install
+pip install gptqmodel --no-build-isolation
+```
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
+
+model_id = "facebook/opt-125m"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+gptq_config = GPTQConfig(bits=4, group_size=128, dataset="wikitext2", tokenizer=tokenizer)
+
+quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=gptq_config)
+
+# save quantized model
+quantized_model.save_pretrained("./opt-125m-gptq")
+tokenizer.save_pretrained("./opt-125m-gptq")
+```
+
+Once quantized, you can post-train GPTQ models with PEFT APIs.
+
 ## AQLM quantization

 Additive Quantization of Language Models ([AQLM](https://arxiv.org/abs/2401.06118)) is a Large Language Models compression method. It quantizes multiple weights together and takes advantage of interdependencies between them. AQLM represents groups of 8-16 weights as a sum of multiple vector codes. This allows it to compress models down to as low as 2-bit with considerably low accuracy losses.
@ -187,9 +213,41 @@ peft_config = LoraConfig(...)
 quantized_model = get_peft_model(quantized_model, peft_config)
 ```

+## torchao (PyTorch Architecture Optimization)
+
+PEFT supports models quantized with [torchao](https://github.com/pytorch/ao) ("ao") for int8 quantization.
+
+```python
+from peft import LoraConfig, get_peft_model
+from transformers import AutoModelForCausalLM, TorchAoConfig
+
+model_id = ...
+quantization_config = TorchAoConfig(quant_type="int8_weight_only")
+base_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
+peft_config = LoraConfig(...)
+model = get_peft_model(base_model, peft_config)
+```
+
+### Caveats:
+
+- Use the most recent versions of torchao (>= v0.4.0) and transformers (> 4.42).
+- Only linear layers are currently supported.
+- `quant_type = "int4_weight_only"` is currently not supported.
+- `NF4` is not implemented in transformers as of yet and is thus also not supported.
+- DoRA only works with `quant_type = "int8_weight_only"` at the moment.
+- There is explicit support for torchao when used with LoRA. However, when torchao quantizes a layer, its class does not change, only the type of the underlying tensor. For this reason, PEFT methods other than LoRA will generally also work with torchao, even if not explicitly supported. Be aware, however, that **merging only works correctly with LoRA and with `quant_type = "int8_weight_only"`**. If you use a different PEFT method or dtype, merging will likely result in an error, and even it doesn't, the results will still be incorrect.
+
+## Other Supported PEFT Methods
+
+Besides LoRA, the following PEFT methods also support quantization:
+
+- **VeRA** (supports bitsandbytes quantization)
+- **AdaLoRA** (supports both bitsandbytes and GPTQ quantization)
+- **(IA)³** (supports bitsandbytes quantization)
+
 ## Next steps

 If you're interested in learning more about quantization, the following may be helpful:

-* Learn more about details about QLoRA and check out some benchmarks on its impact in the [Making LLMs even more accessible with bitsandbytes, 4-bit quantization and QLoRA](https://huggingface.co/blog/4bit-transformers-bitsandbytes) blog post.
+* Learn more details about QLoRA and check out some benchmarks on its impact in the [Making LLMs even more accessible with bitsandbytes, 4-bit quantization and QLoRA](https://huggingface.co/blog/4bit-transformers-bitsandbytes) blog post.
 * Read more about different quantization schemes in the Transformers [Quantization](https://hf.co/docs/transformers/main/quantization) guide.
--- a/docs/source/developer_guides/torch_compile.md
+++ b/docs/source/developer_guides/torch_compile.md
@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.

 In PEFT, [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) works for some but not all features. The reason why it won't always work is because PEFT is highly dynamic in certain places (loading and switching between multiple adapters, for instance), which can cause trouble for `torch.compile`. In other places, `torch.compile` may work, but won't be as fast as expected because of graph breaks.

-If you don't see an error, it doesn't necessarily mean that `torch.compile` worked correctly. It might give you an output, but the output is incorrect. This guide describes what works with `torch.compile` and what doesn't.
+If you don't see an error, it doesn't necessarily mean that `torch.compile` worked correctly. It might give you an output, but the output is incorrect. This guide describes what works with `torch.compile` and what doesn't. For your own testing, we recommend using the latest PyTorch version, as `torch.compile` is constantly being improved.

 > [!TIP]
 > Unless indicated otherwise, the default `torch.compile` settings were used.
@ -36,20 +36,18 @@ The following adapters were tested successfully:

 - AdaLoRA
 - BOFT
+- Bone
 - IA³
 - Layer Norm Tuning
 - LoHa
+- LoKr
 - LoRA
 - LoRA + DoRA
+- LoRA applied to embedding layers
 - OFT
 - VeRA
 - HRA

-The following adapters **don't work** correctly for training or inference when using `torch.compile`:
-
- LoKr
- LoRA targeting embedding layers
-
 ## Advanced PEFT features with `torch.compile`

 Below are some of the more advanced PEFT features that **work**. They were all tested with LoRA.
@ -57,17 +55,14 @@ Below are some of the more advanced PEFT features that **work**. They were all t
 - `modules_to_save` (i.e. `config = LoraConfig(..., modules_to_save=...)`)
 - Merging adapters (one or multiple)
 - Merging multiple adapters into one adapter (i.e. calling `model.add_weighted_adapter(...)`)
+- Using PEFT adapters with quantization (bitsandbytes)
+- Disabling adapters (i.e. using `with model.disable_adapter()`)
+- Unloading (i.e. calling `model.merge_and_unload()`)
+- Mixed adapter batches (i.e. calling `model(batch, adapter_names=["__base__", "default", "other", ...])`)
+- Inference with multiple adapters (i.e. using `model.add_adapter` or `model.load_adapter` to load more than 1 adapter); for this, only call `torch.compile` _after_ loading all adapters

 Generally, we can expect that if a feature works correctly with LoRA and is also supported by other adapter types, it should also work for that adapter type.

-The more advanced PEFT features below **don't work** in conjunction with `torch.compile`. Tests were run with LoRA:
-
- Using PEFT adapters with quantization (bitsandbytes)
- Inference with multiple adapters
- Unloading (i.e. calling `model.merge_and_unload()`)
- Disabling adapters (i.e. using `with model.disable_adapter()`)
- Mixed adapter batches (i.e. calling `model(batch, adapter_names=["__base__", "default", "other", ...])`)
-
 ## Test cases

 All the use cases listed above are tested inside of [`peft/tests/test_torch_compile.py`](https://github.com/huggingface/peft/blob/main/tests/test_torch_compile.py). If you want to check in more detail how we tested a certain feature, please go to that file and check the test that corresponds to your use case.
--- a/docs/source/developer_guides/troubleshooting.md
+++ b/docs/source/developer_guides/troubleshooting.md
@ -118,6 +118,12 @@ You should probably TRAIN this model on a down-stream task to be able to use it

 The mentioned layers should be added to `modules_to_save` in the config to avoid the described problem.

+<Tip>
+
+As an example, when loading a model that is using the DeBERTa architecture for sequence classification, you'll see a warning that the following weights are newly initialized: `['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']`. From this, it follows that the `classifier` and `pooler` layers should be added to: `modules_to_save=["classifier", "pooler"]`.
+
+</Tip>
+
 ### Extending the vocabulary

 For many language fine-tuning tasks, extending the model's vocabulary is necessary since new tokens are being introduced. This requires extending the embedding layer to account for the new tokens and also storing the embedding layer in addition to the adapter weights when saving the adapter.
@ -142,6 +148,34 @@ For inference, load the base model first and resize it the same way you did befo

 For a complete example, please check out [this notebook](https://github.com/huggingface/peft/blob/main/examples/causal_language_modeling/peft_lora_clm_with_additional_tokens.ipynb).

+### Getting a warning about "weights not being initialized from the model checkpoint"
+
+When you load your PEFT model which has been trained on a task (for example, classification), you may get a warning like:
+
+> Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']. You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+
+Although this looks scary, it is most likely nothing to worry about. This warning comes from Transformers, and it isn't a PEFT specific warning. It lets you know that a randomly initialized classification head (`score`) is attached to the base model, and the head must be trained to produce sensible predictions.
+
+When you get this warning _before_ training the model, PEFT automatically takes care of making the classification head trainable if you correctly passed the `task_type` argument to the PEFT config.
+
+```python
+from peft import LoraConfig, TaskType
+
+lora_config = LoraConfig(..., task_type=TaskType.SEQ_CLS)
+```
+
+If your classification head does not follow the usual naming conventions from Transformers (which is rare), you have to explicitly tell PEFT the name of the head in `modules_to_save`.
+
+```python
+lora_config = LoraConfig(..., modules_to_save=["name-of-classification-head"])
+```
+
+To check the name of the classification head, print the model and it should be the last module.
+
+If you get this warning from your inference code, i.e. _after_ training the model, when you load the PEFT model, you always have to load the Transformers model first. Since Transformers does not know that you will load PEFT weights afterwards, it still gives the warning.
+
+As always, it is best practice to ensure the model works correctly for inference by running some validation on it.
+
 ### Check layer and model status

 Sometimes a PEFT model can end up in a bad state, especially when handling multiple adapters. There can be some confusion around what adapters exist, which one is active, which one is merged, etc. To help investigate this issue, call the [`~peft.PeftModel.get_layer_status`] and the [`~peft.PeftModel.get_model_status`] methods. 
@ -284,3 +318,31 @@ config = LoraConfig(
 ```

 Depending on the type of model you use, the batch norm layers could have different names than `"normalization"`, so please ensure that the name matches your model architecture.
+
+## Version mismatch
+
+### Error while loading the config because of an unexpected keyword argument
+
+When you encounter an error like the one shown below, it means the adapter you're trying to load was trained with a more recent version of PEFT than the version you have installed on your system.
+
+```
+TypeError: LoraConfig.__init__() got an unexpected keyword argument <argument-name>
+```
+
+The best way to resolve this issue is to install the latest PEFT version:
+
+```sh
+python -m pip install -U PEFT
+```
+
+If the adapter was trained from a source install of PEFT (an unreleased version of PEFT), then you also need to install PEFT from source.
+
+```sh
+python -m pip install -U git+https://github.com/huggingface/peft.git
+```
+
+If it is not possible for you to upgrade PEFT, there is a workaround you can try.
+
+Assume the error message says that the unknown keyword argument is named `foobar`. Search inside the `adapter_config.json` of this PEFT adapter for the `foobar` entry and delete it from the file. Then save the file and try loading the model again.
+
+This solution works most of the time. As long as it is the default value for `foobar`, it can be ignored. However, when it is set to some other value, you will get incorrect results. Upgrading PEFT is the recommended solution.
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -23,14 +23,14 @@ PEFT is integrated with the Transformers, Diffusers, and Accelerate libraries to
 <div class="mt-10">
  <div class="w-full flex flex-col space-y-4 md:space-y-0 md:grid md:grid-cols-2 md:gap-y-4 md:gap-x-5">
    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="quicktour"
-      ><div class="w-full text-center bg-gradient-to-br from-blue-400 to-blue-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Get started</div>
+      ><div class="w-full text-center bg-gradient-to-br from-blue-400 to-blue-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Quicktour</div>
      <p class="text-gray-700">Start here if you're new to 🤗 PEFT to get an overview of the library's main features, and how to train a model with a PEFT method.</p>
    </a>
-    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./task_guides/image_classification_lora"
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./task_guides/prompt_based_methods"
      ><div class="w-full text-center bg-gradient-to-br from-indigo-400 to-indigo-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">How-to guides</div>
      <p class="text-gray-700">Practical guides demonstrating how to apply various PEFT methods across different types of tasks like image classification, causal language modeling, automatic speech recognition, and more. Learn how to use 🤗 PEFT with the DeepSpeed and Fully Sharded Data Parallel scripts.</p>
    </a>
-    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./conceptual_guides/lora"
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./conceptual_guides/adapter"
      ><div class="w-full text-center bg-gradient-to-br from-pink-400 to-pink-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Conceptual guides</div>
      <p class="text-gray-700">Get a better theoretical understanding of how LoRA and various soft prompting methods help reduce the number of trainable parameters to make training more efficient.</p>
   </a>
--- a/docs/source/install.md
+++ b/docs/source/install.md
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # Installation

-Before you start, you will need to setup your environment, install the appropriate packages, and configure 🤗 PEFT. 🤗 PEFT is tested on **Python 3.8+**.
+Before you start, you will need to setup your environment, install the appropriate packages, and configure 🤗 PEFT. 🤗 PEFT is tested on **Python 3.9+**.

 🤗 PEFT is available on PyPI, as well as GitHub:

@ -43,5 +43,5 @@ repository:
 ```bash
 git clone https://github.com/huggingface/peft
 cd peft
-pip install -e .
+pip install -e .[test]
 ```
--- a/docs/source/package_reference/bone.md
+++ b/docs/source/package_reference/bone.md
@ -0,0 +1,31 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Bone
+
+Block-Affine Adaptation ([Bone](https://huggingface.co/papers/2409.15371)) This work, inspired by GQA and MQA, leverages the sparsity of LLM weights to design the Block-Affine Adaptation (Bone) structure. In Bone, the original weights are divided into multiple sub-spaces, all of which share a single low-rank matrix initialized to zero for updates. Extensive exper-iments demonstrate that Bone consistently outperforms LoRA and its variants across various tasks, while also offering superior computational efficiency. 
+
+The abstract from the paper is:
+
+Low-Rank Adaptation (LoRA) has achieved remarkable training results by freezing the original weights and training only low-rank matrices, establishing itself as the predominant fine-tuning method for LLMs. In pursuit of performance closer to full-parameter training, a series of LoRA variants have emerged, such as LoRA+, PISSA, Olora, and LoRA-GA. This paper introduces a novel PEFT technique distinct from LoRA, called Block-Affine Adaptation (Bone). By dividing the original weights into multiple subspaces that share a single matrix for weight updates, Bone simplifies the process by requiring the trainable matrix to be initialized to zero, eliminating the need for complex initialization as in some LoRA variants. Compared to LoRA, Bone significantly reduces memory usage and achieves faster computation. Evaluation of both NLU and NLG tasks demonstrates that Bone substantially outperforms LoRA and its variants. Inspired by Pissa, we further proposed the 'Weight Guide' theory to better utilize the information from the original weights. By integrating 'Weight Guide' with Bone, we developed a new structure called Block-Affine Transformation (Bat), and ablation experiments confirmed the effectiveness of 'Weight Guide'.
+
+## BoneConfig
+
+[[autodoc]] tuners.bone.config.BoneConfig
+
+## BoneModel
+
+[[autodoc]] tuners.bone.model.BoneModel
--- a/docs/source/package_reference/cpt.md
+++ b/docs/source/package_reference/cpt.md
@ -0,0 +1,34 @@
+<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# Context-aware Prompt Tuning: Advancing In-Context Learning with Adversarial Methods
+
+[CPT](https://huggingface.co/papers/2410.17222) combines In-Context Learning (ICL), Prompt Tuning (PT), and adversarial optimization to improve few-shot learning by refining context embeddings. CPT updates the context tokens by optimizing both the context and the training examples, encapsulating them into a novel loss design that minimizes overfitting, enables more effective optimization, and drives significant improvements in classification tasks.
+
+[//]: # ([CPT]&#40;https://huggingface.co/papers/2410.17222&#41; for the paper)
+
+The abstract from the paper is:
+
+> Large Language Models (LLMs) can perform few-shot learning using either optimization-based approaches or In-Context Learning (ICL). Optimization-based methods often suffer from overfitting, as they require updating a large number of parameters with limited data. In contrast, ICL avoids overfitting but typically underperforms compared to optimization-based methods and is highly sensitive to the selection, order, and format of demonstration examples. To overcome these challenges, we introduce Context-aware Prompt Tuning (CPT), a method inspired by ICL, Prompt Tuning (PT), and adversarial attacks. CPT builds on the ICL strategy of concatenating examples before the input, extending it by incorporating PT-like learning to refine the context embedding through iterative optimization, extracting deeper insights from the training examples. Our approach carefully modifies specific context tokens, considering the unique structure of the examples within the context. In addition to updating the context with PT-like optimization, CPT draws inspiration from adversarial attacks, adjusting the input based on the labels present in the context while preserving the inherent value of the user-provided data. To ensure robustness and stability during optimization, we employ a projected gradient descent algorithm, constraining token embeddings to remain close to their original values and safeguarding the quality of the context. Our method has demonstrated superior accuracy across multiple classification tasks using various LLM models, outperforming existing baselines and effectively addressing the overfitting challenge in few-shot learning.
+
+
+Take a look at [Example](https://github.com/huggingface/peft/blob/main/examples/cpt_finetuning/README.md) for a step-by-step guide on how to train a model with CPT.
+
+
+## CPTConfig
+
+[[autodoc]] tuners.cpt.config.CPTConfig
+
+## CPTEmbedding
+
+[[autodoc]] tuners.cpt.model.CPTEmbedding
+
--- a/docs/source/package_reference/helpers.md
+++ b/docs/source/package_reference/helpers.md
@ -14,4 +14,9 @@ A collection of helper functions for PEFT.
 ## Temporarily Rescaling Adapter Scale in LoraLayer Modules

 [[autodoc]] helpers.rescale_adapter_scale
-    - all
+    - all
+
+## Context manager to disable input dtype casting in the `forward` method of LoRA layers
+
+[[autodoc]] helpers.disable_input_dtype_casting
+    - all
--- a/docs/source/package_reference/hotswap.md
+++ b/docs/source/package_reference/hotswap.md
@ -0,0 +1,76 @@
+<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# Hotswapping adapters
+
+The idea of hotswapping an adapter is the following: We can already load multiple adapters, e.g. two LoRAs, at the same time. But sometimes, we want to load one LoRA and then replace its weights in-place with the LoRA weights of another adapter. This is now possible the `hotswap_adapter` function.
+
+In general, this should be faster than deleting one adapter and loading the adapter in its place, which would be the how to achieve the same final outcome without hotswapping. Another advantage of hotswapping is that it prevents re-compilation in case the PEFT model is already compiled using `torch.compile`. This can save quite a lot of time.
+
+## Example without `torch.compile`
+
+```python
+import torch
+from transformers import AutoModelForCausalLM
+from peft import PeftModel
+from peft.utils.hotswap import hotswap_adapter
+
+model_id = ...
+inputs = ...
+device = ...
+model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
+
+# load lora 0
+model = PeftModel.from_pretrained(model, <path-adapter-0>)
+with torch.inference_mode():
+    output_adapter_0 = model(inputs)
+
+# replace the "default" lora adapter with the new one
+hotswap_adapter(model, <path-adapter-1>, adapter_name="default", torch_device=device)
+with torch.inference_mode():
+    output_adapter_1 = model(inputs).logits
+```
+
+## Example with `torch.compile`
+
+```python
+import torch
+from transformers import AutoModelForCausalLM
+from peft import PeftModel
+from peft.utils.hotswap import hotswap_adapter, prepare_model_for_compiled_hotswap
+
+model_id = ...
+inputs = ...
+device = ...
+max_rank = ...  # maximum rank among all LoRA adapters that will be used
+model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
+
+# load lora 0
+model = PeftModel.from_pretrained(model, <path-adapter-0>)
+# Prepare the model to allow hotswapping even if ranks/scalings of 2nd adapter differ.
+# You can skip this step if all ranks and scalings are identical.
+prepare_model_for_compiled_hotswap(model, target_rank=max_rank)
+model = torch.compile(model)
+with torch.inference_mode():
+    output_adapter_0 = model(inputs)
+
+# replace the "default" lora adapter with the new one
+hotswap_adapter(model, <path-adapter-1>, adapter_name="default", torch_device=device)
+with torch.inference_mode():
+    output_adapter_1 = model(inputs).logits
+```
+
+## Caveats
+
+Hotswapping works with transformers models and diffusers models. However, there are some caveats:
+
+- Right now, only LoRA is properly supported.
+- It only works for the same PEFT method, so no swapping LoRA and LoHa, for example.
+- The adapter that is being swapped in must target the same layers as the previous adapter or a subset of those layers. It cannot target new layers. Therefore, if possible, start with the adapter that targets most layers.
+
+[[autodoc]] utils.hotswap.hotswap_adapter
+    - all
+
+[[autodoc]] utils.hotswap.hotswap_adapter_from_state_dict
+    - all
--- a/docs/source/package_reference/hra.md
+++ b/docs/source/package_reference/hra.md
@ -0,0 +1,32 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Bridging The Gap between Low-rank and Orthogonal Adaptation via Householder Reflection Adaptation (HRA)
+
+[HRA](https://huggingface.co/papers/2405.17484) is a simple but effective adapter-based fine-tuning method by leveraging Householder reflections. This method harnesses the advantages of both strategies, reducing parameters and computation costs while penalizing the loss of pre-training knowledge. It consistently achieves better performance with fewer trainable parameters and outperforms state-of-the-art adapters across different models, including large language models (LLMs) and conditional image generators.
+
+
+The abstract from the paper is:
+
+> While following different technical routes, both low-rank and orthogonal adaptation techniques can efficiently adapt large-scale pre-training models in specific tasks or domains based on a small piece of trainable parameters. In this study, we bridge the gap between these two techniques, proposing a simple but effective adaptation method based on Householder reflections. Given a pre-trained model, our method fine-tunes its layers by multiplying each frozen weight matrix with an orthogonal matrix constructed by a chain of learnable Householder reflections (HRs). This HR-based orthogonal fine-tuning is equivalent to an adaptive low-rank adaptation. Moreover, we show that the orthogonality of the reflection planes corresponding to the HRs impacts the model capacity and regularity. The analysis motivates us to regularize the orthogonality of the HRs, leading to different implementations of the proposed Householder reflection adaptation (HRA) method. Compared with state-of-the-art methods, HRA achieves superior performance with fewer learnable parameters when adapting large language models and conditional image generators. The code is available at [peft](https://github.com/huggingface/peft/tree/main/src/peft/tuners/hra) and [HRA](https://github.com/DaShenZi721/HRA).
+
+## HRAConfig
+
+[[autodoc]] tuners.hra.config.HRAConfig
+
+## HRAModel
+
+[[autodoc]] tuners.hra.model.HRAModel
--- a/docs/source/package_reference/lora.md
+++ b/docs/source/package_reference/lora.md
@ -32,4 +32,20 @@ The abstract from the paper is:

 ## Utility

+### LoftQ
+
 [[autodoc]] utils.loftq_utils.replace_lora_weights_loftq
+
+### Eva
+
+#### EvaConfig
+
+[[autodoc]] tuners.lora.config.EvaConfig
+
+#### initialize_lora_eva_weights
+
+[[autodoc]] tuners.lora.eva.initialize_lora_eva_weights
+
+#### get_eva_state_dict
+
+[[autodoc]] tuners.lora.eva.get_eva_state_dict
--- a/docs/source/package_reference/trainable_tokens.md
+++ b/docs/source/package_reference/trainable_tokens.md
@ -0,0 +1,43 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Trainable Tokens
+
+The Trainable Tokens method provides a way to target specific token embeddings for fine-tuning without resorting to
+training the full embedding matrix or using an adapter on the embedding matrix. It is based on the initial implementation from
+[here](https://github.com/huggingface/peft/pull/1541).
+
+The method only targets specific tokens and selectively trains the token indices you specify. Consequently the
+required RAM will be lower and disk memory is also significantly lower than storing the full fine-tuned embedding matrix.
+
+Some preliminary benchmarks acquired with [this script](https://github.com/huggingface/peft/blob/main/scripts/train_memory.py)
+suggest that for `gemma-2-2b` (which has a rather large embedding matrix) you can save ~4 GiB VRAM with Trainable Tokens
+over fully fine-tuning the embedding matrix. While LoRA will use comparable amounts of VRAM it might also target
+tokens you don't want to be changed. Note that these are just indications and varying embedding matrix sizes might skew
+these numbers a bit.
+
+Note that this method does not add tokens for you, you have to add tokens to the tokenizer yourself and resize the
+embedding matrix of the model accordingly. This method will only re-train the embeddings for the tokens you specify.
+This method can also be used in conjunction with LoRA layers! See [the LoRA developer guide](../developer_guides/lora#efficiently-train-tokens-alongside-lora).
+
+## TrainableTokensConfig
+
+[[autodoc]] tuners.trainable_tokens.config.TrainableTokensConfig
+
+## TrainableTokensModel
+
+[[autodoc]] tuners.trainable_tokens.model.TrainableTokensModel
+
--- a/docs/source/package_reference/vera.md
+++ b/docs/source/package_reference/vera.md
@ -22,12 +22,9 @@ When saving the adapter parameters, it's possible to eschew storing the low rank

 To handle different shapes of adapted layers, VeRA initializes shared A and B matrices with the largest required size for each dimension. During the forward pass, submatrices A and B for a given layer are sliced out from these shared matrices and used as described in the paper. For example, adapting two linear layers of shapes (100, 20) and (80, 50) will create A and B matrices of shapes (rank, 50) and (100, rank) respectively. Then, to adapt a layer of shape (100, 20), submatrices A and B of shapes (rank, 20) and (100, rank) will be extracted.

-VeRA currently has the following constraints:
+VeRA currently has the following constraint:

 - Only `nn.Linear` layers are supported.
- Quantized layers are not supported.
-
-If these constraints don't work for your use case, use LoRA instead.

 The abstract from the paper is:

--- a/docs/source/tutorial/peft_model_config.md
+++ b/docs/source/tutorial/peft_model_config.md
@ -135,6 +135,9 @@ lora_model.print_trainable_parameters()
 "trainable params: 1,572,864 || all params: 332,769,280 || trainable%: 0.472659014678278"
 ```

+> [!WARNING]
+> When calling [`get_peft_model`], the base model will be modified *in-place*. That means, when calling [`get_peft_model`] on a model that was already modified in the same way before, this model will be further mutated. Therefore, if you would like to modify your PEFT configuration after having called [`get_peft_model()`] before, you would first have to unload the model with [`~LoraModel.unload`] and then call [`get_peft_model()`] with your new configuration. Alternatively, you can re-initialize the model to ensure a fresh, unmodified state before applying a new PEFT configuration.
+
 Now you can train the [`PeftModel`] with your preferred training framework! After training, you can save your model locally with [`~PeftModel.save_pretrained`] or upload it to the Hub with the [`~transformers.PreTrainedModel.push_to_hub`] method.

 ```py
--- a/examples/boft_controlnet/requirements.txt
+++ b/examples/boft_controlnet/requirements.txt
@ -1,6 +1,6 @@
 datasets==2.16.1
 diffusers==0.17.1
-transformers==4.36.2
+transformers=>4.48.0
 accelerate==0.25.0
 wandb==0.16.1
 scikit-image==0.22.0
--- a/examples/boft_dreambooth/requirements.txt
+++ b/examples/boft_dreambooth/requirements.txt
@ -1,4 +1,4 @@
-transformers==4.36.2
+transformers=>4.48.0
 accelerate==0.25.0
 evaluate
 tqdm
--- a/examples/bone_finetuning/README.md
+++ b/examples/bone_finetuning/README.md
@ -0,0 +1,95 @@
+# BONE: BLOCK AFFINE TRANSFORMATION AS PARAMETER EFFICIENT FINE-TUNING METHODS FOR LARGE LANGUAGE MODELS
+## Introduction ([Paper](https://arxiv.org/pdf/2409.15371), [code](https://github.com/JL-er/Bone))
+Low-Rank Adaptation (LoRA) has achieved remarkable training results by freezing the original weights and training only low-rank matrices, establishing itself as the predominant fine-tuning method for LLMs. In pursuit of performance closer to full-parameter training, a series of LoRA variants have emerged, such as LoRA+, PISSA, Olora, and LoRA-GA. This paper introduces a novel PEFT technique distinct from LoRA, called Block-Affine Adaptation (Bone). By dividing the original weights into multiple subspaces that share a single matrix for weight updates, Bone simplifies the process by requiring the trainable matrix to be initialized to zero, eliminating the need for complex initialization as in some LoRA variants. Compared to LoRA, Bone significantly reduces memory usage and achieves faster computation. Evaluation of both NLU and NLG tasks demonstrates that Bone substantially outperforms LoRA and its variants. Inspired by Pissa, we further proposed the 'Weight Guide' theory to better utilize the information from the original weights. By integrating 'Weight Guide' with Bone, we developed a new structure called Block-Affine Transformation (Bat), and ablation experiments confirmed the effectiveness of 'Weight Guide'.
+
+## Quick Start
+```python
+import torch
+from peft import LoraConfig, get_peft_model
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from trl import SFTConfig, SFTTrainer
+from datasets import load_dataset
+
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16, device_map="auto")
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+tokenizer.pad_token_id = tokenizer.eos_token_id
+bone_config = BoneConfig(
+    r = 64
+)
+#Bat performs better than Bone, but it uses more memory and is twice as slow. If you want to use the Bat method, you only need to add the parameter init_weights="bat".
+# bone_config = BoneConfig(
+#     r = 64,
+#     init_weights="bat"
+# )
+peft_model = get_peft_model(model, bone_config)
+
+peft_model.print_trainable_parameters()
+
+dataset = load_dataset("imdb", split="train[:1%]")
+
+training_args = SFTConfig(dataset_text_field="text", max_seq_length=128)
+trainer = SFTTrainer(
+    model=peft_model,
+    args=training_args,
+    train_dataset=dataset,
+    tokenizer=tokenizer,
+)
+trainer.train()
+peft_model.save_pretrained("bone-llama-2-7b")
+```
+
+
+To utilize the fine-tuned Bone modules, simply run the following command:
+```python
+import torch
+from peft import PeftModel
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16, device_map="auto"
+)
+peft_model = PeftModel.from_pretrained(model, "bone-llama-2-7b")
+```
+
+## Advanced Usage
+
+### Fine-tune 
+```shell
+#Bat performs better than Bone, but it uses more memory and is twice as slow. If you want to use the Bat method, you only need to add the parameter init_weights="bat".
+python bone_finetuning.py \
+    --base_model_name_or_path meta-llama/Llama-2-7b-hf \
+    --output_dir output/bone-llama-2-7b-metamath-10k \
+    --bone_r 64 \
+    --init_weights True \
+    --bits bf16 \
+    --data_path meta-math/MetaMathQA \
+    --dataset_split train[:100000] \
+    --dataset_field query response \
+    --bf16 True \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 2 \
+    --gradient_accumulation_steps 8 \
+    --save_strategy "steps" \
+    --save_steps 1000 \
+    --save_total_limit 1 \
+    --logging_steps 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --tf32 True \
+    --report_to none
+```
+
+
+
+# Citation
+```bib
+@misc{kang2024boneblockaffineadaptationlarge,
+      title={Bone: Block-Affine Adaptation of Large Language Models}, 
+      author={Jiale Kang},
+      year={2024},
+      eprint={2409.15371},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2409.15371}, 
+}
--- a/examples/bone_finetuning/bone_finetuning.py
+++ b/examples/bone_finetuning/bone_finetuning.py
@ -0,0 +1,105 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from dataclasses import dataclass, field
+from typing import Literal, Optional
+
+import torch
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
+from trl import SFTConfig, SFTTrainer
+
+from peft import BoneConfig, get_peft_model
+
+
+@dataclass
+class ScriptArguments(SFTConfig):
+    # model configs
+    base_model_name_or_path: Optional[str] = field(
+        default=None, metadata={"help": "The name or path of the fp32/16 base model."}
+    )
+    bits: str = field(default="bf16", metadata={"help": "(`['bf16', 'fp16', fp32]`)"})
+    init_weights: Literal[True, "bat"] = field(
+        default=True,
+        metadata={
+            "help": ("True -> Bone; `bat` -> Bat"),
+        },
+    )
+    bone_r: int = field(default=16)
+    merge_and_save: bool = field(default=False)
+    # dataset configs
+    data_path: str = field(default="imdb", metadata={"help": "Path to the training data."})
+    dataset_split: str = field(default="train[:1%]", metadata={"help": "(`['train', 'test', 'eval']`):"})
+    dataset_field: list[str] = field(default=None, metadata={"help": "Fields of dataset input and output."})
+
+
+parser = HfArgumentParser(ScriptArguments)
+script_args = parser.parse_args_into_dataclasses()[0]
+print(script_args)
+
+print(f"Load pre-processed residual model in {script_args.bits} bits.")
+if script_args.bits in ["nf4", "fp4", "int8"]:
+    print("Bone currently does not support quantization.")
+
+elif script_args.base_model_name_or_path is not None:
+    print(f"No available pre-processed model, manually initialize a Bone using {script_args.base_model_name_or_path}.")
+    model = AutoModelForCausalLM.from_pretrained(
+        script_args.base_model_name_or_path,
+        torch_dtype=(
+            torch.float16
+            if script_args.bits == "fp16"
+            else (torch.bfloat16 if script_args.bits == "bf16" else torch.float32)
+        ),
+        device_map="auto",
+    )
+    tokenizer = AutoTokenizer.from_pretrained(script_args.base_model_name_or_path)
+    tokenizer.pad_token_id = tokenizer.eos_token_id
+    bone_config = BoneConfig(
+        r=script_args.bone_r,
+        target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
+        bias="none",
+        task_type="CAUSAL_LM",
+        init_weights=script_args.init_weights,
+    )
+    peft_model = get_peft_model(model, bone_config)
+
+print(peft_model)
+peft_model.print_trainable_parameters()
+
+print(f"Training Bone with trl on the {script_args.data_path}[{script_args.dataset_split}] dataset.")
+dataset = load_dataset(script_args.data_path, split=script_args.dataset_split)
+dataset = dataset.map(
+    lambda example: {
+        "text": f"### USER: {example[script_args.dataset_field[0]]}\n### ASSISTANT: {example[script_args.dataset_field[1]]}"
+    }
+)
+
+trainer = SFTTrainer(
+    model=peft_model,
+    args=script_args,
+    train_dataset=dataset,
+    tokenizer=tokenizer,
+)
+trainer.train()
+trainer.save_state()
+
+peft_model.save_pretrained(
+    os.path.join(script_args.output_dir, "bone_ft"),
+)
+
+if script_args.merge_and_save:
+    model = peft_model.merge_and_unload()
+    model.save_pretrained(os.path.join(script_args.output_dir, "bone_merged"))
+    tokenizer.save_pretrained(os.path.join(script_args.output_dir, "bone_merged"))
--- a/examples/causal_language_modeling/peft_lora_clm_accelerate_ds_zero3_offload.py
+++ b/examples/causal_language_modeling/peft_lora_clm_accelerate_ds_zero3_offload.py
@ -297,9 +297,9 @@ def main():

        correct = 0
        total = 0
-        assert len(eval_preds) == len(
-            dataset["train"][label_column]
-        ), f"{len(eval_preds)} != {len(dataset['train'][label_column])}"
+        assert len(eval_preds) == len(dataset["train"][label_column]), (
+            f"{len(eval_preds)} != {len(dataset['train'][label_column])}"
+        )
        for pred, true in zip(eval_preds, dataset["train"][label_column]):
            if pred.strip() == true.strip():
                correct += 1
--- a/examples/conditional_generation/peft_lora_seq2seq_accelerate_ds_zero3_offload.py
+++ b/examples/conditional_generation/peft_lora_seq2seq_accelerate_ds_zero3_offload.py
@ -125,7 +125,7 @@ def main():
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
-    target_max_length = max([len(tokenizer(class_label)["input_ids"]) for class_label in classes])
+    target_max_length = max(len(tokenizer(class_label)["input_ids"]) for class_label in classes)

    def preprocess_function(examples):
        inputs = examples[text_column]
@ -247,9 +247,9 @@ def main():

        correct = 0
        total = 0
-        assert len(eval_preds) == len(
-            dataset["train"][label_column]
-        ), f"{len(eval_preds)} != {len(dataset['train'][label_column])}"
+        assert len(eval_preds) == len(dataset["train"][label_column]), (
+            f"{len(eval_preds)} != {len(dataset['train'][label_column])}"
+        )
        for pred, true in zip(eval_preds, dataset["train"][label_column]):
            if pred.strip() == true.strip():
                correct += 1
--- a/examples/corda_finetuning/README.md
+++ b/examples/corda_finetuning/README.md
@ -0,0 +1,251 @@
+# CorDA: Context-Oriented Decomposition Adaptation of Large Language Models for Task-Aware Parameter-Efficient Fine-tuning
+
+## Introduction
+
+
+Existing PEFT methods are mostly agnostic of the context of a task of concern, e.g., a downstream task to learn or some pre-trained world knowledge to maintain.
+[CorDA](https://openreview.net/pdf?id=Gi00NVru6n) builds task-aware LoRA adapters from weight decomposition oriented by the context of the task concerned. 
+
+Concretely, CorDA randomly collects a few (by default 256 in our `preprocess.py`) data samples from a target task, e.g. questions from a QA dataset or instructions to write a code or solve a math problem, and feeds these samples into a pre-trained LLM. We can obtain the covariance matrix of the input activation of each linear layer, i.e., $C=XX^T\in\mathcal{R}^{d_{in}\times d_{in}}$. 
+We then perform singular value decomposition (SVD) for the weight $W\in \mathcal{R}^{d_{out}\times d_{in}}$ multiplied by the covariance matrix, i.e., $\verb|SVD|(WC) = U\Sigma V^T$. In this way, the context expressed by these representative covariance matrices is able to orientate the decomposition, such that the principal components (the singular vectors with the largest singular values) are most associated with the task of concern (please refer to Fig.2 of our paper for the advantage of our decomposition over the plain SVD). To ensure the same inference result with the pre-trained model at the start of adaptation, we multiply the inverse of these covariance matrices with the decomposed components, i.e., $\hat{W}=U\Sigma V^T C^{-1}$. 
+
+Thanks to the task-awareness, you can choose how to utilize the task-specific principal components. For examples, if you want to adapt a model to a new task without losing the knowledge of a question-answering dataset, e.g., TriviaQA and NQopen, you can sample questions from this dataset to collect covariance matrices, and keep the principal components frozen because they compact the ability of this dataset, while using the lowest components with the smallest $r$ singular values to initialize the learnable LoRA adapters. This is achieved by the **knowledge-preserved mode (KPM)** of CorDA, which learns new tasks effectively while keeping the world knowledge you are concerned about as sound as possible. Alternatively, when your primary objective is to maximize performance on the finetuning task, disregarding the preservation of world knowledge, the **instruction-previewed mode (IPM**) will be favored. In this mode, CorDA uses the instruction and response from the fine-tuning task (e.g., Math or Code) to produce the covariance matrices. The principal components with the largest $r$ singular values, capturing the characteristics of the finetuning task in advance, can better adapt to the new ability, so they are used to initialize the LoRA adapters, with the remaining components frozen. IPM can further accelerate convergence to enhance the fine-tuning performance on downstream tasks.
+
+
+The implementations of KPM and IPM are compared as follows:
+
+| Mode | Collect covariance from | LoRA $A$ | LoRA $B$ |
+|---|---|---|---
+|KPM | questions from the knowledge benchmark to maintain | $A=\sqrt{\Sigma}\_{[-r:]}(V^T C^{-1})\_{[-r:,:]}$ | $B=U_{[:,-r:]}\sqrt{\Sigma}_{[-r:]}$ |
+IPM | instructions and responses from the downstream task to learn | $A= \sqrt{\Sigma}\_{[:r]} (V^T C^{-1})\_{[:r,:]}$ | $B =U_{[:,:r]} \sqrt{\Sigma}_{[:r]}$ |
+
+### Comparison with alternative methods
+
+The distinction between CorDA with other similar LoRA initialization methods is summarized as follows:
+
+| Method | Initialization for | SVD on | Data-driven | Supports knowledge maintenance |
+| - | - | - | - | - |
+| PiSSA | $A$ and $B$ | weights | no | no |
+| EVA | $A$ | activations | yes | no |
+|CorDA |  $A$ and $B$ | weights (oriented by covariance) | yes | yes |
+
+"Supports knowledge maintenance" denotes the ability of explicitly associating a knowledge benchmark with some components of the pre-trained weights after decomposition, and keeping these components frozen during fine-tuning. 
+
+### Some Results
+
+- Performance with knowledge-preserved mode (sample from NQopen, fine-tune on Math)
+
+| Method | Model | NQ open | GSM8k | Math | Avg. |
+|---|---|---|---|---|---|
+|Pre-trained|Llama-2-7b| 14.99 | -| - | - |
+|LoRA|Llama-2-7b|1.27| 42.68 | 5.88 | 16.61 |
+|**CorDA (KPM)** |Llama-2-7b| **8.20** | **46.32**	| **7.00** | **20.51** |
+|Pre-trained|Llama-2-13b|23.63|-|-|-|
+|LoRA|Llama-2-13b| 16.26 | 57.24 | 8.92 | 27.47 |
+|**CorDA (KPM)** |Llama-2-13b| **19.86** | **59.29** | **9.62** | **29.59** |
+|Pre-trained|Llama-3-8b|13.41|-|-|-|
+|LoRA|Llama-3-8b| 8.75 | 72.33 | 24.04| 35.04 |
+|**CorDA (KPM)** |Llama-3-8b| **9.61** | **74.68** | **25.34** | **36.54** |
+|Pre-trained|Gemma-2-9b|12.85|-|-|-|
+|LoRA|Gemma-2-9b| 9.28 | 83.47 | 42.30| 45.02 |
+|**CorDA (KPM)** |Gemma-2-9b|**10.17** | **84.08** | **42.64** | **45.63** |
+
+- Performance with instruction-previewed mode (sample from Math, fine-tune on Math)
+
+| Method | Model | GSM8k | Math |
+| --- | --- | --- | ---|
+|LoRA| Llama-2-7b | 42.68 | 5.88 |
+|PiSSA | Llama-2-7b | 51.63 | 7.32 |
+| **CorDA (IPM)** | Llama-2-7b | **53.45** | **8.64** |
+|LoRA| Llama-2-13b | 57.24 | 8.92 |
+|PiSSA | Llama-2-13b |60.88	| 11.08|
+| **CorDA (IPM)** | Llama-2-13b | **62.47** |**11.54** |
+|LoRA| Gemma-2-9b | 83.47 |	42.30 |
+|PiSSA | Gemma-2-9b | 84.23	| 43.52|
+| **CorDA (IPM)** | Gemma-2-9b | **84.45** | **43.88** |
+
+
+## Quick Start
+
+### Knowledge-preserved adaptation mode
+
+```py
+import torch
+from peft import LoraConfig, get_peft_model
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from peft.tuners.lora.config import CordaConfig
+from peft.tuners.lora.corda import preprocess_corda
+from trl import SFTConfig, SFTTrainer
+from datasets import load_dataset
+
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16, device_map="auto")
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+tokenizer.pad_token_id = tokenizer.eos_token_id
+sampled_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:256]")
+dataset = load_dataset("imdb", split="train[:256]")
+
+
+def run_model():
+    for batch in sampled_dataset:
+        input_ids = batch["text"]
+        input_ids = input_ids.to(model.device)
+        with torch.no_grad():
+            model(input_ids)
+
+
+corda_config = CordaConfig(
+    corda_method="kpm",
+)
+lora_config = LoraConfig(
+    init_lora_weights="corda",
+    corda_config=corda_config,
+)
+
+# Call `preprocess_corda` first to collect covariance matrix and build SVD result for model
+# For more details, please refer to documentation of `preprocess_corda`
+preprocess_corda(model, lora_config, run_model=run_model)
+
+# Call `get_peft_model` after preprocessing, or else you'll encounter error
+peft_model = get_peft_model(model, lora_config)
+peft_model.print_trainable_parameters()
+
+training_args = SFTConfig(dataset_text_field="text", max_seq_length=128)
+trainer = SFTTrainer(
+    model=peft_model,
+    args=training_args,
+    train_dataset=dataset,
+    tokenizer=tokenizer,
+)
+trainer.train()
+peft_model.save_pretrained("corda-llama-2-7b")
+```
+
+### Instruction-previewed adaptation mode
+
+```py
+# Get model and dataset identically as KPM...
+
+# Different from KPM, we run the model on dataset of the downstream task to collect covariance matrices
+def run_model():
+    for batch in dataset:
+        input_ids = batch["text"]
+        input_ids = input_ids.to(model.device)
+        with torch.no_grad():
+            model(input_ids)
+
+# Different from KPM, we set `corda_method` to `"ipm"`
+corda_config = CordaConfig(
+    corda_method="ipm",
+)
+
+# The rest of training process is identical to KPM...
+```
+
+## Advanced Usage
+
+### Preprocessing
+
+`preprocess.py`: This script builds CorDA adapters for a model, and saves the adapters initial weights and residual model weights to a specified directory. Example usage:
+
+####  Knowledge-preserved adaptation mode
+
+```bash
+CUDA_VISIBLE_DEVICES=0 python -u preprocess.py --model_id="meta-llama/Llama-2-7b-hf" \
+    --r 128 --seed 233 \
+    --save_model --save_path {path_to_residual_model} \
+    --calib_dataset "nqopen"
+```
+Arguments:
+
+- `--model_id` is the pre-trained model for decomposition.
+- `--r` is the low rank of LoRA, e.g. 128.
+- `--calib_dataset` specifies the dataset to sample data to obtain covariance matrices. KPA mode uses QA datasets such as `"nqopen"`, `"traivia_qa"`, or other choices.
+- `--save_model` saves the initialized model in `--save_path`. 
+
+#### Instruction-previewed adaptation mode
+
+```bash
+CUDA_VISIBLE_DEVICES=0 python -u preprocess.py --model_id="meta-llama/Llama-2-7b-hf" \
+    --r 128 --seed 233 \
+    --save_model --save_path {path_to_residual_model} \
+    --first_eigen --calib_dataset "MetaMATH"
+```
+
+Arguments:
+
+- `--first_eigen` uses the largest $r$ singular values and vectors to initialize the learnable adapter for the instruction-previewed adaptation mode. 
+- `--calib_dataset` specifies the dataset to sample data to obtain covariance matrices. Instruction-previewed mode uses the downstream task dataset you are learning, such as  `"MetaMATH"`, `"codefeedback"`, `"WizLMinstruct"`, `"alpaca"`, or other choices.
+
+#### Note about memory consumption 
+
+The process of collecting covariance matrices is performed in `torch.float32` by default. If you would like to reduce the memory consumption of preprocessing, you can specify `use_float16_for_covariance=True` in `CordaConfig` to collect covariance matrices in `torch.float16`. But this may cause numerical instability only in a few cases, such that the initialized model does not ensure the exact same inference result as the original model. So it is suggested to check, e.g., comparing the inference result of Wiki/PTB perplexity before and after preprocessing, if you choose to perform in `torch.float16`. 
+
+### Fine-tuning
+
+`corda_finetuning.py`: This script fine-tunes the preprocessed model built above on a downstream task.
+
+Example usage:
+
+```bash
+python corda_finetuning.py \
+    --model_name_or_path {path_to_residual_model} \
+    --output_dir {path_to_output_model} \
+    --corda_mode True \
+    --data_path meta-math/MetaMathQA \
+    --dataset_split "train[:100000]" \
+    --dataset_field query response \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --gradient_accumulation_steps 32 \
+    --save_strategy "steps" \
+    --save_steps 100 \
+    --save_total_limit 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --bf16 True \
+    --tf32 True \
+    --report_to none
+```
+
+### Convert CorDA to LoRA
+
+The main advantage of CorDA is concentrated during the training phase. For a trained CorDA adapter, we recommend converting it equivalently to the LoRA adapter for using and sharing.
+
+```python
+# The fine-tuned matrices $A$ and $B$ in CorDA adapter is saved and should be combined with the residual model.
+peft_model.save_pretrained(output_dir) 
+# Given the matrices $A_0$ and $B_0$, initialized by CorDA and untrained, and the trained matrices $A$ and $B$, 
+# we can convert these to LoRA by setting $\Delta W = A \times B - A_0 \times B_0 = [A \mid A_0] \times [B \mid -B_0]^T = A'B'$.
+peft_model.save_pretrained(output_dir, path_initial_model_for_weight_conversion="corda_init")
+```
+
+This conversion enables the loading of LoRA on top of a standard base model:
+
+```python
+import torch
+from peft import PeftModel
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16, device_map="auto"
+)
+# No SVD is performed during this step, and the base model remains unaltered.
+peft_model = PeftModel.from_pretrained(model, "corda-llama-2-7b-lora")
+```
+
+Utilizing the converted LoRA does not require modifying the parameters of the base model. When multiple converted LoRAs are needed simultaneously, each adapter operates independently without interference, allowing for the adapters to be freely deleted or added.
+
+Note that this conversion is not supported if `rslora` is used in combination with `rank_pattern` or `alpha_pattern`.
+
+## Citation
+```
+@inproceedings{yangcorda,
+  title={CorDA: Context-Oriented Decomposition Adaptation of Large Language Models for Task-Aware Parameter-Efficient Fine-tuning},
+  author={Yang, Yibo and Li, Xiaojie and Zhou, Zhongzhu and Song, Shuaiwen Leon and Wu, Jianlong and Nie, Liqiang and Ghanem, Bernard},
+  booktitle={The Thirty-eighth Annual Conference on Neural Information Processing Systems},
+  year={2024},
+}
+```
--- a/examples/corda_finetuning/corda_finetuning.py
+++ b/examples/corda_finetuning/corda_finetuning.py
@ -0,0 +1,275 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Sequence
+
+import torch
+import transformers
+from datasets import load_dataset
+from transformers import Trainer
+
+from peft import LoraConfig, PeftModel, get_peft_model
+
+
+IGNORE_INDEX = -100
+
+PROMPT = (
+    "Below is an instruction that describes a task. "
+    "Write a response that appropriately completes the request.\n\n"
+    "### Instruction:\n{instruction}\n\n### Response:"
+)
+
+
+def get_nb_trainable_parameters(model) -> tuple[int, int]:
+    r"""
+    Returns the number of trainable parameters and the number of all parameters in the model.
+    """
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        num_params = param.numel()
+        # if using DS Zero 3 and the weights are initialized empty
+        if num_params == 0 and hasattr(param, "ds_numel"):
+            num_params = param.ds_numel
+
+        # Due to the design of 4bit linear layers from bitsandbytes
+        # one needs to multiply the number of parameters by 2 to get
+        # the correct number of parameters
+        if param.__class__.__name__ == "Params4bit":
+            num_bytes = param.quant_storage.itemsize if hasattr(param, "quant_storage") else 1
+            num_params = num_params * 2 * num_bytes
+
+        all_param += num_params
+        if param.requires_grad:
+            trainable_params += num_params
+
+    return trainable_params, all_param
+
+
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
+    data_path: str = field(default=None, metadata={"help": "Path to the training data."})
+    dataset_split: str = field(default="train[:100000]", metadata={"help": "(`['train', 'test', 'eval']`):"})
+    dataset_field: List[str] = field(default=None, metadata={"help": "Fields of dataset input and output."})
+    dataloader_num_proc: int = field(default=16, metadata={"help": "Number of processes to load dataset"})
+    dataloader_batch_size: int = field(
+        default=3000,
+        metadata={
+            "help": "batch size to load dataset. To set the batch size for training, you should pass --batch_size argument instead."
+        },
+    )
+    optim: str = field(default="adamw_torch")
+    model_max_length: int = field(
+        default=512,
+        metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."},
+    )
+    lora_r: int = field(
+        default=None,
+        metadata={"help": "The rank of LoRA adapter. When passing `None`, CorDA or full fine-tuning is used."},
+    )
+    corda_mode: bool = field(default=True, metadata={"help": "True for CorDA mode"})
+
+
+def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str):
+    """Collects the state dict and dump to disk."""
+    state_dict = trainer.model.state_dict()
+    if trainer.args.should_save:
+        cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()}
+        del state_dict
+        trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa
+
+
+def smart_tokenizer_and_embedding_resize(
+    special_tokens_dict: Dict,
+    tokenizer: transformers.PreTrainedTokenizer,
+    model: transformers.PreTrainedModel,
+):
+    """Resize tokenizer and embedding.
+
+    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
+    """
+    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
+    model.resize_token_embeddings(len(tokenizer))
+
+    if num_new_tokens > 0:
+        input_embeddings = model.get_input_embeddings().weight.data
+        output_embeddings = model.get_output_embeddings().weight.data
+
+        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+
+        input_embeddings[-num_new_tokens:] = input_embeddings_avg
+        output_embeddings[-num_new_tokens:] = output_embeddings_avg
+
+
+def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
+    """Tokenize a list of strings."""
+    tokenized_list = [
+        tokenizer(
+            text,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        )
+        for text in strings
+    ]
+    input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
+    input_ids_lens = labels_lens = [
+        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list
+    ]
+    return {
+        "input_ids": input_ids,
+        "labels": labels,
+        "input_ids_lens": input_ids_lens,
+        "labels_lens": labels_lens,
+    }
+
+
+def preprocess(
+    sources: Sequence[str],
+    targets: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    """Preprocess the data by tokenizing."""
+    examples = [s + t for s, t in zip(sources, targets)]
+    examples_tokenized, sources_tokenized = (_tokenize_fn(strings, tokenizer) for strings in (examples, sources))
+    input_ids = examples_tokenized["input_ids"]
+    labels = copy.deepcopy(input_ids)
+    for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
+        label[:source_len] = IGNORE_INDEX
+    return {
+        "input_ids": input_ids,
+        "labels": labels,
+    }
+
+
+@dataclass
+class DataCollatorForSupervisedDataset:
+    """Collate examples for supervised fine-tuning."""
+
+    tokenizer: transformers.PreTrainedTokenizer
+
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
+        input_ids = [torch.tensor(x) for x in input_ids]
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
+        )
+        labels = [torch.tensor(x) for x in labels]
+        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
+        return {
+            "input_ids": input_ids,
+            "labels": labels,
+            "attention_mask": input_ids.ne(self.tokenizer.pad_token_id),
+        }
+
+
+def train_tokenize_function(examples, tokenizer, query, response):
+    sources = [
+        PROMPT.format_map(
+            {
+                "instruction": instruction,
+            }
+        )
+        for instruction in examples[query]
+    ]
+    targets = [f"{output}{tokenizer.eos_token}" for output in examples[response]]
+    data_dict = preprocess(sources, targets, tokenizer)
+    return data_dict
+
+
+def train():
+    parser = transformers.HfArgumentParser(TrainingArguments)
+    script_args = parser.parse_args_into_dataclasses()[0]
+    print(script_args)
+
+    if script_args.corda_mode:
+        print("Train in CorDA mode")
+        res_model = transformers.AutoModelForCausalLM.from_pretrained(
+            script_args.model_name_or_path,
+            device_map="auto",
+        )
+        model = PeftModel.from_pretrained(
+            res_model, script_args.model_name_or_path, subfolder="corda_init", is_trainable=True
+        )
+    elif script_args.lora_r is not None:
+        print("Train in LoRA mode")
+        model = transformers.AutoModelForCausalLM.from_pretrained(
+            script_args.model_name_or_path,
+            device_map="auto",
+        )
+        lora_config = LoraConfig(
+            r=script_args.lora_r,
+            lora_alpha=script_args.lora_r,
+            init_lora_weights=True,  # script_args.init_lora_weights,
+            target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
+            lora_dropout=0,
+            bias="none",
+            task_type="CAUSAL_LM",
+        )
+        model = get_peft_model(model, lora_config)
+    else:
+        print("Train in Full Finetuning mode")
+        model = transformers.AutoModelForCausalLM.from_pretrained(
+            script_args.model_name_or_path,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+        )
+    trainable_params, all_param = get_nb_trainable_parameters(model)
+    print(
+        f"trainable params: {trainable_params:,d} || all params: {all_param:,d} || trainable%: {100 * trainable_params / all_param}"
+    )
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        script_args.model_name_or_path,
+        model_max_length=script_args.model_max_length,
+        padding_side="right",
+        use_fast=True,
+        trust_remote_code=True,
+    )
+    tokenizer.pad_token_id = tokenizer.eos_token_id
+
+    raw_train_datasets = load_dataset(script_args.data_path, split=script_args.dataset_split)
+    train_dataset = raw_train_datasets.map(
+        train_tokenize_function,
+        batched=True,
+        batch_size=script_args.dataloader_batch_size,
+        num_proc=script_args.dataloader_num_proc,
+        remove_columns=raw_train_datasets.column_names,
+        load_from_cache_file=True,
+        desc="Running tokenizer on train dataset",
+        fn_kwargs={
+            "tokenizer": tokenizer,
+            "query": script_args.dataset_field[0],
+            "response": script_args.dataset_field[1],
+        },
+    )
+
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    data_module = {
+        "train_dataset": train_dataset,
+        "data_collator": data_collator,
+    }
+    trainer = Trainer(model=model, tokenizer=tokenizer, args=script_args, **data_module)
+    trainer.train()
+    trainer.save_state()
+    model.save_pretrained(os.path.join(script_args.output_dir, "ft"))
+
+
+if __name__ == "__main__":
+    train()
--- a/examples/corda_finetuning/datautils.py
+++ b/examples/corda_finetuning/datautils.py
@ -0,0 +1,235 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+
+import numpy as np
+import torch
+from datasets import load_dataset
+
+
+"""
+doc https://huggingface.co/docs/datasets/loading
+doc https://huggingface.co/docs/datasets/process
+doc https://huggingface.co/blog/llama2#how-to-prompt-llama-2
+"""
+
+
+def set_seed(seed):
+    np.random.seed(seed)
+    torch.random.manual_seed(seed)
+
+
+def sample_train_loaders(name, tokenizer, nsamples=128, seed=0, seqlen=2048):
+    set_seed(seed)
+    if "wikitext2" in name:
+        traindata = load_dataset(
+            "wikitext",
+            "wikitext-2-raw-v1",
+            split="train",
+        )
+        traindata = "\n\n".join(traindata["text"])
+    elif "c4" in name:
+        traindata = load_dataset(
+            "allenai/c4",
+            "allenai--c4",
+            data_files={"train": "en/c4-train.00000-of-01024.json.gz"},
+            split="train",
+        )
+        traindata = "\n\n".join(traindata["text"])
+    else:
+        raise NotImplementedError
+
+    trainloader = []
+    for _ in range(nsamples):
+        i = random.randint(0, len(traindata) - seqlen * 2 - 1)
+        j = i + seqlen * 2
+        # breakpoint()
+        trainenc = tokenizer(traindata[i:j], return_tensors="pt")
+        inp = trainenc.input_ids[:, :seqlen]
+        trainloader.append(inp)
+    return trainloader
+
+
+def get_redpajama_train(tokenizer, percent=10, seed=3, batch_size=128, max_length=2048):
+    def tokenization(example):
+        return tokenizer(example["text"], truncation=True, max_length=max_length)
+
+    if percent != 100:
+        split = f"train[:{int(850000 * percent / 100)}]"
+    else:
+        split = "train"
+    dataset = load_dataset("togethercomputer/RedPajama-Data-1T-Sample", split=split)
+
+    processed_dataset = dataset.map(tokenization, batched=True, batch_size=batch_size, num_proc=os.cpu_count())
+    return processed_dataset
+
+
+def get_english_quote(dataset_name, tokenizer):
+    data = load_dataset(dataset_name)
+    data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
+    return data["train"]
+
+
+def get_qat_dataset(name, tokenizer, data_percent):
+    if name == "red_pajama":
+        data = get_redpajama_train(tokenizer, data_percent)
+
+    elif name == "Abirate/english_quotes":
+        data = get_english_quote(name, tokenizer)
+    else:
+        raise NotImplementedError
+    data = data.shuffle()
+    return data
+
+
+llama_chat_format = """<s>[INST] <<SYS>>
+"Below is an instruction that describes a task. Write a response that appropriately completes the request."
+<</SYS>>
+
+{instruction} [/INST] {response} </s>
+"""
+
+
+def get_calib_data(name, tokenizer, model_id, nsamples, seqlen=2048, seed=3):
+    print(f" get_data_from: {name}, nsamples={nsamples}, seqlen={seqlen}, {seed}")
+    cache_file = f"cache/{name}_{model_id.replace('/', '_')}_{nsamples}_{seqlen}_{seed}.pt"
+    traindataset = []
+    if not os.path.exists("cache"):
+        os.makedirs("cache")
+    if os.path.exists(cache_file):
+        print(f"found data file: {cache_file}")
+        traindataset = torch.load(cache_file)
+        print("loaded ...")
+        return traindataset
+    if name == "c4":
+        traindata = load_dataset(
+            "allenai/c4",
+            "allenai--c4",
+            data_files={"train": "en/c4-train.00000-of-01024.json.gz"},
+            split="train",
+        )
+        tot_text = "\n\n".join(traindata["text"])
+    elif name == "wikitext2":
+        traindata = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
+        tot_text = "\n\n".join(traindata["text"])
+    elif name == "ptb":
+        traindata = load_dataset(
+            "ptb_text_only",
+            "penn_treebank",
+            split="train",
+        )
+        tot_text = "\n\n".join(traindata["sentence"])
+    elif name == "traivia_qa":
+        traindata = load_dataset("trivia_qa", "rc", split="train")
+        tot_text = "\n\n".join(traindata["question"])
+    elif name == "nqopen":
+        traindata = load_dataset("nq_open", split="train")
+        tot_text = "\n\n".join(traindata["question"])
+    elif name == "alpaca":
+        selected_data_dict = load_dataset("iboing/alpaca_data", split="train").shuffle(seed=seed).take(nsamples)
+        for example in selected_data_dict:
+            if example.get("input", "") == "":
+                s = llama_chat_format.format(instruction=example["instruction"], response=example["output"])
+                trainenc = tokenizer(s, return_tensors="pt")
+                inp = trainenc.input_ids[:, :seqlen]
+                attention_mask = torch.ones_like(inp)
+                traindataset.append({"input_ids": inp, "attention_mask": attention_mask})
+        print("example instruction:", s)
+        torch.save(traindataset, cache_file)
+        return traindataset
+    elif name == "MetaMATH":
+        selected_data_dict = load_dataset("iboing/MetaMathQA-395K", split="train").shuffle(seed=seed).take(nsamples)
+        for example in selected_data_dict:
+            if example.get("input", "") == "":
+                s = llama_chat_format.format(instruction=example["query"], response=example["response"])
+                trainenc = tokenizer(s, return_tensors="pt")
+                inp = trainenc.input_ids[:, :seqlen]
+                attention_mask = torch.ones_like(inp)
+                traindataset.append({"input_ids": inp, "attention_mask": attention_mask})
+        print("example instruction:", s)
+        torch.save(traindataset, cache_file)
+        return traindataset
+    elif name == "codefeedback":
+        selected_data_dict = (
+            load_dataset("iboing/CodeFeedback-Filtered-Instruction", split="train").shuffle(seed=seed).take(nsamples)
+        )
+        for example in selected_data_dict:
+            if example.get("input", "") == "":
+                s = llama_chat_format.format(instruction=example["query"], response=example["answer"])
+                trainenc = tokenizer(s, return_tensors="pt")
+                inp = trainenc.input_ids[:, :seqlen]
+                attention_mask = torch.ones_like(inp)
+                traindataset.append({"input_ids": inp, "attention_mask": attention_mask})
+        print("example instruction:", s)
+        torch.save(traindataset, cache_file)
+        return traindataset
+    elif name == "WizLMinstruct":
+        selected_data_dict = (
+            load_dataset("iboing/WizardLM_evol_instruct_V2_143k", split="train").shuffle(seed=seed).take(nsamples)
+        )
+        for example in selected_data_dict:
+            if example.get("input", "") == "":
+                s = llama_chat_format.format(
+                    instruction=example["conversation"][0]["human"], response=example["conversation"][0]["assistant"]
+                )
+                trainenc = tokenizer(s, return_tensors="pt")
+                inp = trainenc.input_ids[:, :seqlen]
+                attention_mask = torch.ones_like(inp)
+                traindataset.append({"input_ids": inp, "attention_mask": attention_mask})
+        print("example instruction:", s)
+        torch.save(traindataset, cache_file)
+        return traindataset
+    else:
+        raise NotImplementedError
+    print(f"tot_text={len(tot_text)}")
+    for _ in range(nsamples):
+        i = random.randint(0, len(tot_text) - seqlen - 1)
+        j = i + seqlen * 10
+        trainenc = tokenizer(tot_text[i:j], return_tensors="pt")
+        inp = trainenc.input_ids[:, :seqlen]
+        attention_mask = torch.ones_like(inp)
+        traindataset.append({"input_ids": inp, "attention_mask": attention_mask})
+    torch.save(traindataset, cache_file)
+    return traindataset
+
+
+def get_eval_loaders(name, tokenizer):
+    if "wikitext2" in name:
+        testdata = load_dataset(
+            "wikitext",
+            "wikitext-2-raw-v1",
+            split="test",
+        )
+        testenc = tokenizer("\n\n".join(testdata["text"]), return_tensors="pt")
+        return testenc
+    if "ptb" in name:
+        valdata = load_dataset(
+            "ptb_text_only",
+            "penn_treebank",
+            split="validation",
+        )
+        testenc = tokenizer("\n\n".join(valdata["sentence"]), return_tensors="pt")
+        return testenc
+    if "c4" in name:
+        testdata = load_dataset(
+            "allenai/c4",
+            "allenai--c4",
+            data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"},
+            split="validation",
+        )
+        testenc = tokenizer("\n\n".join(testdata["text"]), return_tensors="pt")
+        return testenc
+    raise NotImplementedError
--- a/examples/corda_finetuning/preprocess.py
+++ b/examples/corda_finetuning/preprocess.py
@ -0,0 +1,162 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+
+import numpy as np
+import torch
+from datautils import get_calib_data
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from peft import get_peft_model
+from peft.tuners.lora.config import CordaConfig, LoraConfig
+from peft.tuners.lora.corda import preprocess_corda
+
+
+@torch.no_grad()
+def run_model(model, calib_loader):
+    model.eval()
+    for batch in tqdm(calib_loader):
+        batch = {k: v.to(model.device) for k, v in batch.items()}
+        model(**batch)
+
+
+def main(args):
+    # Setting random seed of numpy and torch
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    torch.cuda.manual_seed_all(args.seed)
+    torch.backends.cudnn.deterministic = True
+
+    # Load model
+    model_id = args.model_id
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True
+    )
+
+    # Collect data
+    calib_loader = get_calib_data(args.calib_dataset, tokenizer, model_id, args.calib_loader_size, seed=args.seed)
+
+    # Evaluate the original model
+    print("\n---- model before svd ---\n")
+    print(model)
+
+    # Perform decomposition
+    corda_config = CordaConfig(
+        corda_method="ipm" if args.first_eigen else "kpm",
+    )
+    lora_config = LoraConfig(
+        init_lora_weights="corda",
+        target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
+        r=args.r,
+        lora_alpha=args.r,
+        corda_config=corda_config,
+    )
+    preprocess_corda(
+        model,
+        lora_config,
+        run_model=lambda: run_model(model, calib_loader),
+    )
+    model = get_peft_model(model, lora_config)
+
+    # Evaluate again to check if the model is consistent
+    # Using `model.model` here because `get_peft_model` wraps a layer to the model
+    print("\n---- model after svd ---\n")
+    print(model)
+
+    # Save as hugging face model
+    if args.save_model:
+        assert args.save_path is not None
+        save_path = args.save_path
+
+        # Save CorDA modules
+        model.peft_config["default"].init_lora_weights = True
+        model.save_pretrained(os.path.join(save_path, "corda_init"))
+
+        # Save residual model
+        model = model.unload()
+        model.save_pretrained(save_path)
+
+        # Save tokenizer
+        tokenizer.save_pretrained(save_path)
+        print(f"Done building CorDA huggingface model in {save_path}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_id",
+        type=str,
+        default="meta-llama/Llama-2-7b-hf",
+        help="Pretrained model ID",
+    )
+    parser.add_argument(
+        "--calib_loader_size",
+        type=int,
+        default=256,
+        help="number of samples used for covariance matrices",
+    )
+    parser.add_argument(
+        "--calib_dataset",
+        type=str,
+        default="wikitext2",
+        choices=[
+            "wikitext2",
+            "c4",
+            "ptb",
+            "traivia_qa",
+            "nqopen",
+            "MetaMATH",
+            "codefeedback",
+            "WizLMinstruct",
+            "alpaca",
+        ],
+        help="calibration dataset",
+    )
+    parser.add_argument(
+        "--eval_mmlu",
+        action="store_true",
+        help="evaluate mmlu",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=233,
+        help="random seed",
+    )
+    parser.add_argument(
+        "--r",
+        type=int,
+        default=None,
+    )
+    parser.add_argument(
+        "--first_eigen",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--save_model",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--save_path",
+        type=str,
+        default=None,
+    )
+    args = parser.parse_args()
+
+    main(args)
--- a/examples/cpt_finetuning/README.md
+++ b/examples/cpt_finetuning/README.md
@ -0,0 +1,64 @@
+
+# Context-aware Prompt Tuning: Advancing In-Context Learning with Adversarial Methods
+## Introduction ([Paper](https://arxiv.org/abs/2410.17222), [Code](https://github.com/tsachiblau/Context-aware-Prompt-Tuning-Advancing-In-Context-Learning-with-Adversarial-Methods), [Notebook](cpt_train_and_inference.ipynb), [Colab](https://colab.research.google.com/drive/1UhQDVhZ9bDlSk1551SuJV8tIUmlIayta?usp=sharing))
+
+> Large Language Models (LLMs) can perform few-shot learning using either optimization-based approaches or In-Context Learning (ICL). Optimization-based methods often suffer from overfitting, as they require updating a large number of parameters with limited data. In contrast, ICL avoids overfitting but typically underperforms compared to optimization-based methods and is highly sensitive to the selection, order, and format of demonstration examples. To overcome these challenges, we introduce Context-aware Prompt Tuning (CPT), a method inspired by ICL, Prompt Tuning (PT), and adversarial attacks. CPT builds on the ICL strategy of concatenating examples before the input, extending it by incorporating PT-like learning to refine the context embedding through iterative optimization, extracting deeper insights from the training examples. Our approach carefully modifies specific context tokens, considering the unique structure of the examples within the context. In addition to updating the context with PT-like optimization, CPT draws inspiration from adversarial attacks, adjusting the input based on the labels present in the context while preserving the inherent value of the user-provided data. To ensure robustness and stability during optimization, we employ a projected gradient descent algorithm, constraining token embeddings to remain close to their original values and safeguarding the quality of the context. Our method has demonstrated superior accuracy across multiple classification tasks using various LLM models, outperforming existing baselines and effectively addressing the overfitting challenge in few-shot learning.
+
+
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/cpt.png"/>
+</div>
+<small>CPT optimizing only specific token embeddings while keeping the rest of the model frozen <a href="https://huggingface.co/papers/2410.17222">(image source)</a>.</small>
+
+---
+
+## Dataset Creation and Collation for CPT
+
+This document explains how to prepare datasets for CPT, linking the dataset preparation processes in the code to the methods and principles described in the CPT paper, specifically in **Sections 3.1**, **3.2**, and **3.3**.
+
+---
+
+### Template-Based Tokenization
+
+#### The Role of Templates
+Templates define the structure of the input-output pairs, enabling the model to interpret the task within a unified context.
+
+- **Input Templates**:  
+  Templates like `"input: {sentence}"` structure raw input sentences. The `{sentence}` placeholder is replaced with the actual input text.
+
+- **Output Templates**:  
+  Templates such as `"output: {label}"` format the labels (e.g., `positive`, `negative`, etc.).
+
+- **Separator Tokens**:  
+  Separators distinguish different parts of the input, such as the input text and labels, as well as separate examples within the context.
+
+
+#### How CPT Utilizes Context Structure
+
+CPT leverages the context structure, encoded within the `cpt_tokens_type_mask`, to optimize the context effectively. to optimize the context effectively. By treating different token types based on their roles, the model updates some tokens while using others solely for optimization:
+
+1. **Refrain from Updating Label Tokens**:  
+   Some context tokens represent label tokens, which contain valuable, unmodifiable information. By excluding these tokens from updates during training, CPT ensures that the labels remain fixed, preserving their integrity.
+
+2. **Apply Type-Specific Projection Norms**:  
+   CPT employs Projected Gradient Descent (PGD) to update context embeddings, applying tailored norms to different context parts. This approach reduces overfitting while maintaining robustness and generalization by preserving the integrity of user-provided examples.
+
+
+
+#### Limitations 
+CPT is designed for few-shot scenarios, as concatenating more examples increases memory usage due to the self-attention mechanism and additional loss terms. For larger datasets, users can limit the number of context examples and use the remaining samples solely for optimization to manage memory efficiently.
+
+
+
+
+## Citation
+```bib
+@article{   
+    blau2025cpt, 
+    title={Context-Aware Prompt Tuning: Advancing In-Context Learning with Adversarial Methods}, 
+    author={Tsachi Blau, Moshe Kimhi, Yonatan Belinkov, Alexander Bronstein, Chaim Baskin}, 
+    journal={arXiv preprint arXiv:2410.17222}}, 
+    year={2025} 
+}
+```
--- a/examples/cpt_finetuning/cpt_train_and_inference.ipynb
+++ b/examples/cpt_finetuning/cpt_train_and_inference.ipynb
--- a/examples/dora_finetuning/dora_finetuning.py
+++ b/examples/dora_finetuning/dora_finetuning.py
@ -6,7 +6,7 @@ from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
-    DataCollatorWithPadding,
+    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
 )
@ -95,7 +95,7 @@ def train_model(
    tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)

    # Data collator to dynamically pad the batched examples
-    data_collator = DataCollatorWithPadding(tokenizer)
+    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

    # Define training arguments
    training_args = TrainingArguments(
--- a/examples/eva_finetuning/README.md
+++ b/examples/eva_finetuning/README.md
@ -0,0 +1,158 @@
+# EVA: Explained Variance Adaptation
+## Introduction ([Paper](https://arxiv.org/abs/2410.07170), [code](https://github.com/ml-jku/EVA))
+Explained Variance Adaptation (EVA) is a novel initialization method for LoRA style adapters which initializes adapter weights in a data driven manner and adaptively allocates ranks according to the variance they explain. EVA improves average performance on a multitude of tasks across various domains, such as Language generation and understanding, Image classification, and Decision Making.
+
+The abstract from the paper is:
+
+*Foundation models (FMs) are pre-trained on large-scale datasets and then fine-tuned on a downstream task for a specific application. The most successful and most commonly used fine-tuning method is to update the pre-trained weights via a low-rank adaptation (LoRA). LoRA introduces new weight matrices that are usually initialized at random with a uniform rank distribution across model weights. Recent works focus on weight-driven initialization or learning of adaptive ranks during training. Both approaches have only been investigated in isolation, resulting in slow convergence or a uniform rank distribution, in turn leading to sub-optimal performance. We propose to enhance LoRA by initializing the new weights in a data-driven manner by computing singular value decomposition on minibatches of activation vectors. Then, we initialize the LoRA matrices with the obtained right-singular vectors and re-distribute ranks among all weight matrices to explain the maximal amount of variance and continue the standard LoRA fine-tuning procedure. This results in our new method **E**xplained **V**ariance **A**daptation (EVA). We apply EVA to a variety of fine-tuning tasks ranging from language generation and understanding to image classification and reinforcement learning. EVA exhibits faster convergence than competitors and attains the highest average score across a multitude of tasks per domain.*
+
+## Quick Start
+Below is an example of how to use EVA with a causal language model. For a more detailed example see [eva_finetuning.py](https://github.com/huggingface/peft/blob/main/examples/eva_finetuning/eva_finetuning.py).
+```python
+import torch
+from datasets import load_dataset
+from torch.utils.data import DataLoader
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from peft import EvaConfig, LoraConfig, get_peft_model, initialize_lora_eva_weights
+
+
+# config
+model_name = "meta-llama/Llama-3.1-8B"
+max_seq_len = 512
+rank = 16
+alpha = 1
+rho = 2.0
+target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"]
+svd_batch_size = 4 # can be different from the batch size used in finetuning
+
+# load model and tokenizer
+model = AutoModelForCausalLM.from_pretrained(model_name)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+tokenizer.pad_token = tokenizer.eos_token
+
+# load dataset
+dataset = load_dataset("Rowan/hellaswag")
+dataset = dataset.map(
+    lambda x: tokenizer(x["ctx"], padding="max_length", truncation=True, max_length=max_seq_len),
+    batched=True,
+    remove_columns=dataset["train"].column_names,
+)
+dataset.set_format(type="torch")
+
+# create dataloader for SVD
+# typically this is the same as the dataloader used for finetuning
+dataloader = DataLoader(
+    dataset["train"],
+    batch_size=svd_batch_size,
+    collate_fn=lambda examples: {k: torch.stack([v[k] for v in examples], dim=0) for k in examples[0].keys()},
+)
+
+# setup peft config
+eva_config = EvaConfig(
+    rho=rho
+)
+peft_config = LoraConfig(
+    r=rank,
+    lora_alpha=alpha,
+    target_modules=target_modules,
+    init_lora_weights="eva",
+    eva_config=eva_config
+)
+
+# move model to GPU
+model = model.cuda()
+
+# to optimize memory usage during EVA initialization, set low_cpu_mem_usage=True
+peft_model = get_peft_model(model, peft_config, low_cpu_mem_usage=True)
+
+initialize_lora_eva_weights(peft_model, dataloader)
+```
+`initialize_lora_eva_weights` will compute the SVD and load the components into the model. After this continue with standard LoRA finetuning.
+
+## Using EVA with Bitsandbytes
+EVA is fully compatible with bitsandbytes. Simply initialize the pretrained model with a BitsAndBytesConfig and then use the peft model with EVA.
+```python
+from transformers import BitsAndBytesConfig
+from peft import prepare_model_for_kbit_training
+
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-3.1-8B",
+    quantization_config=BitsAndBytesConfig(load_in_4bit=True)
+)
+model = prepare_model_for_kbit_training(model)
+peft_model = get_peft_model(model, peft_config)
+initialize_lora_eva_weights(peft_model, dataloader)
+```
+
+## Getting the EVA state_dict without loading the adapter weights
+In some cases you might just want to get the state_dict after EVA initialization without loading the adapter weights. This can be useful for example if:
+- you want to precompute and store the state_dict for different downstream tasks.
+- you need to quantize the model for finetuning but want to perform EVA initialization with model weights in full/half precision.
+- you do not intend to use a peft model for LoRA finetuning.
+- you would like to leverage multiple GPUs for EVA initialization. (At the moment this is not directly supported by `initialize_lora_eva_weights`)
+
+You can do this by calling `get_eva_state_dict` directly (you only need to pass `peft_config` if `model` is not a PeftModel):
+```python
+from peft import get_eva_state_dict
+
+eva_state_dict = get_eva_state_dict(model, dataloader, peft_config)
+```
+Later you can load the state_dict into a `PeftModel` by using the `eva_state_dict` argument in `initialize_lora_eva_weights`:
+```python
+initialize_lora_eva_weights(peft_model, eva_state_dict=eva_state_dict)
+```
+
+## Leveraging multiple GPUs
+
+EVA initialization can be parallelized across multiple GPUs. In this case inputs from multiple GPUs are gathered before computing the SVD for the batch. This requires that the model is wrapped in a `torch.nn.DataParallel` or `torch.nn.DistributedDataParallel` class. An example of how to use this can be found in [eva_finetuning_multi_gpu.py](https://github.com/huggingface/peft/blob/main/examples/eva_finetuning/eva_finetuning_multi_gpu.py).
+
+## Customizing EVA
+
+By default, EVA is designed to work with standard transformer language models. However we integrated three different paramters which can be used to customize EVA for other types of models.
+1. `forward_fn`: Defines how the forward pass during EVA initialization should be computed.
+2. `prepare_model_inputs_fn`: Can be used if it is necessary to use information contained in the original model_input to prepare the input for SVD in individual layers.
+3. `prepare_layer_inputs_fn`: Defines how layer inputs should be prepared for SVD.
+
+All three parameters can be passed to `initialize_lora_eva_weights` and `get_eva_state_dict`.
+
+### forward_fn
+
+`forward_fn` defines how the forward pass during EVA initialization should be computed. `forward_fn` receives two arguments: `model` and `inputs`. By default this is set to `forward_fn_dict` which simply returns `model(**inputs)`.
+
+### prepare_model_inputs_fn
+
+`prepare_model_inputs_fn` can be used if it is necessary to use information contained in the original model_input to prepare the input for SVD in individual layers. `prepare_model_inputs_fn` receives two arguments: `model_input` and `peft_config`. This component is separate from `prepare_layer_inputs_fn` as the output only needs to be computed once per batch. By default this parameter is set to `prepare_model_inputs_fn_language_modeling` which is used get a subset of indices based on attention and label mask to avoid including padding tokens in the SVD computation. If you would like to not use this component set `prepare_model_inputs_fn` to None. The default logic is:
+```python
+def prepare_model_inputs_fn_language_modeling(model_input, peft_config: LoraConfig):
+    mask = model_input.get("attention_mask", torch.ones_like(model_input["input_ids"])).bool()
+    if peft_config.eva_config.use_label_mask and hasattr(model_input, "labels"):
+        mask = torch.logical_and(mask, model_input["labels"] != peft_config.eva_config.label_mask_value)
+    return mask.nonzero()
+```
+
+### prepare_layer_inputs_fn
+
+`prepare_layer_inputs_fn` can be used to preprocess the layer inputs before passing them to the SVD algorithm. `prepare_layer_inputs_fn` receives three arguments: `layer_input`, `model_input` and `layer_name`. It can either be a callable or a dictionary where the keys are the layer names and the values are callables. If it is a dictionary, functions are assigned to adapter layers based on the layer names. By default a language modeling setting is assumed where model_inputs are the outputs of `prepare_model_inputs_fn_language_modeling` which is a mask of indices. If this parameter is set to None, only two modifications are made to the layer inputs
+- take the first element incase of a tuple or list. 
+- if the input has more than 2 dimensions, we flatten all but the last dimension.
+
+Must always return a tensor. The default logic is:
+```python
+def prepare_layer_inputs_fn_default(layer_input, model_input, layer_name) -> torch.Tensor:
+    if isinstance(layer_input, (tuple, list)):
+        layer_input = layer_input[0]
+    return layer_input[model_input.T.unbind()]
+```
+
+## Citation
+In case you find our work useful, please consider citing it.
+
+```	
+@article{paischer2024eva,
+    title={One Initialization to Rule them All: Fine-tuning via Explained Variance Adaptation}, 
+    author={Fabian Paischer, Lukas Hauzenberger, Thomas Schmied, Benedikt Alkin, Marc Peter Deisenroth, Sepp Hochreiter},
+    journal={arXiv preprint arXiv:2410.07170},
+    year={2024}
+}
+```
--- a/examples/eva_finetuning/eva_finetuning.py
+++ b/examples/eva_finetuning/eva_finetuning.py
@ -0,0 +1,97 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from datasets import load_dataset
+from torch.utils.data import DataLoader
+from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
+from utils import DataCollator, TokenizerMetaMath
+
+from peft import EvaConfig, LoraConfig, get_peft_model, initialize_lora_eva_weights
+
+
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+
+
+# config
+model_name = "meta-llama/Llama-3.1-8B"
+max_seq_len = 512
+rank = 16
+alpha = 1
+rho = 2.0
+target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"]
+svd_batch_size = 4  # can be different from the batch size used in finetuning
+batch_size = 4
+learning_rate = 5e-4
+gradient_accumulation_steps = 8
+num_epochs = 1
+output_dir = "outputs"
+bf16 = True
+
+
+# load model and tokenizer
+model = AutoModelForCausalLM.from_pretrained(model_name)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+# load dataset
+dataset = load_dataset("meta-math/MetaMathQA")
+dataset = dataset.map(
+    TokenizerMetaMath(model_name),
+    batched=True,
+    remove_columns=dataset["train"].column_names,
+)
+dataset.set_format(type="torch")
+
+# data collator
+data_collator = DataCollator(tokenizer.eos_token_id, max_length=max_seq_len)
+
+# dataloader
+dataloader = DataLoader(
+    dataset["train"],
+    batch_size=svd_batch_size,
+    collate_fn=data_collator,
+)
+
+# setup peft config
+eva_config = EvaConfig(rho=rho)
+peft_config = LoraConfig(
+    r=rank, lora_alpha=alpha, target_modules=target_modules, init_lora_weights="eva", eva_config=eva_config
+)
+
+# move model to GPU
+model = model.to(DEVICE)
+
+# to optimize memory usage during eva initialization, set low_cpu_mem_usage=True
+peft_model = get_peft_model(model, peft_config, low_cpu_mem_usage=True)
+initialize_lora_eva_weights(peft_model, dataloader)
+
+# setup training arguments
+training_args = TrainingArguments(
+    per_device_train_batch_size=batch_size,
+    learning_rate=learning_rate,
+    gradient_accumulation_steps=gradient_accumulation_steps,
+    num_train_epochs=num_epochs,
+    output_dir=output_dir,
+    remove_unused_columns=False,
+    bf16=bf16,
+)
+
+# continue with standard finetuning
+trainer = Trainer(
+    model=peft_model,
+    args=training_args,
+    train_dataset=dataset["train"],
+    data_collator=data_collator,
+)
+trainer.train()
--- a/examples/eva_finetuning/eva_finetuning_multi_gpu.py
+++ b/examples/eva_finetuning/eva_finetuning_multi_gpu.py
@ -0,0 +1,127 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import torch
+import torch.distributed as dist
+from datasets import load_dataset
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
+from utils import DataCollator, TokenizerMetaMath
+
+from peft import EvaConfig, LoraConfig, get_eva_state_dict, get_peft_model, initialize_lora_eva_weights
+
+
+# run this script e.g. with: torchrun --nproc_per_node=4 eva_finetuning_multi_gpu.py
+
+# config
+model_name = "meta-llama/Llama-2-7b-hf"
+max_seq_len = 512
+rank = 16
+alpha = 1
+rho = 2.0
+target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"]
+svd_batch_size = 4  # can be different from the batch size used in finetuning
+batch_size = 4
+learning_rate = 5e-4
+gradient_accumulation_steps = 8
+num_epochs = 1
+output_dir = "outputs"
+bf16 = True
+
+
+# Initialize distributed environment
+if torch.cuda.is_available():
+    local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    torch.cuda.set_device(local_rank)
+    dist.init_process_group("nccl")
+    world_size = dist.get_world_size()
+else:
+    local_rank = -1
+    world_size = 1
+
+
+# load model and tokenizer
+model = AutoModelForCausalLM.from_pretrained(model_name)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+# load dataset
+dataset = load_dataset("meta-math/MetaMathQA")
+dataset = dataset.map(
+    TokenizerMetaMath(model_name),
+    batched=True,
+    remove_columns=dataset["train"].column_names,
+)
+dataset.set_format(type="torch")
+
+# data collator
+data_collator = DataCollator(tokenizer.eos_token_id, max_length=max_seq_len)
+
+# Create sampler for distributed training
+sampler = DistributedSampler(dataset["train"], num_replicas=world_size, rank=local_rank)
+
+# dataloader
+dataloader = DataLoader(
+    dataset["train"],
+    batch_size=svd_batch_size,
+    collate_fn=data_collator,
+    sampler=sampler,
+    shuffle=False,
+)
+
+sampler.set_epoch(0)
+
+# Wrap model in DDP
+model = model.to(local_rank)
+model = DDP(model, device_ids=[local_rank], output_device=local_rank)
+
+# setup peft config
+eva_config = EvaConfig(rho=rho)
+peft_config = LoraConfig(
+    r=rank, lora_alpha=alpha, target_modules=target_modules, init_lora_weights="eva", eva_config=eva_config
+)
+
+# EVA initialization
+eva_state_dict = get_eva_state_dict(model, dataloader, peft_config)
+eva_state_dict = {".".join(["base_model.model"] + k.split(".")[1:]): v for k, v in eva_state_dict.items()}
+
+# cleanup ddp
+model = model.module
+
+# initialize peft model
+peft_model = get_peft_model(model, peft_config, low_cpu_mem_usage=True)
+initialize_lora_eva_weights(peft_model, eva_state_dict=eva_state_dict)
+
+# setup training arguments
+training_args = TrainingArguments(
+    per_device_train_batch_size=batch_size,
+    learning_rate=learning_rate,
+    gradient_accumulation_steps=gradient_accumulation_steps,
+    num_train_epochs=num_epochs,
+    output_dir=output_dir,
+    remove_unused_columns=False,
+    bf16=bf16,
+)
+
+# continue with standard finetuning
+trainer = Trainer(
+    model=peft_model,
+    args=training_args,
+    train_dataset=dataset["train"],
+    data_collator=data_collator,
+)
+trainer.train()
--- a/examples/eva_finetuning/utils.py
+++ b/examples/eva_finetuning/utils.py
@ -0,0 +1,76 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoTokenizer
+
+
+class TokenizerMetaMath:
+    PROMPT_NO_INPUT = (
+        "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
+        "### Instruction:\n{query}\n\n### Response: "
+    )
+    PROMPT = (
+        "Below is an instruction that describes a task, paired with an input that provides further context. "
+        "Write a response that appropriately completes the request.\n\n"
+        "### Instruction:\n{query}\n\n### Input:\n{input}\n\n### Response: "
+    )
+
+    def format_prompt(self, query):
+        query = query.split("\n", 1)
+        if len(query) == 1 or query[1].strip("\n") == "":
+            return self.PROMPT_NO_INPUT.format(query=query[0])
+        else:
+            return self.PROMPT.format(query=query[0], input=query[1])
+
+    def __init__(self, tokenizer_path):
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+
+    def __call__(self, examples):
+        prompts = [self.format_prompt(text) for text in examples["query"]]
+        completions = examples["response"]
+        return self._tokenize_fn(prompts, completions)
+
+    def _tokenize_fn(self, prompts, completions):
+        prompt_tokens = self.tokenizer(prompts, add_special_tokens=False)["input_ids"]
+        input_tokens = self.tokenizer([x + y for x, y in zip(prompts, completions)], add_special_tokens=False)[
+            "input_ids"
+        ]
+        input_tokens = [[self.tokenizer.bos_token_id] + x + [self.tokenizer.eos_token_id] for x in input_tokens]
+        prompt_length = [len(x) + 1 for x in prompt_tokens]  # +1 for the bos token
+        input_length = [len(x) for x in input_tokens]
+        return {"input_ids": input_tokens, "prompt_length": prompt_length, "input_length": input_length}
+
+
+class DataCollator:
+    def __init__(self, eos_token_id, max_length=None):
+        self.eos_token_id = eos_token_id
+        self.max_length = max_length
+
+    def __call__(self, batch):
+        batch = {k: [item[k] for item in batch] for k in batch[0]}
+        input_lengths = torch.stack(batch["input_length"])
+        prompt_lengths = torch.stack(batch["prompt_length"])
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            batch["input_ids"], batch_first=True, padding_value=self.eos_token_id
+        )
+        col_indices = torch.arange(input_ids.size(1)).unsqueeze(0)
+        attention_mask = col_indices < input_lengths.unsqueeze(1)
+        label_mask = torch.logical_or(col_indices < prompt_lengths.unsqueeze(1), ~attention_mask)
+        labels = input_ids.masked_fill(label_mask, -100)
+        if self.max_length is not None:
+            input_ids = input_ids[:, : self.max_length]
+            attention_mask = attention_mask[:, : self.max_length]
+            labels = labels[:, : self.max_length]
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
--- a/examples/evaluation/lora-lm-eval.ipynb
+++ b/examples/evaluation/lora-lm-eval.ipynb
--- a/examples/feature_extraction/peft_lora_embedding_semantic_search.py
+++ b/examples/feature_extraction/peft_lora_embedding_semantic_search.py
@ -264,7 +264,7 @@ def main():

        dataset = DatasetDict({"train": train_dataset, "validation": val_dataset})
    else:
-        dataset = load_dataset(args.dataset_name)
+        dataset = load_dataset(args.dataset_name, revision="main")

    def preprocess_function(examples):
        queries = examples["query"]
@ -440,13 +440,13 @@ def main():
                completed_steps += 1

            if (step + 1) % 100 == 0:
-                logger.info(f"Step: {step+1}, Loss: {total_loss/(step+1)}")
+                logger.info(f"Step: {step + 1}, Loss: {total_loss / (step + 1)}")
                if args.with_tracking:
                    accelerator.log({"train/loss": total_loss / (step + 1)}, step=completed_steps)

            if isinstance(checkpointing_steps, int):
                if completed_steps % checkpointing_steps == 0:
-                    output_dir = f"step_{completed_steps }"
+                    output_dir = f"step_{completed_steps}"
                    if args.output_dir is not None:
                        output_dir = os.path.join(args.output_dir, output_dir)
                    accelerator.save_state(output_dir)
--- a/examples/feature_extraction/peft_lora_embedding_semantic_similarity_inference.ipynb
+++ b/examples/feature_extraction/peft_lora_embedding_semantic_similarity_inference.ipynb
@ -127,7 +127,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "id": "f190e1ee",
   "metadata": {},
   "outputs": [
@ -157,7 +157,7 @@
    "import pandas as pd\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)\n",
-    "dataset = load_dataset(dataset_name)\n",
+    "dataset = load_dataset(dataset_name, revision=\"main\")\n",
    "train_product_dataset = dataset[\"train\"].to_pandas()[[\"product_title\"]]\n",
    "val_product_dataset = dataset[\"validation\"].to_pandas()[[\"product_title\"]]\n",
    "product_dataset_for_indexing = pd.concat([train_product_dataset, val_product_dataset])\n",
--- a/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py
+++ b/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py
@ -38,7 +38,7 @@ Here let's load the `opt-6.7b` model, its weights in half-precision (float16) ar


 free_in_GB = int(torch.cuda.mem_get_info()[0] / 1024**3)
-max_memory = f"{free_in_GB-2}GB"
+max_memory = f"{free_in_GB - 2}GB"

 n_gpus = torch.cuda.device_count()
 max_memory = {i: max_memory for i in range(n_gpus)}
--- a/examples/hra_dreambooth/requirements.txt
+++ b/examples/hra_dreambooth/requirements.txt
@ -1,4 +1,4 @@
-transformers==4.36.2
+transformers=>4.48.0
 accelerate==0.25.0
 evaluate
 tqdm
--- a/examples/int8_training/peft_adalora_whisper_large_training.py
+++ b/examples/int8_training/peft_adalora_whisper_large_training.py
@ -494,7 +494,7 @@ def main():
    raw_datasets = raw_datasets.cast_column("audio", Audio(sampling_rate=16000))

    logger.info("Dataset loaded: %s", raw_datasets)
-    logger.info(f'{raw_datasets["train"][0]}')
+    logger.info(f"{raw_datasets['train'][0]}")

    vectorized_datasets = raw_datasets.map(
        prepare_dataset,
--- a/examples/loftq_finetuning/README.md
+++ b/examples/loftq_finetuning/README.md
@ -10,7 +10,7 @@ Steps:
 1. Apply LoftQ to a full-precision pre-trained weight and save.
 2. Load LoftQ initialization and train.

-For step 1, we have provided off-the-shelf LoftQ initializations (see [supported model list](#appendix-off-the-shelf-model-table)) 
+For step 1, we have provided off-the-shelf LoftQ initializations (see [supported model list](#appendix-off-the-shelf-model-list)) 
 in [Huggingface Hub LoftQ](https://huggingface.co/LoftQ).
 If you want to do it yourself, jump to [LoftQ DIY](#loftq-diy).

--- a/examples/lora_dreambooth/convert_peft_sd_lora_to_kohya_ss.py
+++ b/examples/lora_dreambooth/convert_peft_sd_lora_to_kohya_ss.py
@ -30,7 +30,7 @@ def get_module_kohya_state_dict(

        # Set alpha parameter
        if "lora_down" in kohya_key:
-            alpha_key = f'{kohya_key.split(".")[0]}.alpha'
+            alpha_key = f"{kohya_key.split('.')[0]}.alpha"
            kohya_ss_state_dict[alpha_key] = torch.tensor(module.peft_config[adapter_name].lora_alpha).to(dtype)

    return kohya_ss_state_dict
--- a/examples/olora_finetuning/README.md
+++ b/examples/olora_finetuning/README.md
@ -8,7 +8,7 @@
 import torch
 from peft import LoraConfig, get_peft_model
 from transformers import AutoTokenizer, AutoModelForCausalLM
-from trl import SFTTrainer
+from trl import SFTConfig, SFTTrainer
 from datasets import load_dataset

 model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.bfloat16, device_map="auto")
@ -18,11 +18,10 @@ lora_config = LoraConfig(
    init_lora_weights="olora"
 )
 peft_model = get_peft_model(model, lora_config)
+training_args = SFTConfig(dataset_text_field="text", max_seq_length=128)
 trainer = SFTTrainer(
    model=peft_model,
    train_dataset=dataset,
-    dataset_text_field="text",
-    max_seq_length=512,
    tokenizer=tokenizer,
 )
 trainer.train()
@ -40,6 +39,19 @@ OLoRA also supports quantization. To use 4-bit quantization try:
 ```bash
 python3 examples/olora_finetuning/olora_finetuning.py --base_model facebook/opt-350m --quantize
 ```
+or you can just pass a quantized model without the quantize flag.
+
+If you want to run DDP by [accelerate](https://huggingface.co/docs/accelerate/en/index), please run `accelerate config` to set your ddp config, and run:
+```bash
+accelerate launch examples/olora_finetuning/olora_finetuning.py --base_model facebook/opt-350m
+```
+please add `--device_map cpu` if you want to run finetune on CPU.
+
+If you want to train a quantized model like AWQ and GPTQ which do not support olora init method, please pass `--init_lora_weights gaussian`. For example:
+```bash
+python3 examples/olora_finetuning/olora_finetuning.py --base_model hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4 --init_lora_weights gaussian
+
+```


 ## Use the model
--- a/examples/olora_finetuning/olora_finetuning.py
+++ b/examples/olora_finetuning/olora_finetuning.py
@ -13,12 +13,13 @@
 # limitations under the License.


-from typing import List
+import os
+from typing import List, Optional

 import torch
 import transformers
 from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, set_seed

 from peft import (
    LoraConfig,
@ -43,23 +44,33 @@ def train(
    lora_alpha: int = 16,
    lora_dropout: float = 0.05,
    lora_target_modules: List[str] = None,
+    torch_dtype: str = "float16",
    init_lora_weights="olora",
+    seed: Optional[int] = None,
 ):
-    model = AutoModelForCausalLM.from_pretrained(
-        base_model,
-        device_map=device_map,
-        quantization_config=BitsAndBytesConfig(
+    # Set device_map to the right place when enabling DDP.
+    world_size = int(os.environ.get("WORLD_SIZE", 0)) or int(os.environ.get("PMI_SIZE", 0))
+    if world_size > 1 and device_map != "cpu":
+        from accelerate import Accelerator
+
+        device_map = {"": Accelerator().process_index}
+    # Set seed
+    if seed is not None:
+        set_seed(seed)
+    model_kwargs = {"torch_dtype": getattr(torch, torch_dtype), "device_map": device_map}
+    if quantize:
+        model_kwargs["quantization_config"] = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
        )
-        if quantize
-        else None,
-        torch_dtype=torch.float16,
-    )
+    model = AutoModelForCausalLM.from_pretrained(base_model, **model_kwargs)

    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
+    # For some tokenizer with no pad token like llama
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token

    def tokenize(prompt, add_eos_token=True):
        result = tokenizer(
@ -112,7 +123,6 @@ def train(
            warmup_steps=100,
            num_train_epochs=num_epochs,
            learning_rate=learning_rate,
-            fp16=True,
            logging_steps=100,
            optim="adamw_torch",
            evaluation_strategy="steps",
@ -122,6 +132,7 @@ def train(
            output_dir=output_dir,
            save_total_limit=3,
            load_best_model_at_end=True,
+            ddp_find_unused_parameters=False if world_size > 1 else None,
        ),
        data_collator=transformers.DataCollatorForSeq2Seq(
            tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
@ -159,7 +170,9 @@ if __name__ == "__main__":
    parser.add_argument("--lora_alpha", type=int, default=16)
    parser.add_argument("--lora_dropout", type=float, default=0.05)
    parser.add_argument("--lora_target_modules", type=str, default=None)
+    parser.add_argument("--torch_dtype", type=str, default="float16")
    parser.add_argument("--init_lora_weights", type=str, default="olora")
+    parser.add_argument("--seed", type=int, default=None)

    args = parser.parse_args()

@ -180,5 +193,7 @@ if __name__ == "__main__":
        lora_alpha=args.lora_alpha,
        lora_dropout=args.lora_dropout,
        lora_target_modules=args.lora_target_modules,
+        torch_dtype=args.torch_dtype,
        init_lora_weights=args.init_lora_weights,
+        seed=args.seed,
    )
--- a/examples/pissa_finetuning/README.md
+++ b/examples/pissa_finetuning/README.md
@ -6,8 +6,7 @@ PiSSA represents a matrix $W\in\mathbb{R}^{m\times n}$ within the model by the p
 ```python
 import torch
 from peft import LoraConfig, get_peft_model
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from trl import SFTTrainer
+from transformers import AutoTokenizer, AutoModelForCausalLMfrom trl import SFTConfig, SFTTrainer
 from datasets import load_dataset

 model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16, device_map="auto")
@ -23,11 +22,11 @@ peft_model.print_trainable_parameters()

 dataset = load_dataset("imdb", split="train[:1%]")

+training_args = SFTConfig(dataset_text_field="text", max_seq_length=128)
 trainer = SFTTrainer(
    model=peft_model,
+    args=training_args,
    train_dataset=dataset,
-    dataset_text_field="text",
-    max_seq_length=128,
    tokenizer=tokenizer,
 )
 trainer.train()
@ -72,7 +71,7 @@ The main advantage of PiSSA is concentrated during the training phase. For a tra
 peft_model.save_pretrained(output_dir) 
 # Given the matrices $A_0$ and $B_0$, initialized by PiSSA and untrained, and the trained matrices $A$ and $B$, 
 # we can convert these to LoRA by setting $\Delta W = A \times B - A_0 \times B_0 = [A \mid A_0] \times [B \mid -B_0]^T = A'B'$.
-peft_model.save_pretrained(output_dir, convert_pissa_to_lora="pissa_init")
+peft_model.save_pretrained(output_dir, path_initial_model_for_weight_conversion="pissa_init")

 ```
 This conversion enables the loading of LoRA on top of a standard base model:
--- a/examples/pissa_finetuning/pissa_finetuning.py
+++ b/examples/pissa_finetuning/pissa_finetuning.py
@ -14,18 +14,18 @@

 import os
 from dataclasses import dataclass, field
-from typing import List, Optional
+from typing import Optional

 import torch
 from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments
-from trl import SFTTrainer
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser
+from trl import SFTConfig, SFTTrainer

 from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training


@dataclass
-class TrainingArguments(TrainingArguments):
+class ScriptArguments(SFTConfig):
    # model configs
    base_model_name_or_path: Optional[str] = field(
        default=None, metadata={"help": "The name or path of the fp32/16 base model."}
@ -46,14 +46,9 @@ class TrainingArguments(TrainingArguments):
    # dataset configs
    data_path: str = field(default="imdb", metadata={"help": "Path to the training data."})
    dataset_split: str = field(default="train[:1%]", metadata={"help": "(`['train', 'test', 'eval']`):"})
-    dataset_field: List[str] = field(default=None, metadata={"help": "Fields of dataset input and output."})
-    max_seq_length: int = field(
-        default=512,
-        metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."},
-    )


-parser = HfArgumentParser(TrainingArguments)
+parser = HfArgumentParser(ScriptArguments)
 script_args = parser.parse_args_into_dataclasses()[0]
 print(script_args)

@ -133,8 +128,6 @@ trainer = SFTTrainer(
    model=peft_model,
    args=script_args,
    train_dataset=dataset,
-    dataset_text_field="text",
-    max_seq_length=script_args.max_seq_length,
    tokenizer=tokenizer,
 )
 trainer.train()
@ -143,7 +136,7 @@ trainer.save_state()
 if script_args.convert_pissa_to_lora:
    peft_model.save_pretrained(
        os.path.join(script_args.output_dir, "pissa_lora"),
-        convert_pissa_to_lora=os.path.join(script_args.residual_model_name_or_path, "pissa_init"),
+        path_initial_model_for_weight_conversion=os.path.join(script_args.residual_model_name_or_path, "pissa_init"),
    )
 else:
    peft_model.save_pretrained(
--- a/examples/sequence_classification/LoRA-torchao-8bit-dynamic-activation.ipynb
+++ b/examples/sequence_classification/LoRA-torchao-8bit-dynamic-activation.ipynb
@ -0,0 +1,526 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "900b542d-0249-453c-a915-a061b80af69f",
+   "metadata": {},
+   "source": [
+    "# PyTorch AO (torchao) with int8_dynamic_activation_int8_weight"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "10e1acc3-50b8-4d40-bdf3-0133c113cc4b",
+   "metadata": {},
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "a9935ae2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import argparse\n",
+    "import os\n",
+    "\n",
+    "import torch\n",
+    "from torch.optim import AdamW\n",
+    "from torch.utils.data import DataLoader\n",
+    "from peft import (\n",
+    "    get_peft_config,\n",
+    "    get_peft_model,\n",
+    "    get_peft_model_state_dict,\n",
+    "    set_peft_model_state_dict,\n",
+    "    LoraConfig,\n",
+    "    PeftType,\n",
+    "    PrefixTuningConfig,\n",
+    "    PromptEncoderConfig,\n",
+    ")\n",
+    "\n",
+    "import evaluate\n",
+    "from datasets import load_dataset\n",
+    "from transformers import AutoModelForSequenceClassification, AutoTokenizer, TorchAoConfig, get_linear_schedule_with_warmup, set_seed\n",
+    "from tqdm import tqdm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eafdd532-b1eb-4aac-8077-3386a84c7cdb",
+   "metadata": {},
+   "source": [
+    "## Parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "e3b13308",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch_size = 16\n",
+    "model_name_or_path = \"google/gemma-2-2b\"\n",
+    "task = \"mrpc\"\n",
+    "device = \"cuda\"\n",
+    "num_epochs = 5\n",
+    "lr = 2e-5\n",
+    "\n",
+    "lora_rank = 16\n",
+    "lora_alpha = 32\n",
+    "lora_dropout = 0.1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c7fb69bf-0182-4111-b715-e2e659b42b1d",
+   "metadata": {},
+   "source": [
+    "## Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "d2f4d25e-30b9-431f-95c3-adb390dc6fcd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if any(k in model_name_or_path for k in (\"gpt\", \"opt\", \"bloom\")):\n",
+    "    padding_side = \"left\"\n",
+    "else:\n",
+    "    padding_side = \"right\"\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)\n",
+    "if getattr(tokenizer, \"pad_token_id\") is None:\n",
+    "    tokenizer.pad_token_id = tokenizer.eos_token_id\n",
+    "\n",
+    "datasets = load_dataset(\"glue\", task)\n",
+    "metric = evaluate.load(\"glue\", task)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "1ea852bc-a040-4244-8fd3-516307cecd14",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tokenize_function(examples):\n",
+    "    # max_length=None => use the model max length (it's actually the default)\n",
+    "    outputs = tokenizer(examples[\"sentence1\"], examples[\"sentence2\"], truncation=True, max_length=None)\n",
+    "    return outputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "cf5ef289-f42f-4582-bd5e-9852ad8beff2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenized_datasets = datasets.map(\n",
+    "    tokenize_function,\n",
+    "    batched=True,\n",
+    "    remove_columns=[\"idx\", \"sentence1\", \"sentence2\"],\n",
+    ")\n",
+    "\n",
+    "# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the\n",
+    "# transformers library\n",
+    "tokenized_datasets = tokenized_datasets.rename_column(\"label\", \"labels\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "739b3655-9db0-48bc-8542-308c6d5e0b8b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def collate_fn(examples):\n",
+    "    return tokenizer.pad(examples, padding=\"longest\", return_tensors=\"pt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "0288f311-8475-4a0e-99af-e4b909d10e01",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Instantiate dataloaders.\n",
+    "train_dataloader = DataLoader(\n",
+    "    tokenized_datasets[\"train\"],\n",
+    "    shuffle=True,\n",
+    "    collate_fn=collate_fn,\n",
+    "    batch_size=batch_size,\n",
+    ")\n",
+    "eval_dataloader = DataLoader(\n",
+    "    tokenized_datasets[\"validation\"],\n",
+    "    shuffle=False,\n",
+    "    collate_fn=collate_fn,\n",
+    "    batch_size=batch_size,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fcaf6f9e-c9d1-445a-9f08-18ef462f67ce",
+   "metadata": {},
+   "source": [
+    "## Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "e5dfff56-ea80-4561-aeaf-43216bbb9af7",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2ac42f98e60d412496fe77ed7eb5c6df",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "quant_config = TorchAoConfig(quant_type=\"int8_dynamic_activation_int8_weight\")\n",
+    "model = AutoModelForSequenceClassification.from_pretrained(\n",
+    "    model_name_or_path, return_dict=True, device_map=0, torch_dtype=torch.bfloat16, quantization_config=quant_config\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "0526f571",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "peft_config = LoraConfig(\n",
+    "    task_type=\"SEQ_CLS\",\n",
+    "    r=lora_rank,\n",
+    "    lora_alpha=lora_alpha,\n",
+    "    lora_dropout=lora_dropout,\n",
+    "    target_modules=[\"q_proj\", \"v_proj\"],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "ceeae329-e931-4d52-8a28-9c87e5cdb4cf",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "trainable params: 3,199,488 || all params: 2,617,545,984 || trainable%: 0.1222\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = get_peft_model(model, peft_config)\n",
+    "model.print_trainable_parameters()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1b3d2544-3028-4e2a-9c56-d4d7d9d674de",
+   "metadata": {},
+   "source": [
+    "## Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "0d2d0381",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "optimizer = AdamW(params=model.parameters(), lr=lr)\n",
+    "\n",
+    "# Instantiate scheduler\n",
+    "lr_scheduler = get_linear_schedule_with_warmup(\n",
+    "    optimizer=optimizer,\n",
+    "    num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),\n",
+    "    num_training_steps=(len(train_dataloader) * num_epochs),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "f04c88ca-84eb-4184-afe6-3869b6f96b76",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "PeftModelForSequenceClassification(\n",
+       "  (base_model): LoraModel(\n",
+       "    (model): Gemma2ForSequenceClassification(\n",
+       "      (model): Gemma2Model(\n",
+       "        (embed_tokens): Embedding(256000, 2304, padding_idx=0)\n",
+       "        (layers): ModuleList(\n",
+       "          (0-25): 26 x Gemma2DecoderLayer(\n",
+       "            (self_attn): Gemma2Attention(\n",
+       "              (q_proj): lora.TorchaoLoraLinear(\n",
+       "                (base_layer): Linear(in_features=2304, out_features=2048, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7a846f516520>, weight=AffineQuantizedTensor(shape=torch.Size([2048, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None)))\n",
+       "                (lora_dropout): ModuleDict(\n",
+       "                  (default): Dropout(p=0.1, inplace=False)\n",
+       "                )\n",
+       "                (lora_A): ModuleDict(\n",
+       "                  (default): Linear(in_features=2304, out_features=16, bias=False)\n",
+       "                )\n",
+       "                (lora_B): ModuleDict(\n",
+       "                  (default): Linear(in_features=16, out_features=2048, bias=False)\n",
+       "                )\n",
+       "                (lora_embedding_A): ParameterDict()\n",
+       "                (lora_embedding_B): ParameterDict()\n",
+       "                (lora_magnitude_vector): ModuleDict()\n",
+       "              )\n",
+       "              (k_proj): Linear(in_features=2304, out_features=1024, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7a846f516520>, weight=AffineQuantizedTensor(shape=torch.Size([1024, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None)))\n",
+       "              (v_proj): lora.TorchaoLoraLinear(\n",
+       "                (base_layer): Linear(in_features=2304, out_features=1024, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7a846f516520>, weight=AffineQuantizedTensor(shape=torch.Size([1024, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None)))\n",
+       "                (lora_dropout): ModuleDict(\n",
+       "                  (default): Dropout(p=0.1, inplace=False)\n",
+       "                )\n",
+       "                (lora_A): ModuleDict(\n",
+       "                  (default): Linear(in_features=2304, out_features=16, bias=False)\n",
+       "                )\n",
+       "                (lora_B): ModuleDict(\n",
+       "                  (default): Linear(in_features=16, out_features=1024, bias=False)\n",
+       "                )\n",
+       "                (lora_embedding_A): ParameterDict()\n",
+       "                (lora_embedding_B): ParameterDict()\n",
+       "                (lora_magnitude_vector): ModuleDict()\n",
+       "              )\n",
+       "              (o_proj): Linear(in_features=2048, out_features=2304, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7a846f516520>, weight=AffineQuantizedTensor(shape=torch.Size([2304, 2048]), block_size=(1, 2048), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None)))\n",
+       "              (rotary_emb): Gemma2RotaryEmbedding()\n",
+       "            )\n",
+       "            (mlp): Gemma2MLP(\n",
+       "              (gate_proj): Linear(in_features=2304, out_features=9216, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7a846f516520>, weight=AffineQuantizedTensor(shape=torch.Size([9216, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None)))\n",
+       "              (up_proj): Linear(in_features=2304, out_features=9216, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7a846f516520>, weight=AffineQuantizedTensor(shape=torch.Size([9216, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None)))\n",
+       "              (down_proj): Linear(in_features=9216, out_features=2304, weight=LinearActivationQuantizedTensor(activation=<function _int8_symm_per_token_reduced_range_quant at 0x7a846f516520>, weight=AffineQuantizedTensor(shape=torch.Size([2304, 9216]), block_size=(1, 9216), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None)))\n",
+       "              (act_fn): PytorchGELUTanh()\n",
+       "            )\n",
+       "            (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+       "            (post_attention_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+       "            (pre_feedforward_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+       "            (post_feedforward_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+       "          )\n",
+       "        )\n",
+       "        (norm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+       "      )\n",
+       "      (score): ModulesToSaveWrapper(\n",
+       "        (original_module): Linear(in_features=2304, out_features=2, bias=False)\n",
+       "        (modules_to_save): ModuleDict(\n",
+       "          (default): Linear(in_features=2304, out_features=2, bias=False)\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.config.use_cache = False\n",
+    "model.to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "fa0e73be",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|                                                                                                                                                                                                                                                       | 0/230 [00:00<?, ?it/s]You're using a GemmaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 230/230 [00:43<00:00,  5.27it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:04<00:00,  5.33it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch 1 | train loss 1.7618 | {'accuracy': 0.46568627450980393, 'f1': 0.5458333333333333}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 230/230 [00:43<00:00,  5.29it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:04<00:00,  5.47it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch 2 | train loss 1.1905 | {'accuracy': 0.5245098039215687, 'f1': 0.6325757575757576}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 230/230 [00:43<00:00,  5.32it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:04<00:00,  5.34it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch 3 | train loss 1.1478 | {'accuracy': 0.5318627450980392, 'f1': 0.6456400742115028}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 230/230 [00:43<00:00,  5.29it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:04<00:00,  5.36it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch 4 | train loss 1.1384 | {'accuracy': 0.5367647058823529, 'f1': 0.6506469500924215}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 230/230 [00:44<00:00,  5.21it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:04<00:00,  5.43it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch 5 | train loss 1.1365 | {'accuracy': 0.5367647058823529, 'f1': 0.6506469500924215}\n",
+      "CPU times: user 4min 2s, sys: 399 ms, total: 4min 2s\n",
+      "Wall time: 4min 2s\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "for epoch in range(1, num_epochs + 1):\n",
+    "    model.train()\n",
+    "    train_losses = []\n",
+    "    for step, batch in enumerate(tqdm(train_dataloader)):\n",
+    "        batch.to(device)\n",
+    "        outputs = model(**batch)\n",
+    "        loss = outputs.loss\n",
+    "        if not torch.isfinite(loss):\n",
+    "            raise ValueError(\"non-finite loss encountered\")\n",
+    "\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "        lr_scheduler.step()\n",
+    "        optimizer.zero_grad()\n",
+    "        train_losses.append(loss.item())\n",
+    "\n",
+    "    model.eval()\n",
+    "    for step, batch in enumerate(tqdm(eval_dataloader)):\n",
+    "        batch.to(device)\n",
+    "        with torch.no_grad():\n",
+    "            outputs = model(**batch)\n",
+    "        predictions = outputs.logits.argmax(dim=-1)\n",
+    "        predictions, references = predictions, batch[\"labels\"]\n",
+    "        metric.add_batch(\n",
+    "            predictions=predictions,\n",
+    "            references=references,\n",
+    "        )\n",
+    "\n",
+    "    eval_metric = metric.compute()\n",
+    "    train_loss = sum(train_losses) / len(train_losses)\n",
+    "    print(f\"epoch {epoch} | train loss {train_loss:.4f} |\", eval_metric)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "6a1f937b-a0a5-40ec-8e41-5a5a18c6bff6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# memory: 4122MiB"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/examples/sequence_classification/LoRA-torchao-8bit.ipynb
+++ b/examples/sequence_classification/LoRA-torchao-8bit.ipynb
@ -0,0 +1,526 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "412e41ee-72d3-4e71-bd3a-703b37429c57",
+   "metadata": {},
+   "source": [
+    "# PyTorch AO (torchao) with int8_weight_only"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "10e1acc3-50b8-4d40-bdf3-0133c113cc4b",
+   "metadata": {},
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "a9935ae2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import argparse\n",
+    "import os\n",
+    "\n",
+    "import torch\n",
+    "from torch.optim import AdamW\n",
+    "from torch.utils.data import DataLoader\n",
+    "from peft import (\n",
+    "    get_peft_config,\n",
+    "    get_peft_model,\n",
+    "    get_peft_model_state_dict,\n",
+    "    set_peft_model_state_dict,\n",
+    "    LoraConfig,\n",
+    "    PeftType,\n",
+    "    PrefixTuningConfig,\n",
+    "    PromptEncoderConfig,\n",
+    ")\n",
+    "\n",
+    "import evaluate\n",
+    "from datasets import load_dataset\n",
+    "from transformers import AutoModelForSequenceClassification, AutoTokenizer, TorchAoConfig, get_linear_schedule_with_warmup, set_seed\n",
+    "from tqdm import tqdm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eafdd532-b1eb-4aac-8077-3386a84c7cdb",
+   "metadata": {},
+   "source": [
+    "## Parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "e3b13308",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch_size = 16\n",
+    "model_name_or_path = \"google/gemma-2-2b\"\n",
+    "task = \"mrpc\"\n",
+    "device = \"cuda\"\n",
+    "num_epochs = 5\n",
+    "lr = 2e-5\n",
+    "\n",
+    "lora_rank = 16\n",
+    "lora_alpha = 32\n",
+    "lora_dropout = 0.1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c7fb69bf-0182-4111-b715-e2e659b42b1d",
+   "metadata": {},
+   "source": [
+    "## Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "d2f4d25e-30b9-431f-95c3-adb390dc6fcd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if any(k in model_name_or_path for k in (\"gpt\", \"opt\", \"bloom\")):\n",
+    "    padding_side = \"left\"\n",
+    "else:\n",
+    "    padding_side = \"right\"\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)\n",
+    "if getattr(tokenizer, \"pad_token_id\") is None:\n",
+    "    tokenizer.pad_token_id = tokenizer.eos_token_id\n",
+    "\n",
+    "datasets = load_dataset(\"glue\", task)\n",
+    "metric = evaluate.load(\"glue\", task)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "1ea852bc-a040-4244-8fd3-516307cecd14",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tokenize_function(examples):\n",
+    "    # max_length=None => use the model max length (it's actually the default)\n",
+    "    outputs = tokenizer(examples[\"sentence1\"], examples[\"sentence2\"], truncation=True, max_length=None)\n",
+    "    return outputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "cf5ef289-f42f-4582-bd5e-9852ad8beff2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenized_datasets = datasets.map(\n",
+    "    tokenize_function,\n",
+    "    batched=True,\n",
+    "    remove_columns=[\"idx\", \"sentence1\", \"sentence2\"],\n",
+    ")\n",
+    "\n",
+    "# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the\n",
+    "# transformers library\n",
+    "tokenized_datasets = tokenized_datasets.rename_column(\"label\", \"labels\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "739b3655-9db0-48bc-8542-308c6d5e0b8b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def collate_fn(examples):\n",
+    "    return tokenizer.pad(examples, padding=\"longest\", return_tensors=\"pt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "0288f311-8475-4a0e-99af-e4b909d10e01",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Instantiate dataloaders.\n",
+    "train_dataloader = DataLoader(\n",
+    "    tokenized_datasets[\"train\"],\n",
+    "    shuffle=True,\n",
+    "    collate_fn=collate_fn,\n",
+    "    batch_size=batch_size,\n",
+    ")\n",
+    "eval_dataloader = DataLoader(\n",
+    "    tokenized_datasets[\"validation\"],\n",
+    "    shuffle=False,\n",
+    "    collate_fn=collate_fn,\n",
+    "    batch_size=batch_size,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fcaf6f9e-c9d1-445a-9f08-18ef462f67ce",
+   "metadata": {},
+   "source": [
+    "## Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "e5dfff56-ea80-4561-aeaf-43216bbb9af7",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "512d9dc10a4d4ecc88b9440575b0973a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "quant_config = TorchAoConfig(quant_type=\"int8_weight_only\")\n",
+    "model = AutoModelForSequenceClassification.from_pretrained(\n",
+    "    model_name_or_path, return_dict=True, device_map=0, torch_dtype=torch.bfloat16, quantization_config=quant_config\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "0526f571",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "peft_config = LoraConfig(\n",
+    "    task_type=\"SEQ_CLS\",\n",
+    "    r=lora_rank,\n",
+    "    lora_alpha=lora_alpha,\n",
+    "    lora_dropout=lora_dropout,\n",
+    "    target_modules=[\"q_proj\", \"v_proj\"],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "ceeae329-e931-4d52-8a28-9c87e5cdb4cf",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "trainable params: 3,199,488 || all params: 2,617,545,984 || trainable%: 0.1222\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = get_peft_model(model, peft_config)\n",
+    "model.print_trainable_parameters()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1b3d2544-3028-4e2a-9c56-d4d7d9d674de",
+   "metadata": {},
+   "source": [
+    "## Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "0d2d0381",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "optimizer = AdamW(params=model.parameters(), lr=lr)\n",
+    "\n",
+    "# Instantiate scheduler\n",
+    "lr_scheduler = get_linear_schedule_with_warmup(\n",
+    "    optimizer=optimizer,\n",
+    "    num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),\n",
+    "    num_training_steps=(len(train_dataloader) * num_epochs),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "f04c88ca-84eb-4184-afe6-3869b6f96b76",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "PeftModelForSequenceClassification(\n",
+       "  (base_model): LoraModel(\n",
+       "    (model): Gemma2ForSequenceClassification(\n",
+       "      (model): Gemma2Model(\n",
+       "        (embed_tokens): Embedding(256000, 2304, padding_idx=0)\n",
+       "        (layers): ModuleList(\n",
+       "          (0-25): 26 x Gemma2DecoderLayer(\n",
+       "            (self_attn): Gemma2Attention(\n",
+       "              (q_proj): lora.TorchaoLoraLinear(\n",
+       "                (base_layer): Linear(in_features=2304, out_features=2048, weight=AffineQuantizedTensor(shape=torch.Size([2048, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None))\n",
+       "                (lora_dropout): ModuleDict(\n",
+       "                  (default): Dropout(p=0.1, inplace=False)\n",
+       "                )\n",
+       "                (lora_A): ModuleDict(\n",
+       "                  (default): Linear(in_features=2304, out_features=16, bias=False)\n",
+       "                )\n",
+       "                (lora_B): ModuleDict(\n",
+       "                  (default): Linear(in_features=16, out_features=2048, bias=False)\n",
+       "                )\n",
+       "                (lora_embedding_A): ParameterDict()\n",
+       "                (lora_embedding_B): ParameterDict()\n",
+       "                (lora_magnitude_vector): ModuleDict()\n",
+       "              )\n",
+       "              (k_proj): Linear(in_features=2304, out_features=1024, weight=AffineQuantizedTensor(shape=torch.Size([1024, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None))\n",
+       "              (v_proj): lora.TorchaoLoraLinear(\n",
+       "                (base_layer): Linear(in_features=2304, out_features=1024, weight=AffineQuantizedTensor(shape=torch.Size([1024, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None))\n",
+       "                (lora_dropout): ModuleDict(\n",
+       "                  (default): Dropout(p=0.1, inplace=False)\n",
+       "                )\n",
+       "                (lora_A): ModuleDict(\n",
+       "                  (default): Linear(in_features=2304, out_features=16, bias=False)\n",
+       "                )\n",
+       "                (lora_B): ModuleDict(\n",
+       "                  (default): Linear(in_features=16, out_features=1024, bias=False)\n",
+       "                )\n",
+       "                (lora_embedding_A): ParameterDict()\n",
+       "                (lora_embedding_B): ParameterDict()\n",
+       "                (lora_magnitude_vector): ModuleDict()\n",
+       "              )\n",
+       "              (o_proj): Linear(in_features=2048, out_features=2304, weight=AffineQuantizedTensor(shape=torch.Size([2304, 2048]), block_size=(1, 2048), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None))\n",
+       "              (rotary_emb): Gemma2RotaryEmbedding()\n",
+       "            )\n",
+       "            (mlp): Gemma2MLP(\n",
+       "              (gate_proj): Linear(in_features=2304, out_features=9216, weight=AffineQuantizedTensor(shape=torch.Size([9216, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None))\n",
+       "              (up_proj): Linear(in_features=2304, out_features=9216, weight=AffineQuantizedTensor(shape=torch.Size([9216, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None))\n",
+       "              (down_proj): Linear(in_features=9216, out_features=2304, weight=AffineQuantizedTensor(shape=torch.Size([2304, 9216]), block_size=(1, 9216), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None))\n",
+       "              (act_fn): PytorchGELUTanh()\n",
+       "            )\n",
+       "            (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+       "            (post_attention_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+       "            (pre_feedforward_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+       "            (post_feedforward_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+       "          )\n",
+       "        )\n",
+       "        (norm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+       "      )\n",
+       "      (score): ModulesToSaveWrapper(\n",
+       "        (original_module): Linear(in_features=2304, out_features=2, bias=False)\n",
+       "        (modules_to_save): ModuleDict(\n",
+       "          (default): Linear(in_features=2304, out_features=2, bias=False)\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.config.use_cache = False\n",
+    "model.to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "fa0e73be",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|                                                                                                                                                                                                                                                       | 0/230 [00:00<?, ?it/s]You're using a GemmaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 230/230 [00:31<00:00,  7.19it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:01<00:00, 16.19it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch 1 | train loss 1.0672 | {'accuracy': 0.6715686274509803, 'f1': 0.7751677852348994}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 230/230 [00:31<00:00,  7.26it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:01<00:00, 16.19it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch 2 | train loss 0.6261 | {'accuracy': 0.7377450980392157, 'f1': 0.8201680672268907}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 230/230 [00:31<00:00,  7.25it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:01<00:00, 16.15it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch 3 | train loss 0.4743 | {'accuracy': 0.7867647058823529, 'f1': 0.8502581755593803}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 230/230 [00:31<00:00,  7.30it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:01<00:00, 16.17it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch 4 | train loss 0.4006 | {'accuracy': 0.803921568627451, 'f1': 0.8586572438162544}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 230/230 [00:31<00:00,  7.26it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:01<00:00, 16.10it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch 5 | train loss 0.3585 | {'accuracy': 0.8235294117647058, 'f1': 0.8791946308724832}\n",
+      "CPU times: user 2min 8s, sys: 38 s, total: 2min 46s\n",
+      "Wall time: 2min 46s\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "for epoch in range(1, num_epochs + 1):\n",
+    "    model.train()\n",
+    "    train_losses = []\n",
+    "    for step, batch in enumerate(tqdm(train_dataloader)):\n",
+    "        batch.to(device)\n",
+    "        outputs = model(**batch)\n",
+    "        loss = outputs.loss\n",
+    "        if not torch.isfinite(loss):\n",
+    "            raise ValueError(\"non-finite loss encountered\")\n",
+    "\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "        lr_scheduler.step()\n",
+    "        optimizer.zero_grad()\n",
+    "        train_losses.append(loss.item())\n",
+    "\n",
+    "    model.eval()\n",
+    "    for step, batch in enumerate(tqdm(eval_dataloader)):\n",
+    "        batch.to(device)\n",
+    "        with torch.no_grad():\n",
+    "            outputs = model(**batch)\n",
+    "        predictions = outputs.logits.argmax(dim=-1)\n",
+    "        predictions, references = predictions, batch[\"labels\"]\n",
+    "        metric.add_batch(\n",
+    "            predictions=predictions,\n",
+    "            references=references,\n",
+    "        )\n",
+    "\n",
+    "    eval_metric = metric.compute()\n",
+    "    train_loss = sum(train_losses) / len(train_losses)\n",
+    "    print(f\"epoch {epoch} | train loss {train_loss:.4f} |\", eval_metric)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "6a1f937b-a0a5-40ec-8e41-5a5a18c6bff6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# memory: 18098MiB"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/examples/sft/README.md
+++ b/examples/sft/README.md
@ -29,4 +29,6 @@ When you have access to multiple GPUs, it would be better to use normal LoRA wit
 ## Multi-GPU SFT with LoRA and FSDP
 When you have access to multiple GPUs, it would be better to use normal LoRA with DeepSpeed/FSDP. To use LoRA with DeepSpeed, refer the docs at [PEFT with FSDP](https://huggingface.co/docs/peft/accelerate/fsdp).

+## Tip

+Generally try to upgrade to the latest package versions for best results, especially when it comes to `bitsandbytes`, `accelerate`, `transformers`, `trl`, and `peft`.
--- a/examples/sft/train.py
+++ b/examples/sft/train.py
@ -3,8 +3,8 @@ import sys
 from dataclasses import dataclass, field
 from typing import Optional

-from transformers import HfArgumentParser, TrainingArguments, set_seed
-from trl import SFTTrainer
+from transformers import HfArgumentParser, set_seed
+from trl import SFTConfig, SFTTrainer
 from utils import create_and_prepare_model, create_datasets


@ -79,12 +79,6 @@ class DataTrainingArguments:
        default="timdettmers/openassistant-guanaco",
        metadata={"help": "The preference dataset to use."},
    )
-    packing: Optional[bool] = field(
-        default=False,
-        metadata={"help": "Use packing dataset creating."},
-    )
-    dataset_text_field: str = field(default="text", metadata={"help": "Dataset field to use as input text."})
-    max_seq_length: Optional[int] = field(default=512)
    append_concat_token: Optional[bool] = field(
        default=False,
        metadata={"help": "If True, appends `eos_token_id` at the end of each sample being packed."},
@ -112,6 +106,11 @@ def main(model_args, data_args, training_args):
    if training_args.gradient_checkpointing:
        training_args.gradient_checkpointing_kwargs = {"use_reentrant": model_args.use_reentrant}

+    training_args.dataset_kwargs = {
+        "append_concat_token": data_args.append_concat_token,
+        "add_special_tokens": data_args.add_special_tokens,
+    }
+
    # datasets
    train_dataset, eval_dataset = create_datasets(
        tokenizer,
@ -128,13 +127,6 @@ def main(model_args, data_args, training_args):
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        peft_config=peft_config,
-        packing=data_args.packing,
-        dataset_kwargs={
-            "append_concat_token": data_args.append_concat_token,
-            "add_special_tokens": data_args.add_special_tokens,
-        },
-        dataset_text_field=data_args.dataset_text_field,
-        max_seq_length=data_args.max_seq_length,
    )
    trainer.accelerator.print(f"{trainer.model}")
    if hasattr(trainer.model, "print_trainable_parameters"):
@ -153,7 +145,7 @@ def main(model_args, data_args, training_args):


 if __name__ == "__main__":
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, SFTConfig))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
--- a/examples/sft/utils.py
+++ b/examples/sft/utils.py
@ -1,7 +1,9 @@
 import os
 from enum import Enum

+import packaging.version
 import torch
+import transformers
 from datasets import DatasetDict, load_dataset, load_from_disk
 from datasets.builder import DatasetGenerationError
 from transformers import (
@ -169,8 +171,17 @@ def create_and_prepare_model(args, data_args, training_args):
            trust_remote_code=True,
        )
        tokenizer.chat_template = chat_template
+
        # make embedding resizing configurable?
-        model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)
+        # Transformers 4.46.0+ defaults uses mean_resizing by default, which fails with QLoRA + FSDP because the
+        # embedding could be on meta device, therefore, we set mean_resizing=False in that case (i.e. the status quo
+        # ante). See https://github.com/huggingface/accelerate/issues/1620.
+        uses_transformers_4_46 = packaging.version.parse(transformers.__version__) >= packaging.version.parse("4.46.0")
+        uses_fsdp = os.environ.get("ACCELERATE_USE_FSDP").lower() == "true"
+        if (bnb_config is not None) and uses_fsdp and uses_transformers_4_46:
+            model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8, mean_resizing=False)
+        else:
+            model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)
    else:
        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, trust_remote_code=True)
        tokenizer.pad_token = tokenizer.eos_token
--- a/examples/token_classification/peft_lora_ner.ipynb
+++ b/examples/token_classification/peft_lora_ner.ipynb
@ -0,0 +1,780 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Named Entity Recognition with Peft Model 🤗\n",
+    "\n",
+    "##### In this notebook, we will learn how to perform Named Entity Recognition(NER) on the CoNLL-2003 dataset using the Trainer class\n",
+    "\n",
+    "##### This notebook has been adapted from the main NLP course here - https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt#fine-tuning-the-model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#install the required libraries\n",
+    "!pip install -q datasets evaluate transformers seqeval"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import required libraries\n",
+    "from datasets import load_dataset\n",
+    "from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer, pipeline\n",
+    "from peft import get_peft_model, LoraConfig, TaskType\n",
+    "import evaluate\n",
+    "import numpy as np\n",
+    "from huggingface_hub import notebook_login"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DatasetDict({\n",
+      "    train: Dataset({\n",
+      "        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],\n",
+      "        num_rows: 14041\n",
+      "    })\n",
+      "    validation: Dataset({\n",
+      "        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],\n",
+      "        num_rows: 3250\n",
+      "    })\n",
+      "    test: Dataset({\n",
+      "        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],\n",
+      "        num_rows: 3453\n",
+      "    })\n",
+      "})\n"
+     ]
+    }
+   ],
+   "source": [
+    "raw_datasets = load_dataset(\"conll2003\")\n",
+    "print(raw_datasets)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Look at the tokens of the first training example\n",
+    "raw_datasets[\"train\"][0][\"tokens\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[3, 0, 7, 0, 0, 0, 7, 0, 0]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Look at the NER tags of the first training example\n",
+    "raw_datasets[\"train\"][0][\"ner_tags\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Get the label names for the NER tags\n",
+    "ner_feature = raw_datasets[\"train\"].features[\"ner_tags\"]\n",
+    "label_names = ner_feature.feature.names\n",
+    "label_names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "EU    rejects German call to boycott British lamb . \n",
+      "B-ORG O       B-MISC O    O  O       B-MISC  O    O \n"
+     ]
+    }
+   ],
+   "source": [
+    "words = raw_datasets[\"train\"][0][\"tokens\"]\n",
+    "labels = raw_datasets[\"train\"][0][\"ner_tags\"]\n",
+    "line1 = \"\"\n",
+    "line2 = \"\"\n",
+    "for word, label in zip(words, labels):\n",
+    "    full_label = label_names[label]\n",
+    "    max_length = max(len(word), len(full_label))\n",
+    "    line1 += word + \" \" * (max_length - len(word) + 1)\n",
+    "    line2 += full_label + \" \" * (max_length - len(full_label) + 1)\n",
+    "\n",
+    "print(line1)\n",
+    "print(line2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "e:\\open_source\\peft-folder\\ner-examples\\.venv\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:1617: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be deprecated in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Load the tokenizer\n",
+    "model_checkpoint = \"bert-base-cased\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['[CLS]',\n",
+       " 'EU',\n",
+       " 'rejects',\n",
+       " 'German',\n",
+       " 'call',\n",
+       " 'to',\n",
+       " 'boycott',\n",
+       " 'British',\n",
+       " 'la',\n",
+       " '##mb',\n",
+       " '.',\n",
+       " '[SEP]']"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Tokenize the first training example\n",
+    "inputs = tokenizer(raw_datasets[\"train\"][0][\"tokens\"], is_split_into_words=True)\n",
+    "inputs.tokens()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def align_labels_with_tokens(labels, word_ids):\n",
+    "    new_labels = []\n",
+    "    current_word = None\n",
+    "    for word_id in word_ids:\n",
+    "        if word_id != current_word:\n",
+    "            # Start of a new word!\n",
+    "            current_word = word_id\n",
+    "            label = -100 if word_id is None else labels[word_id]\n",
+    "            new_labels.append(label)\n",
+    "        elif word_id is None:\n",
+    "            # Special token\n",
+    "            new_labels.append(-100)\n",
+    "        else:\n",
+    "            # Same word as previous token\n",
+    "            label = labels[word_id]\n",
+    "            # If the label is B-XXX we change it to I-XXX\n",
+    "            if label % 2 == 1:\n",
+    "                label += 1\n",
+    "            new_labels.append(label)\n",
+    "\n",
+    "    return new_labels"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[3, 0, 7, 0, 0, 0, 7, 0, 0]\n",
+      "[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]\n"
+     ]
+    }
+   ],
+   "source": [
+    "labels = raw_datasets[\"train\"][0][\"ner_tags\"]\n",
+    "word_ids = inputs.word_ids()\n",
+    "print(labels)\n",
+    "print(align_labels_with_tokens(labels, word_ids))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tokenize_and_align_labels(examples):\n",
+    "    tokenized_inputs = tokenizer(\n",
+    "        examples[\"tokens\"], truncation=True, is_split_into_words=True\n",
+    "    )\n",
+    "    all_labels = examples[\"ner_tags\"]\n",
+    "    new_labels = []\n",
+    "    for i, labels in enumerate(all_labels):\n",
+    "        word_ids = tokenized_inputs.word_ids(i)\n",
+    "        new_labels.append(align_labels_with_tokens(labels, word_ids))\n",
+    "\n",
+    "    tokenized_inputs[\"labels\"] = new_labels\n",
+    "    return tokenized_inputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenized_datasets = raw_datasets.map(\n",
+    "    tokenize_and_align_labels,\n",
+    "    batched=True,\n",
+    "    remove_columns=raw_datasets[\"train\"].column_names,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]\n",
+      "[-100, 1, 2, -100]\n"
+     ]
+    }
+   ],
+   "source": [
+    "for i in range(2):\n",
+    "    print(tokenized_datasets[\"train\"][i][\"labels\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "metric = evaluate.load(\"seqeval\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create label mappings\n",
+    "id2label = {i: label for i, label in enumerate(label_names)}\n",
+    "label2id = {v: k for k, v in id2label.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Load the pre-trained model\n",
+    "model = AutoModelForTokenClassification.from_pretrained(\n",
+    "    model_checkpoint,\n",
+    "    id2label=id2label,\n",
+    "    label2id=label2id,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "9"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.config.num_labels"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "BertForTokenClassification(\n",
+       "  (bert): BertModel(\n",
+       "    (embeddings): BertEmbeddings(\n",
+       "      (word_embeddings): Embedding(28996, 768, padding_idx=0)\n",
+       "      (position_embeddings): Embedding(512, 768)\n",
+       "      (token_type_embeddings): Embedding(2, 768)\n",
+       "      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "      (dropout): Dropout(p=0.1, inplace=False)\n",
+       "    )\n",
+       "    (encoder): BertEncoder(\n",
+       "      (layer): ModuleList(\n",
+       "        (0-11): 12 x BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSdpaSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "  )\n",
+       "  (dropout): Dropout(p=0.1, inplace=False)\n",
+       "  (classifier): Linear(in_features=768, out_features=9, bias=True)\n",
+       ")"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "trainable params: 301,833 || all params: 108,028,434 || trainable%: 0.2794\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Configure LoRA (Low-Rank Adaptation) for fine-tuning\n",
+    "peft_config = LoraConfig(target_modules = [\"query\", \"key\"], task_type = TaskType.TOKEN_CLS)\n",
+    "\n",
+    "model = get_peft_model(model, peft_config)\n",
+    "model.print_trainable_parameters()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compute_metrics(eval_preds):\n",
+    "    logits, labels = eval_preds\n",
+    "    predictions = np.argmax(logits, axis=-1)\n",
+    "\n",
+    "    # Remove ignored index (special tokens) and convert to labels\n",
+    "    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]\n",
+    "    true_predictions = [\n",
+    "        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]\n",
+    "        for prediction, label in zip(predictions, labels)\n",
+    "    ]\n",
+    "    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)\n",
+    "    return {\n",
+    "        \"precision\": all_metrics[\"overall_precision\"],\n",
+    "        \"recall\": all_metrics[\"overall_recall\"],\n",
+    "        \"f1\": all_metrics[\"overall_f1\"],\n",
+    "        \"accuracy\": all_metrics[\"overall_accuracy\"],\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "60bd54dd23de4822891a157430ff47b9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "e:\\open_source\\peft-folder\\ner-examples\\.venv\\Lib\\site-packages\\transformers\\training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "args = TrainingArguments(\n",
+    "    \"bert-finetuned-ner-lora\",\n",
+    "    evaluation_strategy=\"epoch\",\n",
+    "    per_device_train_batch_size=32, # decrease this for OOM error\n",
+    "    per_device_eval_batch_size=64,\n",
+    "    save_strategy=\"epoch\",\n",
+    "    learning_rate=2e-3,\n",
+    "    num_train_epochs=5,\n",
+    "    weight_decay=0.01,\n",
+    "    load_best_model_at_end=True,\n",
+    "    do_eval=True,\n",
+    "    do_predict=True,\n",
+    "    metric_for_best_model=\"accuracy\",\n",
+    "    label_names=[\"labels\"],\n",
+    "    push_to_hub=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "52a6470e20474df6808fcea70c8533ff",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/2195 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "e:\\open_source\\peft-folder\\ner-examples\\.venv\\Lib\\site-packages\\transformers\\models\\bert\\modeling_bert.py:440: UserWarning: 1Torch was not compiled with flash attention. (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\builder\\windows\\pytorch\\aten\\src\\ATen\\native\\transformers\\cuda\\sdp_utils.cpp:555.)\n",
+      "  attn_output = torch.nn.functional.scaled_dot_product_attention(\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7944af882c2543238c82a6e9fd65209e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/51 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.09396398812532425, 'eval_precision': 0.8329652996845426, 'eval_recall': 0.8887579939414338, 'eval_f1': 0.859957661618629, 'eval_accuracy': 0.9724936716312474, 'eval_runtime': 5.7905, 'eval_samples_per_second': 561.264, 'eval_steps_per_second': 8.808, 'epoch': 1.0}\n",
+      "{'loss': 0.1864, 'grad_norm': 0.36243128776550293, 'learning_rate': 0.0015444191343963554, 'epoch': 1.14}\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "79c0a83640aa44a1b7c1fb61de11f618",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/51 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.07848585397005081, 'eval_precision': 0.8617173809144496, 'eval_recall': 0.9102995624368899, 'eval_f1': 0.8853424993862018, 'eval_accuracy': 0.9782333549184671, 'eval_runtime': 4.8142, 'eval_samples_per_second': 675.093, 'eval_steps_per_second': 10.594, 'epoch': 2.0}\n",
+      "{'loss': 0.0867, 'grad_norm': 0.23459061980247498, 'learning_rate': 0.0010888382687927107, 'epoch': 2.28}\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7af499f0415a41e59ba46c078f837fa0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/51 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.07186892628669739, 'eval_precision': 0.8772632590930941, 'eval_recall': 0.9214069336923595, 'eval_f1': 0.8987934006402364, 'eval_accuracy': 0.9796903514452228, 'eval_runtime': 4.8141, 'eval_samples_per_second': 675.103, 'eval_steps_per_second': 10.594, 'epoch': 3.0}\n",
+      "{'loss': 0.066, 'grad_norm': 0.2663992941379547, 'learning_rate': 0.0006332574031890661, 'epoch': 3.42}\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "40d883200bb3473b8dd911177685965f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/51 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.06592053174972534, 'eval_precision': 0.8809218950064021, 'eval_recall': 0.9262874453046113, 'eval_f1': 0.9030352748154225, 'eval_accuracy': 0.9809854594690057, 'eval_runtime': 4.8969, 'eval_samples_per_second': 663.691, 'eval_steps_per_second': 10.415, 'epoch': 4.0}\n",
+      "{'loss': 0.0505, 'grad_norm': 0.1911543756723404, 'learning_rate': 0.0001776765375854214, 'epoch': 4.56}\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0eaa7fa8947f46a3af1e21d12bf730f8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/51 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.06525895744562149, 'eval_precision': 0.8831439090763566, 'eval_recall': 0.928475260854931, 'eval_f1': 0.9052424317007138, 'eval_accuracy': 0.981515276387826, 'eval_runtime': 4.8357, 'eval_samples_per_second': 672.083, 'eval_steps_per_second': 10.547, 'epoch': 5.0}\n",
+      "{'train_runtime': 213.3684, 'train_samples_per_second': 329.032, 'train_steps_per_second': 10.287, 'train_loss': 0.09303037256749182, 'epoch': 5.0}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "No files have been modified since last commit. Skipping to prevent empty commit.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "TrainOutput(global_step=2195, training_loss=0.09303037256749182, metrics={'train_runtime': 213.3684, 'train_samples_per_second': 329.032, 'train_steps_per_second': 10.287, 'total_flos': 1949278889622816.0, 'train_loss': 0.09303037256749182, 'epoch': 5.0})"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=args,\n",
+    "    train_dataset=tokenized_datasets[\"train\"],\n",
+    "    eval_dataset=tokenized_datasets[\"validation\"],\n",
+    "    data_collator=data_collator,\n",
+    "    tokenizer=tokenizer,\n",
+    "    compute_metrics=compute_metrics\n",
+    ")\n",
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[{'entity_group': 'PER',\n",
+       "  'score': np.float32(0.9957055),\n",
+       "  'word': 'Jino',\n",
+       "  'start': 11,\n",
+       "  'end': 15}]"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Replace this with your own checkpoint\n",
+    "model_checkpoint = \"bert-finetuned-ner-lora\"\n",
+    "token_classifier = pipeline(\n",
+    "    \"token-classification\", model = model_checkpoint, aggregation_strategy = \"simple\"\n",
+    ")\n",
+    "token_classifier(\"My name is Jino.\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/pyproject.toml
+++ b/pyproject.toml
@ -9,6 +9,8 @@ line-length = 119
 extend-exclude = ["*.ipynb"]

 [tool.ruff.lint]
+preview = true
+explicit-preview-rules = true
 extend-select = [
    "C", # Complexity
    "E", # PEP8 errors
@ -17,10 +19,12 @@ extend-select = [
    "UP", # Pyupgrade upgrades
    "W", # PEP8 warnings
    "PT009", # Pytest assertions
+    "RUF022", # Sorting of __all__
 ]
 ignore = [
    "C901", # Function too complex
    "E501", # Line length (handled by ruff-format)
+    "F841", # unused variable
    "UP007", # X | Y style Unions
 ]

--- a/scripts/ci_clean_cache.py
+++ b/scripts/ci_clean_cache.py
@ -0,0 +1,67 @@
+"""
+Utility to clean cache files that exceed a specific time in days according to their
+last access time recorded in the cache.
+
+Exit code:
+- 1 if no candidates are found
+- 0 if candidates are found
+
+Deletion can be enabled by passing `-d` parameter, otherwise it will only list the candidates.
+"""
+
+import sys
+from datetime import datetime as dt
+
+from huggingface_hub import scan_cache_dir
+
+
+def find_old_revisions(scan_results, max_age_days=30):
+    """Find commit hashes of objects in the cache. These objects need a last access time that
+    is above the passed `max_age_days` parameter. Returns an empty list if no objects are found.
+    Time measurement is based of the current time and the recorded last access tiem in the cache.
+    """
+    now = dt.now()
+    revisions = [(i.revisions, i.last_accessed) for i in scan_results.repos]
+    revisions_ages = [(rev, (now - dt.fromtimestamp(ts_access)).days) for rev, ts_access in revisions]
+    delete_candidates = [rev for rev, age in revisions_ages if age > max_age_days]
+    hashes = [n.commit_hash for rev in delete_candidates for n in rev]
+
+    return hashes
+
+
+def delete_old_revisions(scan_results, delete_candidates, do_delete=False):
+    delete_operation = scan_results.delete_revisions(*delete_candidates)
+    print(f"Would free {delete_operation.expected_freed_size_str}")
+    print(f"Candidates: {delete_candidates}")
+
+    if do_delete:
+        print("Deleting now.")
+        delete_operation.execute()
+    else:
+        print("Not deleting, pass the -d flag.")
+
+
+if __name__ == "__main__":
+    from argparse import ArgumentParser
+
+    parser = ArgumentParser()
+    parser.add_argument("-a", "--max-age", type=int, default=30, help="Max. age in days items in the cache may have.")
+    parser.add_argument(
+        "-d",
+        "--delete",
+        action="store_true",
+        help=(
+            "Delete mode; Really delete items if there are candidates. Exit code = 0 when we found something to delete, 1 "
+            "otherwise."
+        ),
+    )
+    args = parser.parse_args()
+
+    scan_results = scan_cache_dir()
+
+    delete_candidates = find_old_revisions(scan_results, args.max_age)
+    if not delete_candidates:
+        print("No delete candidates found, not deleting anything.")
+        sys.exit(1)
+
+    delete_old_revisions(scan_results, delete_candidates, do_delete=args.delete)
--- a/scripts/log_reports.py
+++ b/scripts/log_reports.py
@ -37,7 +37,7 @@ def main(slack_channel_name=None):
                if line.get("nodeid", "") != "":
                    test = line["nodeid"]
                    if line.get("duration", None) is not None:
-                        duration = f'{line["duration"]:.4f}'
+                        duration = f"{line['duration']:.4f}"
                        if line.get("outcome", "") == "failed":
                            section_num_failed += 1
                            failed.append([test, duration, log.name.split("_")[0]])
--- a/scripts/train_memory.py
+++ b/scripts/train_memory.py
@ -0,0 +1,274 @@
+# Copyright 2025-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This script trains a model on a small text dataset and measures the memory consumption, as well as a few other
+useful metrics.
+
+Example:
+
+Get help:
+
+```bash
+python train_memory.py --help
+```
+
+Train the google/gemma-2-2b model with a LoRA config json at the indicated location.
+
+```bash
+python train_memory.py "google/gemma-2-2b" --max_seq_length 256 --batch_size 1 --rank 32 --dtype bfloat16 --path_config <path-to-adapter-config.json>
+```
+
+Fully fine-tune the model (i.e. without LoRA) by setting the rank to 0:
+
+```bash
+python train_memory.py "google/gemma-2-2b" --rank 0
+```
+
+Get an estimate of the size of the hidden states by passing `--monitor_tensors`. This trains just for a single epoch. For realistic estimates, the batch size for this:
+
+```bash
+python train_memory.py "google/gemma-2-2b" --max_seq_length 256 --batch_size 32 --rank 32 --dtype bfloat16 --path_config configs/lora_rank-32_embedding-lora/ --monitor_tensors
+```
+
+"""
+
+import argparse
+import gc
+import os
+import sys
+import tempfile
+import time
+import warnings
+from collections import Counter
+from contextlib import nullcontext
+from functools import partial
+
+import torch
+from datasets import load_dataset
+from torch import nn
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+)
+
+from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+from peft.utils import CONFIG_NAME, SAFETENSORS_WEIGHTS_NAME
+
+
+# suppress all warnings
+warnings.filterwarnings("ignore")
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+dtype_to_bytes_linear = {"float32": 4, "float16": 2, "bfloat16": 2, "int8": 1, "int4": 0.5}
+
+
+def init_cuda():
+    torch.manual_seed(0)
+    if device == "cpu":
+        return
+
+    torch.cuda.reset_peak_memory_stats()
+    torch.cuda.manual_seed_all(0)
+    # might not be necessary, but just to be sure
+    nn.Linear(1, 1).to(device)
+
+
+def get_data(tokenizer):
+    def tokenize(samples):
+        # For some reason, the max sequence length is not honored by the tokenizer, resulting in IndexErrors. Thus,
+        # manually ensure that sequences are not too long.
+        tokenized = tokenizer(samples["quote"])
+        tokenized["input_ids"] = [input_ids[: tokenizer.model_max_length] for input_ids in tokenized["input_ids"]]
+        tokenized["attention_mask"] = [
+            input_ids[: tokenizer.model_max_length] for input_ids in tokenized["attention_mask"]
+        ]
+        return tokenized
+
+    data = load_dataset("ybelkada/english_quotes_copy")
+    data = data.map(tokenize, batched=True)
+    # We need to manually remove unused columns. This is because we cannot use remove_unused_columns=True in the
+    # Trainer, as this leads to errors with torch.compile. We also cannot just leave them in, as they contain
+    # strings. Therefore, manually remove all unused columns.
+    data = data.remove_columns(["quote", "author", "tags"])
+    return data
+
+
+def train(model_id, rank, dtype, monitor_tensors, max_seq_length, batch_size, max_steps, path_config):
+    init_cuda()
+    cuda_memory_init = torch.cuda.max_memory_allocated()
+    cuda_memory_log = []
+
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    tokenizer.model_max_length = max_seq_length
+    if not tokenizer.pad_token:
+        tokenizer.pad_token = tokenizer.eos_token
+    data = get_data(tokenizer)
+
+    if dtype == "int4":
+        quant_config = BitsAndBytesConfig(load_in_4bit=True)
+        model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, quantization_config=quant_config)
+        model = prepare_model_for_kbit_training(model)
+    elif dtype == "int8":
+        quant_config = BitsAndBytesConfig(load_in_8bit=True)
+        model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, quantization_config=quant_config)
+        model = prepare_model_for_kbit_training(model)
+    elif dtype == "bfloat16":
+        model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, torch_dtype=torch.bfloat16)
+    elif dtype == "float16":
+        model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, torch_dtype=torch.float16)
+    elif dtype == "float32":
+        model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device)
+    else:
+        raise ValueError(f"Invalid dtype: {dtype}")
+
+    if rank > 0:
+        if path_config is None:
+            raise RuntimeError("LoRA rank > 0 requires a path to a LoRA config")
+        if path_config.endswith(CONFIG_NAME):
+            path_config = path_config.removesuffix(CONFIG_NAME)
+        config = LoraConfig.from_pretrained(path_config)
+        model = get_peft_model(model, config)
+        model.print_trainable_parameters()
+    else:
+        print("Not using LoRA")
+
+    model.config.use_cache = False
+    storage = []
+
+    def pack(x):
+        storage.append(x)
+        return len(storage) - 1
+
+    def unpack(x):
+        return storage[x]
+
+    train_ctx = partial(torch.autograd.graph.saved_tensors_hooks, pack, unpack) if monitor_tensors else nullcontext
+
+    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
+    losses = []
+    sample = 0
+    tic_total = time.perf_counter()
+    for i in range(0, max_steps):
+        storage.clear()
+        tic = time.perf_counter()
+        try:
+            batch = tokenizer.pad(data["train"][sample : sample + batch_size], return_tensors="pt").to(model.device)
+            sample += batch_size
+
+            # add targets
+            batch["labels"] = batch["input_ids"].clone()
+            optimizer.zero_grad()
+
+            with train_ctx():
+                outputs = model(**batch)
+            loss = outputs.loss
+            loss.backward()
+            optimizer.step()
+            losses.append(loss.item())
+            cuda_memory_log.append(torch.cuda.memory_allocated() - cuda_memory_init)
+            torch.cuda.empty_cache()
+            gc.collect()
+            toc = time.perf_counter()
+            print(f"step {i:3d} loss {loss.item():.6f} time {toc - tic:.2f}s", file=sys.stderr)
+        except KeyboardInterrupt:
+            print("canceled training")
+            break
+
+        if monitor_tensors:
+            break
+
+    toc_total = time.perf_counter()
+
+    cuda_memory_final = torch.cuda.max_memory_allocated()
+    cuda_memory_avg = int(sum(cuda_memory_log) / len(cuda_memory_log))
+    print(f"cuda memory avg: {cuda_memory_avg // 2**20}MB")
+    print(f"cuda memory max: {(cuda_memory_final - cuda_memory_init) // 2**20}MB")
+    print(f"total time: {toc_total - tic_total:.2f}s")
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        model.save_pretrained(tmp_dir)
+        stat = os.stat(os.path.join(tmp_dir, SAFETENSORS_WEIGHTS_NAME))
+    file_size = stat.st_size
+    print(f"file size: {file_size / 2**20:.1f}MB")
+
+    if monitor_tensors:
+        dtype_counts = Counter(t.dtype for t in storage)
+        shape_counts = Counter(t.shape for t in storage)
+        param_shape_counts = Counter(p.shape for p in model.parameters())
+        param_shape_counts_copy = dict(param_shape_counts).copy()
+
+        # shape counts includes the params, so we need to subtract them; note that they can be transposed
+        # this is an approximation
+        diff_shape_counts = {}
+        for shape, count in shape_counts.items():
+            if shape in param_shape_counts_copy:
+                diff_count = count - param_shape_counts[shape]
+                if diff_count > 0:
+                    diff_shape_counts[shape] = diff_count
+                    param_shape_counts_copy[shape] = max(0, param_shape_counts_copy[shape] - diff_count)
+            elif shape[::-1] in param_shape_counts:
+                diff_count = count - param_shape_counts[shape[::-1]]
+                if diff_count > 0:
+                    diff_shape_counts[shape] = diff_count
+                    param_shape_counts_copy[shape[::-1]] = max(0, param_shape_counts_copy[shape[::-1]] - diff_count)
+            else:
+                diff_shape_counts[shape] = count
+
+        total_size = sum(t.numel() * t.element_size() for t in storage)
+        total_size_mb = f"{total_size // 2**20}MB"
+        diff_size = 0
+        for shape, count in diff_shape_counts.items():
+            diff_size += count * torch.zeros(shape).numel() * dtype_to_bytes_linear[dtype]
+        param_size = total_size - diff_size
+
+        diff_size_mb = f"{diff_size // 2**20}MB"
+        param_size_mb = f"{param_size // 2**20}MB"
+
+        print(f"Dtype counts: {dtype_counts.most_common()}")
+        print(f"Total size of tensors:     {total_size_mb: >12}")
+        print(f"Total size of activations: {diff_size_mb: >12}")
+        print(f"Total size of parameters:  {param_size_mb: >12}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("model_id", type=str, help="Model name on Hugging Face Hub")
+    parser.add_argument("--rank", type=int, default=8, help="Rank of LoRA, 0 => no LoRA, default 8")
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="float32",
+        help="Data type, one of float32, float16, bfloat16, int8, int4, default float32",
+    )
+    parser.add_argument(
+        "--monitor_tensors",
+        action="store_true",
+        help="Monitor tensor sizes during training for a single training step, off by default",
+    )
+    parser.add_argument("--max_seq_length", type=int, default=128, help="Maximum sequence length, default 128")
+    parser.add_argument("--batch_size", type=int, default=1, help="Batch size, default 1")
+    parser.add_argument("--max_steps", type=int, default=50, help="Maximum number of training steps, default 50")
+    parser.add_argument("--path_config", type=str, default=None, help="Path to LoRA config")
+    args = parser.parse_args()
+    train(
+        model_id=args.model_id,
+        rank=args.rank,
+        dtype=args.dtype,
+        monitor_tensors=args.monitor_tensors,
+        max_seq_length=args.max_seq_length,
+        batch_size=args.batch_size,
+        max_steps=args.max_steps,
+        path_config=args.path_config,
+    )
--- a/setup.py
+++ b/setup.py
@ -15,13 +15,13 @@
 from setuptools import find_packages, setup


-VERSION = "0.13.0"
+VERSION = "0.15.1"

 extras = {}
 extras["quality"] = [
    "black",  # doc-builder has an implicit dependency on Black, see huggingface/doc-builder#434
    "hf-doc-builder",
-    "ruff~=0.6.1",
+    "ruff~=0.9.2",
 ]
 extras["docs_specific"] = [
    "black",  # doc-builder has an implicit dependency on Black, see huggingface/doc-builder#434
@ -34,8 +34,10 @@ extras["test"] = extras["dev"] + [
    "pytest-xdist",
    "parameterized",
    "datasets",
-    "diffusers<0.21.0",
+    "diffusers",
    "scipy",
+    "protobuf",
+    "sentencepiece",
 ]

 setup(
@ -54,7 +56,7 @@ setup(
    packages=find_packages("src"),
    package_data={"peft": ["py.typed", "tuners/boft/fbd/fbd_cuda.cpp", "tuners/boft/fbd/fbd_cuda_kernel.cu"]},
    entry_points={},
-    python_requires=">=3.8.0",
+    python_requires=">=3.9.0",
    install_requires=[
        "numpy>=1.17",
        "packaging>=20.0",
@ -65,7 +67,7 @@ setup(
        "tqdm",
        "accelerate>=0.21.0",
        "safetensors",
-        "huggingface_hub>=0.17.0",
+        "huggingface_hub>=0.25.0",
    ],
    extras_require=extras,
    classifiers=[
@ -76,13 +78,16 @@ setup(
        "License :: OSI Approved :: Apache Software License",
        "Operating System :: OS Independent",
        "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
    ],
 )

 # Release checklist
-# 1. Change the version in __init__.py and setup.py to the release version, e.g. from "0.6.0.dev0" to "0.6.0"
+# 1. Change the version in __init__.py and setup.py to the release version, e.g. from "0.6.1.dev0" to "0.7.0"
 # 2. Check if there are any deprecations that need to be addressed for this release by searching for "# TODO" in the code
 # 3. Commit these changes with the message: "Release: VERSION", create a PR and merge it.
 # 4. Add a tag in git to mark the release: "git tag -a VERSION -m 'Adds tag VERSION for pypi' "
@ -102,4 +107,4 @@ setup(
 #      twine upload dist/* -r pypi
 # 9. Add release notes to the tag on https://github.com/huggingface/peft/releases once everything is looking hunky-dory.
 #      Check the notes here: https://docs.google.com/document/d/1k-sOIfykuKjWcOIALqjhFKz4amFEp-myeJUJEzNgjoU/edit?usp=sharing
-# 10. Update the version in __init__.py, setup.py to the bumped minor version + ".dev0" (e.g. from "0.6.0" to "0.7.0.dev0")
+# 10. Update the version in __init__.py, setup.py to the bumped patch version + ".dev0" (e.g. from "0.7.0" to "0.7.1.dev0")
--- a/src/peft/init.py
+++ b/src/peft/init.py
@ -1,8 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
-# coding=utf-8
 # Copyright 2023-present the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -17,53 +12,73 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-__version__ = "0.13.0"
+__version__ = "0.15.1"

 from .auto import (
+    MODEL_TYPE_TO_PEFT_MODEL_MAPPING,
    AutoPeftModel,
    AutoPeftModelForCausalLM,
-    AutoPeftModelForSequenceClassification,
-    AutoPeftModelForSeq2SeqLM,
-    AutoPeftModelForTokenClassification,
-    AutoPeftModelForQuestionAnswering,
    AutoPeftModelForFeatureExtraction,
+    AutoPeftModelForQuestionAnswering,
+    AutoPeftModelForSeq2SeqLM,
+    AutoPeftModelForSequenceClassification,
+    AutoPeftModelForTokenClassification,
 )
+from .config import PeftConfig, PromptLearningConfig
 from .mapping import (
-    MODEL_TYPE_TO_PEFT_MODEL_MAPPING,
    PEFT_TYPE_TO_CONFIG_MAPPING,
+    PEFT_TYPE_TO_MIXED_MODEL_MAPPING,
+    PEFT_TYPE_TO_TUNER_MAPPING,
    get_peft_config,
-    get_peft_model,
    inject_adapter_in_model,
 )
+from .mapping_func import get_peft_model
 from .mixed_model import PeftMixedModel
 from .peft_model import (
    PeftModel,
    PeftModelForCausalLM,
+    PeftModelForFeatureExtraction,
+    PeftModelForQuestionAnswering,
    PeftModelForSeq2SeqLM,
    PeftModelForSequenceClassification,
    PeftModelForTokenClassification,
-    PeftModelForQuestionAnswering,
-    PeftModelForFeatureExtraction,
    get_layer_status,
    get_model_status,
 )
 from .tuners import (
+    AdaLoraConfig,
+    AdaLoraModel,
    AdaptionPromptConfig,
    AdaptionPromptModel,
-    LoraConfig,
-    LoraRuntimeConfig,
+    BOFTConfig,
+    BOFTModel,
+    BoneConfig,
+    BoneModel,
+    CPTConfig,
+    CPTEmbedding,
+    EvaConfig,
+    FourierFTConfig,
+    FourierFTModel,
+    HRAConfig,
+    HRAModel,
+    IA3Config,
+    IA3Model,
+    LNTuningConfig,
+    LNTuningModel,
    LoftQConfig,
-    LoraModel,
    LoHaConfig,
    LoHaModel,
    LoKrConfig,
    LoKrModel,
-    IA3Config,
-    IA3Model,
-    AdaLoraConfig,
-    AdaLoraModel,
-    BOFTConfig,
-    BOFTModel,
+    LoraConfig,
+    LoraModel,
+    LoraRuntimeConfig,
+    MultitaskPromptTuningConfig,
+    MultitaskPromptTuningInit,
+    OFTConfig,
+    OFTModel,
+    PolyConfig,
+    PolyModel,
    PrefixEncoder,
    PrefixTuningConfig,
    PromptEmbedding,
@ -72,37 +87,120 @@ from .tuners import (
    PromptEncoderReparameterizationType,
    PromptTuningConfig,
    PromptTuningInit,
-    MultitaskPromptTuningConfig,
-    MultitaskPromptTuningInit,
-    OFTConfig,
-    OFTModel,
-    PolyConfig,
-    PolyModel,
-    LNTuningConfig,
-    LNTuningModel,
+    TrainableTokensConfig,
+    TrainableTokensModel,
    VBLoRAConfig,
    VBLoRAModel,
    VeraConfig,
    VeraModel,
-    FourierFTConfig,
-    FourierFTModel,
    XLoraConfig,
    XLoraModel,
-    HRAConfig,
-    HRAModel,
-    VBLoRAConfig,
+    get_eva_state_dict,
+    initialize_lora_eva_weights,
 )
 from .utils import (
    TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING,
    PeftType,
    TaskType,
    bloom_model_postprocess_past_key_value,
+    cast_mixed_precision_params,
    get_peft_model_state_dict,
+    load_peft_weights,
    prepare_model_for_kbit_training,
    replace_lora_weights_loftq,
    set_peft_model_state_dict,
    shift_tokens_right,
-    load_peft_weights,
-    cast_mixed_precision_params,
 )
-from .config import PeftConfig, PromptLearningConfig
+
+
+__all__ = [
+    "MODEL_TYPE_TO_PEFT_MODEL_MAPPING",
+    "PEFT_TYPE_TO_CONFIG_MAPPING",
+    "PEFT_TYPE_TO_MIXED_MODEL_MAPPING",
+    "PEFT_TYPE_TO_TUNER_MAPPING",
+    "TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING",
+    "AdaLoraConfig",
+    "AdaLoraModel",
+    "AdaptionPromptConfig",
+    "AdaptionPromptModel",
+    "AutoPeftModel",
+    "AutoPeftModelForCausalLM",
+    "AutoPeftModelForFeatureExtraction",
+    "AutoPeftModelForQuestionAnswering",
+    "AutoPeftModelForSeq2SeqLM",
+    "AutoPeftModelForSequenceClassification",
+    "AutoPeftModelForTokenClassification",
+    "BOFTConfig",
+    "BOFTModel",
+    "BoneConfig",
+    "BoneModel",
+    "CPTConfig",
+    "CPTEmbedding",
+    "EvaConfig",
+    "FourierFTConfig",
+    "FourierFTModel",
+    "HRAConfig",
+    "HRAModel",
+    "IA3Config",
+    "IA3Model",
+    "LNTuningConfig",
+    "LNTuningModel",
+    "LoHaConfig",
+    "LoHaModel",
+    "LoKrConfig",
+    "LoKrModel",
+    "LoftQConfig",
+    "LoraConfig",
+    "LoraModel",
+    "LoraRuntimeConfig",
+    "MultitaskPromptTuningConfig",
+    "MultitaskPromptTuningInit",
+    "OFTConfig",
+    "OFTModel",
+    "PeftConfig",
+    "PeftMixedModel",
+    "PeftModel",
+    "PeftModelForCausalLM",
+    "PeftModelForFeatureExtraction",
+    "PeftModelForQuestionAnswering",
+    "PeftModelForSeq2SeqLM",
+    "PeftModelForSequenceClassification",
+    "PeftModelForTokenClassification",
+    "PeftType",
+    "PolyConfig",
+    "PolyModel",
+    "PrefixEncoder",
+    "PrefixTuningConfig",
+    "PromptEmbedding",
+    "PromptEncoder",
+    "PromptEncoderConfig",
+    "PromptEncoderReparameterizationType",
+    "PromptLearningConfig",
+    "PromptTuningConfig",
+    "PromptTuningInit",
+    "TaskType",
+    "TrainableTokensConfig",
+    "TrainableTokensModel",
+    "VBLoRAConfig",
+    "VBLoRAConfig",
+    "VBLoRAModel",
+    "VeraConfig",
+    "VeraModel",
+    "XLoraConfig",
+    "XLoraModel",
+    "bloom_model_postprocess_past_key_value",
+    "cast_mixed_precision_params",
+    "get_eva_state_dict",
+    "get_layer_status",
+    "get_model_status",
+    "get_peft_config",
+    "get_peft_model",
+    "get_peft_model_state_dict",
+    "initialize_lora_eva_weights",
+    "inject_adapter_in_model",
+    "load_peft_weights",
+    "prepare_model_for_kbit_training",
+    "replace_lora_weights_loftq",
+    "set_peft_model_state_dict",
+    "shift_tokens_right",
+]
--- a/src/peft/auto.py
+++ b/src/peft/auto.py
@ -29,7 +29,6 @@ from transformers import (
 )

 from .config import PeftConfig
-from .mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING
 from .peft_model import (
    PeftModel,
    PeftModelForCausalLM,
@ -43,6 +42,16 @@ from .utils.constants import TOKENIZER_CONFIG_NAME
 from .utils.other import check_file_exists_on_hf_hub


+MODEL_TYPE_TO_PEFT_MODEL_MAPPING: dict[str, type[PeftModel]] = {
+    "SEQ_CLS": PeftModelForSequenceClassification,
+    "SEQ_2_SEQ_LM": PeftModelForSeq2SeqLM,
+    "CAUSAL_LM": PeftModelForCausalLM,
+    "TOKEN_CLS": PeftModelForTokenClassification,
+    "QUESTION_ANS": PeftModelForQuestionAnswering,
+    "FEATURE_EXTRACTION": PeftModelForFeatureExtraction,
+}
+
+
 class _BaseAutoPeftModel:
    _target_class = None
    _target_peft_class = None
@ -88,7 +97,7 @@ class _BaseAutoPeftModel:
            expected_target_class = MODEL_TYPE_TO_PEFT_MODEL_MAPPING[task_type]
            if cls._target_peft_class.__name__ != expected_target_class.__name__:
                raise ValueError(
-                    f"Expected target PEFT class: {expected_target_class.__name__}, but you have asked for: {cls._target_peft_class.__name__ }"
+                    f"Expected target PEFT class: {expected_target_class.__name__}, but you have asked for: {cls._target_peft_class.__name__}"
                    " make sure that you are loading the correct model for your task type."
                )
        elif task_type is None and getattr(peft_config, "auto_mapping", None) is not None:
@ -121,11 +130,14 @@ class _BaseAutoPeftModel:
                token=token,
            )

-        if tokenizer_exists:
+        if tokenizer_exists and hasattr(base_model, "get_input_embeddings"):
            tokenizer = AutoTokenizer.from_pretrained(
                pretrained_model_name_or_path, trust_remote_code=kwargs.get("trust_remote_code", False)
            )
-            base_model.resize_token_embeddings(len(tokenizer))
+            embedding_size = base_model.get_input_embeddings().weight.shape[0]
+            if len(tokenizer) > embedding_size:
+                # only resize if the tokenizer has a larger vocab size than there are embeddings
+                base_model.resize_token_embeddings(len(tokenizer))

        return cls._target_peft_class.from_pretrained(
            base_model,
--- a/src/peft/config.py
+++ b/src/peft/config.py
@ -24,6 +24,25 @@ from transformers.utils import PushToHubMixin
 from .utils import CONFIG_NAME, PeftType, TaskType


+# we expect at least these keys to be present in a PEFT adapter_config.json
+MIN_EXPECTED_CONFIG_KEYS = {"peft_type"}
+
+
+def _check_and_remove_unused_kwargs(cls, kwargs):
+    """Make PEFT configs forward-compatible by removing unused kwargs that were added in later PEFT versions.
+
+    This assumes that removing the unused kwargs will not affect the default behavior.
+
+    Returns the filtered kwargs and the set of removed keys.
+    """
+    # it's not pretty but eh
+    signature_parameters = inspect.signature(cls.__init__).parameters
+    unexpected_kwargs = set(kwargs.keys()) - set(signature_parameters.keys())
+    for key in unexpected_kwargs:
+        del kwargs[key]
+    return kwargs, unexpected_kwargs
+
+
@dataclass
 class PeftConfigMixin(PushToHubMixin):
    r"""
@ -36,11 +55,19 @@ class PeftConfigMixin(PushToHubMixin):
        peft_type (Union[[`~peft.utils.config.PeftType`], `str`]): The type of Peft method to use.
    """

+    task_type: Optional[TaskType] = field(default=None, metadata={"help": "The type of task."})
    peft_type: Optional[PeftType] = field(default=None, metadata={"help": "The type of PEFT model."})
    auto_mapping: Optional[dict] = field(
        default=None, metadata={"help": "An auto mapping dict to help retrieve the base model class if needed."}
    )

+    def __post_init__(self):
+        # check for invalid task type
+        if (self.task_type is not None) and (self.task_type not in list(TaskType)):
+            raise ValueError(
+                f"Invalid task type: '{self.task_type}'. Must be one of the following task types: {', '.join(TaskType)}."
+            )
+
    def to_dict(self) -> Dict:
        r"""
        Returns the configuration for your adapter model as a dictionary.
@ -116,7 +143,34 @@ class PeftConfigMixin(PushToHubMixin):
        else:
            config_cls = cls

-        return config_cls(**kwargs)
+        try:
+            config = config_cls(**kwargs)
+        except TypeError as exc:
+            # Here we potentially handle forward compatibility. Sometimes new keywords are added to configs, which makes
+            # new configs incompatible with older PEFT versions. We catch these and remove them to allow the program to
+            # continue, but warn the user about it.
+
+            # First check if the error is due to unexpected keyword arguments, we don't want to accidentally catch
+            # other TypeErrors.
+            if "got an unexpected keyword argument" not in str(exc):
+                raise exc
+
+            filtered_kwargs, unexpected_kwargs = _check_and_remove_unused_kwargs(config_cls, kwargs)
+            if not MIN_EXPECTED_CONFIG_KEYS.issubset(set(filtered_kwargs.keys())):
+                raise TypeError(
+                    f"The {cls.__name__} config that is trying to be loaded is missing required keys: "
+                    f"{MIN_EXPECTED_CONFIG_KEYS}."
+                )
+
+            warnings.warn(
+                f"Unexpected keyword arguments {sorted(unexpected_kwargs)} for class {config_cls.__name__}, these are "
+                "ignored. This probably means that you're loading a configuration file that was saved using a "
+                "higher version of the library and additional parameters have been introduced since. It is "
+                "highly recommended to upgrade the PEFT version before continuing (e.g. by running `pip install "
+                "-U peft`)."
+            )
+            config = config_cls.from_peft_type(**filtered_kwargs)
+        return config

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: str, subfolder: Optional[str] = None, **kwargs):
@ -149,6 +203,7 @@ class PeftConfigMixin(PushToHubMixin):

        loaded_attributes = cls.from_json_file(config_file)
        kwargs = {**class_kwargs, **loaded_attributes}
+        kwargs = cls.check_kwargs(**kwargs)
        return cls.from_peft_type(**kwargs)

    @classmethod
@ -213,6 +268,15 @@ class PeftConfigMixin(PushToHubMixin):
        loaded_attributes = cls.from_json_file(config_file)
        return loaded_attributes["peft_type"]

+    @classmethod
+    def check_kwargs(cls, **kwargs):
+        """Check kwargs before initializing the config instance.
+
+        Subclasses can override this method to add specific checks.
+
+        """
+        return kwargs
+
    @property
    def is_prompt_learning(self) -> bool:
        r"""
--- a/src/peft/helpers.py
+++ b/src/peft/helpers.py
@ -18,8 +18,10 @@ from copy import deepcopy
 from functools import update_wrapper
 from types import MethodType

+from torch import nn
+
 from .peft_model import PeftConfig, PeftModel
-from .tuners.lora.layer import LoraLayer
+from .tuners.lora import LoraLayer


 def update_forward_signature(model: PeftModel) -> None:
@ -168,7 +170,8 @@ def rescale_adapter_scale(model, multiplier):

    Args:
        model: The model containing `LoraLayer` modules whose scaling is to be adjusted.
-        multiplier (float or int): The multiplier that rescales the `scaling` attribute. Must be of type float or int.
+        multiplier (float or int):
+            The multiplier that rescales the `scaling` attribute. Must be of type float or int.

    Raises:
        ValueError: If the model does not contain any `LoraLayer`
@ -208,3 +211,42 @@ def rescale_adapter_scale(model, multiplier):
        # restore original scaling values after exiting the context
        for module, scaling in original_scaling.items():
            module.scaling = scaling
+
+
+@contextmanager
+def disable_input_dtype_casting(model: nn.Module, active: bool = True):
+    """
+    Context manager disables input dtype casting to the dtype of the weight.
+
+    Currently specifically works for LoRA.
+
+    Parameters:
+        model (nn.Module):
+            The model containing PEFT modules whose input dtype casting is to be adjusted.
+        active (bool):
+            Whether the context manager is active (default) or inactive.
+
+    """
+    # Additional info: Normally, the dtype of the weight and input need to match, which is why the dtype is cast.
+    # However, in certain circumustances, this is handled by forward hooks, e.g. when using layerwise casting in
+    # diffusers. In that case, PEFT casting the dtype interferes with the layerwise casting, which is why the option to
+    # disable it is given.
+    if not active:
+        yield
+        return
+
+    original_values = {}
+    for name, module in model.named_modules():
+        if not isinstance(module, LoraLayer):
+            continue
+        original_values[name] = module.cast_input_dtype_enabled
+        module.cast_input_dtype_enabled = False
+
+    try:
+        yield
+    finally:
+        for name, module in model.named_modules():
+            if not isinstance(module, LoraLayer):
+                continue
+            if name in original_values:
+                module.cast_input_dtype_enabled = original_values[name]
--- a/src/peft/import_utils.py
+++ b/src/peft/import_utils.py
@ -13,9 +13,11 @@
 # limitations under the License.
 import importlib
 import importlib.metadata as importlib_metadata
+import platform
 from functools import lru_cache

 import packaging.version
+import torch


@lru_cache
@ -47,6 +49,33 @@ def is_auto_gptq_available():
            )


+@lru_cache
+def is_gptqmodel_available():
+    if importlib.util.find_spec("gptqmodel") is not None:
+        GPTQMODEL_MINIMUM_VERSION = packaging.version.parse("2.0.0")
+        OPTIMUM_MINIMUM_VERSION = packaging.version.parse("1.24.0")
+        version_gptqmodel = packaging.version.parse(importlib_metadata.version("gptqmodel"))
+        if GPTQMODEL_MINIMUM_VERSION <= version_gptqmodel:
+            if is_optimum_available():
+                version_optimum = packaging.version.parse(importlib_metadata.version("optimum"))
+                if OPTIMUM_MINIMUM_VERSION <= version_optimum:
+                    return True
+                else:
+                    raise ImportError(
+                        f"gptqmodel requires optimum version `{OPTIMUM_MINIMUM_VERSION}` or higher. Found version `{version_optimum}`, "
+                        f"but only versions above `{OPTIMUM_MINIMUM_VERSION}` are supported"
+                    )
+            else:
+                raise ImportError(
+                    f"gptqmodel requires optimum version `{OPTIMUM_MINIMUM_VERSION}` or higher to be installed."
+                )
+        else:
+            raise ImportError(
+                f"Found an incompatible version of gptqmodel. Found version `{version_gptqmodel}`, "
+                f"but only versions above `{GPTQMODEL_MINIMUM_VERSION}` are supported"
+            )
+
+
@lru_cache
 def is_optimum_available() -> bool:
    return importlib.util.find_spec("optimum") is not None
@ -87,3 +116,52 @@ def is_eetq_available():
@lru_cache
 def is_hqq_available():
    return importlib.util.find_spec("hqq") is not None
+
+
+@lru_cache
+def is_torchao_available():
+    if importlib.util.find_spec("torchao") is None:
+        return False
+
+    TORCHAO_MINIMUM_VERSION = packaging.version.parse("0.4.0")
+    try:
+        torchao_version = packaging.version.parse(importlib_metadata.version("torchao"))
+    except importlib_metadata.PackageNotFoundError:
+        # Same idea as in diffusers:
+        # https://github.com/huggingface/diffusers/blob/9f06a0d1a4a998ac6a463c5be728c892f95320a8/src/diffusers/utils/import_utils.py#L351-L357
+        # It's not clear under what circumstances `importlib_metadata.version("torchao")` can raise an error even
+        # though `importlib.util.find_spec("torchao") is not None` but it has been observed, so adding this for
+        # precaution.
+        return False
+
+    if torchao_version < TORCHAO_MINIMUM_VERSION:
+        raise ImportError(
+            f"Found an incompatible version of torchao. Found version {torchao_version}, "
+            f"but only versions above {TORCHAO_MINIMUM_VERSION} are supported"
+        )
+    return True
+
+
+@lru_cache
+def is_xpu_available(check_device=False):
+    """
+    Checks if XPU acceleration is available and potentially if a XPU is in the environment
+    """
+
+    system = platform.system()
+    if system == "Darwin":
+        return False
+    else:
+        if check_device:
+            try:
+                # Will raise a RuntimeError if no XPU is found
+                _ = torch.xpu.device_count()
+                return torch.xpu.is_available()
+            except RuntimeError:
+                return False
+        return hasattr(torch, "xpu") and torch.xpu.is_available()
+
+
+@lru_cache
+def is_diffusers_available():
+    return importlib.util.find_spec("diffusers") is not None
--- a/src/peft/mapping.py
+++ b/src/peft/mapping.py
@ -14,114 +14,23 @@

 from __future__ import annotations

-import warnings
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any

 import torch

-from peft.tuners.xlora.model import XLoraModel
-
-from .config import PeftConfig
-from .mixed_model import PeftMixedModel
-from .peft_model import (
-    PeftModel,
-    PeftModelForCausalLM,
-    PeftModelForFeatureExtraction,
-    PeftModelForQuestionAnswering,
-    PeftModelForSeq2SeqLM,
-    PeftModelForSequenceClassification,
-    PeftModelForTokenClassification,
-)
-from .tuners import (
-    AdaLoraConfig,
-    AdaLoraModel,
-    AdaptionPromptConfig,
-    BOFTConfig,
-    BOFTModel,
-    FourierFTConfig,
-    FourierFTModel,
-    HRAConfig,
-    HRAModel,
-    IA3Config,
-    IA3Model,
-    LNTuningConfig,
-    LNTuningModel,
-    LoHaConfig,
-    LoHaModel,
-    LoKrConfig,
-    LoKrModel,
-    LoraConfig,
-    LoraModel,
-    MultitaskPromptTuningConfig,
-    OFTConfig,
-    OFTModel,
-    PolyConfig,
-    PolyModel,
-    PrefixTuningConfig,
-    PromptEncoderConfig,
-    PromptTuningConfig,
-    VBLoRAConfig,
-    VBLoRAModel,
-    VeraConfig,
-    VeraModel,
-    XLoraConfig,
-)
-from .tuners.tuners_utils import BaseTuner
-from .utils import _prepare_prompt_learning_config
+from .utils import PeftType


 if TYPE_CHECKING:
-    from transformers import PreTrainedModel
+    from .config import PeftConfig
+    from .tuners.tuners_utils import BaseTuner


-MODEL_TYPE_TO_PEFT_MODEL_MAPPING: dict[str, type[PeftModel]] = {
-    "SEQ_CLS": PeftModelForSequenceClassification,
-    "SEQ_2_SEQ_LM": PeftModelForSeq2SeqLM,
-    "CAUSAL_LM": PeftModelForCausalLM,
-    "TOKEN_CLS": PeftModelForTokenClassification,
-    "QUESTION_ANS": PeftModelForQuestionAnswering,
-    "FEATURE_EXTRACTION": PeftModelForFeatureExtraction,
-}
-
-PEFT_TYPE_TO_CONFIG_MAPPING: dict[str, type[PeftConfig]] = {
-    "ADAPTION_PROMPT": AdaptionPromptConfig,
-    "PROMPT_TUNING": PromptTuningConfig,
-    "PREFIX_TUNING": PrefixTuningConfig,
-    "P_TUNING": PromptEncoderConfig,
-    "LORA": LoraConfig,
-    "LOHA": LoHaConfig,
-    "LORAPLUS": LoraConfig,
-    "LOKR": LoKrConfig,
-    "ADALORA": AdaLoraConfig,
-    "BOFT": BOFTConfig,
-    "IA3": IA3Config,
-    "MULTITASK_PROMPT_TUNING": MultitaskPromptTuningConfig,
-    "OFT": OFTConfig,
-    "POLY": PolyConfig,
-    "LN_TUNING": LNTuningConfig,
-    "VERA": VeraConfig,
-    "FOURIERFT": FourierFTConfig,
-    "XLORA": XLoraConfig,
-    "HRA": HRAConfig,
-    "VBLORA": VBLoRAConfig,
-}
-
-PEFT_TYPE_TO_TUNER_MAPPING: dict[str, type[BaseTuner]] = {
-    "LORA": LoraModel,
-    "LOHA": LoHaModel,
-    "LOKR": LoKrModel,
-    "ADALORA": AdaLoraModel,
-    "BOFT": BOFTModel,
-    "IA3": IA3Model,
-    "OFT": OFTModel,
-    "POLY": PolyModel,
-    "LN_TUNING": LNTuningModel,
-    "VERA": VeraModel,
-    "FOURIERFT": FourierFTModel,
-    "XLORA": XLoraModel,
-    "HRA": HRAModel,
-    "VBLORA": VBLoRAModel,
-}
+# these will be filled by the register_peft_method function
+PEFT_TYPE_TO_CONFIG_MAPPING: dict[PeftType, type[PeftConfig]] = {}
+PEFT_TYPE_TO_TUNER_MAPPING: dict[PeftType, type[BaseTuner]] = {}
+PEFT_TYPE_TO_MIXED_MODEL_MAPPING: dict[PeftType, type[BaseTuner]] = {}
+PEFT_TYPE_TO_PREFIX_MAPPING: dict[PeftType, str] = {}


 def get_peft_config(config_dict: dict[str, Any]) -> PeftConfig:
@ -135,66 +44,6 @@ def get_peft_config(config_dict: dict[str, Any]) -> PeftConfig:
    return PEFT_TYPE_TO_CONFIG_MAPPING[config_dict["peft_type"]](**config_dict)


-def get_peft_model(
-    model: PreTrainedModel,
-    peft_config: PeftConfig,
-    adapter_name: str = "default",
-    mixed: bool = False,
-    autocast_adapter_dtype: bool = True,
-    revision: Optional[str] = None,
-) -> PeftModel | PeftMixedModel:
-    """
-    Returns a Peft model object from a model and a config.
-
-    Args:
-        model ([`transformers.PreTrainedModel`]):
-            Model to be wrapped.
-        peft_config ([`PeftConfig`]):
-            Configuration object containing the parameters of the Peft model.
-        adapter_name (`str`, `optional`, defaults to `"default"`):
-            The name of the adapter to be injected, if not provided, the default adapter name is used ("default").
-        mixed (`bool`, `optional`, defaults to `False`):
-            Whether to allow mixing different (compatible) adapter types.
-        autocast_adapter_dtype (`bool`, *optional*):
-            Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter weights
-            using float16 or bfloat16 to float32, as this is typically required for stable training, and only affect
-            select PEFT tuners.
-        revision (`str`, `optional`, defaults to `main`):
-            The revision of the base model. If this isn't set, the saved peft model will load the `main` revision for
-            the base model
-    """
-    model_config = BaseTuner.get_model_config(model)
-    old_name = peft_config.base_model_name_or_path
-    new_name = model.__dict__.get("name_or_path", None)
-    peft_config.base_model_name_or_path = new_name
-
-    if (old_name is not None) and (old_name != new_name):
-        warnings.warn(
-            f"The PEFT config's `base_model_name_or_path` was renamed from '{old_name}' to '{new_name}'. "
-            "Please ensure that the correct base model is loaded when loading this checkpoint."
-        )
-
-    if revision is not None:
-        if peft_config.revision is not None and peft_config.revision != revision:
-            warnings.warn(
-                f"peft config has already set base model revision to {peft_config.revision}, overwriting with revision {revision}"
-            )
-        peft_config.revision = revision
-
-    if mixed:
-        # note: PeftMixedModel does not support autocast_adapter_dtype, so don't pass it
-        return PeftMixedModel(model, peft_config, adapter_name=adapter_name)
-
-    if peft_config.task_type not in MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys() and not peft_config.is_prompt_learning:
-        return PeftModel(model, peft_config, adapter_name=adapter_name, autocast_adapter_dtype=autocast_adapter_dtype)
-
-    if peft_config.is_prompt_learning:
-        peft_config = _prepare_prompt_learning_config(peft_config, model_config)
-    return MODEL_TYPE_TO_PEFT_MODEL_MAPPING[peft_config.task_type](
-        model, peft_config, adapter_name=adapter_name, autocast_adapter_dtype=autocast_adapter_dtype
-    )
-
-
 def inject_adapter_in_model(
    peft_config: PeftConfig, model: torch.nn.Module, adapter_name: str = "default", low_cpu_mem_usage: bool = False
 ) -> torch.nn.Module:
--- a/src/peft/mapping_func.py
+++ b/src/peft/mapping_func.py
@ -0,0 +1,129 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import warnings
+from typing import Optional
+
+from transformers import PreTrainedModel
+
+from .auto import MODEL_TYPE_TO_PEFT_MODEL_MAPPING
+from .config import PeftConfig
+from .mapping import PEFT_TYPE_TO_CONFIG_MAPPING, PEFT_TYPE_TO_PREFIX_MAPPING
+from .mixed_model import PeftMixedModel
+from .peft_model import PeftModel
+from .tuners.tuners_utils import BaseTuner, BaseTunerLayer
+from .utils import _prepare_prompt_learning_config
+
+
+def get_peft_model(
+    model: PreTrainedModel,
+    peft_config: PeftConfig,
+    adapter_name: str = "default",
+    mixed: bool = False,
+    autocast_adapter_dtype: bool = True,
+    revision: Optional[str] = None,
+    low_cpu_mem_usage: bool = False,
+) -> PeftModel | PeftMixedModel:
+    """
+    Returns a Peft model object from a model and a config, where the model will be modified in-place.
+
+    Args:
+        model ([`transformers.PreTrainedModel`]):
+            Model to be wrapped.
+        peft_config ([`PeftConfig`]):
+            Configuration object containing the parameters of the Peft model.
+        adapter_name (`str`, `optional`, defaults to `"default"`):
+            The name of the adapter to be injected, if not provided, the default adapter name is used ("default").
+        mixed (`bool`, `optional`, defaults to `False`):
+            Whether to allow mixing different (compatible) adapter types.
+        autocast_adapter_dtype (`bool`, *optional*):
+            Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter weights
+            using float16 or bfloat16 to float32, as this is typically required for stable training, and only affect
+            select PEFT tuners.
+        revision (`str`, `optional`, defaults to `main`):
+            The revision of the base model. If this isn't set, the saved peft model will load the `main` revision for
+            the base model
+        low_cpu_mem_usage (`bool`, `optional`, defaults to `False`):
+            Create empty adapter weights on meta device. Useful to speed up the loading process. Leave this setting as
+            False if you intend on training the model, unless the adapter weights will be replaced by different weights
+            before training starts.
+    """
+    model_config = BaseTuner.get_model_config(model)
+    old_name = peft_config.base_model_name_or_path
+    new_name = model.__dict__.get("name_or_path", None)
+    peft_config.base_model_name_or_path = new_name
+
+    # Especially in notebook environments there could be a case that a user wants to experiment with different
+    # configuration values. However, it is likely that there won't be any changes for new configs on an already
+    # initialized PEFT model. The best we can do is warn the user about it.
+    if any(isinstance(module, BaseTunerLayer) for module in model.modules()):
+        warnings.warn(
+            "You are trying to modify a model with PEFT for a second time. If you want to reload the model with a "
+            "different config, make sure to call `.unload()` before."
+        )
+
+    if (old_name is not None) and (old_name != new_name):
+        warnings.warn(
+            f"The PEFT config's `base_model_name_or_path` was renamed from '{old_name}' to '{new_name}'. "
+            "Please ensure that the correct base model is loaded when loading this checkpoint."
+        )
+
+    if revision is not None:
+        if peft_config.revision is not None and peft_config.revision != revision:
+            warnings.warn(
+                f"peft config has already set base model revision to {peft_config.revision}, overwriting with revision {revision}"
+            )
+        peft_config.revision = revision
+
+    if (
+        (isinstance(peft_config, PEFT_TYPE_TO_CONFIG_MAPPING["LORA"]))
+        and (peft_config.init_lora_weights == "eva")
+        and not low_cpu_mem_usage
+    ):
+        warnings.warn(
+            "lora with eva initialization used with low_cpu_mem_usage=False. "
+            "Setting low_cpu_mem_usage=True can improve the maximum batch size possible for eva initialization."
+        )
+
+    prefix = PEFT_TYPE_TO_PREFIX_MAPPING.get(peft_config.peft_type)
+    if prefix and adapter_name in prefix:
+        warnings.warn(
+            f"Adapter name {adapter_name} should not be contained in the prefix {prefix}."
+            "This may lead to reinitialization of the adapter weights during loading."
+        )
+
+    if mixed:
+        # note: PeftMixedModel does not support autocast_adapter_dtype, so don't pass it
+        return PeftMixedModel(model, peft_config, adapter_name=adapter_name)
+
+    if peft_config.task_type not in MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys() and not peft_config.is_prompt_learning:
+        return PeftModel(
+            model,
+            peft_config,
+            adapter_name=adapter_name,
+            autocast_adapter_dtype=autocast_adapter_dtype,
+            low_cpu_mem_usage=low_cpu_mem_usage,
+        )
+
+    if peft_config.is_prompt_learning:
+        peft_config = _prepare_prompt_learning_config(peft_config, model_config)
+    return MODEL_TYPE_TO_PEFT_MODEL_MAPPING[peft_config.task_type](
+        model,
+        peft_config,
+        adapter_name=adapter_name,
+        autocast_adapter_dtype=autocast_adapter_dtype,
+        low_cpu_mem_usage=low_cpu_mem_usage,
+    )
--- a/src/peft/mixed_model.py
+++ b/src/peft/mixed_model.py
@ -27,27 +27,8 @@ from peft.utils.constants import DUMMY_MODEL_CONFIG

 from .config import PeftConfig
 from .peft_model import PeftModel
-from .tuners import (
-    AdaLoraModel,
-    IA3Model,
-    LoHaModel,
-    LoKrModel,
-    LoraModel,
-    MixedModel,
-    OFTModel,
-)
-from .tuners.mixed import COMPATIBLE_TUNER_TYPES
-from .utils import PeftType, _set_adapter, _set_trainable
-
-
-PEFT_TYPE_TO_MODEL_MAPPING = {
-    PeftType.LORA: LoraModel,
-    PeftType.LOHA: LoHaModel,
-    PeftType.LOKR: LoKrModel,
-    PeftType.ADALORA: AdaLoraModel,
-    PeftType.IA3: IA3Model,
-    PeftType.OFT: OFTModel,
-}
+from .tuners import MixedModel
+from .utils import _set_adapter, _set_trainable


 def _prepare_model_for_gradient_checkpointing(model: nn.Module) -> None:
@ -74,6 +55,8 @@ def _prepare_model_for_gradient_checkpointing(model: nn.Module) -> None:


 def _check_config_compatible(peft_config: PeftConfig) -> None:
+    from .tuners.mixed import COMPATIBLE_TUNER_TYPES
+
    if peft_config.peft_type not in COMPATIBLE_TUNER_TYPES:
        raise ValueError(
            f"The provided `peft_type` '{peft_config.peft_type.value}' is not compatible with the `PeftMixedModel`. "
@ -268,7 +251,7 @@ class PeftMixedModel(PushToHubMixin, torch.nn.Module):
            self.modules_to_save = set(modules_to_save)
        else:
            self.modules_to_save.update(modules_to_save)
-        _set_trainable(self, adapter_name)
+        _set_trainable(self, adapter_name, module_names=peft_config.modules_to_save)

    def set_adapter(self, adapter_name: Union[str, list[str]]) -> None:
        """
@ -350,6 +333,9 @@ class PeftMixedModel(PushToHubMixin, torch.nn.Module):
    def _split_kwargs(cls, kwargs: dict[str, Any]):
        return PeftModel._split_kwargs(kwargs)

+    def _check_new_adapter_config(self, peft_config: PeftConfig, is_trainable: bool) -> None:
+        return PeftModel._check_new_adapter_config(self, peft_config, is_trainable=is_trainable)
+
    def load_adapter(self, model_id: str, adapter_name: str, *args: Any, **kwargs: Any):
        """
        Load a trained adapter into the model.
@ -439,7 +425,7 @@ class PeftMixedModel(PushToHubMixin, torch.nn.Module):
                Additional keyword arguments passed along to the specific PEFT configuration class.
        """
        # note: adapted from PeftModel.from_pretrained
-        from .mapping import PEFT_TYPE_TO_CONFIG_MAPPING
+        from .mapping import PEFT_TYPE_TO_CONFIG_MAPPING, PEFT_TYPE_TO_MIXED_MODEL_MAPPING

        # load the config
        if config is None:
@ -458,7 +444,7 @@ class PeftMixedModel(PushToHubMixin, torch.nn.Module):
            raise ValueError(f"The input config must be a PeftConfig, got {config.__class__}")

        # note: this is different from PeftModel.from_pretrained
-        if config.peft_type not in PEFT_TYPE_TO_MODEL_MAPPING:
+        if config.peft_type not in PEFT_TYPE_TO_MIXED_MODEL_MAPPING:
            raise ValueError(f"Adapter of type {config.peft_type} is not supported for mixed models.")

        if (getattr(model, "hf_device_map", None) is not None) and len(
--- a/src/peft/peft_model.py
+++ b/src/peft/peft_model.py
@ -27,44 +27,25 @@ from typing import Any, Literal, Optional, Union
 import packaging.version
 import torch
 import transformers
-from accelerate import dispatch_model, infer_auto_device_map, init_empty_weights
+from accelerate import dispatch_model, infer_auto_device_map
 from accelerate.hooks import AlignDevicesHook, add_hook_to_module, remove_hook_from_submodules
 from accelerate.utils import get_balanced_memory, named_module_tensors
 from huggingface_hub import HfFileSystem, ModelCard, ModelCardData, hf_hub_download
 from safetensors import safe_open
 from safetensors.torch import save_file as safe_save_file
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from transformers import PreTrainedModel
+from transformers import Cache, DynamicCache, EncoderDecoderCache, PreTrainedModel
 from transformers.modeling_outputs import QuestionAnsweringModelOutput, SequenceClassifierOutput, TokenClassifierOutput
 from transformers.utils import PushToHubMixin

+from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer
 from peft.utils.constants import DUMMY_MODEL_CONFIG
+from peft.utils.integrations import init_empty_weights
+from peft.utils.other import TrainableTokensWrapper

 from . import __version__
 from .config import PeftConfig
-from .tuners import (
-    AdaLoraModel,
-    AdaptionPromptModel,
-    BOFTModel,
-    FourierFTModel,
-    HRAModel,
-    IA3Model,
-    LNTuningModel,
-    LoHaModel,
-    LoKrModel,
-    LoraModel,
-    MultitaskPromptEmbedding,
-    OFTModel,
-    PolyModel,
-    PrefixEncoder,
-    PromptEmbedding,
-    PromptEncoder,
-    VBLoRAModel,
-    VeraModel,
-    XLoraConfig,
-    XLoraModel,
-)
-from .tuners.tuners_utils import BaseTuner, BaseTunerLayer
+from .mapping import PEFT_TYPE_TO_CONFIG_MAPPING, PEFT_TYPE_TO_PREFIX_MAPPING, PEFT_TYPE_TO_TUNER_MAPPING
 from .utils import (
    SAFETENSORS_WEIGHTS_NAME,
    TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING,
@ -72,6 +53,7 @@ from .utils import (
    PeftType,
    TaskType,
    _get_batch_size,
+    _get_input_embeddings_name,
    _prepare_prompt_learning_config,
    _set_adapter,
    _set_trainable,
@ -79,33 +61,12 @@ from .utils import (
    id_tensor_storage,
    infer_device,
    load_peft_weights,
+    map_cache_to_layer_device_map,
    set_peft_model_state_dict,
    shift_tokens_right,
 )


-PEFT_TYPE_TO_MODEL_MAPPING = {
-    PeftType.LORA: LoraModel,
-    PeftType.LOHA: LoHaModel,
-    PeftType.LOKR: LoKrModel,
-    PeftType.PROMPT_TUNING: PromptEmbedding,
-    PeftType.P_TUNING: PromptEncoder,
-    PeftType.PREFIX_TUNING: PrefixEncoder,
-    PeftType.ADALORA: AdaLoraModel,
-    PeftType.BOFT: BOFTModel,
-    PeftType.ADAPTION_PROMPT: AdaptionPromptModel,
-    PeftType.IA3: IA3Model,
-    PeftType.OFT: OFTModel,
-    PeftType.POLY: PolyModel,
-    PeftType.LN_TUNING: LNTuningModel,
-    PeftType.VERA: VeraModel,
-    PeftType.FOURIERFT: FourierFTModel,
-    PeftType.XLORA: XLoraModel,
-    PeftType.HRA: HRAModel,
-    PeftType.VBLORA: VBLoRAModel,
-}
-
-
 class PeftModel(PushToHubMixin, torch.nn.Module):
    """
    Base model encompassing various Peft methods.
@ -165,11 +126,12 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
            self.add_adapter(adapter_name, peft_config, low_cpu_mem_usage=low_cpu_mem_usage)
        else:
            self._peft_config = None
-            cls = PEFT_TYPE_TO_MODEL_MAPPING[peft_config.peft_type]
+            cls = PEFT_TYPE_TO_TUNER_MAPPING[peft_config.peft_type]
            ctx = init_empty_weights if low_cpu_mem_usage else nullcontext
            with ctx():
                self.base_model = cls(model, {adapter_name: peft_config}, adapter_name)
-            self.set_additional_trainable_modules(peft_config, adapter_name)
+
+        self.set_additional_trainable_modules(peft_config, adapter_name)

        if hasattr(self.base_model, "_cast_adapter_dtype"):
            self.base_model._cast_adapter_dtype(
@ -224,7 +186,6 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
        selected_adapters: Optional[list[str]] = None,
        save_embedding_layers: Union[str, bool] = "auto",
        is_main_process: bool = True,
-        convert_pissa_to_lora: Optional[str] = None,
        path_initial_model_for_weight_conversion: Optional[str] = None,
        **kwargs: Any,
    ) -> None:
@ -248,16 +209,14 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
            is_main_process (`bool`, *optional*):
                Whether the process calling this is the main process or not. Will default to `True`. Will not save the
                checkpoint if not on the main process, which is important for multi device setups (e.g. DDP).
-            convert_pissa_to_lora (`str, *optional*`):
-                Deprecated. Use `path_initial_model_for_weight_conversion` instead.
            path_initial_model_for_weight_conversion (`str, *optional*`):
-                The path to the initialized adapter, which is obtained after initializing the model with PiSSA or OLoRA
-                and before performing any training. When `path_initial_model_for_weight_conversion` is not None, the
-                difference in adapter before and after fine-tuning is calculated. This difference can be represented as
-                the parameters of a standard LoRA adapter. Using this converted adapter does not require changes to the
-                base model, thus conveniently allowing the use of multiple PiSSA or OLoRA adapters with LoRA adapters,
-                and the activation or deactivation of any adapters. Note that this conversion is not supported if
-                `rslora` is used in combination with `rank_pattern` or `alpha_pattern`.
+                The path to the initialized adapter, which is obtained after initializing the model with
+                PiSSA/CorDA/OLoRA and before performing any training. When `path_initial_model_for_weight_conversion`
+                is not None, the difference in adapter before and after fine-tuning is calculated. This difference can
+                be represented as the parameters of a standard LoRA adapter. Using this converted adapter does not
+                require changes to the base model, thus conveniently allowing the use of multiple PiSSA/CorDA/OLoRA
+                adapters with LoRA adapters, and the activation or deactivation of any adapters. Note that this
+                conversion is not supported if `rslora` is used in combination with `rank_pattern` or `alpha_pattern`.
            kwargs (additional keyword arguments, *optional*):
                Additional keyword arguments passed along to the `push_to_hub` method.

@ -276,13 +235,6 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
                    f"You passed an invalid `selected_adapters` arguments, current supported adapter names are"
                    f" {list(self.peft_config.keys())} - got {selected_adapters}."
                )
-        # TODO: remove deprecated parameter in PEFT v0.14.0
-        if convert_pissa_to_lora is not None:
-            warnings.warn(
-                "`convert_pissa_to_lora` is deprecated and will be removed in a future version. "
-                "Use `path_initial_model_for_weight_conversion` instead."
-            )
-            path_initial_model_for_weight_conversion = convert_pissa_to_lora

        def save_mutated_as_lora(peft_config, path_initial_model_for_weight_conversion, output_state_dict, kwargs):
            if peft_config.use_rslora and (peft_config.rank_pattern or peft_config.alpha_pattern):
@ -293,10 +245,11 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
                raise ValueError(msg)

            if not any(
-                str(peft_config.init_lora_weights).lower().startswith(prefix) for prefix in ["pissa", "olora", "true"]
+                str(peft_config.init_lora_weights).lower().startswith(prefix)
+                for prefix in ["pissa", "corda", "olora", "true"]
            ):
                warnings.warn(
-                    "`path_initial_model_for_weight_conversion` only works for converting a PiSSA or OLoRA adapter to "
+                    "`path_initial_model_for_weight_conversion` only works for converting a PiSSA/CorDA/OLoRA adapter to "
                    "a LoRA adapter"
                )
            initial_adapter_name = os.path.basename(path_initial_model_for_weight_conversion)
@ -307,8 +260,9 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
                    adapter_name=initial_adapter_name,
                )
                is_pissa = str(self.peft_config[initial_adapter_name].init_lora_weights).lower().startswith("pissa")
+                is_corda = str(self.peft_config[initial_adapter_name].init_lora_weights).lower() == "corda"
                is_olora = str(self.peft_config[initial_adapter_name].init_lora_weights).lower() == "olora"
-                if is_pissa or is_olora:
+                if is_pissa or is_corda or is_olora:
                    raise ValueError(
                        "The `init_lora_weights` parameter of the initial adapter should be set to `True`. "
                        "Otherwise, `self.load_adapter` will subtract the decomposed values again based on the "
@ -476,7 +430,8 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
            kwargs: (`optional`):
                Additional keyword arguments passed along to the specific PEFT configuration class.
        """
-        from .mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING, PEFT_TYPE_TO_CONFIG_MAPPING
+        from .auto import MODEL_TYPE_TO_PEFT_MODEL_MAPPING
+        from .tuners import XLoraConfig, XLoraModel

        # load the config
        if config is None:
@ -583,7 +538,7 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
                low_cpu_mem_usage=low_cpu_mem_usage,
            )

-        model.load_adapter(
+        load_result = model.load_adapter(
            model_id,
            adapter_name,
            is_trainable=is_trainable,
@ -592,6 +547,27 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
            **kwargs,
        )

+        # 1. Remove VB-LoRA vector bank, since it's a shared parameter set via the VBLoRAModel
+        # 2. Remove the prompt encoder, as it does not need to be part of the checkpoint
+        missing_keys = [
+            k for k in load_result.missing_keys if "vblora_vector_bank" not in k and "prompt_encoder" not in k
+        ]
+        if missing_keys:
+            # Let's warn here since (in contrast to load_adapter) we don't return the load result, so it could be quite
+            # difficult for users to even notice that something might have gone wrong here. As we filter out non PEFT
+            # keys from the missing keys, this gives no false positives.
+
+            warn_message = f"Found missing adapter keys while loading the checkpoint: {missing_keys}."
+
+            prefix = PEFT_TYPE_TO_PREFIX_MAPPING.get(config.peft_type)
+            if prefix and adapter_name in prefix:
+                warn_message += (
+                    f"Adapter name {adapter_name} should not be contained in the prefix {prefix}."
+                    "This could be the potential reason for missing adapter keys."
+                )
+
+            warnings.warn(warn_message)
+
        return model

    def _setup_prompt_encoder(self, adapter_name: str):
@ -614,29 +590,44 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
        if config.num_transformer_submodules is None:
            config.num_transformer_submodules = 2 if config.task_type == TaskType.SEQ_2_SEQ_LM else 1

-        for named_param, value in list(transformer_backbone.named_parameters()):
-            # for ZeRO-3, the tensor is sharded across accelerators and deepspeed modifies it to a tensor with shape [0]
-            # the actual unsharded shape is stored in "ds_shape" attribute
-            # special handling is needed in case the model is initialized in deepspeed.zero.Init() context or HfDeepSpeedConfig
-            # has been called before
-            # For reference refer to issue: https://github.com/huggingface/peft/issues/996
-            deepspeed_distributed_tensor_shape = getattr(value, "ds_shape", None)
+        # determine the word embeddings
+        word_embeddings = None
+        try:
+            # First try to find the word embeddings based on the module name, this should work for models like Bert,
+            # Roberta, Deberta, etc.
+            word_embeddings = self.base_model.get_submodule("embeddings.word_embeddings")
+        except AttributeError:
+            pass

-            if value.shape[0] == self.base_model.config.vocab_size or (
-                deepspeed_distributed_tensor_shape is not None
-                and deepspeed_distributed_tensor_shape[0] == self.base_model.config.vocab_size
-            ):
-                self.word_embeddings = transformer_backbone.get_submodule(named_param.replace(".weight", ""))
-                break
+        if word_embeddings is None:
+            # Word embeddings could not be determined. Next try to guess them by checking which parameter has the size
+            # of the vocab.
+            for named_param, value in list(transformer_backbone.named_parameters()):
+                # for ZeRO-3, the tensor is sharded across accelerators and deepspeed modifies it to a tensor with shape
+                # [0] the actual unsharded shape is stored in "ds_shape" attribute special handling is needed in case
+                # the model is initialized in deepspeed.zero.Init() context or HfDeepSpeedConfig has been called before
+                # For reference refer to issue: https://github.com/huggingface/peft/issues/996
+                deepspeed_distributed_tensor_shape = getattr(value, "ds_shape", None)

-        if config.peft_type == PeftType.PROMPT_TUNING:
-            prompt_encoder = PromptEmbedding(config, self.word_embeddings)
-        elif config.peft_type == PeftType.MULTITASK_PROMPT_TUNING:
-            prompt_encoder = MultitaskPromptEmbedding(config, self.word_embeddings)
+                if value.shape[0] == self.base_model.config.vocab_size or (
+                    deepspeed_distributed_tensor_shape is not None
+                    and deepspeed_distributed_tensor_shape[0] == self.base_model.config.vocab_size
+                ):
+                    word_embeddings = transformer_backbone.get_submodule(named_param.replace(".weight", ""))
+                    break
+
+        self.word_embeddings = word_embeddings
+        model_cls = PEFT_TYPE_TO_TUNER_MAPPING[config.peft_type]
+
+        if config.peft_type in (PeftType.PROMPT_TUNING, PeftType.MULTITASK_PROMPT_TUNING, PeftType.CPT):
+            prompt_encoder = model_cls(config, self.word_embeddings)
        elif config.peft_type == PeftType.P_TUNING:
-            prompt_encoder = PromptEncoder(config)
+            prompt_encoder = model_cls(config)
        elif config.peft_type == PeftType.PREFIX_TUNING:
-            prompt_encoder = PrefixEncoder(config)
+            # prefix tuning now uses Cache but that won't work with gradient checkpointing
+            if any(getattr(module, "gradient_checkpointing", False) for module in self.get_base_model().modules()):
+                raise ValueError("Prefix tuning does not work with gradient checkpointing.")
+            prompt_encoder = model_cls(config)
        else:
            raise ValueError("Not supported")

@ -674,11 +665,13 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
        prompt_tokens = (
            self.prompt_tokens[adapter_name].unsqueeze(0).expand(1, -1).to(prompt_encoder.embedding.weight.device)
        )
+        peft_type = self.peft_config[adapter_name].peft_type
        if self.peft_config[adapter_name].peft_type == PeftType.PREFIX_TUNING:
            prompt_tokens = prompt_tokens[:, : self.peft_config[adapter_name].num_virtual_tokens]

        if self.peft_config[adapter_name].peft_type == PeftType.MULTITASK_PROMPT_TUNING:
-            prompt_embeddings = super(MultitaskPromptEmbedding, prompt_encoder).forward(prompt_tokens)
+            prompt_embedding_cls = PEFT_TYPE_TO_TUNER_MAPPING[peft_type]
+            prompt_embeddings = super(prompt_embedding_cls, prompt_encoder).forward(prompt_tokens)
        else:
            prompt_embeddings = prompt_encoder(prompt_tokens)

@ -719,6 +712,19 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
            if TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING.get(self.config.model_type, None) is not None:
                post_process_fn = TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING[self.config.model_type]
                past_key_values = post_process_fn(past_key_values)
+            elif peft_config.num_transformer_submodules == 1:
+                # Dont' apply this to encoder-decoder models and not to models requiring special processing.
+                # local import in case users use a very old transformers version
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            elif peft_config.num_transformer_submodules == 2 and self.base_model._supports_cache_class:
+                # Dont' apply this to encoder-decoder models that don't support new Cachc format yet
+                # If we don't apply this, prefix-tuning fails to update cross-attn cache
+                past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+                past_key_values.cross_attention_cache = DynamicCache()
+                past_key_values.is_updated = {
+                    layer_idx: False for layer_idx in range(len(past_key_values.cross_attention_cache.key_cache))
+                }
+            map_cache_to_layer_device_map(self.get_base_model(), past_key_values)  # no-op if not a Cache instance
            return past_key_values
        else:
            if peft_config.peft_type == PeftType.MULTITASK_PROMPT_TUNING:
@ -903,6 +909,13 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
                adapters. Don't use this option when creating a new PEFT adapter for training.

        """
+        prefix = PEFT_TYPE_TO_PREFIX_MAPPING.get(peft_config.peft_type)
+        if prefix and adapter_name in prefix:
+            warnings.warn(
+                f"Adapter name {adapter_name} should not be contained in the prefix {prefix}."
+                "This may lead to reinitialization of the adapter weights during loading."
+            )
+
        if peft_config.peft_type != self.peft_type:
            raise ValueError(
                f"Cannot combine adapters with different peft types. "
@ -939,7 +952,61 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
                self.modules_to_save = set(peft_config.modules_to_save)
            else:
                self.modules_to_save.update(peft_config.modules_to_save)
-            _set_trainable(self, adapter_name)  # this may add a new ModulesToSaveWrapper
+            # this may add a new ModulesToSaveWrapper
+            _set_trainable(self, adapter_name, module_names=peft_config.modules_to_save)
+
+        if getattr(peft_config, "trainable_token_indices", None) is not None:
+            if isinstance(peft_config.trainable_token_indices, dict):
+                target_layers = peft_config.trainable_token_indices
+            else:
+                layer_name = _get_input_embeddings_name(self.model, "embed_tokens")
+                target_layers = {layer_name: peft_config.trainable_token_indices}
+
+            if self.modules_to_save:
+                for target_layer in target_layers:
+                    if target_layer in self.modules_to_save:
+                        raise ValueError(
+                            "The embedding layer is already marked to be trained fully, either specify "
+                            f'`modules_to_save=[..., "{target_layer}", ...]` or '
+                            f"`trainable_tokens={{'{target_layer}': x}}` but not both."
+                        )
+
+            # we are not adding these module names to `self.modules_to_save` as this is strictly reserved for the
+            # `ModulesToSaveWrapper`.
+
+            for target_layer, token_indices in target_layers.items():
+                _set_trainable(
+                    self,
+                    adapter_name,
+                    module_names=[target_layer],
+                    strict_module_check=True,
+                    wrapper_cls=TrainableTokensWrapper,
+                    token_indices=token_indices,
+                )
+
+            # There might be the possibility that we have output weights that are tied to the input weights.
+            # In that case we will tie any module that wants tied weights to the token adapter to make sure that
+            # any modification is reflected in the tied layers as well.
+            model_config = BaseTuner.get_model_config(self)
+            if (
+                model_config.get("tie_word_embeddings", False)
+                # some models may be misconfigured to have weight tying enabled but don't define tied weights keys
+                and self.model._tied_weights_keys is not None
+                and isinstance(self.model.get_input_embeddings(), TrainableTokensWrapper)
+            ):
+                # the embedding layer is modified and we want weight tying.
+                module_keys = [".".join(n.split(".")[:-1]) for n in self.model._tied_weights_keys]
+
+                token_adapter = self.model.get_input_embeddings().token_adapter
+                _set_trainable(
+                    self,
+                    adapter_name,
+                    module_names=module_keys,
+                    strict_module_check=True,
+                    wrapper_cls=TrainableTokensWrapper,
+                    token_indices=token_adapter.token_indices[adapter_name],
+                    tied_adapter=self.model.get_input_embeddings().token_adapter,
+                )

    def get_layer_status(self) -> list[TunerLayerStatus]:
        """Get the status of each adapter layer in the model.
@ -1105,6 +1172,36 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
                        os.makedirs(base_name)
                    safe_save_file(safe_dict, new_fname, metadata=metadata)

+    def _check_new_adapter_config(self, peft_config: PeftConfig, is_trainable: bool) -> None:
+        """Perform checks on newly added PEFT configs to ensure integrity."""
+        if peft_config.is_prompt_learning and is_trainable:
+            raise ValueError("Cannot set a prompt learning adapter to trainable when loading pretrained adapter.")
+
+        # Since PiSSA/CorDA/OLoRA modifies the base weights, it should not be combined with other adapters.
+        all_configs = [peft_config] + list(self.peft_config.values())
+        if len(all_configs) > 1:
+            if any(getattr(config, "init_lora_weights", None) == "pissa" for config in all_configs):
+                msg = (
+                    "PiSSA changes the base weights of the model and should thus not be used with other adapters. "
+                    "Consider converting the PiSSA adapter into a normal LoRA adapter: "
+                    "https://github.com/huggingface/peft/tree/main/examples/pissa_finetuning#convert-pissa-to-lora"
+                )
+                warnings.warn(msg)
+            elif any(getattr(config, "init_lora_weights", None) == "corda" for config in all_configs):
+                msg = (
+                    "CorDA changes the base weights of the model and should thus not be used with other adapters. "
+                    "Consider converting the CorDA adapter into a normal LoRA adapter: "
+                    "https://github.com/huggingface/peft/tree/main/examples/corda_finetuning#convert-corda-to-lora"
+                )
+                warnings.warn(msg)
+            elif any(getattr(config, "init_lora_weights", None) == "olora" for config in all_configs):
+                msg = (
+                    "OLoRA changes the base weights of the model and should thus not be used with other adapters. "
+                    "Consider converting the OLoRA adapter into a normal LoRA adapter: "
+                    "https://github.com/huggingface/peft/tree/main/examples/olora_finetuning#olora-and-lora"
+                )
+                warnings.warn(msg)
+
    def load_adapter(
        self,
        model_id: Union[str, os.PathLike],
@ -1168,10 +1265,8 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
                ephemeral_gpu_offload=ephemeral_gpu_offload,
                **hf_hub_download_kwargs,
            )
-            if peft_config.is_prompt_learning and is_trainable:
-                raise ValueError("Cannot set a prompt learning adapter to trainable when loading pretrained adapter.")
-            else:
-                peft_config.inference_mode = not is_trainable
+            self._check_new_adapter_config(peft_config, is_trainable=is_trainable)
+            peft_config.inference_mode = not is_trainable
            self.add_adapter(adapter_name, peft_config, low_cpu_mem_usage=low_cpu_mem_usage)

        adapters_weights = load_peft_weights(model_id, device=torch_device, **hf_hub_download_kwargs)
@ -1185,6 +1280,19 @@ class PeftModel(PushToHubMixin, torch.nn.Module):
            ignore_mismatched_sizes=ignore_mismatched_sizes,
            low_cpu_mem_usage=low_cpu_mem_usage,
        )
+
+        tuner = self.peft_config[adapter_name].peft_type
+        tuner_prefix = PEFT_TYPE_TO_PREFIX_MAPPING.get(tuner, "")
+        adapter_missing_keys = []
+
+        # Filter missing keys specific to the current adapter and tuner prefix.
+        for key in load_result.missing_keys:
+            if tuner_prefix in key and adapter_name in key:
+                adapter_missing_keys.append(key)
+
+        load_result.missing_keys.clear()
+        load_result.missing_keys.extend(adapter_missing_keys)
+
        if (
            (getattr(self, "hf_device_map", None) is not None)
            and (len(set(self.hf_device_map.values()).intersection({"cpu", "disk"})) > 0)
@ -1395,9 +1503,9 @@ class PeftModelForSequenceClassification(PeftModel):
                break

        # to make sure classifier layer is trainable; this may add a new ModulesToSaveWrapper
-        _set_trainable(self, adapter_name)
+        _set_trainable(self, adapter_name, module_names=peft_config.modules_to_save)

-    def add_adapter(self, adapter_name: str, peft_config: PeftConfig) -> None:
+    def add_adapter(self, adapter_name: str, peft_config: PeftConfig, low_cpu_mem_usage: bool = False) -> None:
        """
        Add an adapter to the model based on the passed configuration.

@ -1413,6 +1521,10 @@ class PeftModelForSequenceClassification(PeftModel):
                The name of the adapter to be added.
            peft_config ([`PeftConfig`]):
                The configuration of the adapter to be added.
+            low_cpu_mem_usage (`bool`, `optional`, defaults to `False`):
+                Create empty adapter weights on meta device. Useful to speed up the process when loading saved
+                adapters. Don't use this option when creating a new PEFT adapter for training.
+
        """
        # ensure that additional adapters also add the classifier layer to modules_to_save
        if hasattr(peft_config, "modules_to_save"):
@ -1422,7 +1534,7 @@ class PeftModelForSequenceClassification(PeftModel):
            else:
                peft_config.modules_to_save.extend(classifier_module_names)

-        return super().add_adapter(adapter_name, peft_config)
+        return super().add_adapter(adapter_name, peft_config, low_cpu_mem_usage=low_cpu_mem_usage)

    def forward(
        self,
@ -1678,6 +1790,8 @@ class PeftModelForCausalLM(PeftModel):
            # overwrite past_kv in kwargs
            kwargs["past_key_values"] = self.get_prompt(batch_size)
            return self.base_model(input_ids=input_ids, inputs_embeds=inputs_embeds, **kwargs)
+        elif peft_config.peft_type == PeftType.CPT:
+            return self._cpt_forward(input_ids, inputs_embeds, peft_config, task_ids, batch_size, **kwargs)
        else:
            if inputs_embeds is None:
                inputs_embeds = self.word_embeddings(input_ids)
@ -1690,6 +1804,62 @@ class PeftModelForCausalLM(PeftModel):
            inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1)
            return self.base_model(inputs_embeds=inputs_embeds, **kwargs)

+    def _cpt_forward(self, input_ids, inputs_embeds, peft_config, task_ids, batch_size, **kwargs):
+        # Extract labels from kwargs
+        labels = kwargs.pop("labels")
+        device = [i.device for i in [input_ids, inputs_embeds, labels] if i is not None][0]
+        # Extract input_type_mask from kwargs and move it to the same device as labels
+        if "input_type_mask" in kwargs.keys():
+            input_type_mask = kwargs.pop("input_type_mask").to(device)
+        else:
+            if input_ids is None:
+                N_tokens = inputs_embeds.shape[1]
+            else:
+                N_tokens = input_ids.shape[1]
+            input_type_mask = torch.ones((batch_size, N_tokens)).to(device) * 4
+
+        cpt_token_ids = peft_config.cpt_token_ids
+        cpt_tokens_type_mask = peft_config.cpt_tokens_type_mask
+
+        # Generate embeddings if not provided
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        # Get prompt and concatenate with input embeddings
+        prompts = self.get_prompt(batch_size=batch_size, task_ids=task_ids)
+        prompts = prompts.to(inputs_embeds.dtype)
+        inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1)
+        # If labels are provided, generate prefix labels and type mask
+        cpt_labels = None
+        if labels is not None:
+            # Generate prefix labels and concatenate with the input labels
+            prefix_labels = torch.Tensor(cpt_token_ids).long().view(1, -1)
+            prefix_labels = prefix_labels.repeat(batch_size, 1).to(labels.device)
+            cpt_labels = torch.cat((prefix_labels, labels), dim=1)
+            # Generate prefix type mask and shift input type mask values to avoid conflicts
+            prefix_type_mask = torch.Tensor(cpt_tokens_type_mask).long().view(1, -1)
+            prefix_type_mask = prefix_type_mask.repeat(batch_size, 1).to(labels.device)
+            adjusted_input_type_mask = input_type_mask
+            adjusted_input_type_mask[adjusted_input_type_mask > 0] += prefix_type_mask.max()
+            # Concatenate prefix and shifted input type masks
+            cpt_type_mask = torch.cat((prefix_type_mask, adjusted_input_type_mask), dim=1)
+            # Identify valid label positions and mask invalid ones with -100
+            labels_idx = (cpt_type_mask > 0) & (cpt_type_mask % 4 == 0)
+            cpt_labels[~labels_idx] = -100
+            # Update kwargs with the modified labels
+
+        kwargs["labels"] = cpt_labels
+        # Pass the modified inputs to the base model
+        base_model_output = self.base_model(inputs_embeds=inputs_embeds, **kwargs)
+        if labels is None:
+            return base_model_output
+        else:
+            # Calculate the loss using the custom CPT loss function
+            cpt_embedding = PEFT_TYPE_TO_TUNER_MAPPING[peft_config.peft_type]
+            base_model_output = cpt_embedding.calculate_loss(
+                base_model_output, cpt_labels, cpt_type_mask, self.peft_config["default"]
+            )
+            return base_model_output
+
    def generate(self, *args, **kwargs):
        peft_config = self.active_peft_config
        self.base_model.prepare_inputs_for_generation = self.prepare_inputs_for_generation
@ -1732,7 +1902,7 @@ class PeftModelForCausalLM(PeftModel):
        if peft_config.peft_type == PeftType.POLY:
            model_kwargs["task_ids"] = task_ids
        if peft_config.is_prompt_learning:
-            if uses_cache and (model_kwargs["past_key_values"] is not None):
+            if uses_cache and (model_kwargs.get("past_key_values", None) is not None):
                # change in the logic of `prepare_inputs_for_generation` makes the below code necessary
                # In prompt learning methods, past key values are longer when compared to the `input_ids`.
                # As such only consider the last input ids in the autogressive generation phase.
@ -1762,8 +1932,9 @@ class PeftModelForCausalLM(PeftModel):
                kwargs["token_type_ids"] = None

            # no past_key_values or past_key_values empty cache
-            requires_prompt_injection = (model_kwargs["past_key_values"] is None) or (
-                isinstance(model_kwargs["past_key_values"], transformers.Cache) and not model_kwargs["past_key_values"]
+            requires_prompt_injection = (model_kwargs.get("past_key_values", None) is None) or (
+                isinstance(model_kwargs["past_key_values"], transformers.Cache)
+                and not model_kwargs["past_key_values"].get_seq_length()
            )

            if requires_prompt_injection and peft_config.peft_type == PeftType.PREFIX_TUNING:
@ -2037,10 +2208,20 @@ class PeftModelForSeq2SeqLM(PeftModel):
        model_kwargs = self.base_model_prepare_inputs_for_generation(*args, **kwargs)
        if peft_config.peft_type == PeftType.POLY:
            model_kwargs["task_ids"] = task_ids
-        if model_kwargs["past_key_values"] is None and peft_config.peft_type == PeftType.PREFIX_TUNING:
-            batch_size = model_kwargs["decoder_input_ids"].shape[0]
-            past_key_values = self.get_prompt(batch_size)
-            model_kwargs["past_key_values"] = past_key_values
+        elif peft_config.peft_type == PeftType.PREFIX_TUNING:
+            past_key_values = model_kwargs.get("past_key_values", None)
+            cache_position = model_kwargs.get("cache_position", [None])
+            # check prefill stage
+            is_prefill_stage = (
+                # old cache implementation
+                (past_key_values is None)
+                # new cache implementation
+                or (isinstance(past_key_values, Cache) and (cache_position[0] == 0))
+            )
+            if is_prefill_stage:
+                batch_size = model_kwargs["decoder_input_ids"].shape[0]
+                new_past_key_values = self.get_prompt(batch_size)
+                model_kwargs["past_key_values"] = new_past_key_values

        return model_kwargs

@ -2113,9 +2294,9 @@ class PeftModelForTokenClassification(PeftModel):
                break

        # to make sure classifier layer is trainable; this may add a new ModulesToSaveWrapper
-        _set_trainable(self, adapter_name)
+        _set_trainable(self, adapter_name, module_names=peft_config.modules_to_save)

-    def add_adapter(self, adapter_name: str, peft_config: PeftConfig) -> None:
+    def add_adapter(self, adapter_name: str, peft_config: PeftConfig, low_cpu_mem_usage: bool = False) -> None:
        """
        Add an adapter to the model based on the passed configuration.

@ -2131,6 +2312,10 @@ class PeftModelForTokenClassification(PeftModel):
                The name of the adapter to be added.
            peft_config ([`PeftConfig`]):
                The configuration of the adapter to be added.
+            low_cpu_mem_usage (`bool`, `optional`, defaults to `False`):
+                Create empty adapter weights on meta device. Useful to speed up the process when loading saved
+                adapters. Don't use this option when creating a new PEFT adapter for training.
+
        """
        # ensure that additional adapters also add the classifier layer to modules_to_save
        if hasattr(peft_config, "modules_to_save"):
@ -2140,7 +2325,7 @@ class PeftModelForTokenClassification(PeftModel):
            else:
                peft_config.modules_to_save.extend(classifier_module_names)

-        return super().add_adapter(adapter_name, peft_config)
+        return super().add_adapter(adapter_name, peft_config, low_cpu_mem_usage=low_cpu_mem_usage)

    def forward(
        self,
@ -2330,9 +2515,9 @@ class PeftModelForQuestionAnswering(PeftModel):
                break

        # to make sure classifier layer is trainable; this may add a new ModulesToSaveWrapper
-        _set_trainable(self, adapter_name)
+        _set_trainable(self, adapter_name, module_names=peft_config.modules_to_save)

-    def add_adapter(self, adapter_name: str, peft_config: PeftConfig) -> None:
+    def add_adapter(self, adapter_name: str, peft_config: PeftConfig, low_cpu_mem_usage: bool = False) -> None:
        """
        Add an adapter to the model based on the passed configuration.

@ -2348,6 +2533,10 @@ class PeftModelForQuestionAnswering(PeftModel):
                The name of the adapter to be added.
            peft_config ([`PeftConfig`]):
                The configuration of the adapter to be added.
+            low_cpu_mem_usage (`bool`, `optional`, defaults to `False`):
+                Create empty adapter weights on meta device. Useful to speed up the process when loading saved
+                adapters. Don't use this option when creating a new PEFT adapter for training.
+
        """
        # ensure that additional adapters also add the classifier layer to modules_to_save
        if hasattr(peft_config, "modules_to_save"):
@ -2357,7 +2546,7 @@ class PeftModelForQuestionAnswering(PeftModel):
            else:
                peft_config.modules_to_save.extend(qa_module_names)

-        return super().add_adapter(adapter_name, peft_config)
+        return super().add_adapter(adapter_name, peft_config, low_cpu_mem_usage=low_cpu_mem_usage)

    def forward(
        self,
@ -2890,3 +3079,19 @@ def get_model_status(model: torch.nn.Module) -> TunerModelStatus:
        devices=devices,
    )
    return adapter_model_status
+
+
+def __getattr__(name):
+    if name == "PEFT_TYPE_TO_MODEL_MAPPING":
+        # This is for backwards compatibility: In #2282, PEFT_TYPE_TO_MODEL_MAPPING was removed as it was redundant with
+        # PEFT_TYPE_TO_TUNER_MAPPING. However, third party code could still use this mapping, e.g.:
+        # https://github.com/AutoGPTQ/AutoGPTQ/blob/6689349625de973b9ee3016c28c11f32acf7f02c/auto_gptq/utils/peft_utils.py#L8
+        # TODO: Remove after 2026-01
+        msg = (
+            "PEFT_TYPE_TO_MODEL_MAPPING is deprecated, please use `from peft import PEFT_TYPE_TO_TUNER_MAPPING` instead. "
+            "The deprecated variable will be removed in 2026."
+        )
+        warnings.warn(msg, category=DeprecationWarning)
+        return PEFT_TYPE_TO_TUNER_MAPPING
+
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
--- a/src/peft/tuners/init.py
+++ b/src/peft/tuners/init.py
@ -1,8 +1,3 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all
-
-# coding=utf-8
 # Copyright 2023-present the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -17,23 +12,91 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from .adalora import AdaLoraConfig, AdaLoraModel
 from .adaption_prompt import AdaptionPromptConfig, AdaptionPromptModel
-from .lora import LoraConfig, LoraModel, LoftQConfig, LoraRuntimeConfig
+from .boft import BOFTConfig, BOFTModel
+from .bone import BoneConfig, BoneModel
+from .cpt import CPTConfig, CPTEmbedding
+from .fourierft import FourierFTConfig, FourierFTModel
+from .hra import HRAConfig, HRAModel
+from .ia3 import IA3Config, IA3Model
+from .ln_tuning import LNTuningConfig, LNTuningModel
 from .loha import LoHaConfig, LoHaModel
 from .lokr import LoKrConfig, LoKrModel
-from .ia3 import IA3Config, IA3Model
-from .adalora import AdaLoraConfig, AdaLoraModel
-from .boft import BOFTConfig, BOFTModel
-from .p_tuning import PromptEncoder, PromptEncoderConfig, PromptEncoderReparameterizationType
-from .prefix_tuning import PrefixEncoder, PrefixTuningConfig
-from .prompt_tuning import PromptEmbedding, PromptTuningConfig, PromptTuningInit
+from .lora import (
+    EvaConfig,
+    LoftQConfig,
+    LoraConfig,
+    LoraModel,
+    LoraRuntimeConfig,
+    get_eva_state_dict,
+    initialize_lora_eva_weights,
+)
+from .mixed import MixedModel
 from .multitask_prompt_tuning import MultitaskPromptEmbedding, MultitaskPromptTuningConfig, MultitaskPromptTuningInit
 from .oft import OFTConfig, OFTModel
-from .mixed import MixedModel
+from .p_tuning import PromptEncoder, PromptEncoderConfig, PromptEncoderReparameterizationType
 from .poly import PolyConfig, PolyModel
-from .ln_tuning import LNTuningConfig, LNTuningModel
-from .vera import VeraConfig, VeraModel
-from .fourierft import FourierFTConfig, FourierFTModel
-from .xlora import XLoraConfig, XLoraModel
-from .hra import HRAConfig, HRAModel
+from .prefix_tuning import PrefixEncoder, PrefixTuningConfig
+from .prompt_tuning import PromptEmbedding, PromptTuningConfig, PromptTuningInit
+from .trainable_tokens import TrainableTokensConfig, TrainableTokensModel
 from .vblora import VBLoRAConfig, VBLoRAModel
+from .vera import VeraConfig, VeraModel
+from .xlora import XLoraConfig, XLoraModel
+
+
+__all__ = [
+    "AdaLoraConfig",
+    "AdaLoraModel",
+    "AdaptionPromptConfig",
+    "AdaptionPromptModel",
+    "BOFTConfig",
+    "BOFTModel",
+    "BoneConfig",
+    "BoneModel",
+    "CPTConfig",
+    "CPTEmbedding",
+    "EvaConfig",
+    "FourierFTConfig",
+    "FourierFTModel",
+    "HRAConfig",
+    "HRAModel",
+    "IA3Config",
+    "IA3Model",
+    "LNTuningConfig",
+    "LNTuningModel",
+    "LoHaConfig",
+    "LoHaModel",
+    "LoKrConfig",
+    "LoKrModel",
+    "LoftQConfig",
+    "LoraConfig",
+    "LoraModel",
+    "LoraRuntimeConfig",
+    "MixedModel",
+    "MultitaskPromptEmbedding",
+    "MultitaskPromptTuningConfig",
+    "MultitaskPromptTuningInit",
+    "OFTConfig",
+    "OFTModel",
+    "PolyConfig",
+    "PolyModel",
+    "PrefixEncoder",
+    "PrefixTuningConfig",
+    "PromptEmbedding",
+    "PromptEncoder",
+    "PromptEncoderConfig",
+    "PromptEncoderReparameterizationType",
+    "PromptTuningConfig",
+    "PromptTuningInit",
+    "TrainableTokensConfig",
+    "TrainableTokensModel",
+    "VBLoRAConfig",
+    "VBLoRAModel",
+    "VeraConfig",
+    "VeraModel",
+    "XLoraConfig",
+    "XLoraModel",
+    "get_eva_state_dict",
+    "initialize_lora_eva_weights",
+]
--- a/src/peft/tuners/_buffer_dict.py
+++ b/src/peft/tuners/_buffer_dict.py
@ -55,11 +55,11 @@ class BufferDict(Module):
                (string, `torch.Tensor`).
        """
        super().__init__()
+        self.persistent = persistent
+
        if buffers is not None:
            self.update(buffers)

-        self.persistent = persistent
-
    def __getitem__(self, key):
        return self._buffers[key]

@ -125,19 +125,17 @@ class BufferDict(Module):
                "iterable of key/value pairs, but got " + type(buffers).__name__
            )

-        if isinstance(buffers, collections.abc.Mapping):
-            if isinstance(buffers, (OrderedDict, BufferDict)):
-                for key, buffer in buffers.items():
-                    self[key] = buffer
-            else:
-                for key, buffer in sorted(buffers.items()):
-                    self[key] = buffer
+        if isinstance(buffers, (OrderedDict, BufferDict)):
+            for key, buffer in buffers.items():
+                self[key] = buffer
+        elif isinstance(buffers, collections.abc.Mapping):
+            for key, buffer in sorted(buffers.items()):
+                self[key] = buffer
        else:
            for j, p in enumerate(buffers):
                if not isinstance(p, collections.abc.Iterable):
                    raise TypeError(
-                        "BufferDict update sequence element "
-                        "#" + str(j) + " should be Iterable; is" + type(p).__name__
+                        "BufferDict update sequence element #" + str(j) + " should be Iterable; is" + type(p).__name__
                    )
                if not len(p) == 2:
                    raise ValueError(
--- a/src/peft/tuners/adalora/init.py
+++ b/src/peft/tuners/adalora/init.py
@ -13,6 +13,7 @@
 # limitations under the License.

 from peft.import_utils import is_bnb_4bit_available, is_bnb_available
+from peft.utils import register_peft_method

 from .config import AdaLoraConfig
 from .gptq import SVDQuantLinear
@ -20,7 +21,12 @@ from .layer import AdaLoraLayer, RankAllocator, SVDLinear
 from .model import AdaLoraModel


-__all__ = ["AdaLoraConfig", "AdaLoraLayer", "AdaLoraModel", "SVDLinear", "RankAllocator", "SVDQuantLinear"]
+__all__ = ["AdaLoraConfig", "AdaLoraLayer", "AdaLoraModel", "RankAllocator", "SVDLinear", "SVDQuantLinear"]
+
+
+register_peft_method(
+    name="adalora", config_cls=AdaLoraConfig, model_cls=AdaLoraModel, prefix="lora_", is_mixed_compatible=True
+)


 def __getattr__(name):
--- a/src/peft/tuners/adalora/bnb.py
+++ b/src/peft/tuners/adalora/bnb.py
@ -129,9 +129,7 @@ if is_bnb_4bit_available():
                requires_conversion = not torch.is_autocast_enabled()
                if requires_conversion:
                    expected_dtype = result.dtype
-                    compute_dtype = lora_A.dtype
-                    if x.dtype != compute_dtype:
-                        x = x.to(compute_dtype)
+                    x = self._cast_input_dtype(x, lora_A.dtype)

                output = dropout(x) @ (lora_A * lora_E).T @ lora_B.T
                if requires_conversion:
--- a/src/peft/tuners/adalora/config.py
+++ b/src/peft/tuners/adalora/config.py
@ -25,11 +25,29 @@ class AdaLoraConfig(LoraConfig):
    """
    This is the configuration class to store the configuration of a [`~peft.AdaLora`].

+    AdaLoRA has three phases defined by `tinit`, `tfinal` and `total_step`.
+
+    The initial phase can be understood as a step for pre-training the adapters so that when reducing their rank, there
+    is already some information encoded that can be reduced instead of random matrices. This phase is defined by
+    supplying `tinit`.
+
+    After the initial phase is over (`tinit` steps have passed) and the final phase has not begun, AdaLoRA reduces the
+    budget of how much rank each layer is allowed to have with each step. This is where the reduction of rank is
+    happening. This goes on until `total_step - tfinal` steps are reached.
+
+    The last phase, beginning once `total_step - tfinal` steps are reached, does not change the layer ranks anymore but
+    fine-tunes the reduced-rank layers that resulted from the previous phase.
+
+    A practical example: `tinit` is 10, `tfinal` is 20, `total_step` is 100. We spend 10 steps doing pre-training
+    without rank reduction because our budget is constant (init phase), then we spend 80 (100-20) steps in the
+    reduction phase where our budget decreases step-wise and, finally, 20 steps in the final fine-tuning stage without
+    reduction.
+
    Args:
        target_r (`int`): The target average rank of incremental matrix.
        init_r (`int`): The initial rank for each incremental matrix.
        tinit (`int`): The steps of initial fine-tuning warmup.
-        tfinal (`int`): The step of final fine-tuning.
+        tfinal (`int`): The number of steps of final fine-tuning.
        deltaT (`int`): The time internval between two budget allocations.
        beta1 (`float`): The hyperparameter of EMA for sensitivity smoothing.
        beta2 (`float`): The hyperparameter of EMA for undertainty quantification.
@ -50,6 +68,7 @@ class AdaLoraConfig(LoraConfig):
    rank_pattern: Optional[dict] = field(default=None, metadata={"help": "The saved rank pattern."})

    def __post_init__(self):
+        super().__post_init__()
        self.peft_type = PeftType.ADALORA

        if self.use_dora:
@ -61,13 +80,16 @@ class AdaLoraConfig(LoraConfig):
        self.target_modules = (
            set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
        )
+        self.exclude_modules = (
+            set(self.exclude_modules) if isinstance(self.exclude_modules, list) else self.exclude_modules
+        )
        # if target_modules is a regex expression, then layers_to_transform should be None
        if isinstance(self.target_modules, str) and self.layers_to_transform is not None:
            raise ValueError("`layers_to_transform` cannot be used when `target_modules` is a str.")

-        # if target_modules is a regex expression, then layers_pattern should be None
-        if isinstance(self.target_modules, str) and self.layers_pattern is not None:
-            raise ValueError("`layers_pattern` cannot be used when `target_modules` is a str.")
+        # check for layers_to_transform and layers_pattern
+        if self.layers_pattern and not self.layers_to_transform:
+            raise ValueError("When `layers_pattern` is specified, `layers_to_transform` must also be specified. ")

        # Check if 'r' has been set to a non-default value
        if self.r != 8:  # 8 is the default value for 'r' in LoraConfig
@ -75,3 +97,12 @@ class AdaLoraConfig(LoraConfig):
                "Note that `r` is not used in AdaLora and will be ignored."
                "If you intended to set the initial rank, use `init_r` instead."
            )
+
+        if self.total_step is None or self.total_step <= 0:
+            raise ValueError("AdaLoRA does not work when `total_step` is None, supply a value > 0.")
+
+        if self.tinit >= (self.total_step - self.tfinal):
+            raise ValueError(
+                "The supplied schedule values don't allow for a budgeting phase. Decrease `tfinal`/`tinit` or "
+                "increase `total_step`."
+            )
--- a/src/peft/tuners/adalora/gptq.py
+++ b/src/peft/tuners/adalora/gptq.py
@ -55,8 +55,7 @@ class SVDQuantLinear(torch.nn.Module, AdaLoraLayer):
            requires_conversion = not torch.is_autocast_enabled()
            if requires_conversion:
                expected_dtype = result.dtype
-                if x.dtype != torch.float32:
-                    x = x.float()
+                x = self._cast_input_dtype(x, torch.float32)

            output = (dropout(x) @ (lora_A * lora_E).T @ lora_B.T) * scaling / ranknum
            # TODO: here, the dtype conversion is applied on the *whole expression*,
--- a/Show More
+++ b/Show More