add docs chatbot v0

fix crash when using torch.nn.DataParallel for LORA inference (#805 )
Signed-off-by: Wang, Yi <yi.a.wang@intel.com>
2025-10-22 00:18:52 +08:00 · 2023-08-09 16:22:17 +05:30 · 2023-08-08 15:07:23 +02:00 · 2023-08-08 14:38:23 +02:00 · 2023-08-08 18:08:04 +05:30 · 2023-08-08 14:35:19 +02:00
132 changed files with 91320 additions and 10198 deletions
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@ -0,0 +1,71 @@
+name: "\U0001F41B Bug Report"
+description: Submit a bug report to help us improve the library
+body:
+  - type: textarea
+    id: system-info
+    attributes:
+      label: System Info
+      description: Please share your relevant system information with us
+      placeholder: peft & accelerate & transformers version, platform, python version, ...
+    validations:
+      required: true
+
+  - type: textarea
+    id: who-can-help
+    attributes:
+      label: Who can help?
+      description: |
+        Your issue will be replied to more quickly if you can figure out the right person to tag with @
+        If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**.
+        
+        All issues are read by one of the core maintainers, so if you don't know who to tag, just leave this blank and
+        a core maintainer will ping the right person.
+        
+        Please tag fewer than 3 people.
+        
+        Library: @pacman100 @younesbelkada @sayakpaul
+        
+        Documentation: @stevhliu and @MKhalusova
+
+      placeholder: "@Username ..."
+
+  - type: checkboxes
+    id: information-scripts-examples
+    attributes:
+      label: Information
+      description: 'The problem arises when using:'
+      options:
+        - label: "The official example scripts"
+        - label: "My own modified scripts"
+
+  - type: checkboxes
+    id: information-tasks
+    attributes:
+      label: Tasks
+      description: "The tasks I am working on are:"
+      options:
+        - label: "An officially supported task in the `examples` folder"
+        - label: "My own task or dataset (give details below)"
+
+  - type: textarea
+    id: reproduction
+    validations:
+      required: true
+    attributes:
+      label: Reproduction
+      description: |
+        Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
+        Please provide the simplest reproducer as possible so that we can quickly fix the issue. 
+
+      placeholder: |
+        Reproducer: 
+        
+          
+
+  - type: textarea
+    id: expected-behavior
+    validations:
+      required: true
+    attributes:
+      label: Expected behavior
+      description: "A clear and concise description of what you would expect to happen."
--- a/.github/ISSUE_TEMPLATE/feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/feature-request.yml
@ -0,0 +1,30 @@
+name: "\U0001F680 Feature request"
+description: Submit a proposal/request for a new feature
+labels: [ "feature" ]
+body:
+  - type: textarea
+    id: feature-request
+    validations:
+      required: true
+    attributes:
+      label: Feature request
+      description: |
+        A clear and concise description of the feature proposal. Please provide a link to the paper and code in case they exist.
+
+  - type: textarea
+    id: motivation
+    validations:
+      required: true
+    attributes:
+      label: Motivation
+      description: |
+        Please outline the motivation for the proposal. Is your feature request related to a problem? 
+
+  - type: textarea
+    id: contribution
+    validations:
+      required: true
+    attributes:
+      label: Your contribution
+      description: |
+        Is there any way that you could help, e.g. by submitting a PR? 
--- a/.github/workflows/build_docker_images.yml
+++ b/.github/workflows/build_docker_images.yml
@ -0,0 +1,74 @@
+name: Build Docker images (scheduled)
+
+on:
+  workflow_dispatch:
+  workflow_call:
+  schedule:
+    - cron: "0 1 * * *"
+
+concurrency:
+  group: docker-image-builds
+  cancel-in-progress: false
+
+jobs:
+  latest-cpu:
+    name: "Latest Peft CPU [dev]"
+    runs-on: ubuntu-latest
+    steps:
+      - name: Cleanup disk
+        run: |
+          sudo ls -l /usr/local/lib/
+          sudo ls -l /usr/share/
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+      - name: Check out code
+        uses: actions/checkout@v2
+      - name: Login to DockerHub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+
+      - name: Build and Push CPU
+        uses: docker/build-push-action@v4
+        with:
+          context: ./docker/peft-cpu
+          push: true
+          tags: huggingface/peft-cpu
+
+  latest-cuda:
+    name: "Latest Peft GPU [dev]"
+    runs-on: ubuntu-latest
+    steps:
+      - name: Cleanup disk
+        run: |
+          sudo ls -l /usr/local/lib/
+          sudo ls -l /usr/share/
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+      - name: Check out code
+        uses: actions/checkout@v2
+      - name: Login to DockerHub
+        uses: docker/login-action@v1
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+
+      - name: Build and Push GPU
+        uses: docker/build-push-action@v2
+        with:
+          context: ./docker/peft-gpu
+          push: true
+          tags: huggingface/peft-gpu
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@ -0,0 +1,19 @@
+name: Build documentation
+
+on:
+  push:
+    branches:
+      - main
+      - doc-builder*
+      - v*-release
+
+jobs:
+   build:
+    uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
+    with:
+      commit_sha: ${{ github.sha }}
+      package: peft
+      notebook_folder: peft_docs
+    secrets:
+      token: ${{ secrets.HUGGINGFACE_PUSH }}
+      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@ -0,0 +1,16 @@
+name: Build PR Documentation
+
+on:
+  pull_request:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
+    with:
+      commit_sha: ${{ github.event.pull_request.head.sha }}
+      pr_number: ${{ github.event.number }}
+      package: peft
--- a/.github/workflows/delete_doc_comment.yml
+++ b/.github/workflows/delete_doc_comment.yml
@ -0,0 +1,14 @@
+name: Delete doc comment
+
+on:
+  workflow_run:
+    workflows: ["Delete doc comment trigger"]
+    types:
+      - completed
+
+
+jobs:
+  delete:
+    uses: huggingface/doc-builder/.github/workflows/delete_doc_comment.yml@main
+    secrets:
+      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
--- a/.github/workflows/delete_doc_comment_trigger.yml
+++ b/.github/workflows/delete_doc_comment_trigger.yml
@ -0,0 +1,12 @@
+name: Delete doc comment trigger
+
+on:
+  pull_request:
+    types: [ closed ]
+
+
+jobs:
+  delete:
+    uses: huggingface/doc-builder/.github/workflows/delete_doc_comment_trigger.yml@main
+    with:
+      pr_number: ${{ github.event.number }}
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@ -0,0 +1,101 @@
+name: Self-hosted runner with slow tests (scheduled)
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 2 * * *"
+
+env:
+  RUN_SLOW: "yes"
+  IS_GITHUB_CI: "1"
+  SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }}
+
+
+jobs:
+  run_all_tests_single_gpu:
+    runs-on: [self-hosted, docker-gpu, multi-gpu]
+    env:
+      CUDA_VISIBLE_DEVICES: "0"
+      TEST_TYPE: "single_gpu"
+    container:
+      image: huggingface/peft-gpu:latest
+      options: --gpus all --shm-size "16gb"
+    defaults:
+      run:
+        working-directory: peft/
+        shell: bash
+    steps:
+      - name: Update clone & pip install
+        run: |
+          source activate peft
+          git config --global --add safe.directory '*'
+          git fetch && git checkout ${{ github.sha }} 
+          pip install -e . --no-deps
+          pip install pytest-reportlog
+      
+      - name: Run common tests on single GPU
+        run: |
+          source activate peft
+          make tests_common_gpu
+
+      - name: Run examples on single GPU
+        run: |
+          source activate peft
+          make tests_examples_single_gpu
+      
+      - name: Run core tests on single GPU
+        run: |
+          source activate peft
+          make tests_core_single_gpu
+          
+      - name: Generate Report
+        if: always()
+        run: |
+          pip install slack_sdk tabulate
+          python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
+
+  run_all_tests_multi_gpu:
+    runs-on: [self-hosted, docker-gpu, multi-gpu]
+    env:
+      CUDA_VISIBLE_DEVICES: "0,1"
+      TEST_TYPE: "multi_gpu"
+    container:
+      image: huggingface/peft-gpu:latest
+      options: --gpus all --shm-size "16gb"
+    defaults:
+      run:
+        working-directory: peft/
+        shell: bash
+    steps:
+      - name: Update clone
+        run: |
+          source activate peft
+          git config --global --add safe.directory '*'
+          git fetch && git checkout ${{ github.sha }}
+          pip install -e . --no-deps
+          pip install pytest-reportlog
+
+      - name: Run core GPU tests on multi-gpu
+        run: |
+          source activate peft
+          
+      - name: Run common tests on multi GPU
+        run: |
+          source activate peft
+          make tests_common_gpu
+        
+      - name: Run examples on multi GPU
+        run: |
+          source activate peft
+          make tests_examples_multi_gpu
+      
+      - name: Run core tests on multi GPU
+        run: |
+          source activate peft
+          make tests_core_multi_gpu
+          
+      - name: Generate Report
+        if: always()
+        run: |
+          pip install slack_sdk tabulate
+          python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@ -0,0 +1,27 @@
+name: Stale Bot
+
+on:
+  schedule:
+    - cron: "0 15 * * *"
+
+jobs:
+  close_stale_issues:
+    name: Close Stale Issues
+    if: github.repository == 'huggingface/peft'
+    runs-on: ubuntu-latest
+    env:
+      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Setup Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: 3.8
+
+    - name: Install requirements
+      run: |
+        pip install PyGithub
+    - name: Close stale issues
+      run: |
+        python scripts/stale.py
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -0,0 +1,49 @@
+name: tests
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+
+jobs:
+  check_code_quality:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
+          cache: "pip"
+          cache-dependency-path: "setup.py"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install .[dev]
+      - name: Check quality
+        run: |
+          make quality
+
+  tests:
+    needs: check_code_quality
+    strategy:
+      matrix:
+        python-version: ["3.8", "3.9", "3.10"]
+        os: ["ubuntu-latest", "macos-latest", "windows-latest"]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: "pip"
+          cache-dependency-path: "setup.py"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          # cpu version of pytorch
+          pip install -e .[test]
+      - name: Test with pytest
+        run: |
+          make test
--- a/.github/workflows/upload_pr_documentation.yml
+++ b/.github/workflows/upload_pr_documentation.yml
@ -0,0 +1,16 @@
+name: Upload PR Documentation
+
+on:
+  workflow_run:
+    workflows: ["Build PR Documentation"]
+    types:
+      - completed
+
+jobs:
+  build:
+    uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
+    with:
+      package_name: peft
+    secrets:
+      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
+      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1 +0,0 @@
-include LICENSE
--- a/31
+++ b/31
@ -1,19 +1,36 @@
 .PHONY: quality style test docs

-check_dirs := src examples
+check_dirs := src tests examples docs

 # Check that source code meets quality standards

 # this target runs checks on all files
 quality:
 	black --check $(check_dirs)
-	isort --check-only $(check_dirs)
-	flake8 $(check_dirs)
-	doc-builder style src --max_len 119 --check_only
+	ruff $(check_dirs)
+	doc-builder style src/peft tests docs/source --max_len 119 --check_only

 # Format source code automatically and check is there are any problems left that need manual fixing
 style:
 	black $(check_dirs)
-	isort $(check_dirs)
-	doc-builder style src --max_len 119
-	
+	ruff $(check_dirs) --fix
+	doc-builder style src/peft tests docs/source --max_len 119
+
+test:
+	python -m pytest -n 3 tests/ $(if $(IS_GITHUB_CI),--report-log "ci_tests.log",)
+
+tests_examples_multi_gpu:
+	python -m pytest -m multi_gpu_tests tests/test_gpu_examples.py $(if $(IS_GITHUB_CI),--report-log "multi_gpu_examples.log",)
+
+tests_examples_single_gpu:
+	python -m pytest -m single_gpu_tests tests/test_gpu_examples.py $(if $(IS_GITHUB_CI),--report-log "single_gpu_examples.log",)
+
+tests_core_multi_gpu:
+	python -m pytest -m multi_gpu_tests tests/test_common_gpu.py $(if $(IS_GITHUB_CI),--report-log "core_multi_gpu.log",)
+
+tests_core_single_gpu:
+	python -m pytest -m single_gpu_tests tests/test_common_gpu.py $(if $(IS_GITHUB_CI),--report-log "core_single_gpu.log",)
+
+tests_common_gpu:
+	python -m pytest tests/test_decoder_models.py $(if $(IS_GITHUB_CI),--report-log "common_decoder.log",)
+	python -m pytest tests/test_encoder_decoder_models.py $(if $(IS_GITHUB_CI),--report-log "common_encoder_decoder.log",)
--- a/README.md
+++ b/README.md
@ -21,24 +21,26 @@ limitations under the License.

 Parameter-Efficient Fine-Tuning (PEFT) methods enable efficient adaptation of pre-trained language models (PLMs) to various downstream applications without fine-tuning all the model's parameters. Fine-tuning large-scale PLMs is often prohibitively costly. In this regard, PEFT methods only fine-tune a small number of (extra) model parameters, thereby greatly decreasing the computational and storage costs. Recent State-of-the-Art PEFT techniques achieve performance comparable to that of full fine-tuning. 

-Seamlessly integrated with 🤗 Accelerate for large scale models leveraging PyTorch FSDP. 
+Seamlessly integrated with 🤗 Accelerate for large scale models leveraging DeepSpeed and Big Model Inference. 

 Supported methods:

-1. LoRA: [LORA: LOW-RANK ADAPTATION OF LARGE LANGUAGE MODELS](https://arxiv.org/pdf/2106.09685.pdf)
-2. Prefix Tuning: [P-Tuning v2: Prompt Tuning Can Be Comparable to Fine-tuning Universally Across Scales and Tasks](https://arxiv.org/pdf/2110.07602.pdf)
-3. P-Tuning: [GPT Understands, Too](https://arxiv.org/pdf/2103.10385.pdf)
-4. Prompt Tuning: [The Power of Scale for Parameter-Efficient Prompt Tuning](https://arxiv.org/pdf/2104.08691.pdf) 
+1. LoRA: [LORA: LOW-RANK ADAPTATION OF LARGE LANGUAGE MODELS](https://arxiv.org/abs/2106.09685)
+2. Prefix Tuning: [Prefix-Tuning: Optimizing Continuous Prompts for Generation](https://aclanthology.org/2021.acl-long.353/), [P-Tuning v2: Prompt Tuning Can Be Comparable to Fine-tuning Universally Across Scales and Tasks](https://arxiv.org/pdf/2110.07602.pdf)
+3. P-Tuning: [GPT Understands, Too](https://arxiv.org/abs/2103.10385)
+4. Prompt Tuning: [The Power of Scale for Parameter-Efficient Prompt Tuning](https://arxiv.org/abs/2104.08691)
+5. AdaLoRA: [Adaptive Budget Allocation for Parameter-Efficient Fine-Tuning](https://arxiv.org/abs/2303.10512)  
+6. $(IA)^3$ : [Infused Adapter by Inhibiting and Amplifying Inner Activations](https://arxiv.org/abs/2205.05638)

 ## Getting started

 ```python
 from transformers import AutoModelForSeq2SeqLM
-from peft import get_peft_config, get_peft_model, LoRAConfig, TaskType
+from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
 model_name_or_path = "bigscience/mt0-large"
 tokenizer_name_or_path = "bigscience/mt0-large"

-peft_config = LoRAConfig(
+peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
 )

@ -52,7 +54,7 @@ model.print_trainable_parameters()

 ### Get comparable performance to full finetuning by adapting LLMs to downstream tasks using consumer hardware

-GPU memory required for adapting LLMs on the few-shot dataset `ought/raft/twitter_complaints`. Here, settings considered
+GPU memory required for adapting LLMs on the few-shot dataset [`ought/raft/twitter_complaints`](https://huggingface.co/datasets/ought/raft/viewer/twitter_complaints). Here, settings considered
 are full finetuning, PEFT-LoRA using plain PyTorch and  PEFT-LoRA using DeepSpeed with CPU Offloading. 

 Hardware: Single A100 80GB GPU with CPU RAM above 64GB
@ -63,9 +65,9 @@ Hardware: Single A100 80GB GPU with CPU RAM above 64GB
 | bigscience/mt0-xxl (12B params) | OOM GPU | 56GB GPU / 3GB CPU | 22GB GPU / 52GB CPU |
 | bigscience/bloomz-7b1 (7B params) | OOM GPU | 32GB GPU / 3.8GB CPU | 18.1GB GPU / 35GB CPU |

-Performance of PEFT-LoRA tuned `bigscience/T0_3B` on `ought/raft/twitter_complaints` leaderboard. 
-A point to note is that we didn't try to sequeeze performance by playing around with input instruction templates, LoRA hyperparams and other training related hyperparams. Also, we didn't use the larger 13B mt0-xxl model.
-So, we are already seeing comparable performance to SoTA with parameter effcient tuning. Also, the final checkpoint size is just `19MB` in comparison to `11GB` size of the backbone `bigscience/T0_3B` model.
+Performance of PEFT-LoRA tuned [`bigscience/T0_3B`](https://huggingface.co/bigscience/T0_3B) on [`ought/raft/twitter_complaints`](https://huggingface.co/datasets/ought/raft/viewer/twitter_complaints) leaderboard. 
+A point to note is that we didn't try to squeeze performance by playing around with input instruction templates, LoRA hyperparams and other training related hyperparams. Also, we didn't use the larger 13B [mt0-xxl](https://huggingface.co/bigscience/mt0-xxl) model.
+So, we are already seeing comparable performance to SoTA with parameter efficient tuning. Also, the final checkpoint size is just `19MB` in comparison to `11GB` size of the backbone [`bigscience/T0_3B`](https://huggingface.co/bigscience/T0_3B) model.

 |   Submission Name        | Accuracy |
 | --------- | ---- |
@ -75,19 +77,21 @@ So, we are already seeing comparable performance to SoTA with parameter effcient

 **Therefore, we can see that performance comparable to SoTA is achievable by PEFT methods with consumer hardware such as 16GB and 24GB GPUs.**

+An insightful blogpost explaining the advantages of using PEFT for fine-tuning FlanT5-XXL: [https://www.philschmid.de/fine-tune-flan-t5-peft](https://www.philschmid.de/fine-tune-flan-t5-peft)
+
 ### Parameter Efficient Tuning of Diffusion Models

-GPU memory required by different settings during training are given below. The final checkpoint size being `8.8 MB`.
+GPU memory required by different settings during training is given below. The final checkpoint size is `8.8 MB`.

-Hardware: Single A100 80GB GPU with CPU RAM above 64G
+Hardware: Single A100 80GB GPU with CPU RAM above 64GB

-|   Model         | Full Finetuning | PEFT-LoRA  | PEFT-LoRA with Gradient Checkpoitning  |
+|   Model         | Full Finetuning | PEFT-LoRA  | PEFT-LoRA with Gradient Checkpointing  |
 | --------- | ---- | ---- | ---- |
 | CompVis/stable-diffusion-v1-4 | 27.5GB GPU / 3.97GB CPU | 15.5GB GPU / 3.84GB CPU | 8.12GB GPU / 3.77GB CPU | 


 **Training**
-An example of using LoRA for parameter efficient dreambooth training is given in `~examples/lora_dreambooth/train_dreambooth.py`
+An example of using LoRA for parameter efficient dreambooth training is given in [`examples/lora_dreambooth/train_dreambooth.py`](examples/lora_dreambooth/train_dreambooth.py)

 ```bash
 export MODEL_NAME= "CompVis/stable-diffusion-v1-4" #"stabilityai/stable-diffusion-2-1"
@ -125,7 +129,18 @@ Try out the 🤗 Gradio Space which should run seamlessly on a T4 instance:

 ![peft lora dreambooth gradio space](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/peft_lora_dreambooth_gradio_space.png)

-### Parameter Efficient Tuning of LLMs for RLHF components such as Ranker and Policy [ToDo]
+**NEW** ✨ Multi Adapter support and combining multiple LoRA adapters in a weighted combination 
+![peft lora dreambooth weighted adapter](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/weighted_adapter_dreambooth_lora.png)
+
+### Parameter Efficient Tuning of LLMs for RLHF components such as Ranker and Policy
+- Here is an example in [trl](https://github.com/lvwerra/trl) library using PEFT+INT8 for tuning policy model: [gpt2-sentiment_peft.py](https://github.com/lvwerra/trl/blob/main/examples/sentiment/scripts/gpt2-sentiment_peft.py) and corresponding [Blog](https://huggingface.co/blog/trl-peft)
+- Example using PEFT for Instrction finetuning, reward model and policy : [stack_llama](https://github.com/lvwerra/trl/tree/main/examples/stack_llama/scripts) and corresponding [Blog](https://huggingface.co/blog/stackllama) 
+
+### INT8 training of large models in Colab using PEFT LoRA and bits_and_bytes
+
+- Here is now a demo on how to fine tune [OPT-6.7b](https://huggingface.co/facebook/opt-6.7b) (14GB in fp16) in a Google Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1jCkpikz0J2o20FBQmYmAGdiKmJGOMo-o?usp=sharing)
+
+- Here is now a demo on how to fine tune [whisper-large](https://huggingface.co/openai/whisper-large-v2) (1.5B params) (14GB in fp16) in a Google Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1DOkD_5OUjFa0r5Ik3SgywJLJtEo2qLxO?usp=sharing) and [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1vhF8yueFqha3Y3CpTHN6q9EVcII9EYzs?usp=sharing)

 ### Save compute and storage even for medium and small models

@ -133,22 +148,22 @@ Save storage by avoiding full finetuning of models on each of the downstream tas
 With PEFT methods, users only need to store tiny checkpoints in the order of `MBs` all the while retaining 
 performance comparable to full finetuning.

-An example of using LoRA for the task of adaping `LayoutLMForTokenClassification` on `FUNSD` dataset is given in `~examples/token_classification/PEFT_LoRA_LayoutLMForTokenClassification_on_FUNSD.py`. We can observe that with only `0.62 %` of parameters being trainable, we achieve performance (F1 0.777) comparable to full finetuning (F1 0.786) (without any hyerparam tuning runs for extracting more performance), and the checkpoint of this is only `2.8MB`. Now, if there are `N` such datasets, just have these PEFT models one for each dataset and save a lot of storage without having to worry about the problem of catastrophic forgetting or overfitting of backbone/base model.
+An example of using LoRA for the task of adapting `LayoutLMForTokenClassification` on `FUNSD` dataset is given in `~examples/token_classification/PEFT_LoRA_LayoutLMForTokenClassification_on_FUNSD.py`. We can observe that with only `0.62 %` of parameters being trainable, we achieve performance (F1 0.777) comparable to full finetuning (F1 0.786) (without any hyerparam tuning runs for extracting more performance), and the checkpoint of this is only `2.8MB`. Now, if there are `N` such datasets, just have these PEFT models one for each dataset and save a lot of storage without having to worry about the problem of catastrophic forgetting or overfitting of backbone/base model.

-Another example is fine-tuning `roberta-large` on `MRPC` GLUE dataset suing differenct PEFT methods. The notebooks are given in `~examples/sequence_classification`. 
+Another example is fine-tuning [`roberta-large`](https://huggingface.co/roberta-large) on [`MRPC` GLUE](https://huggingface.co/datasets/glue/viewer/mrpc) dataset using different PEFT methods. The notebooks are given in `~examples/sequence_classification`. 


 ## PEFT + 🤗 Accelerate

-PEFT models work with 🤗 Accelerate out of the box. Use 🤗 Accelerate for Distributed training on various hardware such as GPUs, Apple Silicon devices etc during training.
+PEFT models work with 🤗 Accelerate out of the box. Use 🤗 Accelerate for Distributed training on various hardware such as GPUs, Apple Silicon devices, etc during training.
 Use 🤗 Accelerate for inferencing on consumer hardware with small resources.

-### Example of PEFT model training using 🤗 Accelerate's DeepSpeed integation
+### Example of PEFT model training using 🤗 Accelerate's DeepSpeed integration

- Currently DeepSpeed requires PR [ZeRO3 handling frozen weights](https://github.com/microsoft/DeepSpeed/pull/2653) to fix [[REQUEST] efficiently deal with frozen weights during training](https://github.com/microsoft/DeepSpeed/issues/2615) issue. Example is provided in `~examples/conditional_generation/peft_lora_seq2seq_accelerate_ds_zero3_offload.py`. 
-  a. First run `accelerate config --config_file ds_zero3_cpu.yaml` and answer the questionaire. 
+DeepSpeed version required `v0.8.0`. An example is provided in `~examples/conditional_generation/peft_lora_seq2seq_accelerate_ds_zero3_offload.py`. 
+  a. First, run `accelerate config --config_file ds_zero3_cpu.yaml` and answer the questionnaire. 
  Below are the contents of the config file.
-  ```
+  ```yaml
  compute_environment: LOCAL_MACHINE
  deepspeed_config:
    gradient_accumulation_steps: 1
@ -172,8 +187,8 @@ Use 🤗 Accelerate for inferencing on consumer hardware with small resources.
  same_network: true
  use_cpu: false
  ```
-  b. run the below command to launch example script
-  ```
+  b. run the below command to launch the example script
+  ```bash
  accelerate launch --config_file ds_zero3_cpu.yaml examples/peft_lora_seq2seq_accelerate_ds_zero3_offload.py
  ```

@ -203,52 +218,85 @@ Use 🤗 Accelerate for inferencing on consumer hardware with small resources.
  ```

 ### Example of PEFT model inference using 🤗 Accelerate's Big Model Inferencing capabilities
-
-Example is provided in `~examples/causal_language_modeling/peft_lora_clm_accelerate_big_model_inference.ipynb`. 
+An example is provided in `~examples/causal_language_modeling/peft_lora_clm_accelerate_big_model_inference.ipynb`. 


 ## Models support matrix

 ### Causal Language Modeling
-|   Model         | LoRA | Prefix Tuning  | P-Tuning | Prompt Tuning  |
-| --------- | ---- | ---- | ---- | ----  |
-| GPT-2          | ✅  | ✅  | ✅  | ✅  |
-| Bloom          | ✅  | ✅  | ✅  | ✅  |
-| OPT            | ✅  | ✅  | ✅  | ✅  |
-| GPT-Neo        | ✅  | ✅  | ✅  | ✅  |
-| GPT-J          | ✅  | ✅  | ✅  | ✅  |
+| Model        | LoRA | Prefix Tuning  | P-Tuning | Prompt Tuning  | IA3 |
+|--------------| ---- | ---- | ---- | ----  | ----  |
+| GPT-2        | ✅  | ✅  | ✅  | ✅  | ✅  |
+| Bloom        | ✅  | ✅  | ✅  | ✅  | ✅  |
+| OPT          | ✅  | ✅  | ✅  | ✅  | ✅  |
+| GPT-Neo      | ✅  | ✅  | ✅  | ✅  | ✅  |
+| GPT-J        | ✅  | ✅  | ✅  | ✅  | ✅  |
+| GPT-NeoX-20B | ✅  | ✅  | ✅  | ✅  | ✅  |
+| LLaMA        | ✅  | ✅  | ✅  | ✅  | ✅  |
+| ChatGLM      | ✅  | ✅  | ✅  | ✅  | ✅  |

 ### Conditional Generation
-|   Model         | LoRA | Prefix Tuning  | P-Tuning | Prompt Tuning  | 
-| --------- | ---- | ---- | ---- | ---- |
-| T5        | ✅   | ✅   | ✅   | ✅   |
-| BART      | ✅   | ✅   | ✅   | ✅   |
+|   Model         | LoRA | Prefix Tuning  | P-Tuning | Prompt Tuning  | IA3 |
+| --------- | ---- | ---- | ---- | ---- | ---- |
+| T5        | ✅   | ✅   | ✅   | ✅   | ✅   |
+| BART      | ✅   | ✅   | ✅   | ✅   | ✅   |

 ### Sequence Classification
-|   Model         | LoRA | Prefix Tuning  | P-Tuning | Prompt Tuning  | 
-| --------- | ---- | ---- | ---- | ----  |
-| BERT           | ✅  | ✅  | ✅  | ✅  |  
-| RoBERTa        | ✅  | ✅  | ✅  | ✅  |
-| GPT-2          | ✅  | ✅  | ✅  | ✅  | 
-| Bloom          | ✅  | ✅  | ✅  | ✅  |   
-| OPT            | ✅  | ✅  | ✅  | ✅  |
-| GPT-Neo        | ✅  | ✅  | ✅  | ✅  |
-| GPT-J          | ✅  | ✅  | ✅  | ✅  |
-| Deberta        | ✅  |     | ✅  | ✅  |     
-| Deberta-v2     | ✅  |     | ✅  | ✅  |    
+|   Model         | LoRA | Prefix Tuning  | P-Tuning | Prompt Tuning  | IA3 |
+| --------- | ---- | ---- | ---- | ----  | ----  |
+| BERT           | ✅  | ✅  | ✅  | ✅  |  ✅  |  
+| RoBERTa        | ✅  | ✅  | ✅  | ✅  | ✅  |
+| GPT-2          | ✅  | ✅  | ✅  | ✅  |   |
+| Bloom          | ✅  | ✅  | ✅  | ✅  |   |
+| OPT            | ✅  | ✅  | ✅  | ✅  |   |
+| GPT-Neo        | ✅  | ✅  | ✅  | ✅  |   |
+| GPT-J          | ✅  | ✅  | ✅  | ✅  |   |
+| Deberta        | ✅  |     | ✅  | ✅  |   | 
+| Deberta-v2     | ✅  |     | ✅  | ✅  |   |

 ### Token Classification
-|   Model         | LoRA | Prefix Tuning  | P-Tuning | Prompt Tuning  | 
-| --------- | ---- | ---- | ---- | ----  |
-| BERT           | ✅  | ✅  |   |   |  
-| RoBERTa        | ✅  | ✅  |   |   |
-| GPT-2          | ✅  | ✅  |   |   | 
-| Bloom          | ✅  | ✅  |   |   |   
-| OPT            | ✅  | ✅  |   |   |
-| GPT-Neo        | ✅  | ✅  |   |   |
-| GPT-J          | ✅  | ✅  |   |   |
-| Deberta        | ✅  |     |   |   | 
-| Deberta-v2     | ✅  |     |   |   |
+|   Model         | LoRA | Prefix Tuning  | P-Tuning | Prompt Tuning  | IA3 |
+| --------- | ---- | ---- | ---- | ----  | ----  |
+| BERT           | ✅  | ✅  |   |   |   |  
+| RoBERTa        | ✅  | ✅  |   |   |   |
+| GPT-2          | ✅  | ✅  |   |   |   |
+| Bloom          | ✅  | ✅  |   |   |   |
+| OPT            | ✅  | ✅  |   |   |   |
+| GPT-Neo        | ✅  | ✅  |   |   |   |
+| GPT-J          | ✅  | ✅  |   |   |   |
+| Deberta        | ✅  |     |   |   |   |
+| Deberta-v2     | ✅  |     |   |   |   |
+
+### Text-to-Image Generation
+
+|   Model         | LoRA | Prefix Tuning  | P-Tuning | Prompt Tuning  | IA3 |
+| --------- | ---- | ---- | ---- | ----  | ----  |
+| Stable Diffusion           | ✅  |   |   |   |   |  
+
+
+### Image Classification
+
+|   Model         | LoRA | Prefix Tuning  | P-Tuning | Prompt Tuning  | IA3 |
+| --------- | ---- | ---- | ---- | ----  | ----  |
+| ViT           | ✅  |   |   |   |    | 
+| Swin           | ✅  |   |   |   |   |  
+
+### Image to text (Multi-modal models)
+
+|   Model         | LoRA | Prefix Tuning  | P-Tuning | Prompt Tuning  | IA3
+| --------- | ---- | ---- | ---- | ----  | ----  |
+| Blip-2           | ✅  |   |   |   |   |
+
+___Note that we have tested LoRA for [ViT](https://huggingface.co/docs/transformers/model_doc/vit) and [Swin](https://huggingface.co/docs/transformers/model_doc/swin) for fine-tuning on image classification. However, it should be possible to use LoRA for any compatible model [provided](https://huggingface.co/models?pipeline_tag=image-classification&sort=downloads&search=vit) by 🤗 Transformers. Check out the respective
+examples to learn more. If you run into problems, please open an issue.___
+
+The same principle applies to our [segmentation models](https://huggingface.co/models?pipeline_tag=image-segmentation&sort=downloads) as well. 
+
+### Semantic Segmentation
+
+|   Model         | LoRA | Prefix Tuning  | P-Tuning | Prompt Tuning  | IA3 |
+| --------- | ---- | ---- | ---- | ----  | ----  |
+| SegFormer           | ✅  |   |   |   |   | 


 ## Caveats:
@ -267,10 +315,10 @@ any GPU memory savings. Please refer issue [[FSDP] FSDP with CPU offload consume
  model = accelerator.prepare(model)
  ```

-  Example of parameter efficient tuning with `mt0-xxl` base model using 🤗 Accelerate is provided in `~examples/conditional_generation/peft_lora_seq2seq_accelerate_fsdp.py`. 
-  a. First run `accelerate config --config_file fsdp_config.yaml` and answer the questionaire. 
+  Example of parameter efficient tuning with [`mt0-xxl`](https://huggingface.co/bigscience/mt0-xxl) base model using 🤗 Accelerate is provided in `~examples/conditional_generation/peft_lora_seq2seq_accelerate_fsdp.py`. 
+  a. First, run `accelerate config --config_file fsdp_config.yaml` and answer the questionnaire. 
  Below are the contents of the config file.
-  ```
+  ```yaml
  command_file: null
  commands: null
  compute_environment: LOCAL_MACHINE
@ -300,21 +348,19 @@ any GPU memory savings. Please refer issue [[FSDP] FSDP with CPU offload consume
  tpu_zone: null
  use_cpu: false
  ```
-  b. run the below command to launch example script
-  ```
+  b. run the below command to launch the example script
+  ```bash
  accelerate launch --config_file fsdp_config.yaml examples/peft_lora_seq2seq_accelerate_fsdp.py
  ```

-2. When using `P_TUNING` or `PROMPT_TUNING` with `SEQ_2_SEQ` task, remember to remove the `num_virtual_token` virtual prompt predictions from the left side of the model outputs during evaluations. 
-
-3. `P_TUNING` or `PROMPT_TUNING` doesn't support `generate` functionality of transformers bcause `generate` strictly requires `input_ids`/`decoder_input_ids` but 
-`P_TUNING`/`PROMPT_TUNING` appends soft prompt embeddings to `input_embeds` to create
-new `input_embeds` to be given to the model. Therefore, `generate` doesn't support this yet.
+2. When using ZeRO3 with zero3_init_flag=True, if you find the gpu memory increase with training steps. we might need to update deepspeed after [deepspeed commit 42858a9891422abc](https://github.com/microsoft/DeepSpeed/commit/42858a9891422abcecaa12c1bd432d28d33eb0d4) . The related issue is [[BUG] Peft Training with Zero.Init() and Zero3 will increase GPU memory every forward step ](https://github.com/microsoft/DeepSpeed/issues/3002)

 ## Backlog:
-1. Explore and possibly integrate `(IA)^3` and `UniPELT`
-2. Add tests
-3. Add more use cases and examples
+- [x] Add tests
+- [x] Multi Adapter training and inference support
+- [x] Add more use cases and examples
+- [x] Integrate`(IA)^3`, `AdaptionPrompt`
+- [ ] Explore and possibly integrate methods like `Bottleneck Adapters`,  ...

 ## Citing 🤗 PEFT

@ -323,7 +369,7 @@ If you use 🤗 PEFT in your publication, please cite it by using the following
 ```bibtex
@Misc{peft,
  title =        {PEFT: State-of-the-art Parameter-Efficient Fine-Tuning methods},
-  author =       {Sourab Mangrulkar, Sylvain Gugger},
+  author =       {Sourab Mangrulkar and Sylvain Gugger and Lysandre Debut and Younes Belkada and Sayak Paul},
  howpublished = {\url{https://github.com/huggingface/peft}},
  year =         {2022}
 }
--- a/docker/peft-cpu/Dockerfile
+++ b/docker/peft-cpu/Dockerfile
@ -0,0 +1,51 @@
+# Builds GPU docker image of PyTorch
+# Uses multi-staged approach to reduce size
+# Stage 1
+# Use base conda image to reduce time
+FROM continuumio/miniconda3:latest AS compile-image
+# Specify py version
+ENV PYTHON_VERSION=3.8
+# Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
+RUN apt-get update && \
+    apt-get install -y curl git wget software-properties-common git-lfs && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists*
+
+# Install audio-related libraries 
+RUN apt-get update && \
+    apt install -y ffmpeg
+
+RUN apt install -y libsndfile1-dev
+RUN git lfs install
+
+# Create our conda env - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
+RUN conda create --name peft python=${PYTHON_VERSION} ipython jupyter pip
+RUN python3 -m pip install --no-cache-dir --upgrade pip
+
+# Below is copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
+# We don't install pytorch here yet since CUDA isn't available
+# instead we use the direct torch wheel
+ENV PATH /opt/conda/envs/peft/bin:$PATH
+# Activate our bash shell
+RUN chsh -s /bin/bash
+SHELL ["/bin/bash", "-c"]
+# Activate the conda env and install transformers + accelerate from source
+RUN source activate peft && \
+    python3 -m pip install --no-cache-dir \
+    librosa \
+    "soundfile>=0.12.1" \
+    scipy \
+    git+https://github.com/huggingface/transformers \
+    git+https://github.com/huggingface/accelerate \
+    peft[test]@git+https://github.com/huggingface/peft
+
+# Install apt libs
+RUN apt-get update && \
+    apt-get install -y curl git wget && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists*
+
+RUN echo "source activate peft" >> ~/.profile
+
+# Activate the virtualenv
+CMD ["/bin/bash"]
--- a/docker/peft-gpu/Dockerfile
+++ b/docker/peft-gpu/Dockerfile
@ -0,0 +1,58 @@
+# Builds GPU docker image of PyTorch
+# Uses multi-staged approach to reduce size
+# Stage 1
+# Use base conda image to reduce time
+FROM continuumio/miniconda3:latest AS compile-image
+# Specify py version
+ENV PYTHON_VERSION=3.8
+# Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
+RUN apt-get update && \
+    apt-get install -y curl git wget software-properties-common git-lfs && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists*
+
+# Install audio-related libraries 
+RUN apt-get update && \
+    apt install -y ffmpeg
+
+RUN apt install -y libsndfile1-dev
+RUN git lfs install
+
+# Create our conda env - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
+RUN conda create --name peft python=${PYTHON_VERSION} ipython jupyter pip
+RUN python3 -m pip install --no-cache-dir --upgrade pip
+
+# Below is copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
+# We don't install pytorch here yet since CUDA isn't available
+# instead we use the direct torch wheel
+ENV PATH /opt/conda/envs/peft/bin:$PATH
+# Activate our bash shell
+RUN chsh -s /bin/bash
+SHELL ["/bin/bash", "-c"]
+# Activate the conda env and install transformers + accelerate from source
+RUN source activate peft && \
+    python3 -m pip install --no-cache-dir \
+    librosa \
+    "soundfile>=0.12.1" \
+    scipy \
+    git+https://github.com/huggingface/transformers \
+    git+https://github.com/huggingface/accelerate \
+    peft[test]@git+https://github.com/huggingface/peft
+
+RUN python3 -m pip install --no-cache-dir bitsandbytes
+
+# Stage 2
+FROM nvidia/cuda:11.3.1-devel-ubuntu20.04 AS build-image
+COPY --from=compile-image /opt/conda /opt/conda
+ENV PATH /opt/conda/bin:$PATH
+
+# Install apt libs
+RUN apt-get update && \
+    apt-get install -y curl git wget && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists*
+
+RUN echo "source activate peft" >> ~/.profile
+
+# Activate the virtualenv
+CMD ["/bin/bash"]
--- a/docs/Makefile
+++ b/docs/Makefile
@ -0,0 +1,19 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/docs/README.md
+++ b/docs/README.md
@ -0,0 +1,267 @@
+<!---
+Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Generating the documentation
+
+To generate the documentation, you first have to build it. Several packages are necessary to build the doc, 
+you can install them with the following command, at the root of the code repository:
+
+```bash
+pip install -e ".[docs]"
+```
+
+Then you need to install our special tool that builds the documentation:
+
+```bash
+pip install git+https://github.com/huggingface/doc-builder
+```
+
+---
+**NOTE**
+
+You only need to generate the documentation to inspect it locally (if you're planning changes and want to
+check how they look before committing for instance). You don't have to commit the built documentation.
+
+---
+
+## Building the documentation
+
+Once you have setup the `doc-builder` and additional packages, you can generate the documentation by 
+typing the following command:
+
+```bash
+doc-builder build peft docs/source/ --build_dir ~/tmp/test-build
+```
+
+You can adapt the `--build_dir` to set any temporary folder that you prefer. This command will create it and generate
+the MDX files that will be rendered as the documentation on the main website. You can inspect them in your favorite
+Markdown editor.
+
+## Previewing the documentation
+
+To preview the docs, first install the `watchdog` module with:
+
+```bash
+pip install watchdog
+```
+
+Then run the following command:
+
+```bash
+doc-builder preview {package_name} {path_to_docs}
+```
+
+For example:
+
+```bash
+doc-builder preview peft docs/source
+```
+
+The docs will be viewable at [http://localhost:3000](http://localhost:3000). You can also preview the docs once you have opened a PR. You will see a bot add a comment to a link where the documentation with your changes lives.
+
+---
+**NOTE**
+
+The `preview` command only works with existing doc files. When you add a completely new file, you need to update `_toctree.yml` & restart `preview` command (`ctrl-c` to stop it & call `doc-builder preview ...` again).
+
+---
+
+## Adding a new element to the navigation bar
+
+Accepted files are Markdown (.md or .mdx).
+
+Create a file with its extension and put it in the source directory. You can then link it to the toc-tree by putting
+the filename without the extension in the [`_toctree.yml`](https://github.com/huggingface/peft/blob/main/docs/source/_toctree.yml) file.
+
+## Renaming section headers and moving sections
+
+It helps to keep the old links working when renaming the section header and/or moving sections from one document to another. This is because the old links are likely to be used in Issues, Forums, and Social media and it'd make for a much more superior user experience if users reading those months later could still easily navigate to the originally intended information.
+
+Therefore, we simply keep a little map of moved sections at the end of the document where the original section was. The key is to preserve the original anchor.
+
+So if you renamed a section from: "Section A" to "Section B", then you can add at the end of the file:
+
+```
+Sections that were moved:
+
+[ <a href="#section-b">Section A</a><a id="section-a"></a> ]
+```
+and of course, if you moved it to another file, then:
+
+```
+Sections that were moved:
+
+[ <a href="../new-file#section-b">Section A</a><a id="section-a"></a> ]
+```
+
+Use the relative style to link to the new file so that the versioned docs continue to work.
+
+
+## Writing Documentation - Specification
+
+The `huggingface/peft` documentation follows the
+[Google documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) style for docstrings,
+although we can write them directly in Markdown.
+
+### Adding a new tutorial
+
+Adding a new tutorial or section is done in two steps:
+
+- Add a new file under `./source`. This file can either be ReStructuredText (.rst) or Markdown (.md).
+- Link that file in `./source/_toctree.yml` on the correct toc-tree.
+
+Make sure to put your new file under the proper section. It's unlikely to go in the first section (*Get Started*), so
+depending on the intended targets (beginners, more advanced users, or researchers) it should go in sections two, three, or
+four.
+
+### Writing source documentation
+
+Values that should be put in `code` should either be surrounded by backticks: \`like so\`. Note that argument names
+and objects like True, None, or any strings should usually be put in `code`.
+
+When mentioning a class, function, or method, it is recommended to use our syntax for internal links so that our tool
+adds a link to its documentation with this syntax: \[\`XXXClass\`\] or \[\`function\`\]. This requires the class or 
+function to be in the main package.
+
+If you want to create a link to some internal class or function, you need to
+provide its path. For instance: \[\`utils.gather\`\]. This will be converted into a link with
+`utils.gather` in the description. To get rid of the path and only keep the name of the object you are
+linking to in the description, add a ~: \[\`~utils.gather\`\] will generate a link with `gather` in the description.
+
+The same works for methods so you can either use \[\`XXXClass.method\`\] or \[~\`XXXClass.method\`\].
+
+#### Defining arguments in a method
+
+Arguments should be defined with the `Args:` (or `Arguments:` or `Parameters:`) prefix, followed by a line return and
+an indentation. The argument should be followed by its type, with its shape if it is a tensor, a colon, and its
+description:
+
+```
+    Args:
+        n_layers (`int`): The number of layers of the model.
+```
+
+If the description is too long to fit in one line (more than 119 characters in total), another indentation is necessary 
+before writing the description after the argument.
+
+Finally, to maintain uniformity if any *one* description is too long to fit on one line, the 
+rest of the parameters should follow suit and have an indention before their description.
+
+Here's an example showcasing everything so far:
+
+```
+    Args:
+        gradient_accumulation_steps (`int`, *optional*, default to 1):
+            The number of steps that should pass before gradients are accumulated. A number > 1 should be combined with `Accelerator.accumulate`.
+        cpu (`bool`, *optional*):
+            Whether or not to force the script to execute on CPU. Will ignore GPU available if set to `True` and force the execution on one process only.
+```
+
+For optional arguments or arguments with defaults we follow the following syntax: imagine we have a function with the
+following signature:
+
+```
+def my_function(x: str = None, a: float = 1):
+```
+
+then its documentation should look like this:
+
+```
+    Args:
+        x (`str`, *optional*):
+            This argument controls ... and has a description longer than 119 chars.
+        a (`float`, *optional*, defaults to 1):
+            This argument is used to ... and has a description longer than 119 chars.
+```
+
+Note that we always omit the "defaults to \`None\`" when None is the default for any argument. Also note that even
+if the first line describing your argument type and its default gets long, you can't break it on several lines. You can
+however write as many lines as you want in the indented description (see the example above with `input_ids`).
+
+#### Writing a multi-line code block
+
+Multi-line code blocks can be useful for displaying examples. They are done between two lines of three backticks as usual in Markdown:
+
+
+````
+```python
+# first line of code
+# second line
+# etc
+```
+````
+
+#### Writing a return block
+
+The return block should be introduced with the `Returns:` prefix, followed by a line return and an indentation.
+The first line should be the type of the return, followed by a line return. No need to indent further for the elements
+building the return.
+
+Here's an example of a single value return:
+
+```
+    Returns:
+        `List[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token.
+```
+
+Here's an example of a tuple return, comprising several objects:
+
+```
+    Returns:
+        `tuple(torch.FloatTensor)` comprising various elements depending on the configuration ([`BertConfig`]) and inputs:
+        - ** loss** (*optional*, returned when `masked_lm_labels` is provided) `torch.FloatTensor` of shape `(1,)` --
+          Total loss is the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
+        - **prediction_scores** (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`) --
+          Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+```
+
+## Styling the docstring
+
+We have an automatic script running with the `make style` comment that will make sure that:
+- the docstrings fully take advantage of the line width
+- all code examples are formatted using black, like the code of the Transformers library
+
+This script may have some weird failures if you made a syntax mistake or if you uncover a bug. Therefore, it's
+recommended to commit your changes before running `make style`, so you can revert the changes done by that script
+easily.
+
+## Writing documentation examples
+
+The syntax for Example docstrings can look as follows:
+
+```
+    Example:
+
+    ```python
+    >>> import time
+    >>> from accelerate import Accelerator
+    >>> accelerator = Accelerator()
+    >>> if accelerator.is_main_process:
+    ...     time.sleep(2)
+    >>> else:
+    ...     print("I'm waiting for the main process to finish its sleep...")
+    >>> accelerator.wait_for_everyone()
+    >>> # Should print on every process at the same time
+    >>> print("Everyone is here")
+    ```
+```
+
+The docstring should give a minimal, clear example of how the respective function 
+is to be used in inference and also include the expected (ideally sensible)
+output.
+Often, readers will try out the example before even going through the function 
+or class definitions. Therefore, it is of utmost importance that the example 
+works as expected.
--- a/docs/source/_config.py
+++ b/docs/source/_config.py
@ -0,0 +1,7 @@
+# docstyle-ignore
+INSTALL_CONTENT = """
+# PEFT installation
+! pip install peft accelerate transformers
+# To install from source instead of the last release, comment the command above and uncomment the following one.
+# ! pip install git+https://github.com/huggingface/peft.git
+"""
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@ -0,0 +1,59 @@
+- title: Get started
+  sections:
+  - local: index
+    title: 🤗 PEFT
+  - local: quicktour
+    title: Quicktour
+  - local: install
+    title: Installation
+
+- title: Task guides
+  sections:
+  - local: task_guides/image_classification_lora
+    title: Image classification using LoRA
+  - local: task_guides/seq2seq-prefix-tuning
+    title: Prefix tuning for conditional generation
+  - local: task_guides/clm-prompt-tuning
+    title: Prompt tuning for causal language modeling
+  - local: task_guides/semantic_segmentation_lora
+    title: Semantic segmentation using LoRA
+  - local: task_guides/ptuning-seq-classification
+    title: P-tuning for sequence classification
+  - local: task_guides/dreambooth_lora
+    title: Dreambooth fine-tuning with LoRA
+  - local: task_guides/token-classification-lora
+    title: LoRA for token classification
+  - local: task_guides/int8-asr
+    title: int8 training for automatic speech recognition
+  - local: task_guides/semantic-similarity-lora
+    title: Semantic similarity with LoRA
+
+- title: Developer guides
+  sections:
+  - local: developer_guides/custom_models
+    title: Working with custom models
+
+- title: 🤗 Accelerate integrations
+  sections:
+  - local: accelerate/deepspeed-zero3-offload
+    title: DeepSpeed
+  - local: accelerate/fsdp
+    title: Fully Sharded Data Parallel
+
+- title: Conceptual guides
+  sections:
+  - local: conceptual_guides/lora
+    title: LoRA
+  - local: conceptual_guides/prompting
+    title: Prompting
+  - local: conceptual_guides/ia3
+    title: IA3
+
+- title: Reference
+  sections:
+  - local: package_reference/peft_model
+    title: PEFT model
+  - local: package_reference/config
+    title: Configuration
+  - local: package_reference/tuners
+    title: Tuners
--- a/docs/source/accelerate/deepspeed-zero3-offload.mdx
+++ b/docs/source/accelerate/deepspeed-zero3-offload.mdx
@ -0,0 +1,163 @@
+# DeepSpeed
+
+[DeepSpeed](https://www.deepspeed.ai/) is a library designed for speed and scale for distributed training of large models with billions of parameters. At its core is the Zero Redundancy Optimizer (ZeRO) that shards optimizer states (ZeRO-1), gradients (ZeRO-2), and parameters (ZeRO-3) across data parallel processes. This drastically reduces memory usage, allowing you to scale your training to billion parameter models. To unlock even more memory efficiency, ZeRO-Offload reduces GPU compute and memory by leveraging CPU resources during optimization.
+
+Both of these features are supported in 🤗 Accelerate, and you can use them with 🤗 PEFT. This guide will help you learn how to use our DeepSpeed [training script](https://github.com/huggingface/peft/blob/main/examples/conditional_generation/peft_lora_seq2seq_accelerate_ds_zero3_offload.py). You'll configure the script to train a large model for conditional generation with ZeRO-3 and ZeRO-Offload.
+
+<Tip>
+
+💡 To help you get started, check out our example training scripts for [causal language modeling](https://github.com/huggingface/peft/blob/main/examples/causal_language_modeling/peft_lora_clm_accelerate_ds_zero3_offload.py) and [conditional generation](https://github.com/huggingface/peft/blob/main/examples/conditional_generation/peft_lora_seq2seq_accelerate_ds_zero3_offload.py). You can adapt these scripts for your own applications or even use them out of the box if your task is similar to the one in the scripts.
+
+</Tip>
+
+## Configuration
+
+Start by running the following command to [create a DeepSpeed configuration file](https://huggingface.co/docs/accelerate/quicktour#launching-your-distributed-script) with 🤗 Accelerate. The `--config_file` flag allows you to save the configuration file to a specific location, otherwise it is saved as a `default_config.yaml` file in the 🤗 Accelerate cache.
+
+The configuration file is used to set the default options when you launch the training script.
+
+```bash
+accelerate config --config_file ds_zero3_cpu.yaml
+```
+
+You'll be asked a few questions about your setup, and configure the following arguments. In this example, you'll use ZeRO-3 and ZeRO-Offload so make sure you pick those options.
+
+```bash
+`zero_stage`: [0] Disabled, [1] optimizer state partitioning, [2] optimizer+gradient state partitioning and [3] optimizer+gradient+parameter partitioning
+`gradient_accumulation_steps`: Number of training steps to accumulate gradients before averaging and applying them.
+`gradient_clipping`: Enable gradient clipping with value.
+`offload_optimizer_device`: [none] Disable optimizer offloading, [cpu] offload optimizer to CPU, [nvme] offload optimizer to NVMe SSD. Only applicable with ZeRO >= Stage-2.
+`offload_param_device`: [none] Disable parameter offloading, [cpu] offload parameters to CPU, [nvme] offload parameters to NVMe SSD. Only applicable with ZeRO Stage-3.
+`zero3_init_flag`: Decides whether to enable `deepspeed.zero.Init` for constructing massive models. Only applicable with ZeRO Stage-3.
+`zero3_save_16bit_model`: Decides whether to save 16-bit model weights when using ZeRO Stage-3.
+`mixed_precision`: `no` for FP32 training, `fp16` for FP16 mixed-precision training and `bf16` for BF16 mixed-precision training. 
+```
+
+An example [configuration file](https://github.com/huggingface/peft/blob/main/examples/conditional_generation/accelerate_ds_zero3_cpu_offload_config.yaml) might look like the following. The most important thing to notice is that `zero_stage` is set to `3`, and `offload_optimizer_device` and `offload_param_device` are set to the `cpu`.
+
+```yml
+compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  gradient_accumulation_steps: 1
+  gradient_clipping: 1.0
+  offload_optimizer_device: cpu
+  offload_param_device: cpu
+  zero3_init_flag: true
+  zero3_save_16bit_model: true
+  zero_stage: 3
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+dynamo_backend: 'NO'
+fsdp_config: {}
+machine_rank: 0
+main_training_function: main
+megatron_lm_config: {}
+mixed_precision: 'no'
+num_machines: 1
+num_processes: 1
+rdzv_backend: static
+same_network: true
+use_cpu: false
+```
+
+## The important parts
+
+Let's dive a little deeper into the script so you can see what's going on, and understand how it works.
+
+Within the [`main`](https://github.com/huggingface/peft/blob/2822398fbe896f25d4dac5e468624dc5fd65a51b/examples/conditional_generation/peft_lora_seq2seq_accelerate_ds_zero3_offload.py#L103) function, the script creates an [`~accelerate.Accelerator`] class to initialize all the necessary requirements for distributed training.
+
+<Tip>
+
+💡 Feel free to change the model and dataset inside the `main` function. If your dataset format is different from the one in the script, you may also need to write your own preprocessing function. 
+
+</Tip>
+
+The script also creates a configuration for the 🤗 PEFT method you're using, which in this case, is LoRA. The [`LoraConfig`] specifies the task type and important parameters such as the dimension of the low-rank matrices, the matrices scaling factor, and the dropout probability of the LoRA layers. If you want to use a different 🤗 PEFT method, make sure you replace `LoraConfig` with the appropriate [class](../package_reference/tuners).
+
+```diff
+ def main():
+    accelerator = Accelerator()
+     model_name_or_path = "facebook/bart-large"
+     dataset_name = "twitter_complaints"
+    peft_config = LoraConfig(
+         task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
+     )
+```
+
+Throughout the script, you'll see the [`~accelerate.Accelerator.main_process_first`] and [`~accelerate.Accelerator.wait_for_everyone`] functions which help control and synchronize when processes are executed.
+
+The [`get_peft_model`] function takes a base model and the [`peft_config`] you prepared earlier to create a [`PeftModel`]:
+
+```diff
+  model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
+ model = get_peft_model(model, peft_config)
+```
+
+Pass all the relevant training objects to 🤗 Accelerate's [`~accelerate.Accelerator.prepare`] which makes sure everything is ready for training:
+
+```py
+model, train_dataloader, eval_dataloader, test_dataloader, optimizer, lr_scheduler = accelerator.prepare(
+    model, train_dataloader, eval_dataloader, test_dataloader, optimizer, lr_scheduler
+)
+```
+
+The next bit of code checks whether the DeepSpeed plugin is used in the `Accelerator`, and if the plugin exists, then the `Accelerator` uses ZeRO-3 as specified in the configuration file:
+
+```py
+is_ds_zero_3 = False
+if getattr(accelerator.state, "deepspeed_plugin", None):
+    is_ds_zero_3 = accelerator.state.deepspeed_plugin.zero_stage == 3
+```
+
+Inside the training loop, the usual `loss.backward()` is replaced by 🤗 Accelerate's [`~accelerate.Accelerator.backward`] which uses the correct `backward()` method based on your configuration:
+
+```diff
+  for epoch in range(num_epochs):
+      with TorchTracemalloc() as tracemalloc:
+          model.train()
+          total_loss = 0
+          for step, batch in enumerate(tqdm(train_dataloader)):
+              outputs = model(**batch)
+              loss = outputs.loss
+              total_loss += loss.detach().float()
+             accelerator.backward(loss)
+              optimizer.step()
+              lr_scheduler.step()
+              optimizer.zero_grad()
+```
+
+That is all! The rest of the script handles the training loop, evaluation, and even pushes it to the Hub for you.
+
+## Train
+
+Run the following command to launch the training script. Earlier, you saved the configuration file to `ds_zero3_cpu.yaml`, so you'll need to pass the path to the launcher with the `--config_file` argument like this:
+
+```bash
+accelerate launch --config_file ds_zero3_cpu.yaml examples/peft_lora_seq2seq_accelerate_ds_zero3_offload.py
+```
+
+You'll see some output logs that track memory usage during training, and once it's completed, the script returns the accuracy and compares the predictions to the labels:
+
+```bash
+GPU Memory before entering the train : 1916
+GPU Memory consumed at the end of the train (end-begin): 66
+GPU Peak Memory consumed during the train (max-begin): 7488
+GPU Total Peak Memory consumed during the train (max): 9404
+CPU Memory before entering the train : 19411
+CPU Memory consumed at the end of the train (end-begin): 0
+CPU Peak Memory consumed during the train (max-begin): 0
+CPU Total Peak Memory consumed during the train (max): 19411
+epoch=4: train_ppl=tensor(1.0705, device='cuda:0') train_epoch_loss=tensor(0.0681, device='cuda:0')
+100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:27<00:00,  3.92s/it]
+GPU Memory before entering the eval : 1982
+GPU Memory consumed at the end of the eval (end-begin): -66
+GPU Peak Memory consumed during the eval (max-begin): 672
+GPU Total Peak Memory consumed during the eval (max): 2654
+CPU Memory before entering the eval : 19411
+CPU Memory consumed at the end of the eval (end-begin): 0
+CPU Peak Memory consumed during the eval (max-begin): 0
+CPU Total Peak Memory consumed during the eval (max): 19411
+accuracy=100.0
+eval_preds[:10]=['no complaint', 'no complaint', 'complaint', 'complaint', 'no complaint', 'no complaint', 'no complaint', 'complaint', 'complaint', 'no complaint']
+dataset['train'][label_column][:10]=['no complaint', 'no complaint', 'complaint', 'complaint', 'no complaint', 'no complaint', 'no complaint', 'complaint', 'complaint', 'no complaint']
+```
--- a/docs/source/accelerate/fsdp.mdx
+++ b/docs/source/accelerate/fsdp.mdx
@ -0,0 +1,124 @@
+# Fully Sharded Data Parallel
+
+[Fully sharded data parallel](https://pytorch.org/docs/stable/fsdp.html) (FSDP) is developed for distributed training of large pretrained models up to 1T parameters. FSDP achieves this by sharding the model parameters, gradients, and optimizer states across data parallel processes and it can also offload sharded model parameters to a CPU. The memory efficiency afforded by FSDP allows you to scale training to larger batch or model sizes.
+
+<Tip warning={true}>
+
+Currently, FSDP does not confer any reduction in GPU memory usage and FSDP with CPU offload actually consumes 1.65x more GPU memory during training. You can track this PyTorch [issue](https://github.com/pytorch/pytorch/issues/91165) for any updates.
+
+</Tip>
+
+FSDP is supported in 🤗 Accelerate, and you can use it with 🤗 PEFT. This guide will help you learn how to use our FSDP [training script](https://github.com/huggingface/peft/blob/main/examples/conditional_generation/peft_lora_seq2seq_accelerate_fsdp.py). You'll configure the script to train a large model for conditional generation.
+
+## Configuration
+
+Begin by running the following command to [create a FSDP configuration file](https://huggingface.co/docs/accelerate/main/en/usage_guides/fsdp) with 🤗 Accelerate. Use the `--config_file` flag to save the configuration file to a specific location, otherwise it is saved as a `default_config.yaml` file in the 🤗 Accelerate cache.
+
+The configuration file is used to set the default options when you launch the training script.
+
+```bash
+accelerate config --config_file fsdp_config.yaml
+```
+
+You'll be asked a few questions about your setup, and configure the following arguments. For this example, make sure you fully shard the model parameters, gradients, optimizer states, leverage the CPU for offloading, and wrap model layers based on the Transformer layer class name.
+
+```bash
+`Sharding Strategy`: [1] FULL_SHARD (shards optimizer states, gradients and parameters), [2] SHARD_GRAD_OP (shards optimizer states and gradients), [3] NO_SHARD
+`Offload Params`: Decides Whether to offload parameters and gradients to CPU
+`Auto Wrap Policy`: [1] TRANSFORMER_BASED_WRAP, [2] SIZE_BASED_WRAP, [3] NO_WRAP 
+`Transformer Layer Class to Wrap`: When using `TRANSFORMER_BASED_WRAP`, user specifies comma-separated string of transformer layer class names (case-sensitive) to wrap ,e.g, 
+`BertLayer`, `GPTJBlock`, `T5Block`, `BertLayer,BertEmbeddings,BertSelfOutput`...
+`Min Num Params`: minimum number of parameters when using `SIZE_BASED_WRAP`
+`Backward Prefetch`: [1] BACKWARD_PRE, [2] BACKWARD_POST, [3] NO_PREFETCH
+`State Dict Type`: [1] FULL_STATE_DICT, [2] LOCAL_STATE_DICT, [3] SHARDED_STATE_DICT  
+```
+
+For example, your FSDP configuration file may look like the following:
+
+```yaml
+command_file: null
+commands: null
+compute_environment: LOCAL_MACHINE
+deepspeed_config: {}
+distributed_type: FSDP
+downcast_bf16: 'no'
+dynamo_backend: 'NO'
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch_policy: BACKWARD_PRE
+  fsdp_offload_params: true
+  fsdp_sharding_strategy: 1
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_transformer_layer_cls_to_wrap: T5Block
+gpu_ids: null
+machine_rank: 0
+main_process_ip: null
+main_process_port: null
+main_training_function: main
+megatron_lm_config: {}
+mixed_precision: 'no'
+num_machines: 1
+num_processes: 2
+rdzv_backend: static
+same_network: true
+tpu_name: null
+tpu_zone: null
+use_cpu: false
+```
+
+## The important parts
+
+Let's dig a bit deeper into the training script to understand how it works.
+
+The [`main()`](https://github.com/huggingface/peft/blob/2822398fbe896f25d4dac5e468624dc5fd65a51b/examples/conditional_generation/peft_lora_seq2seq_accelerate_fsdp.py#L14) function begins with initializing an [`~accelerate.Accelerator`] class which handles everything for distributed training, such as automatically detecting your training environment.
+
+<Tip>
+
+💡 Feel free to change the model and dataset inside the `main` function. If your dataset format is different from the one in the script, you may also need to write your own preprocessing function. 
+
+</Tip>
+
+The script also creates a configuration corresponding to the 🤗 PEFT method you're using. For LoRA, you'll use [`LoraConfig`] to specify the task type, and several other important parameters such as the dimension of the low-rank matrices, the matrices scaling factor, and the dropout probability of the LoRA layers. If you want to use a different 🤗 PEFT method, replace `LoraConfig` with the appropriate [class](../package_reference/tuners).
+
+Next, the script wraps the base model and `peft_config` with the [`get_peft_model`] function to create a [`PeftModel`]. 
+
+```diff
+ def main():
+    accelerator = Accelerator()
+     model_name_or_path = "t5-base"
+     base_path = "temp/data/FinancialPhraseBank-v1.0"
+    peft_config = LoraConfig(
+         task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
+     )
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
+   model = get_peft_model(model, peft_config)
+```
+
+Throughout the script, you'll see the [`~accelerate.Accelerator.main_process_first`] and [`~accelerate.Accelerator.wait_for_everyone`] functions which help control and synchronize when processes are executed.
+
+After your dataset is prepared, and all the necessary training components are loaded, the script checks if you're using the `fsdp_plugin`. PyTorch offers two ways for wrapping model layers in FSDP, automatically or manually. The simplest method is to allow FSDP to automatically recursively wrap model layers without changing any other code. You can choose to wrap the model layers based on the layer name or on the size (number of parameters). In the FSDP configuration file, it uses the `TRANSFORMER_BASED_WRAP` option to wrap the [`T5Block`] layer.
+
+```py
+if getattr(accelerator.state, "fsdp_plugin", None) is not None:
+    accelerator.state.fsdp_plugin.auto_wrap_policy = fsdp_auto_wrap_policy(model)
+```
+
+Next, use 🤗 Accelerate's [`~accelerate.Accelerator.prepare`] function to prepare the model, datasets, optimizer, and scheduler for training.
+
+```py
+model, train_dataloader, eval_dataloader, optimizer, lr_scheduler = accelerator.prepare(
+    model, train_dataloader, eval_dataloader, optimizer, lr_scheduler
+)
+```
+
+From here, the remainder of the script handles the training loop, evaluation, and sharing your model to the Hub.
+
+## Train
+
+Run the following command to launch the training script. Earlier, you saved the configuration file to `fsdp_config.yaml`, so you'll need to pass the path to the launcher with the `--config_file` argument like this:
+
+```bash
+accelerate launch --config_file fsdp_config.yaml examples/peft_lora_seq2seq_accelerate_fsdp.py
+```
+
+Once training is complete, the script returns the accuracy and compares the predictions to the labels.
--- a/docs/source/conceptual_guides/ia3.mdx
+++ b/docs/source/conceptual_guides/ia3.mdx
@ -0,0 +1,52 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# IA3 
+
+This conceptual guide gives a brief overview of [IA3](https://arxiv.org/abs/2205.05638), a parameter-efficient fine tuning technique that is 
+intended to improve over [LoRA](./lora).
+
+To make fine-tuning more efficient, IA3 (Infused Adapter by Inhibiting and Amplifying Inner Activations) 
+rescales inner activations with learned vectors. These learned vectors are injected in the attention and feedforward modules 
+in a typical transformer-based architecture. These learned vectors are the only trainable parameters during fine-tuning, and thus the original 
+weights remain frozen. Dealing with learned vectors (as opposed to learned low-rank updates to a weight matrix like LoRA)
+keeps the number of trainable parameters much smaller. 
+
+Being similar to LoRA, IA3 carries many of the same advantages: 
+
+* IA3 makes fine-tuning more efficient by drastically reducing the number of trainable parameters. (For T0, an IA3 model only has about 0.01% trainable parameters, while even LoRA has > 0.1%)
+* The original pre-trained weights are kept frozen, which means you can have multiple lightweight and portable IA3 models for various downstream tasks built on top of them.
+* Performance of models fine-tuned using IA3 is comparable to the performance of fully fine-tuned models.
+* IA3 does not add any inference latency because adapter weights can be merged with the base model.
+
+In principle, IA3 can be applied to any subset of weight matrices in a neural network to reduce the number of trainable 
+parameters. Following the authors' implementation, IA3 weights are added to the key, value and feedforward layers 
+of a Transformer model. Given the target layers for injecting IA3 parameters, the number of trainable parameters 
+can be determined based on the size of the weight matrices. 
+
+
+## Common IA3 parameters in PEFT
+
+As with other methods supported by PEFT, to fine-tune a model using IA3, you need to:
+
+1. Instantiate a base model.
+2. Create a configuration (`IA3Config`) where you define IA3-specific parameters.
+3. Wrap the base model with `get_peft_model()` to get a trainable `PeftModel`.
+4. Train the `PeftModel` as you normally would train the base model.
+
+`IA3Config` allows you to control how IA3 is applied to the base model through the following parameters: 
+
+- `target_modules`: The modules (for example, attention blocks) to apply the IA3 vectors.
+- `feedforward_modules`: The list of modules to be treated as feedforward layers in `target_modules`. While learned vectors are multiplied with 
+the output activation for attention blocks, the vectors are multiplied with the input for classic feedforward layers.
+- `modules_to_save`: List of modules apart from IA3 layers to be set as trainable and saved in the final checkpoint. These typically include model's custom head that is randomly initialized for the fine-tuning task.
+
--- a/docs/source/conceptual_guides/lora.mdx
+++ b/docs/source/conceptual_guides/lora.mdx
@ -0,0 +1,89 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# LoRA 
+
+This conceptual guide gives a brief overview of [LoRA](https://arxiv.org/abs/2106.09685), a technique that accelerates 
+the fine-tuning of large models while consuming less memory. 
+
+To make fine-tuning more efficient, LoRA's approach is to represent the weight updates with two smaller 
+matrices (called **update matrices**) through low-rank decomposition. These new matrices can be trained to adapt to the 
+new data while keeping the overall number of changes low. The original weight matrix remains frozen and doesn't receive 
+any further adjustments. To produce the final results, both the original and the adapted weights are combined.
+
+This approach has a number of advantages: 
+
+* LoRA makes fine-tuning more efficient by drastically reducing the number of trainable parameters.
+* The original pre-trained weights are kept frozen, which means you can have multiple lightweight and portable LoRA models for various downstream tasks built on top of them.
+* LoRA is orthogonal to many other parameter-efficient methods and can be combined with many of them.
+* Performance of models fine-tuned using LoRA is comparable to the performance of fully fine-tuned models.
+* LoRA does not add any inference latency because adapter weights can be merged with the base model.
+
+In principle, LoRA can be applied to any subset of weight matrices in a neural network to reduce the number of trainable 
+parameters. However, for simplicity and further parameter efficiency, in Transformer models LoRA is typically applied to 
+attention blocks only. The resulting number of trainable parameters in a LoRA model depends on the size of the low-rank 
+update matrices, which is determined mainly by the rank `r` and the shape of the original weight matrix.
+
+## Merge LoRA weights into the base model
+
+While LoRA is significantly smaller and faster to train, you may encounter latency issues during inference due to separately loading the base model and the LoRA model. To eliminate latency, use the [`~LoraModel.merge_and_unload`] function to merge the adapter weights with the base model which allows you to effectively use the newly merged model as a standalone model.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/lora_diagram.png"/>
+</div>
+
+This works because during training, the smaller weight matrices (*A* and *B* in the diagram above) are separate. But once training is complete, the weights can actually be merged into a new weight matrix that is identical.
+
+## Utils for LoRA
+
+Use [`~LoraModel.merge_adapter`] to merge the LoRa layers into the base model while retaining the PeftModel.
+This will help in later unmerging, deleting, loading different adapters and so on.
+
+Use [`~LoraModel.unmerge_adapter`] to unmerge the LoRa layers from the base model while retaining the PeftModel.
+This will help in later merging, deleting, loading different adapters and so on.
+
+Use [`~LoraModel.unload`] to get back the base model without the merging of the active lora modules. 
+This will help when you want to get back the pretrained base model in some applications when you want to reset the model to its original state.
+For example, in Stable Diffusion WebUi, when the user wants to infer with base model post trying out LoRAs.
+
+Use [`~LoraModel.delete_adapter`] to delete an existing adapter.
+
+Use [`~LoraModel.add_weighted_adapter`] to combine multiple LoRAs into a new adapter based on the user provided weighing scheme.
+
+## Common LoRA parameters in PEFT
+
+As with other methods supported by PEFT, to fine-tune a model using LoRA, you need to:
+
+1. Instantiate a base model.
+2. Create a configuration (`LoraConfig`) where you define LoRA-specific parameters.
+3. Wrap the base model with `get_peft_model()` to get a trainable `PeftModel`.
+4. Train the `PeftModel` as you normally would train the base model.
+
+`LoraConfig` allows you to control how LoRA is applied to the base model through the following parameters: 
+
+- `r`: the rank of the update matrices, expressed in `int`. Lower rank results in smaller update matrices with fewer trainable parameters.
+- `target_modules`: The modules (for example, attention blocks) to apply the LoRA update matrices.
+- `alpha`: LoRA scaling factor.
+- `bias`: Specifies if the `bias` parameters should be trained. Can be `'none'`, `'all'` or `'lora_only'`.
+- `modules_to_save`: List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. These typically include model's custom head that is randomly initialized for the fine-tuning task.
+- `layers_to_transform`: List of layers to be transformed by LoRA. If not specified, all layers in `target_modules` are transformed.
+- `layers_pattern`: Pattern to match layer names in `target_modules`, if `layers_to_transform` is specified. By default `PeftModel` will look at common layer pattern (`layers`, `h`, `blocks`, etc.), use it for exotic and custom models.
+
+## LoRA examples
+
+For an example of LoRA method application to various downstream tasks, please refer to the following guides:
+
+* [Image classification using LoRA](../task_guides/image_classification_lora)
+* [Semantic segmentation](../task_guides/semantic_segmentation_lora)
+
+While the original paper focuses on language models, the technique can be applied to any dense layers in deep learning 
+models. As such, you can leverage this technique with diffusion models. See [Dreambooth fine-tuning with LoRA](../task_guides/task_guides/dreambooth_lora) task guide for an example.
--- a/docs/source/conceptual_guides/prompting.mdx
+++ b/docs/source/conceptual_guides/prompting.mdx
@ -0,0 +1,56 @@
+# Prompting
+
+Training large pretrained language models is very time-consuming and compute-intensive. As they continue to grow in size, there is increasing interest in more efficient training methods such as *prompting*. Prompting primes a frozen pretrained model for a specific downstream task by including a text prompt that describes the task or even demonstrates an example of the task. With prompting, you can avoid fully training a separate model for each downstream task, and use the same frozen pretrained model instead. This is a lot easier because you can use the same model for several different tasks, and it is significantly more efficient to train and store a smaller set of prompt parameters than to train all the model's parameters.
+
+There are two categories of prompting methods:
+
+- hard prompts are manually handcrafted text prompts with discrete input tokens; the downside is that it requires a lot of effort to create a good prompt
+- soft prompts are learnable tensors concatenated with the input embeddings that can be optimized to a dataset; the downside is that they aren't human readable because you aren't matching these "virtual tokens" to the embeddings of a real word
+
+This conceptual guide provides a brief overview of the soft prompt methods included in 🤗 PEFT: prompt tuning, prefix tuning, and P-tuning.
+
+## Prompt tuning
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/prompt-tuning.png"/>
+</div>
+<small>Only train and store a significantly smaller set of task-specific prompt parameters <a href="https://arxiv.org/abs/2104.08691">(image source)</a>.</small>
+
+Prompt tuning was developed for text classification tasks on T5 models, and all downstream tasks are cast as a text generation task. For example, sequence classification usually assigns a single class label to a sequence of text. By casting it as a text generation task, the tokens that make up the class label are *generated*. Prompts are added to the input as a series of tokens. Typically, the model parameters are fixed which means the prompt tokens are also fixed by the model parameters.
+
+The key idea behind prompt tuning is that prompt tokens have their own parameters that are updated independently. This means you can keep the pretrained model's parameters frozen, and only update the gradients of the prompt token embeddings. The results are comparable to the traditional method of training the entire model, and prompt tuning performance scales as model size increases.
+
+Take a look at [Prompt tuning for causal language modeling](../task_guides/clm-prompt-tuning) for a step-by-step guide on how to train a model with prompt tuning.
+
+## Prefix tuning
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/prefix-tuning.png"/>
+</div>
+<small>Optimize the prefix parameters for each task <a href="https://arxiv.org/abs/2101.00190">(image source)</a>.</small>
+
+Prefix tuning was designed for natural language generation (NLG) tasks on GPT models. It is very similar to prompt tuning; prefix tuning also prepends a sequence of task-specific vectors to the input that can be trained and updated while keeping the rest of the pretrained model's parameters frozen. 
+
+The main difference is that the prefix parameters are inserted in **all** of the model layers, whereas prompt tuning only adds the prompt parameters to the model input embeddings. The prefix parameters are also optimized by a separate feed-forward network (FFN) instead of training directly on the soft prompts because it causes instability and hurts performance. The FFN is discarded after updating the soft prompts.
+
+As a result, the authors found that prefix tuning demonstrates comparable performance to fully finetuning a model, despite having 1000x fewer parameters, and it performs even better in low-data settings.
+
+Take a look at [Prefix tuning for conditional generation](../task_guides/seq2seq-prefix-tuning) for a step-by-step guide on how to train a model with prefix tuning.
+
+## P-tuning
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/p-tuning.png"/>
+</div>
+<small>Prompt tokens can be inserted anywhere in the input sequence, and they are optimized by a prompt encoder <a href="https://arxiv.org/abs/2103.10385">(image source)</a>.</small>
+
+P-tuning is designed for natural language understanding (NLU) tasks and all language models. 
+It is another variation of a soft prompt method; P-tuning also adds a trainable embedding tensor that can be optimized to find better prompts, and it uses a prompt encoder (a bidirectional long-short term memory network or LSTM) to optimize the prompt parameters. Unlike prefix tuning though:
+
+- the prompt tokens can be inserted anywhere in the input sequence, and it isn't restricted to only the beginning
+- the prompt tokens are only added to the input instead of adding them to every layer of the model
+- introducing *anchor* tokens can improve performance because they indicate characteristics of a component in the input sequence
+
+The results suggest that P-tuning is more efficient than manually crafting prompts, and it enables GPT-like models to compete with BERT-like models on NLU tasks.
+
+Take a look at [P-tuning for sequence classification](../task_guides/ptuning-seq-classification) for a step-by-step guide on how to train a model with P-tuning.
--- a/docs/source/developer_guides/custom_models.mdx
+++ b/docs/source/developer_guides/custom_models.mdx
@ -0,0 +1,197 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Working with custom models
+
+Some fine-tuning techniques, such as prompt tuning, are specific to language models. That means in 🤗 PEFT, it is
+assumed a 🤗 Transformers model is being used. However, other fine-tuning techniques - like
+[LoRA](./conceptual_guides/lora) - are not restricted to specific model types.
+
+In this guide, we will see how LoRA can be applied to a multilayer perception and a computer vision model from the [timm](https://huggingface.co/docs/timm/index) library.
+
+## Multilayer perceptron
+
+Let's assume that we want to fine-tune a multilayer perceptron with LoRA. Here is the definition:
+
+```python
+from torch import nn
+
+
+class MLP(nn.Module):
+    def __init__(self, num_units_hidden=2000):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Linear(20, num_units_hidden),
+            nn.ReLU(),
+            nn.Linear(num_units_hidden, num_units_hidden),
+            nn.ReLU(),
+            nn.Linear(num_units_hidden, 2),
+            nn.LogSoftmax(dim=-1),
+        )
+
+    def forward(self, X):
+        return self.seq(X)
+```
+
+This is a straightforward multilayer perceptron with an input layer, a hidden layer, and an output layer. 
+
+<Tip>
+
+For this toy example, we choose an exceedingly large number of hidden units to highlight the efficiency gains
+from PEFT, but those gains are in line with more realistic examples.
+
+</Tip>
+
+There are a few linear layers in this model that could be tuned with LoRA. When working with common 🤗 Transformers
+models, PEFT will know which layers to apply LoRA to, but in this case, it is up to us as a user to choose the layers.
+To determine the names of the layers to tune:
+
+```python
+print([(n, type(m)) for n, m in MLP().named_modules()])
+```
+
+This should print:
+
+```
+[('', __main__.MLP),
+ ('seq', torch.nn.modules.container.Sequential),
+ ('seq.0', torch.nn.modules.linear.Linear),
+ ('seq.1', torch.nn.modules.activation.ReLU),
+ ('seq.2', torch.nn.modules.linear.Linear),
+ ('seq.3', torch.nn.modules.activation.ReLU),
+ ('seq.4', torch.nn.modules.linear.Linear),
+ ('seq.5', torch.nn.modules.activation.LogSoftmax)]
+```
+
+Let's say we want to apply LoRA to the input layer and to the hidden layer, those are `'seq.0'` and `'seq.1'`. Moreover,
+let's assume we want to update the output layer without LoRA, that would be `'seq.4'`. The corresponding config would
+be:
+
+```python
+from peft import LoraConfig
+
+config = LoraConfig(
+    target_modules=["seq.0", "seq.2"],
+    modules_to_save=["seq.4"],
+)
+```
+
+With that, we can create our PEFT model and check the fraction of parameters trained:
+
+```python
+from peft import get_peft_model
+
+model = MLP()
+peft_model = get_peft_model(module, config)
+peft_model.print_trainable_parameters()
+# prints trainable params: 56,164 || all params: 4,100,164 || trainable%: 1.369798866581922
+```
+
+Finally, we can use any training framework we like, or write our own fit loop, to train the `peft_model`.
+
+For a complete example, check out [this notebook](https://github.com/huggingface/peft/blob/main/examples/multilayer_perceptron/multilayer_perceptron_lora.ipynb).
+
+## timm model
+
+The [timm](https://huggingface.co/docs/timm/index) library contains a large number of pretrained computer vision models.
+Those can also be fine-tuned with PEFT. Let's check out how this works in practice.
+
+To start, ensure that timm is installed in the Python environment:
+
+```bash
+python -m pip install -U timm
+```
+
+Next we load a timm model for an image classification task:
+
+```python
+import timm
+
+num_classes = ...
+model_id = "timm/poolformer_m36.sail_in1k"
+model = timm.create_model(model_id, pretrained=True, num_classes=num_classes)
+```
+
+Again, we need to make a decision about what layers to apply LoRA to. Since LoRA supports 2D conv layers, and since
+those are a major building block of this model, we should apply LoRA to the 2D conv layers. To identify the names of
+those layers, let's look at all the layer names:
+
+```python
+print([(n, type(m)) for n, m in MLP().named_modules()])
+```
+
+This will print a very long list, we'll only show the first few:
+
+```
+[('', timm.models.metaformer.MetaFormer),
+ ('stem', timm.models.metaformer.Stem),
+ ('stem.conv', torch.nn.modules.conv.Conv2d),
+ ('stem.norm', torch.nn.modules.linear.Identity),
+ ('stages', torch.nn.modules.container.Sequential),
+ ('stages.0', timm.models.metaformer.MetaFormerStage),
+ ('stages.0.downsample', torch.nn.modules.linear.Identity),
+ ('stages.0.blocks', torch.nn.modules.container.Sequential),
+ ('stages.0.blocks.0', timm.models.metaformer.MetaFormerBlock),
+ ('stages.0.blocks.0.norm1', timm.layers.norm.GroupNorm1),
+ ('stages.0.blocks.0.token_mixer', timm.models.metaformer.Pooling),
+ ('stages.0.blocks.0.token_mixer.pool', torch.nn.modules.pooling.AvgPool2d),
+ ('stages.0.blocks.0.drop_path1', torch.nn.modules.linear.Identity),
+ ('stages.0.blocks.0.layer_scale1', timm.models.metaformer.Scale),
+ ('stages.0.blocks.0.res_scale1', torch.nn.modules.linear.Identity),
+ ('stages.0.blocks.0.norm2', timm.layers.norm.GroupNorm1),
+ ('stages.0.blocks.0.mlp', timm.layers.mlp.Mlp),
+ ('stages.0.blocks.0.mlp.fc1', torch.nn.modules.conv.Conv2d),
+ ('stages.0.blocks.0.mlp.act', torch.nn.modules.activation.GELU),
+ ('stages.0.blocks.0.mlp.drop1', torch.nn.modules.dropout.Dropout),
+ ('stages.0.blocks.0.mlp.norm', torch.nn.modules.linear.Identity),
+ ('stages.0.blocks.0.mlp.fc2', torch.nn.modules.conv.Conv2d),
+ ('stages.0.blocks.0.mlp.drop2', torch.nn.modules.dropout.Dropout),
+ ('stages.0.blocks.0.drop_path2', torch.nn.modules.linear.Identity),
+ ('stages.0.blocks.0.layer_scale2', timm.models.metaformer.Scale),
+ ('stages.0.blocks.0.res_scale2', torch.nn.modules.linear.Identity),
+ ('stages.0.blocks.1', timm.models.metaformer.MetaFormerBlock),
+ ('stages.0.blocks.1.norm1', timm.layers.norm.GroupNorm1),
+ ('stages.0.blocks.1.token_mixer', timm.models.metaformer.Pooling),
+ ('stages.0.blocks.1.token_mixer.pool', torch.nn.modules.pooling.AvgPool2d),
+ ...
+ ('head.global_pool.flatten', torch.nn.modules.linear.Identity),
+ ('head.norm', timm.layers.norm.LayerNorm2d),
+ ('head.flatten', torch.nn.modules.flatten.Flatten),
+ ('head.drop', torch.nn.modules.linear.Identity),
+ ('head.fc', torch.nn.modules.linear.Linear)]
+ ]
+```
+
+Upon closer inspection, we see that the 2D conv layers have names such as `"stages.0.blocks.0.mlp.fc1"` and
+`"stages.0.blocks.0.mlp.fc2"`. How can we match those layer names specifically? You can write a [regular
+expressions](https://docs.python.org/3/library/re.html) to match the layer names. For our case, the regex
+`r".*\.mlp\.fc\d"` should do the job.
+
+Furthermore, as in the first example, we should ensure that the output layer, in this case the classification head, is
+also updated. Looking at the end of the list printed above, we can see that it's named `'head.fc'`. With that in mind,
+here is our LoRA config:
+
+```python
+config = LoraConfig(target_modules=r".*\.mlp\.fc\d", modules_to_save=["head.fc"])
+```
+
+Then we only need to create the PEFT model by passing our base model and the config to `get_peft_model`:
+
+```python
+peft_model = get_peft_model(model, config)
+peft_model.print_trainable_parameters()
+# prints trainable params: 1,064,454 || all params: 56,467,974 || trainable%: 1.88505789139876
+```
+
+This shows us that we only need to train less than 2% of all parameters, which is a huge efficiency gain.
+
+For a complete example, check out [this notebook](https://github.com/huggingface/peft/blob/main/examples/image_classification/image_classification_timm_peft_lora.ipynb).
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@ -0,0 +1,153 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# PEFT
+
+🤗 PEFT, or Parameter-Efficient Fine-Tuning (PEFT), is a library for efficiently adapting pre-trained language models (PLMs) to various downstream applications without fine-tuning all the model's parameters. 
+PEFT methods only fine-tune a small number of (extra) model parameters, significantly decreasing computational and storage costs because fine-tuning large-scale PLMs is prohibitively costly.
+Recent state-of-the-art PEFT techniques achieve performance comparable to that of full fine-tuning.
+
+PEFT is seamlessly integrated with 🤗 Accelerate for large-scale models leveraging DeepSpeed and [Big Model Inference](https://huggingface.co/docs/accelerate/usage_guides/big_modeling).
+
+<div class="block dark:hidden">
+	<iframe 
+        src="https://smangrul-peft-docs-qa-chatbot.hf.space?__theme=light"
+        width="850"
+        height="1600"
+    ></iframe>
+</div>
+<div class="hidden dark:block">
+    <iframe 
+        src="https://smangrul-peft-docs-qa-chatbot.hf.space?__theme=dark"
+        width="850"
+        height="1600"
+    ></iframe>
+</div>
+
+<div class="mt-10">
+  <div class="w-full flex flex-col space-y-4 md:space-y-0 md:grid md:grid-cols-2 md:gap-y-4 md:gap-x-5">
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="quicktour"
+      ><div class="w-full text-center bg-gradient-to-br from-blue-400 to-blue-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Get started</div>
+      <p class="text-gray-700">Start here if you're new to 🤗 PEFT to get an overview of the library's main features, and how to train a model with a PEFT method.</p>
+    </a>
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./task_guides/image_classification_lora"
+      ><div class="w-full text-center bg-gradient-to-br from-indigo-400 to-indigo-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">How-to guides</div>
+      <p class="text-gray-700">Practical guides demonstrating how to apply various PEFT methods across different types of tasks like image classification, causal language modeling, automatic speech recognition, and more. Learn how to use 🤗 PEFT with the DeepSpeed and Fully Sharded Data Parallel scripts.</p>
+    </a>
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./conceptual_guides/lora"
+      ><div class="w-full text-center bg-gradient-to-br from-pink-400 to-pink-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Conceptual guides</div>
+      <p class="text-gray-700">Get a better theoretical understanding of how LoRA and various soft prompting methods help reduce the number of trainable parameters to make training more efficient.</p>
+   </a>
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./package_reference/config"
+      ><div class="w-full text-center bg-gradient-to-br from-purple-400 to-purple-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Reference</div>
+      <p class="text-gray-700">Technical descriptions of how 🤗 PEFT classes and methods work.</p>
+    </a>
+  </div>
+</div>
+
+## Supported methods
+
+1. LoRA: [LORA: LOW-RANK ADAPTATION OF LARGE LANGUAGE MODELS](https://arxiv.org/pdf/2106.09685.pdf)
+2. Prefix Tuning: [Prefix-Tuning: Optimizing Continuous Prompts for Generation](https://aclanthology.org/2021.acl-long.353/), [P-Tuning v2: Prompt Tuning Can Be Comparable to Fine-tuning Universally Across Scales and Tasks](https://arxiv.org/pdf/2110.07602.pdf)
+3. P-Tuning: [GPT Understands, Too](https://arxiv.org/pdf/2103.10385.pdf)
+4. Prompt Tuning: [The Power of Scale for Parameter-Efficient Prompt Tuning](https://arxiv.org/pdf/2104.08691.pdf) 
+5. AdaLoRA: [Adaptive Budget Allocation for Parameter-Efficient Fine-Tuning](https://arxiv.org/abs/2303.10512) 
+6. [LLaMA-Adapter: Efficient Fine-tuning of Language Models with Zero-init Attention](https://github.com/ZrrSkywalker/LLaMA-Adapter)
+7. IA3: [Infused Adapter by Inhibiting and Amplifying Inner Activations](https://arxiv.org/abs/2205.05638)
+
+## Supported models
+
+The tables provided below list the PEFT methods and models supported for each task. To apply a particular PEFT method for 
+a task, please refer to the corresponding Task guides.
+
+### Causal Language Modeling
+
+| Model        | LoRA | Prefix Tuning  | P-Tuning | Prompt Tuning  | IA3 |
+|--------------| ---- | ---- | ---- | ----  | ----  |
+| GPT-2        | ✅  | ✅  | ✅  | ✅  | ✅  |
+| Bloom        | ✅  | ✅  | ✅  | ✅  | ✅  |
+| OPT          | ✅  | ✅  | ✅  | ✅  | ✅  |
+| GPT-Neo      | ✅  | ✅  | ✅  | ✅  | ✅  |
+| GPT-J        | ✅  | ✅  | ✅  | ✅  | ✅  |
+| GPT-NeoX-20B | ✅  | ✅  | ✅  | ✅  | ✅  |
+| LLaMA        | ✅  | ✅  | ✅  | ✅  | ✅  |
+| ChatGLM      | ✅  | ✅  | ✅  | ✅  | ✅  |
+
+### Conditional Generation
+
+|   Model         | LoRA | Prefix Tuning  | P-Tuning | Prompt Tuning  | IA3 |
+| --------- | ---- | ---- | ---- | ---- | ---- |
+| T5        | ✅   | ✅   | ✅   | ✅   | ✅   |
+| BART      | ✅   | ✅   | ✅   | ✅   | ✅   |
+
+### Sequence Classification
+
+|   Model         | LoRA | Prefix Tuning  | P-Tuning | Prompt Tuning  | IA3 |
+| --------- | ---- | ---- | ---- | ----  | ----  |
+| BERT           | ✅  | ✅  | ✅  | ✅  | ✅  |  
+| RoBERTa        | ✅  | ✅  | ✅  | ✅  | ✅  |
+| GPT-2          | ✅  | ✅  | ✅  | ✅  |   | 
+| Bloom          | ✅  | ✅  | ✅  | ✅  |   |
+| OPT            | ✅  | ✅  | ✅  | ✅  |   |
+| GPT-Neo        | ✅  | ✅  | ✅  | ✅  |   |
+| GPT-J          | ✅  | ✅  | ✅  | ✅  |   |
+| Deberta        | ✅  |     | ✅  | ✅  |   | 
+| Deberta-v2     | ✅  |     | ✅  | ✅  |   |    
+
+### Token Classification
+
+|   Model         | LoRA | Prefix Tuning  | P-Tuning | Prompt Tuning  | IA3 |
+| --------- | ---- | ---- | ---- | ----  | --- |
+| BERT           | ✅  | ✅  |   |   |   |  
+| RoBERTa        | ✅  | ✅  |   |   |   |
+| GPT-2          | ✅  | ✅  |   |   |   | 
+| Bloom          | ✅  | ✅  |   |   |   |
+| OPT            | ✅  | ✅  |   |   |   |
+| GPT-Neo        | ✅  | ✅  |   |   |   |
+| GPT-J          | ✅  | ✅  |   |   |   |
+| Deberta        | ✅  |     |   |   |    |
+| Deberta-v2     | ✅  |     |   |   |   |
+
+### Text-to-Image Generation
+
+|   Model         | LoRA | Prefix Tuning  | P-Tuning | Prompt Tuning  | IA3 |
+| --------- | ---- | ---- | ---- | ----  | ----  |
+| Stable Diffusion           | ✅  |   |   |   |   |  
+
+
+### Image Classification
+
+|   Model         | LoRA | Prefix Tuning  | P-Tuning | Prompt Tuning  | IA3 |
+| --------- | ---- | ---- | ---- | ----  | ----  | ----  |
+| ViT           | ✅  |   |   |   |   | 
+| Swin           | ✅  |   |   |   |   | 
+
+### Image to text (Multi-modal models)
+
+We have tested LoRA for [ViT](https://huggingface.co/docs/transformers/model_doc/vit) and [Swin](https://huggingface.co/docs/transformers/model_doc/swin) for fine-tuning on image classification. 
+However, it should be possible to use LoRA for any [ViT-based model](https://huggingface.co/models?pipeline_tag=image-classification&sort=downloads&search=vit) from 🤗 Transformers. 
+Check out the [Image classification](/task_guides/image_classification_lora) task guide to learn more. If you run into problems, please open an issue.
+
+|   Model         | LoRA | Prefix Tuning  | P-Tuning | Prompt Tuning  | IA3 |
+| --------- | ---- | ---- | ---- | ----  | ---- |
+| Blip-2           | ✅  |   |   |   |   | 
+ 
+
+### Semantic Segmentation
+
+As with image-to-text models, you should be able to apply LoRA to any of the [segmentation models](https://huggingface.co/models?pipeline_tag=image-segmentation&sort=downloads). 
+It's worth noting that we haven't tested this with every architecture yet. Therefore, if you come across any issues, kindly create an issue report.
+
+|   Model         | LoRA | Prefix Tuning  | P-Tuning | Prompt Tuning  | IA3 |
+| --------- | ---- | ---- | ---- | ----  | ----  |
+| SegFormer           | ✅  |   |   |   |   |
+
--- a/docs/source/install.mdx
+++ b/docs/source/install.mdx
@ -0,0 +1,43 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Installation
+
+Before you start, you will need to setup your environment, install the appropriate packages, and configure 🤗 PEFT. 🤗 PEFT is tested on **Python 3.8+**.
+
+🤗 PEFT is available on PyPI, as well as GitHub:
+
+## PyPI
+
+To install 🤗 PEFT from PyPI:
+
+```bash
+pip install peft
+```
+
+## Source
+
+New features that haven't been released yet are added every day, which also means there may be some bugs. To try them out, install from the GitHub repository:
+
+```bash
+pip install git+https://github.com/huggingface/peft
+```
+
+If you're working on contributing to the library or wish to play with the source code and see live 
+results as you run the code, an editable version can be installed from a locally-cloned version of the 
+repository:
+
+```bash
+git clone https://github.com/huggingface/peft
+cd peft
+pip install -e .
+```
--- a/docs/source/package_reference/config.mdx
+++ b/docs/source/package_reference/config.mdx
@ -0,0 +1,18 @@
+# Configuration
+
+The configuration classes stores the configuration of a [`PeftModel`], PEFT adapter models, and the configurations of [`PrefixTuning`], [`PromptTuning`], and [`PromptEncoder`]. They contain methods for saving and loading model configurations from the Hub, specifying the PEFT method to use, type of task to perform, and model configurations like number of layers and number of attention heads.
+
+## PeftConfigMixin
+
+[[autodoc]] config.PeftConfigMixin
+    - all
+
+## PeftConfig
+
+[[autodoc]] PeftConfig
+    - all
+
+## PromptLearningConfig
+
+[[autodoc]] PromptLearningConfig
+    - all
--- a/docs/source/package_reference/peft_model.mdx
+++ b/docs/source/package_reference/peft_model.mdx
@ -0,0 +1,50 @@
+# Models
+
+[`PeftModel`] is the base model class for specifying the base Transformer model and configuration to apply a PEFT method to. The base `PeftModel` contains methods for loading and saving models from the Hub, and supports the [`PromptEncoder`] for prompt learning.
+
+## PeftModel
+
+[[autodoc]] PeftModel
+    - all
+
+## PeftModelForSequenceClassification
+
+A `PeftModel` for sequence classification tasks.
+
+[[autodoc]] PeftModelForSequenceClassification
+    - all
+
+## PeftModelForTokenClassification
+
+A `PeftModel` for token classification tasks.
+
+[[autodoc]] PeftModelForTokenClassification
+    - all
+
+## PeftModelForCausalLM
+
+A `PeftModel` for causal language modeling.
+
+[[autodoc]] PeftModelForCausalLM
+    - all
+
+## PeftModelForSeq2SeqLM
+
+A `PeftModel` for sequence-to-sequence language modeling.
+
+[[autodoc]] PeftModelForSeq2SeqLM
+    - all
+
+## PeftModelForQuestionAnswering
+
+A `PeftModel` for question answering.
+
+[[autodoc]] PeftModelForQuestionAnswering
+    - all
+
+## PeftModelForFeatureExtraction
+
+A `PeftModel` for getting extracting features/embeddings from transformer models.
+
+[[autodoc]] PeftModelForFeatureExtraction
+    - all
--- a/docs/source/package_reference/tuners.mdx
+++ b/docs/source/package_reference/tuners.mdx
@ -0,0 +1,39 @@
+# Tuners
+
+Each tuner (or PEFT method) has a configuration and model.
+
+## LoRA
+
+For finetuning a model with LoRA.
+
+[[autodoc]] LoraConfig
+
+[[autodoc]] LoraModel
+
+[[autodoc]] tuners.lora.LoraLayer
+
+[[autodoc]] tuners.lora.Linear
+
+## P-tuning
+
+[[autodoc]] tuners.p_tuning.PromptEncoderConfig
+
+[[autodoc]] tuners.p_tuning.PromptEncoder
+
+## Prefix tuning
+
+[[autodoc]] tuners.prefix_tuning.PrefixTuningConfig
+
+[[autodoc]] tuners.prefix_tuning.PrefixEncoder
+
+## Prompt tuning
+
+[[autodoc]] tuners.prompt_tuning.PromptTuningConfig
+
+[[autodoc]] tuners.prompt_tuning.PromptEmbedding
+
+## IA3
+
+[[autodoc]] tuners.ia3.IA3Config
+
+[[autodoc]] tuners.ia3.IA3Model
--- a/docs/source/quicktour.mdx
+++ b/docs/source/quicktour.mdx
@ -0,0 +1,145 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Quicktour
+
+🤗 PEFT contains parameter-efficient finetuning methods for training large pretrained models. The traditional paradigm is to finetune all of a model's parameters for each downstream task, but this is becoming exceedingly costly and impractical because of the enormous number of parameters in models today. Instead, it is more efficient to train a smaller number of prompt parameters or use a reparametrization method like low-rank adaptation (LoRA) to reduce the number of trainable parameters. 
+
+This quicktour will show you 🤗 PEFT's main features and help you train large pretrained models that would typically be inaccessible on consumer devices. You'll see how to train the 1.2B parameter [`bigscience/mt0-large`](https://huggingface.co/bigscience/mt0-large) model with LoRA to generate a classification label and use it for inference.
+
+## PeftConfig
+
+Each 🤗 PEFT method is defined by a [`PeftConfig`] class that stores all the important parameters for building a [`PeftModel`]. 
+
+Because you're going to use LoRA, you'll need to load and create a [`LoraConfig`] class. Within `LoraConfig`, specify the following parameters:
+
+- the `task_type`, or sequence-to-sequence language modeling in this case
+- `inference_mode`, whether you're using the model for inference or not
+- `r`, the dimension of the low-rank matrices
+- `lora_alpha`, the scaling factor for the low-rank matrices
+- `lora_dropout`, the dropout probability of the LoRA layers
+
+```python
+from peft import LoraConfig, TaskType
+
+peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)
+```
+
+<Tip>
+
+💡 See the [`LoraConfig`] reference for more details about other parameters you can adjust.
+
+</Tip>
+
+## PeftModel
+
+A [`PeftModel`] is created by the [`get_peft_model`] function. It takes a base model - which you can load from the 🤗 Transformers library - and the [`PeftConfig`] containing the instructions for how to configure a model for a specific 🤗 PEFT method.
+
+Start by loading the base model you want to finetune.
+
+```python
+from transformers import AutoModelForSeq2SeqLM
+
+model_name_or_path = "bigscience/mt0-large"
+tokenizer_name_or_path = "bigscience/mt0-large"
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
+```
+
+Wrap your base model and `peft_config` with the `get_peft_model` function to create a [`PeftModel`]. To get a sense of the number of trainable parameters in your model, use the [`print_trainable_parameters`] method. In this case, you're only training 0.19% of the model's parameters! 🤏
+
+```python
+from peft import get_peft_model
+
+model = get_peft_model(model, peft_config)
+model.print_trainable_parameters()
+"output: trainable params: 2359296 || all params: 1231940608 || trainable%: 0.19151053100118282"
+```
+
+That is it 🎉! Now you can train the model using the 🤗 Transformers [`~transformers.Trainer`], 🤗 Accelerate, or any custom PyTorch training loop.
+
+## Save and load a model
+
+After your model is finished training, you can save your model to a directory using the [`~transformers.PreTrainedModel.save_pretrained`] function. You can also save your model to the Hub (make sure you log in to your Hugging Face account first) with the [`~transformers.PreTrainedModel.push_to_hub`] function.
+
+```python
+model.save_pretrained("output_dir")
+
+# if pushing to Hub
+from huggingface_hub import notebook_login
+
+notebook_login()
+model.push_to_hub("my_awesome_peft_model")
+```
+
+This only saves the incremental 🤗 PEFT weights that were trained, meaning it is super efficient to store, transfer, and load. For example, this [`bigscience/T0_3B`](https://huggingface.co/smangrul/twitter_complaints_bigscience_T0_3B_LORA_SEQ_2_SEQ_LM) model trained with LoRA on the [`twitter_complaints`](https://huggingface.co/datasets/ought/raft/viewer/twitter_complaints/train) subset of the RAFT [dataset](https://huggingface.co/datasets/ought/raft) only contains two files: `adapter_config.json` and `adapter_model.bin`. The latter file is just 19MB!
+
+Easily load your model for inference using the [`~transformers.PreTrainedModel.from_pretrained`] function:
+
+```diff
+  from transformers import AutoModelForSeq2SeqLM
+ from peft import PeftModel, PeftConfig
+
+ peft_model_id = "smangrul/twitter_complaints_bigscience_T0_3B_LORA_SEQ_2_SEQ_LM"
+ config = PeftConfig.from_pretrained(peft_model_id)
+  model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
+ model = PeftModel.from_pretrained(model, peft_model_id)
+  tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
+
+  model = model.to(device)
+  model.eval()
+  inputs = tokenizer("Tweet text : @HondaCustSvc Your customer service has been horrible during the recall process. I will never purchase a Honda again. Label :", return_tensors="pt")
+
+  with torch.no_grad():
+      outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=10)
+      print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0])
+  'complaint'
+```
+
+## Easy loading with Auto classes 
+
+If you have saved your adapter locally or on the Hub, you can leverage the `AutoPeftModelForxxx` classes and load any PEFT model with a single line of code:
+
+```diff
+- from peft import PeftConfig, PeftModel
+- from transformers import AutoModelForCausalLM
+ from peft import AutoPeftModelForCausalLM
+
+- peft_config = PeftConfig.from_pretrained("ybelkada/opt-350m-lora") 
+- base_model_path = peft_config.base_model_name_or_path
+- transformers_model = AutoModelForCausalLM.from_pretrained(base_model_path)
+- peft_model = PeftModel.from_pretrained(transformers_model, peft_config)
+ peft_model = AutoPeftModelForCausalLM.from_pretrained("ybelkada/opt-350m-lora")
+```
+
+Currently, supported auto classes are: `AutoPeftModelForCausalLM`, `AutoPeftModelForSequenceClassification`, `AutoPeftModelForSeq2SeqLM`, `AutoPeftModelForTokenClassification`, `AutoPeftModelForQuestionAnswering` and `AutoPeftModelForFeatureExtraction`. For other tasks (e.g. Whisper, StableDiffusion), you can load the model with:
+
+```diff
+- from peft import PeftModel, PeftConfig, AutoPeftModel
+ from peft import AutoPeftModel
+- from transformers import WhisperForConditionalGeneration
+
+- model_id = "smangrul/openai-whisper-large-v2-LORA-colab"
+
+peft_model_id = "smangrul/openai-whisper-large-v2-LORA-colab"
+- peft_config = PeftConfig.from_pretrained(peft_model_id)
+- model = WhisperForConditionalGeneration.from_pretrained(
+-     peft_config.base_model_name_or_path, load_in_8bit=True, device_map="auto"
+- )
+- model = PeftModel.from_pretrained(model, peft_model_id)
+ model = AutoPeftModel.from_pretrained(peft_model_id)
+```
+
+## Next steps
+
+Now that you've seen how to train a model with one of the 🤗 PEFT methods, we encourage you to try out some of the other methods like prompt tuning. The steps are very similar to the ones shown in this quickstart; prepare a [`PeftConfig`] for a 🤗 PEFT method, and use the `get_peft_model` to create a [`PeftModel`] from the configuration and base model. Then you can train it however you like!
+
+Feel free to also take a look at the task guides if you're interested in training a model with a 🤗 PEFT method for a specific task such as semantic segmentation, multilingual automatic speech recognition, DreamBooth, and token classification.
--- a/docs/source/task_guides/clm-prompt-tuning.mdx
+++ b/docs/source/task_guides/clm-prompt-tuning.mdx
@ -0,0 +1,289 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Prompt tuning for causal language modeling
+
+[[open-in-colab]]
+
+Prompting helps guide language model behavior by adding some input text specific to a task. Prompt tuning is an additive method for only training and updating the newly added prompt tokens to a pretrained model. This way, you can use one pretrained model whose weights are frozen, and train and update a smaller set of prompt parameters for each downstream task instead of fully finetuning a separate model. As models grow larger and larger, prompt tuning can be more efficient, and results are even better as model parameters scale.
+
+<Tip>
+
+💡 Read [The Power of Scale for Parameter-Efficient Prompt Tuning](https://arxiv.org/abs/2104.08691) to learn more about prompt tuning.
+
+</Tip>
+
+This guide will show you how to apply prompt tuning to train a [`bloomz-560m`](https://huggingface.co/bigscience/bloomz-560m) model on the `twitter_complaints` subset of the [RAFT](https://huggingface.co/datasets/ought/raft) dataset.
+
+Before you begin, make sure you have all the necessary libraries installed:
+
+```bash
+!pip install -q peft transformers datasets
+```
+
+## Setup
+
+Start by defining the model and tokenizer, the dataset and the dataset columns to train on, some training hyperparameters, and the [`PromptTuningConfig`]. The [`PromptTuningConfig`] contains information about the task type, the text to initialize the prompt embedding, the number of virtual tokens, and the tokenizer to use:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup
+from peft import get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType, PeftType
+import torch
+from datasets import load_dataset
+import os
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+device = "cuda"
+model_name_or_path = "bigscience/bloomz-560m"
+tokenizer_name_or_path = "bigscience/bloomz-560m"
+peft_config = PromptTuningConfig(
+    task_type=TaskType.CAUSAL_LM,
+    prompt_tuning_init=PromptTuningInit.TEXT,
+    num_virtual_tokens=8,
+    prompt_tuning_init_text="Classify if the tweet is a complaint or not:",
+    tokenizer_name_or_path=model_name_or_path,
+)
+
+dataset_name = "twitter_complaints"
+checkpoint_name = f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}_v1.pt".replace(
+    "/", "_"
+)
+text_column = "Tweet text"
+label_column = "text_label"
+max_length = 64
+lr = 3e-2
+num_epochs = 50
+batch_size = 8
+```
+
+## Load dataset
+
+For this guide, you'll load the `twitter_complaints` subset of the [RAFT](https://huggingface.co/datasets/ought/raft) dataset. This subset contains tweets that are labeled either `complaint` or `no complaint`:
+
+```py
+dataset = load_dataset("ought/raft", dataset_name)
+dataset["train"][0]
+{"Tweet text": "@HMRCcustomers No this is my first job", "ID": 0, "Label": 2}
+```
+
+To make the `Label` column more readable, replace the `Label` value with the corresponding label text and store them in a `text_label` column. You can use the [`~datasets.Dataset.map`] function to apply this change over the entire dataset in one step:
+
+```py
+classes = [k.replace("_", " ") for k in dataset["train"].features["Label"].names]
+dataset = dataset.map(
+    lambda x: {"text_label": [classes[label] for label in x["Label"]]},
+    batched=True,
+    num_proc=1,
+)
+dataset["train"][0]
+{"Tweet text": "@HMRCcustomers No this is my first job", "ID": 0, "Label": 2, "text_label": "no complaint"}
+```
+
+## Preprocess dataset
+
+Next, you'll setup a tokenizer; configure the appropriate padding token to use for padding sequences, and determine the maximum length of the tokenized labels:
+
+```py
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+if tokenizer.pad_token_id is None:
+    tokenizer.pad_token_id = tokenizer.eos_token_id
+target_max_length = max([len(tokenizer(class_label)["input_ids"]) for class_label in classes])
+print(target_max_length)
+3
+```
+
+Create a `preprocess_function` to:
+
+1. Tokenize the input text and labels.
+2. For each example in a batch, pad the labels with the tokenizers `pad_token_id`.
+3. Concatenate the input text and labels into the `model_inputs`.
+4. Create a separate attention mask for `labels` and `model_inputs`.
+5. Loop through each example in the batch again to pad the input ids, labels, and attention mask to the `max_length` and convert them to PyTorch tensors.
+
+```py
+def preprocess_function(examples):
+    batch_size = len(examples[text_column])
+    inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]]
+    targets = [str(x) for x in examples[label_column]]
+    model_inputs = tokenizer(inputs)
+    labels = tokenizer(targets)
+    for i in range(batch_size):
+        sample_input_ids = model_inputs["input_ids"][i]
+        label_input_ids = labels["input_ids"][i] + [tokenizer.pad_token_id]
+        # print(i, sample_input_ids, label_input_ids)
+        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
+        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
+        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
+    # print(model_inputs)
+    for i in range(batch_size):
+        sample_input_ids = model_inputs["input_ids"][i]
+        label_input_ids = labels["input_ids"][i]
+        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
+            max_length - len(sample_input_ids)
+        ) + sample_input_ids
+        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
+            "attention_mask"
+        ][i]
+        labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids
+        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
+        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
+        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
+    model_inputs["labels"] = labels["input_ids"]
+    return model_inputs
+```
+
+Use the [`~datasets.Dataset.map`] function to apply the `preprocess_function` to the entire dataset. You can remove the unprocessed columns since the model won't need them:
+
+```py
+processed_datasets = dataset.map(
+    preprocess_function,
+    batched=True,
+    num_proc=1,
+    remove_columns=dataset["train"].column_names,
+    load_from_cache_file=False,
+    desc="Running tokenizer on dataset",
+)
+```
+
+Create a [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader) from the `train` and `eval` datasets. Set `pin_memory=True` to speed up the data transfer to the GPU during training if the samples in your dataset are on a CPU.
+
+```py
+train_dataset = processed_datasets["train"]
+eval_dataset = processed_datasets["test"]
+
+
+train_dataloader = DataLoader(
+    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
+)
+eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)
+```
+
+## Train
+
+You're almost ready to setup your model and start training!
+
+Initialize a base model from [`~transformers.AutoModelForCausalLM`], and pass it and `peft_config` to the [`get_peft_model`] function to create a [`PeftModel`]. You can print the new [`PeftModel`]'s trainable parameters to see how much more efficient it is than training the full parameters of the original model!
+
+```py
+model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
+model = get_peft_model(model, peft_config)
+print(model.print_trainable_parameters())
+"trainable params: 8192 || all params: 559222784 || trainable%: 0.0014648902430985358"
+```
+
+Setup an optimizer and learning rate scheduler:
+
+```py
+optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
+lr_scheduler = get_linear_schedule_with_warmup(
+    optimizer=optimizer,
+    num_warmup_steps=0,
+    num_training_steps=(len(train_dataloader) * num_epochs),
+)
+```
+
+Move the model to the GPU, then write a training loop to start training!
+
+```py
+model = model.to(device)
+
+for epoch in range(num_epochs):
+    model.train()
+    total_loss = 0
+    for step, batch in enumerate(tqdm(train_dataloader)):
+        batch = {k: v.to(device) for k, v in batch.items()}
+        outputs = model(**batch)
+        loss = outputs.loss
+        total_loss += loss.detach().float()
+        loss.backward()
+        optimizer.step()
+        lr_scheduler.step()
+        optimizer.zero_grad()
+
+    model.eval()
+    eval_loss = 0
+    eval_preds = []
+    for step, batch in enumerate(tqdm(eval_dataloader)):
+        batch = {k: v.to(device) for k, v in batch.items()}
+        with torch.no_grad():
+            outputs = model(**batch)
+        loss = outputs.loss
+        eval_loss += loss.detach().float()
+        eval_preds.extend(
+            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
+        )
+
+    eval_epoch_loss = eval_loss / len(eval_dataloader)
+    eval_ppl = torch.exp(eval_epoch_loss)
+    train_epoch_loss = total_loss / len(train_dataloader)
+    train_ppl = torch.exp(train_epoch_loss)
+    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
+```
+
+## Share model
+
+You can store and share your model on the Hub if you'd like. Log in to your Hugging Face account and enter your token when prompted:
+
+```py
+from huggingface_hub import notebook_login
+
+notebook_login()
+```
+
+Use the [`~transformers.PreTrainedModel.push_to_hub`] function to upload your model to a model repository on the Hub:
+
+```py
+peft_model_id = "your-name/bloomz-560m_PROMPT_TUNING_CAUSAL_LM"
+model.push_to_hub("your-name/bloomz-560m_PROMPT_TUNING_CAUSAL_LM", use_auth_token=True)
+```
+
+Once the model is uploaded, you'll see the model file size is only 33.5kB! 🤏
+
+## Inference
+
+Let's try the model on a sample input for inference. If you look at the repository you uploaded the model to, you'll see a `adapter_config.json` file. Load this file into [`PeftConfig`] to specify the `peft_type` and `task_type`. Then you can load the prompt tuned model weights, and the configuration into [`~PeftModel.from_pretrained`] to create the [`PeftModel`]:
+
+```py
+from peft import PeftModel, PeftConfig
+
+peft_model_id = "stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM"
+
+config = PeftConfig.from_pretrained(peft_model_id)
+model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
+model = PeftModel.from_pretrained(model, peft_model_id)
+```
+
+Grab a tweet and tokenize it:
+
+```py
+inputs = tokenizer(
+    f'{text_column} : {"@nationalgridus I have no water and the bill is current and paid. Can you do something about this?"} Label : ',
+    return_tensors="pt",
+)
+```
+
+Put the model on a GPU and *generate* the predicted label:
+
+```py
+model.to(device)
+
+with torch.no_grad():
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    outputs = model.generate(
+        input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=10, eos_token_id=3
+    )
+    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))
+[
+    "Tweet text : @nationalgridus I have no water and the bill is current and paid. Can you do something about this? Label : complaint"
+]
+```
--- a/docs/source/task_guides/dreambooth_lora.mdx
+++ b/docs/source/task_guides/dreambooth_lora.mdx
@ -0,0 +1,270 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# DreamBooth fine-tuning with LoRA
+
+This guide demonstrates how to use LoRA, a low-rank approximation technique, to fine-tune DreamBooth with the 
+`CompVis/stable-diffusion-v1-4` model. 
+
+Although LoRA was initially designed as a technique for reducing the number of trainable parameters in 
+large-language models, the technique can also be applied to diffusion models. Performing a complete model fine-tuning 
+of diffusion models is a time-consuming task, which is why lightweight techniques like DreamBooth or Textual Inversion 
+gained popularity. With the introduction of LoRA, customizing and fine-tuning a model on a specific dataset has become 
+even faster.
+
+In this guide we'll be using a DreamBooth fine-tuning script that is available in 
+[PEFT's GitHub repo](https://github.com/huggingface/peft/tree/main/examples/lora_dreambooth). Feel free to explore it and 
+learn how things work.
+
+## Set up your environment 
+
+Start by cloning the PEFT repository:
+
+```bash
+git clone https://github.com/huggingface/peft
+```
+
+Navigate to the directory containing the training scripts for fine-tuning Dreambooth with LoRA:
+
+```bash
+cd peft/examples/lora_dreambooth
+```
+
+Set up your environment: install PEFT, and all the required libraries. At the time of writing this guide we recommend 
+installing PEFT from source.  
+
+```bash
+pip install -r requirements.txt
+pip install git+https://github.com/huggingface/peft
+```
+
+## Fine-tuning DreamBooth
+
+Prepare the images that you will use for fine-tuning the model. Set up a few environment variables: 
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4" 
+export INSTANCE_DIR="path-to-instance-images"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+```
+
+Here: 
+- `INSTANCE_DIR`: The directory containing the images that you intend to use for training your model.
+- `CLASS_DIR`: The directory containing class-specific images. In this example, we use prior preservation to avoid overfitting and language-drift. For prior preservation, you need other images of the same class as part of the training process. However, these images can be generated and the training script will save them to a local path you specify here.
+- `OUTPUT_DIR`: The destination folder for storing the trained model's weights.
+
+To learn more about DreamBooth fine-tuning with prior-preserving loss, check out the [Diffusers documentation](https://huggingface.co/docs/diffusers/training/dreambooth#finetuning-with-priorpreserving-loss).
+
+Launch the training script with `accelerate` and pass hyperparameters, as well as LoRa-specific arguments to it such as:
+
+- `use_lora`: Enables LoRa in the training script. 
+- `lora_r`:  The dimension used by the LoRA update matrices.
+- `lora_alpha`: Scaling factor.
+- `lora_text_encoder_r`: LoRA rank for text encoder.
+- `lora_text_encoder_alpha`: LoRA alpha (scaling factor) for text encoder.
+
+Here's what the full set of script arguments may look like:
+
+```bash
+accelerate launch train_dreambooth.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --train_text_encoder \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --use_lora \
+  --lora_r 16 \
+  --lora_alpha 27 \
+  --lora_text_encoder_r 16 \
+  --lora_text_encoder_alpha 17 \
+  --learning_rate=1e-4 \
+  --gradient_accumulation_steps=1 \
+  --gradient_checkpointing \
+  --max_train_steps=800
+```
+
+## Inference with a single adapter
+
+To run inference with the fine-tuned model, first specify the base model with which the fine-tuned LoRA weights will be combined:
+
+```python
+import os
+import torch
+
+from diffusers import StableDiffusionPipeline
+from peft import PeftModel, LoraConfig
+
+MODEL_NAME = "CompVis/stable-diffusion-v1-4"
+```
+
+Next, add a function that will create a Stable Diffusion pipeline for image generation. It will combine the weights of 
+the base model with the fine-tuned LoRA weights using `LoraConfig`.
+
+```python
+def get_lora_sd_pipeline(
+    ckpt_dir, base_model_name_or_path=None, dtype=torch.float16, device="cuda", adapter_name="default"
+):
+    unet_sub_dir = os.path.join(ckpt_dir, "unet")
+    text_encoder_sub_dir = os.path.join(ckpt_dir, "text_encoder")
+    if os.path.exists(text_encoder_sub_dir) and base_model_name_or_path is None:
+        config = LoraConfig.from_pretrained(text_encoder_sub_dir)
+        base_model_name_or_path = config.base_model_name_or_path
+
+    if base_model_name_or_path is None:
+        raise ValueError("Please specify the base model name or path")
+
+    pipe = StableDiffusionPipeline.from_pretrained(base_model_name_or_path, torch_dtype=dtype).to(device)
+    pipe.unet = PeftModel.from_pretrained(pipe.unet, unet_sub_dir, adapter_name=adapter_name)
+
+    if os.path.exists(text_encoder_sub_dir):
+        pipe.text_encoder = PeftModel.from_pretrained(
+            pipe.text_encoder, text_encoder_sub_dir, adapter_name=adapter_name
+        )
+
+    if dtype in (torch.float16, torch.bfloat16):
+        pipe.unet.half()
+        pipe.text_encoder.half()
+
+    pipe.to(device)
+    return pipe
+```
+
+Now you can use the function above to create a Stable Diffusion pipeline using the LoRA weights that you have created during the fine-tuning step.  
+Note, if you're running inference on the same machine, the path you specify here will be the same as `OUTPUT_DIR`.  
+
+```python
+pipe = get_lora_sd_pipeline(Path("path-to-saved-model"), adapter_name="dog")
+```
+
+Once you have the pipeline with your fine-tuned model, you can use it to generate images: 
+
+```python
+prompt = "sks dog playing fetch in the park"
+negative_prompt = "low quality, blurry, unfinished"
+image = pipe(prompt, num_inference_steps=50, guidance_scale=7, negative_prompt=negative_prompt).images[0]
+image.save("DESTINATION_PATH_FOR_THE_IMAGE")
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/lora_dreambooth_dog_park.png" alt="Generated image of a dog in a park"/>
+ </div>
+
+
+## Multi-adapter inference
+
+With PEFT you can combine multiple adapters for inference. In the previous example you have fine-tuned Stable Diffusion on 
+some dog images. The pipeline created based on these weights got a name - `adapter_name="dog`. Now, suppose you also fine-tuned 
+this base model on images of a crochet toy. Let's see how we can use both adapters. 
+
+First, you'll need to perform all the steps as in the single adapter inference example:
+
+1. Specify the base model.
+2. Add a function that creates a Stable Diffusion pipeline for image generation uses LoRA weights.
+3. Create a `pipe` with `adapter_name="dog"` based on the model fine-tuned on dog images.
+
+Next, you're going to need a few more helper functions.
+To load another adapter, create a `load_adapter()` function that leverages `load_adapter()` method of `PeftModel` (e.g. `pipe.unet.load_adapter(peft_model_path, adapter_name)`):
+
+```python
+def load_adapter(pipe, ckpt_dir, adapter_name):
+    unet_sub_dir = os.path.join(ckpt_dir, "unet")
+    text_encoder_sub_dir = os.path.join(ckpt_dir, "text_encoder")
+    pipe.unet.load_adapter(unet_sub_dir, adapter_name=adapter_name)
+    if os.path.exists(text_encoder_sub_dir):
+        pipe.text_encoder.load_adapter(text_encoder_sub_dir, adapter_name=adapter_name)
+```
+
+To switch between adapters, write a function that uses `set_adapter()` method of `PeftModel` (see `pipe.unet.set_adapter(adapter_name)`)
+
+```python
+def set_adapter(pipe, adapter_name):
+    pipe.unet.set_adapter(adapter_name)
+    if isinstance(pipe.text_encoder, PeftModel):
+        pipe.text_encoder.set_adapter(adapter_name)
+```
+
+Finally, add a function to create weighted LoRA adapter. 
+
+```python
+def create_weighted_lora_adapter(pipe, adapters, weights, adapter_name="default"):
+    pipe.unet.add_weighted_adapter(adapters, weights, adapter_name)
+    if isinstance(pipe.text_encoder, PeftModel):
+        pipe.text_encoder.add_weighted_adapter(adapters, weights, adapter_name)
+
+    return pipe
+```
+
+Let's load the second adapter from the model fine-tuned on images of a crochet toy, and give it a unique name:
+
+```python
+load_adapter(pipe, Path("path-to-the-second-saved-model"), adapter_name="crochet")
+```
+
+Create a pipeline using weighted adapters:
+
+```python
+pipe = create_weighted_lora_adapter(pipe, ["crochet", "dog"], [1.0, 1.05], adapter_name="crochet_dog")
+```
+
+Now you can switch between adapters. If you'd like to generate more dog images, set the adapter to `"dog"`:
+
+```python
+set_adapter(pipe, adapter_name="dog")
+prompt = "sks dog in a supermarket isle"
+negative_prompt = "low quality, blurry, unfinished"
+image = pipe(prompt, num_inference_steps=50, guidance_scale=7, negative_prompt=negative_prompt).images[0]
+image
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/lora_dreambooth_dog_supermarket.png" alt="Generated image of a dog in a supermarket"/>
+</div>
+
+In the same way, you can switch to the second adapter: 
+
+```python
+set_adapter(pipe, adapter_name="crochet")
+prompt = "a fish rendered in the style of <1>"
+negative_prompt = "low quality, blurry, unfinished"
+image = pipe(prompt, num_inference_steps=50, guidance_scale=7, negative_prompt=negative_prompt).images[0]
+image
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/lora_dreambooth_fish.png" alt="Generated image of a crochet fish"/>
+</div>
+
+Finally, you can use combined weighted adapters: 
+
+```python
+set_adapter(pipe, adapter_name="crochet_dog")
+prompt = "sks dog rendered in the style of <1>, close up portrait, 4K HD"
+negative_prompt = "low quality, blurry, unfinished"
+image = pipe(prompt, num_inference_steps=50, guidance_scale=7, negative_prompt=negative_prompt).images[0]
+image
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/lora_dreambooth_crochet_dog.png" alt="Generated image of a crochet dog"/>
+</div>
+
+
+
--- a/docs/source/task_guides/image_classification_lora.mdx
+++ b/docs/source/task_guides/image_classification_lora.mdx
@ -0,0 +1,429 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Image classification using LoRA
+
+This guide demonstrates how to use LoRA, a low-rank approximation technique, to fine-tune an image classification model.
+By using LoRA from 🤗 PEFT, we can reduce the number of trainable parameters in the model to only 0.77% of the original.
+
+LoRA achieves this reduction by adding low-rank "update matrices" to specific blocks of the model, such as the attention
+blocks. During fine-tuning, only these matrices are trained, while the original model parameters are left unchanged.
+At inference time, the update matrices are merged with the original model parameters to produce the final classification result.
+
+For more information on LoRA, please refer to the [original LoRA paper](https://arxiv.org/abs/2106.09685).
+
+## Install dependencies
+
+Install the libraries required for model training:
+
+```bash
+!pip install transformers accelerate evaluate datasets peft -q
+```
+
+Check the versions of all required libraries to make sure you are up to date:
+
+```python
+import transformers
+import accelerate
+import peft
+
+print(f"Transformers version: {transformers.__version__}")
+print(f"Accelerate version: {accelerate.__version__}")
+print(f"PEFT version: {peft.__version__}")
+"Transformers version: 4.27.4"
+"Accelerate version: 0.18.0"
+"PEFT version: 0.2.0"
+```
+
+## Authenticate to share your model
+
+To share the fine-tuned model at the end of the training with the community, authenticate using your 🤗 token.
+You can obtain your token from your [account settings](https://huggingface.co/settings/token).
+
+```python
+from huggingface_hub import notebook_login
+
+notebook_login()
+```
+
+## Select a model checkpoint to fine-tune
+
+Choose a model checkpoint from any of the model architectures supported for [image classification](https://huggingface.co/models?pipeline_tag=image-classification&sort=downloads). When in doubt, refer to
+the [image classification task guide](https://huggingface.co/docs/transformers/v4.27.2/en/tasks/image_classification) in
+🤗 Transformers documentation.
+
+```python
+model_checkpoint = "google/vit-base-patch16-224-in21k"
+```
+
+## Load a dataset
+
+To keep this example's runtime short, let's only load the first 5000 instances from the training set of the [Food-101 dataset](https://huggingface.co/datasets/food101):
+
+```python
+from datasets import load_dataset
+
+dataset = load_dataset("food101", split="train[:5000]")
+```
+
+## Dataset preparation
+
+To prepare the dataset for training and evaluation, create `label2id` and `id2label` dictionaries. These will come in
+handy when performing inference and for metadata information:
+
+```python
+labels = dataset.features["label"].names
+label2id, id2label = dict(), dict()
+for i, label in enumerate(labels):
+    label2id[label] = i
+    id2label[i] = label
+
+id2label[2]
+"baklava"
+```
+
+Next, load the image processor of the model you're fine-tuning:
+
+```python
+from transformers import AutoImageProcessor
+
+image_processor = AutoImageProcessor.from_pretrained(model_checkpoint)
+```
+
+The `image_processor` contains useful information on which size the training and evaluation images should be resized
+to, as well as values that should be used to normalize the pixel values. Using the `image_processor`, prepare transformation
+functions for the datasets. These functions will include data augmentation and pixel scaling:
+
+```python
+from torchvision.transforms import (
+    CenterCrop,
+    Compose,
+    Normalize,
+    RandomHorizontalFlip,
+    RandomResizedCrop,
+    Resize,
+    ToTensor,
+)
+
+normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
+train_transforms = Compose(
+    [
+        RandomResizedCrop(image_processor.size["height"]),
+        RandomHorizontalFlip(),
+        ToTensor(),
+        normalize,
+    ]
+)
+
+val_transforms = Compose(
+    [
+        Resize(image_processor.size["height"]),
+        CenterCrop(image_processor.size["height"]),
+        ToTensor(),
+        normalize,
+    ]
+)
+
+
+def preprocess_train(example_batch):
+    """Apply train_transforms across a batch."""
+    example_batch["pixel_values"] = [train_transforms(image.convert("RGB")) for image in example_batch["image"]]
+    return example_batch
+
+
+def preprocess_val(example_batch):
+    """Apply val_transforms across a batch."""
+    example_batch["pixel_values"] = [val_transforms(image.convert("RGB")) for image in example_batch["image"]]
+    return example_batch
+```
+
+Split the dataset into training and validation sets:
+
+```python
+splits = dataset.train_test_split(test_size=0.1)
+train_ds = splits["train"]
+val_ds = splits["test"]
+```
+
+Finally, set the transformation functions for the datasets accordingly:
+
+```python
+train_ds.set_transform(preprocess_train)
+val_ds.set_transform(preprocess_val)
+```
+
+## Load and prepare a model
+
+Before loading the model, let's define a helper function to check the total number of parameters a model has, as well
+as how many of them are trainable.
+
+```python
+def print_trainable_parameters(model):
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        all_param += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    print(
+        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
+    )
+```
+
+It's important to initialize the original model correctly as it will be used as a base to create the `PeftModel` you'll
+actually fine-tune. Specify the `label2id` and `id2label` so that [`~transformers.AutoModelForImageClassification`] can append a classification
+head to the underlying model, adapted for this dataset. You should see the following output:
+
+```
+Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
+```
+
+```python
+from transformers import AutoModelForImageClassification, TrainingArguments, Trainer
+
+model = AutoModelForImageClassification.from_pretrained(
+    model_checkpoint,
+    label2id=label2id,
+    id2label=id2label,
+    ignore_mismatched_sizes=True,  # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
+)
+```
+
+Before creating a `PeftModel`, you can check the number of trainable parameters in the original model:
+
+```python
+print_trainable_parameters(model)
+"trainable params: 85876325 || all params: 85876325 || trainable%: 100.00"
+```
+
+Next, use `get_peft_model` to wrap the base model so that "update" matrices are added to the respective places.
+
+```python
+from peft import LoraConfig, get_peft_model
+
+config = LoraConfig(
+    r=16,
+    lora_alpha=16,
+    target_modules=["query", "value"],
+    lora_dropout=0.1,
+    bias="none",
+    modules_to_save=["classifier"],
+)
+lora_model = get_peft_model(model, config)
+print_trainable_parameters(lora_model)
+"trainable params: 667493 || all params: 86466149 || trainable%: 0.77"
+```
+
+Let's unpack what's going on here.
+To use LoRA, you need to specify the target modules in `LoraConfig` so that `get_peft_model()` knows which modules
+inside our model need to be amended with LoRA matrices. In this example, we're only interested in targeting the query and
+value matrices of the attention blocks of the base model. Since the parameters corresponding to these matrices are "named"
+"query" and "value" respectively, we specify them accordingly in the `target_modules` argument of `LoraConfig`.
+
+We also specify `modules_to_save`. After wrapping the base model with `get_peft_model()` along with the `config`, we get
+a new model where only the LoRA parameters are trainable (so-called "update matrices") while the pre-trained parameters
+are kept frozen. However, we want the classifier parameters to be trained too when fine-tuning the base model on our
+custom dataset. To ensure that the classifier parameters are also trained, we specify `modules_to_save`. This also
+ensures that these modules are serialized alongside the LoRA trainable parameters when using utilities like `save_pretrained()`
+and `push_to_hub()`.
+
+Here's what the other parameters mean:
+
+- `r`: The dimension used by the LoRA update matrices.
+- `alpha`: Scaling factor.
+- `bias`: Specifies if the `bias` parameters should be trained. `None` denotes none of the `bias` parameters will be trained.
+
+`r` and `alpha` together control the total number of final trainable parameters when using LoRA, giving you the flexibility
+to balance a trade-off between end performance and compute efficiency.
+
+By looking at the number of trainable parameters, you can see how many parameters we're actually training. Since the goal is
+to achieve parameter-efficient fine-tuning, you should expect to see fewer trainable parameters in the `lora_model`
+in comparison to the original model, which is indeed the case here.
+
+## Define training arguments
+
+For model fine-tuning, use [`~transformers.Trainer`]. It accepts
+several arguments which you can wrap using [`~transformers.TrainingArguments`].
+
+```python
+from transformers import TrainingArguments, Trainer
+
+
+model_name = model_checkpoint.split("/")[-1]
+batch_size = 128
+
+args = TrainingArguments(
+    f"{model_name}-finetuned-lora-food101",
+    remove_unused_columns=False,
+    evaluation_strategy="epoch",
+    save_strategy="epoch",
+    learning_rate=5e-3,
+    per_device_train_batch_size=batch_size,
+    gradient_accumulation_steps=4,
+    per_device_eval_batch_size=batch_size,
+    fp16=True,
+    num_train_epochs=5,
+    logging_steps=10,
+    load_best_model_at_end=True,
+    metric_for_best_model="accuracy",
+    push_to_hub=True,
+    label_names=["labels"],
+)
+```
+
+Compared to non-PEFT methods, you can use a larger batch size since there are fewer parameters to train.
+You can also set a larger learning rate than the normal (1e-5 for example).
+
+This can potentially also reduce the need to conduct expensive hyperparameter tuning experiments.
+
+## Prepare evaluation metric
+
+```python
+import numpy as np
+import evaluate
+
+metric = evaluate.load("accuracy")
+
+
+def compute_metrics(eval_pred):
+    """Computes accuracy on a batch of predictions"""
+    predictions = np.argmax(eval_pred.predictions, axis=1)
+    return metric.compute(predictions=predictions, references=eval_pred.label_ids)
+```
+
+The `compute_metrics` function takes a named tuple as input: `predictions`, which are the logits of the model as Numpy arrays,
+and `label_ids`, which are the ground-truth labels as Numpy arrays.
+
+## Define collation function
+
+A collation function is used by [`~transformers.Trainer`] to gather a batch of training and evaluation examples and prepare them in a
+format that is acceptable by the underlying model.
+
+```python
+import torch
+
+
+def collate_fn(examples):
+    pixel_values = torch.stack([example["pixel_values"] for example in examples])
+    labels = torch.tensor([example["label"] for example in examples])
+    return {"pixel_values": pixel_values, "labels": labels}
+```
+
+## Train and evaluate
+
+Bring everything together - model, training arguments, data, collation function, etc. Then, start the training!
+
+```python
+trainer = Trainer(
+    lora_model,
+    args,
+    train_dataset=train_ds,
+    eval_dataset=val_ds,
+    tokenizer=image_processor,
+    compute_metrics=compute_metrics,
+    data_collator=collate_fn,
+)
+train_results = trainer.train()
+```
+
+In just a few minutes, the fine-tuned model shows 96% validation accuracy even on this small
+subset of the training dataset.
+
+```python
+trainer.evaluate(val_ds)
+{
+    "eval_loss": 0.14475855231285095,
+    "eval_accuracy": 0.96,
+    "eval_runtime": 3.5725,
+    "eval_samples_per_second": 139.958,
+    "eval_steps_per_second": 1.12,
+    "epoch": 5.0,
+}
+```
+
+## Share your model and run inference
+
+Once the fine-tuning is done, share the LoRA parameters with the community like so:
+
+```python
+repo_name = f"sayakpaul/{model_name}-finetuned-lora-food101"
+lora_model.push_to_hub(repo_name)
+```
+
+When calling [`~transformers.PreTrainedModel.push_to_hub`] on the `lora_model`, only the LoRA parameters along with any modules specified in `modules_to_save`
+are saved. Take a look at the [trained LoRA parameters](https://huggingface.co/sayakpaul/vit-base-patch16-224-in21k-finetuned-lora-food101/blob/main/adapter_model.bin).
+You'll see that it's only 2.6 MB! This greatly helps with portability, especially when using a very large model to fine-tune (such as [BLOOM](https://huggingface.co/bigscience/bloom)).
+
+Next, let's see how to load the LoRA updated parameters along with our base model for inference. When you wrap a base model
+with `PeftModel`, modifications are done *in-place*. To mitigate any concerns that might stem from in-place modifications,
+initialize the base model just like you did earlier and construct the inference model.
+
+```python
+from peft import PeftConfig, PeftModel
+
+
+config = PeftConfig.from_pretrained(repo_name)
+model = AutoModelForImageClassification.from_pretrained(
+    config.base_model_name_or_path,
+    label2id=label2id,
+    id2label=id2label,
+    ignore_mismatched_sizes=True,  # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
+)
+# Load the LoRA model
+inference_model = PeftModel.from_pretrained(model, repo_name)
+```
+
+Let's now fetch an example image for inference.
+
+```python
+from PIL import Image
+import requests
+
+url = "https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/beignets.jpeg"
+image = Image.open(requests.get(url, stream=True).raw)
+image
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/beignets.jpeg" alt="image of beignets"/>
+</div>
+
+First, instantiate an `image_processor` from the underlying model repo.
+
+```python
+image_processor = AutoImageProcessor.from_pretrained(repo_name)
+```
+
+Then, prepare the example for inference.
+
+```python
+encoding = image_processor(image.convert("RGB"), return_tensors="pt")
+```
+
+Finally, run inference!
+
+```python
+with torch.no_grad():
+    outputs = inference_model(**encoding)
+    logits = outputs.logits
+
+predicted_class_idx = logits.argmax(-1).item()
+print("Predicted class:", inference_model.config.id2label[predicted_class_idx])
+"Predicted class: beignets"
+```
+
+
+
+
+
+
+
--- a/docs/source/task_guides/int8-asr.mdx
+++ b/docs/source/task_guides/int8-asr.mdx
@ -0,0 +1,378 @@
+# int8 training for automatic speech recognition
+
+Quantization reduces the precision of floating point data types, decreasing the memory required to store model weights. However, quantization degrades inference performance because you lose information when you reduce the precision. 8-bit or `int8` quantization uses only a quarter precision, but it does not degrade performance because it doesn't just drop the bits or data. Instead, `int8` quantization *rounds* from one data type to another.
+
+<Tip>
+
+💡 Read the [LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale](https://arxiv.org/abs/2208.07339) paper to learn more, or you can take a look at the corresponding [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration) for a gentler introduction.
+
+</Tip>
+
+This guide will show you how to train a [`openai/whisper-large-v2`](https://huggingface.co/openai/whisper-large-v2) model for multilingual automatic speech recognition (ASR) using a combination of `int8` quantization and LoRA. You'll train Whisper for multilingual ASR on Marathi from the [Common Voice 11.0](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0) dataset.
+
+Before you start, make sure you have all the necessary libraries installed:
+
+```bash
+!pip install -q peft transformers datasets accelerate evaluate jiwer bitsandbytes
+```
+
+## Setup
+
+Let's take care of some of the setup first so you can start training faster later. Set the `CUDA_VISIBLE_DEVICES` to `0` to use the first GPU on your machine. Then you can specify the model name (either a Hub model repository id or a path to a directory containing the model), language and language abbreviation to train on, the task type, and the dataset name:
+
+```py
+import os
+
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+model_name_or_path = "openai/whisper-large-v2"
+language = "Marathi"
+language_abbr = "mr"
+task = "transcribe"
+dataset_name = "mozilla-foundation/common_voice_11_0"
+```
+
+You can also log in to your Hugging Face account to save and share your trained model on the Hub if you'd like:
+
+```py
+from huggingface_hub import notebook_login
+
+notebook_login()
+```
+
+## Load dataset and metric
+
+The [Common Voice 11.0](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0) dataset contains many hours of recorded speech in many different languages. This guide uses the [Marathi](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/viewer/mr/train) language as an example, but feel free to use any other language you're interested in. 
+
+Initialize a [`~datasets.DatasetDict`] structure, and load the [`train`] (load both the `train+validation` split into `train`) and [`test`] splits from the dataset into it:
+
+```py
+from datasets import load_dataset
+from datasets import load_dataset, DatasetDict
+
+common_voice = DatasetDict()
+
+common_voice["train"] = load_dataset(dataset_name, language_abbr, split="train+validation", use_auth_token=True)
+common_voice["test"] = load_dataset(dataset_name, language_abbr, split="test", use_auth_token=True)
+common_voice["train"][0]
+```
+
+## Preprocess dataset
+
+Let's prepare the dataset for training. Load a feature extractor, tokenizer, and processor. You should also pass the language and task to the tokenizer and processor so they know how to process the inputs:
+
+```py
+from transformers import AutoFeatureExtractor, AutoTokenizer, AutoProcessor
+
+feature_extractor = AutoFeatureExtractor.from_pretrained(model_name_or_path)
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, language=language, task=task)
+processor = AutoProcessor.from_pretrained(model_name_or_path, language=language, task=task)
+```
+
+You'll only be training on the `sentence` and `audio` columns, so you can remove the rest of the metadata with [`~datasets.Dataset.remove_columns`]:
+
+```py
+common_voice = common_voice.remove_columns(
+    ["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"]
+)
+common_voice["train"][0]
+{
+    "audio": {
+        "path": "/root/.cache/huggingface/datasets/downloads/extracted/f7e1ef6a2d14f20194999aad5040c5d4bb3ead1377de3e1bbc6e9dba34d18a8a/common_voice_mr_30585613.mp3",
+        "array": array(
+            [1.13686838e-13, -1.42108547e-13, -1.98951966e-13, ..., 4.83472422e-06, 3.54798703e-06, 1.63231743e-06]
+        ),
+        "sampling_rate": 48000,
+    },
+    "sentence": "आईचे आजारपण वाढत चालले, तसतशी मथीही नीट खातपीतनाशी झाली.",
+}
+```
+
+If you look at the `sampling_rate`, you'll see the audio was sampled at 48kHz. The Whisper model was pretrained on audio inputs at 16kHZ which means you'll need to downsample the audio inputs to match what the model was pretrained on. Downsample the audio by using the [`~datasets.Dataset.cast_column`] method on the `audio` column, and set the `sampling_rate` to 16kHz. The audio input is resampled on the fly the next time you call it:
+
+```py
+from datasets import Audio
+
+common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))
+common_voice["train"][0]
+{
+    "audio": {
+        "path": "/root/.cache/huggingface/datasets/downloads/extracted/f7e1ef6a2d14f20194999aad5040c5d4bb3ead1377de3e1bbc6e9dba34d18a8a/common_voice_mr_30585613.mp3",
+        "array": array(
+            [-3.06954462e-12, -3.63797881e-12, -4.54747351e-12, ..., -7.74800901e-06, -1.74738125e-06, 4.36312439e-06]
+        ),
+        "sampling_rate": 16000,
+    },
+    "sentence": "आईचे आजारपण वाढत चालले, तसतशी मथीही नीट खातपीतनाशी झाली.",
+}
+```
+
+Once you've cleaned up the dataset, you can write a function to generate the correct model inputs. The function should:
+
+1. Resample the audio inputs to 16kHZ by loading the `audio` column.
+2. Compute the input features from the audio `array` using the feature extractor.
+3. Tokenize the `sentence` column to the input labels.
+
+```py
+def prepare_dataset(batch):
+    audio = batch["audio"]
+    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
+    batch["labels"] = tokenizer(batch["sentence"]).input_ids
+    return batch
+```
+
+Apply the `prepare_dataset` function to the dataset with the [`~datasets.Dataset.map`] function, and set the `num_proc` argument to `2` to enable multiprocessing (if `map` hangs, then set `num_proc=1`):
+
+```py
+common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=2)
+```
+
+Finally, create a `DataCollator` class to pad the labels in each batch to the maximum length, and replace padding with `-100` so they're ignored by the loss function. Then initialize an instance of the data collator:
+
+```py
+import torch
+
+from dataclasses import dataclass
+from typing import Any, Dict, List, Union
+
+
+@dataclass
+class DataCollatorSpeechSeq2SeqWithPadding:
+    processor: Any
+
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        input_features = [{"input_features": feature["input_features"]} for feature in features]
+        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
+
+        label_features = [{"input_ids": feature["labels"]} for feature in features]
+        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
+
+        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
+
+        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
+            labels = labels[:, 1:]
+
+        batch["labels"] = labels
+
+        return batch
+
+
+data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
+```
+
+## Train
+
+Now that the dataset is ready, you can turn your attention to the model. Start by loading the pretrained [`openai/whisper-large-v2`]() model from [`~transformers.AutoModelForSpeechSeq2Seq`], and make sure to set the [`~transformers.BitsAndBytesConfig.load_in_8bit`] argument to `True` to enable `int8` quantization. The `device_map=auto` argument automatically determines how to load and store the model weights:
+
+```py
+from transformers import AutoModelForSpeechSeq2Seq
+
+model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name_or_path, load_in_8bit=True, device_map="auto")
+```
+
+You should configure `forced_decoder_ids=None` because no tokens are used before sampling, and you won't need to suppress any tokens during generation either:
+
+```py
+model.config.forced_decoder_ids = None
+model.config.suppress_tokens = []
+```
+
+To get the model ready for `int8` quantization, use the utility function [`prepare_model_for_int8_training`](https://github.com/huggingface/peft/blob/34027fe813756897767b9a6f19ae7f1c4c7b418c/src/peft/utils/other.py#L35) to handle the following:
+
+- casts all the non `int8` modules to full precision (`fp32`) for stability
+- adds a forward hook to the input embedding layer to calculate the gradients of the input hidden states
+- enables gradient checkpointing for more memory-efficient training
+
+```py
+from peft import prepare_model_for_int8_training
+
+model = prepare_model_for_int8_training(model)
+```
+
+Let's also apply LoRA to the training to make it even more efficient. Load a [`~peft.LoraConfig`] and configure the following parameters:
+
+- `r`, the dimension of the low-rank matrices
+- `lora_alpha`, scaling factor for the weight matrices
+- `target_modules`, the name of the attention matrices to apply LoRA to (`q_proj` and `v_proj`, or query and value in this case)
+- `lora_dropout`, dropout probability of the LoRA layers
+- `bias`, set to `none`
+
+<Tip>
+
+💡 The weight matrix is scaled by `lora_alpha/r`, and a higher `lora_alpha` value assigns more weight to the LoRA activations. For performance, we recommend setting bias to `None` first, and then `lora_only`, before trying `all`.
+
+</Tip>
+
+```py
+from peft import LoraConfig, PeftModel, LoraModel, LoraConfig, get_peft_model
+
+config = LoraConfig(r=32, lora_alpha=64, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none")
+```
+
+After you set up the [`~peft.LoraConfig`], wrap it and the base model with the [`get_peft_model`] function to create a [`PeftModel`]. Print out the number of trainable parameters to see how much more efficient LoRA is compared to fully training the model!
+
+```py
+model = get_peft_model(model, config)
+model.print_trainable_parameters()
+"trainable params: 15728640 || all params: 1559033600 || trainable%: 1.0088711365810203"
+```
+
+Now you're ready to define some training hyperparameters in the [`~transformers.Seq2SeqTrainingArguments`] class, such as where to save the model to, batch size, learning rate, and number of epochs to train for. The [`PeftModel`] doesn't have the same signature as the base model, so you'll need to explicitly set `remove_unused_columns=False` and `label_names=["labels"]`.
+
+```py
+from transformers import Seq2SeqTrainingArguments
+
+training_args = Seq2SeqTrainingArguments(
+    output_dir="your-name/int8-whisper-large-v2-asr",
+    per_device_train_batch_size=8,
+    gradient_accumulation_steps=1,
+    learning_rate=1e-3,
+    warmup_steps=50,
+    num_train_epochs=3,
+    evaluation_strategy="epoch",
+    fp16=True,
+    per_device_eval_batch_size=8,
+    generation_max_length=128,
+    logging_steps=25,
+    remove_unused_columns=False,
+    label_names=["labels"],
+)
+```
+
+It is also a good idea to write a custom [`~transformers.TrainerCallback`] to save model checkpoints during training:
+
+```py
+from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+
+
+class SavePeftModelCallback(TrainerCallback):
+    def on_save(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")
+
+        peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
+        kwargs["model"].save_pretrained(peft_model_path)
+
+        pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin")
+        if os.path.exists(pytorch_model_path):
+            os.remove(pytorch_model_path)
+        return control
+```
+
+Pass the `Seq2SeqTrainingArguments`, model, datasets, data collator, tokenizer, and callback to the [`~transformers.Seq2SeqTrainer`]. You can optionally set `model.config.use_cache = False` to silence any warnings. Once everything is ready, call [`~transformers.Trainer.train`] to start training!
+
+```py
+from transformers import Seq2SeqTrainer, TrainerCallback, Seq2SeqTrainingArguments, TrainerState, TrainerControl
+
+trainer = Seq2SeqTrainer(
+    args=training_args,
+    model=model,
+    train_dataset=common_voice["train"],
+    eval_dataset=common_voice["test"],
+    data_collator=data_collator,
+    tokenizer=processor.feature_extractor,
+    callbacks=[SavePeftModelCallback],
+)
+model.config.use_cache = False
+trainer.train()
+```
+
+## Evaluate
+
+[Word error rate](https://huggingface.co/spaces/evaluate-metric/wer) (WER) is a common metric for evaluating ASR models. Load the WER metric from 🤗 Evaluate:
+
+```py
+import evaluate
+
+metric = evaluate.load("wer")
+```
+
+Write a loop to evaluate the model performance. Set the model to evaluation mode first, and write the loop with [`torch.cuda.amp.autocast()`](https://pytorch.org/docs/stable/amp.html) because `int8` training requires autocasting. Then, pass a batch of examples to the model to evaluate. Get the decoded predictions and labels, and add them as a batch to the WER metric before calling `compute` to get the final WER score:
+
+```py
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+import numpy as np
+import gc
+
+eval_dataloader = DataLoader(common_voice["test"], batch_size=8, collate_fn=data_collator)
+
+model.eval()
+for step, batch in enumerate(tqdm(eval_dataloader)):
+    with torch.cuda.amp.autocast():
+        with torch.no_grad():
+            generated_tokens = (
+                model.generate(
+                    input_features=batch["input_features"].to("cuda"),
+                    decoder_input_ids=batch["labels"][:, :4].to("cuda"),
+                    max_new_tokens=255,
+                )
+                .cpu()
+                .numpy()
+            )
+            labels = batch["labels"].cpu().numpy()
+            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+            decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+            metric.add_batch(
+                predictions=decoded_preds,
+                references=decoded_labels,
+            )
+    del generated_tokens, labels, batch
+    gc.collect()
+wer = 100 * metric.compute()
+print(f"{wer=}")
+```
+
+## Share model
+
+Once you're happy with your results, you can upload your model to the Hub with the [`~transformers.PreTrainedModel.push_to_hub`] method:
+
+```py
+model.push_to_hub("your-name/int8-whisper-large-v2-asr")
+```
+
+## Inference
+
+Let's test the model out now!
+
+Instantiate the model configuration from [`PeftConfig`], and from here, you can use the configuration to load the base and [`PeftModel`], tokenizer, processor, and feature extractor. Remember to define the `language` and `task` in the tokenizer, processor, and `forced_decoder_ids`:
+
+```py
+from peft import PeftModel, PeftConfig
+
+peft_model_id = "smangrul/openai-whisper-large-v2-LORA-colab"
+language = "Marathi"
+task = "transcribe"
+peft_config = PeftConfig.from_pretrained(peft_model_id)
+model = WhisperForConditionalGeneration.from_pretrained(
+    peft_config.base_model_name_or_path, load_in_8bit=True, device_map="auto"
+)
+model = PeftModel.from_pretrained(model, peft_model_id)
+tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
+processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
+feature_extractor = processor.feature_extractor
+forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task=task)
+```
+
+Load an audio sample (you can listen to it in the [Dataset Preview](https://huggingface.co/datasets/stevhliu/dummy)) to transcribe, and the [`~transformers.AutomaticSpeechRecognitionPipeline`]:
+
+```py
+from transformers import AutomaticSpeechRecognitionPipeline
+
+audio = "https://huggingface.co/datasets/stevhliu/dummy/resolve/main/mrt_01523_00028548203.wav"
+pipeline = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)
+```
+
+Then use the pipeline with autocast as a context manager on the audio sample:
+
+```py
+with torch.cuda.amp.autocast():
+    text = pipe(audio, generate_kwargs={"forced_decoder_ids": forced_decoder_ids}, max_new_tokens=255)["text"]
+text
+"मी तुमच्यासाठी काही करू शकतो का?"
+```
--- a/docs/source/task_guides/ptuning-seq-classification.mdx
+++ b/docs/source/task_guides/ptuning-seq-classification.mdx
@ -0,0 +1,232 @@
+# P-tuning for sequence classification
+
+It is challenging to finetune large language models for downstream tasks because they have so many parameters. To work around this, you can use *prompts* to steer the model toward a particular downstream task without fully finetuning a model. Typically, these prompts are handcrafted, which may be impractical because you need very large validation sets to find the best prompts. *P-tuning* is a method for automatically searching and optimizing for better prompts in a continuous space.
+
+<Tip>
+
+💡 Read [GPT Understands, Too](https://arxiv.org/abs/2103.10385) to learn more about p-tuning.
+
+</Tip>
+
+This guide will show you how to train a [`roberta-large`](https://huggingface.co/roberta-large) model (but you can also use any of the GPT, OPT, or BLOOM models) with p-tuning on the `mrpc` configuration of the [GLUE](https://huggingface.co/datasets/glue) benchmark.
+
+Before you begin, make sure you have all the necessary libraries installed:
+
+```bash
+!pip install -q peft transformers datasets evaluate
+```
+
+## Setup
+
+To get started, import 🤗 Transformers to create the base model, 🤗 Datasets to load a dataset, 🤗 Evaluate to load an evaluation metric, and 🤗 PEFT to create a [`PeftModel`] and setup the configuration for p-tuning.
+
+Define the model, dataset, and some basic training hyperparameters:
+
+```py
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    DataCollatorWithPadding,
+    TrainingArguments,
+    Trainer,
+)
+from peft import (
+    get_peft_config,
+    get_peft_model,
+    get_peft_model_state_dict,
+    set_peft_model_state_dict,
+    PeftType,
+    PromptEncoderConfig,
+)
+from datasets import load_dataset
+import evaluate
+import torch
+
+model_name_or_path = "roberta-large"
+task = "mrpc"
+num_epochs = 20
+lr = 1e-3
+batch_size = 32
+```
+
+## Load dataset and metric
+
+Next, load the `mrpc` configuration - a corpus of sentence pairs labeled according to whether they're semantically equivalent or not - from the [GLUE](https://huggingface.co/datasets/glue) benchmark:
+
+```py
+dataset = load_dataset("glue", task)
+dataset["train"][0]
+{
+    "sentence1": 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
+    "sentence2": 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
+    "label": 1,
+    "idx": 0,
+}
+```
+
+From 🤗 Evaluate, load a metric for evaluating the model's performance. The evaluation module returns the accuracy and F1 scores associated with this specific task.
+
+```py
+metric = evaluate.load("glue", task)
+```
+
+Now you can use the `metric` to write a function that computes the accuracy and F1 scores. The `compute_metric` function calculates the scores from the model predictions and labels:
+
+```py
+import numpy as np
+
+
+def compute_metrics(eval_pred):
+    predictions, labels = eval_pred
+    predictions = np.argmax(predictions, axis=1)
+    return metric.compute(predictions=predictions, references=labels)
+```
+
+## Preprocess dataset
+
+Initialize the tokenizer and configure the padding token to use. If you're using a GPT, OPT, or BLOOM model, you should set the `padding_side` to the left; otherwise it'll be set to the right. Tokenize the sentence pairs and truncate them to the maximum length.
+
+```py
+if any(k in model_name_or_path for k in ("gpt", "opt", "bloom")):
+    padding_side = "left"
+else:
+    padding_side = "right"
+
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)
+if getattr(tokenizer, "pad_token_id") is None:
+    tokenizer.pad_token_id = tokenizer.eos_token_id
+
+
+def tokenize_function(examples):
+    # max_length=None => use the model max length (it's actually the default)
+    outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
+    return outputs
+```
+
+Use [`~datasets.Dataset.map`] to apply the `tokenize_function` to the dataset, and remove the unprocessed columns because the model won't need those. You should also rename the `label` column to `labels` because that is the expected name for the labels by models in the 🤗 Transformers library.
+
+```py
+tokenized_datasets = dataset.map(
+    tokenize_function,
+    batched=True,
+    remove_columns=["idx", "sentence1", "sentence2"],
+)
+
+tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
+```
+
+Create a collator function with [`~transformers.DataCollatorWithPadding`] to pad the examples in the batches to the `longest` sequence in the batch:
+
+```py
+data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")
+```
+
+## Train
+
+P-tuning uses a prompt encoder to optimize the prompt parameters, so you'll need to initialize the [`PromptEncoderConfig`] with several arguments:
+
+- `task_type`: the type of task you're training on, in this case it is sequence classification or `SEQ_CLS`
+- `num_virtual_tokens`: the number of virtual tokens to use, or in other words, the prompt
+- `encoder_hidden_size`: the hidden size of the encoder used to optimize the prompt parameters
+
+```py
+peft_config = PromptEncoderConfig(task_type="SEQ_CLS", num_virtual_tokens=20, encoder_hidden_size=128)
+```
+
+Create the base `roberta-large` model from [`~transformers.AutoModelForSequenceClassification`], and then wrap the base model and `peft_config` with [`get_peft_model`] to create a [`PeftModel`]. If you're curious to see how many parameters you're actually training compared to training on all the model parameters, you can print it out with [`~peft.PeftModel.print_trainable_parameters`]:
+
+```py
+model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True)
+model = get_peft_model(model, peft_config)
+model.print_trainable_parameters()
+"trainable params: 1351938 || all params: 355662082 || trainable%: 0.38011867680626127"
+```
+
+From the 🤗 Transformers library, set up the [`~transformers.TrainingArguments`] class with where you want to save the model to, the training hyperparameters, how to evaluate the model, and when to save the checkpoints:
+
+```py
+training_args = TrainingArguments(
+    output_dir="your-name/roberta-large-peft-p-tuning",
+    learning_rate=1e-3,
+    per_device_train_batch_size=32,
+    per_device_eval_batch_size=32,
+    num_train_epochs=2,
+    weight_decay=0.01,
+    evaluation_strategy="epoch",
+    save_strategy="epoch",
+    load_best_model_at_end=True,
+)
+```
+
+Then pass the model, `TrainingArguments`, datasets, tokenizer, data collator, and evaluation function to the [`~transformers.Trainer`] class, which'll handle the entire training loop for you. Once you're ready, call [`~transformers.Trainer.train`] to start training!
+
+```py
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=tokenized_datasets["train"],
+    eval_dataset=tokenized_datasets["test"],
+    tokenizer=tokenizer,
+    data_collator=data_collator,
+    compute_metrics=compute_metrics,
+)
+
+trainer.train()
+```
+
+## Share model
+
+You can store and share your model on the Hub if you'd like. Log in to your Hugging Face account and enter your token when prompted:
+
+```py
+from huggingface_hub import notebook_login
+
+notebook_login()
+```
+
+Upload the model to a specifc model repository on the Hub with the [`~transformers.PreTrainedModel.push_to_hub`] function:
+
+```py
+model.push_to_hub("your-name/roberta-large-peft-p-tuning", use_auth_token=True)
+```
+
+## Inference
+
+Once the model has been uploaded to the Hub, anyone can easily use it for inference. Load the configuration and model:
+
+```py
+import torch
+from peft import PeftModel, PeftConfig
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+peft_model_id = "smangrul/roberta-large-peft-p-tuning"
+config = PeftConfig.from_pretrained(peft_model_id)
+inference_model = AutoModelForSequenceClassification.from_pretrained(config.base_model_name_or_path)
+tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
+model = PeftModel.from_pretrained(inference_model, peft_model_id)
+```
+
+Get some text and tokenize it:
+
+```py
+classes = ["not equivalent", "equivalent"]
+
+sentence1 = "Coast redwood trees are the tallest trees on the planet and can grow over 300 feet tall."
+sentence2 = "The coast redwood trees, which can attain a height of over 300 feet, are the tallest trees on earth."
+
+inputs = tokenizer(sentence1, sentence2, truncation=True, padding="longest", return_tensors="pt")
+```
+
+Pass the inputs to the model to classify the sentences:
+
+```py
+with torch.no_grad():
+    outputs = model(**inputs).logits
+    print(outputs)
+
+paraphrased_text = torch.softmax(outputs, dim=1).tolist()[0]
+for i in range(len(classes)):
+    print(f"{classes[i]}: {int(round(paraphrased_text[i] * 100))}%")
+"not equivalent: 4%"
+"equivalent: 96%"
+```
--- a/docs/source/task_guides/semantic-similarity-lora.md
+++ b/docs/source/task_guides/semantic-similarity-lora.md
@ -0,0 +1,295 @@
+# LoRA for semantic similarity tasks
+
+Low-Rank Adaptation (LoRA) is a reparametrization method that aims to reduce the number of trainable parameters with low-rank representations. The weight matrix is broken down into low-rank matrices that are trained and updated. All the pretrained model parameters remain frozen. After training, the low-rank matrices are added back to the original weights. This makes it more efficient to store and train a LoRA model because there are significantly fewer parameters.
+
+<Tip>
+
+💡 Read [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) to learn more about LoRA.
+
+</Tip>
+
+In this guide, we'll be using a LoRA [script](https://github.com/huggingface/peft/tree/main/examples/lora_dreambooth) to fine-tune a [`intfloat/e5-large-v2`](https://huggingface.co/intfloat/e5-large-v2) model on the [`smangrul/amazon_esci`](https://huggingface.co/datasets/smangrul/amazon_esci) dataset for semantic similarity tasks. Feel free to explore the script to learn how things work in greater detail!
+
+## Setup
+
+Start by installing 🤗 PEFT from [source](https://github.com/huggingface/peft), and then navigate to the directory containing the training scripts for fine-tuning DreamBooth with LoRA:
+
+```bash
+cd peft/examples/feature_extraction
+```
+
+Install all the necessary required libraries with:
+
+```bash
+pip install -r requirements.txt
+```
+
+## Setup
+
+Let's start by importing all the necessary libraries you'll need:
+
+- 🤗 Transformers for loading the `intfloat/e5-large-v2` model and tokenizer
+- 🤗 Accelerate for the training loop
+- 🤗 Datasets for loading and preparing the `smangrul/amazon_esci` dataset for training and inference
+- 🤗 Evaluate for evaluating the model's performance
+- 🤗 PEFT for setting up the LoRA configuration and creating the PEFT model
+- 🤗 huggingface_hub for uploading the trained model to HF hub
+- hnswlib for creating the search index and doing fast approximate nearest neighbor search
+
+<Tip>
+
+It is assumed that PyTorch with CUDA support is already installed.
+
+</Tip>
+
+## Train
+
+Launch the training script with `accelerate launch` and pass your hyperparameters along with the `--use_peft` argument to enable LoRA.
+
+This guide uses the following [`LoraConfig`]:
+
+```py
+peft_config = LoraConfig(
+            r=8,
+            lora_alpha=16,
+            bias="none",
+            task_type=TaskType.FEATURE_EXTRACTION,
+            target_modules=["key", "query", "value"],
+        )
+```
+
+Here's what a full set of script arguments may look like when running in Colab on a V100 GPU with standard RAM:
+
+```bash
+accelerate launch \
+--mixed_precision="fp16" \
+peft_lora_embedding_semantic_search.py \
+--dataset_name="smangrul/amazon_esci" \
+--max_length=70 --model_name_or_path="intfloat/e5-large-v2" \
+--per_device_train_batch_size=64 \
+--per_device_eval_batch_size=128 \
+--learning_rate=5e-4 \
+--weight_decay=0.0 \
+--num_train_epochs 3 \
+--gradient_accumulation_steps=1 \
+--output_dir="results/peft_lora_e5_ecommerce_semantic_search_colab" \
+--seed=42 \
+--push_to_hub \
+--hub_model_id="smangrul/peft_lora_e5_ecommerce_semantic_search_colab" \
+--with_tracking \
+--report_to="wandb" \
+--use_peft \
+--checkpointing_steps "epoch"
+```
+
+## Dataset for semantic similarity
+
+The dataset we'll be using is a small subset of the [esci-data](https://github.com/amazon-science/esci-data.git) dataset (it can be found on Hub at [smangrul/amazon_esci](https://huggingface.co/datasets/smangrul/amazon_esci)). 
+Each sample contains a tuple of `(query, product_title, relevance_label)` where `relevance_label` is `1` if the product matches the intent of the `query`, otherwise it is `0`. 
+
+Our task is to build an embedding model that can retrieve semantically similar products given a product query. 
+This is usually the first stage in building a product search engine to retrieve all the potentially relevant products of a given query. 
+Typically, this involves using Bi-Encoder models to cross-join the query and millions of products which could blow up quickly. 
+Instead, you can use a Transformer model to retrieve the top K nearest similar products for a given query by 
+embedding the query and products in the same latent embedding space. 
+The millions of products are embedded offline to create a search index. 
+At run time, only the query is embedded by the model, and products are retrieved from the search index with a  
+fast approximate nearest neighbor search library such as [FAISS](https://github.com/facebookresearch/faiss) or [HNSWlib](https://github.com/nmslib/hnswlib).
+
+
+The next stage involves reranking the retrieved list of products to return the most relevant ones;
+this stage can utilize cross-encoder based models as the cross-join between the query and a limited set of retrieved products.
+The diagram below from [awesome-semantic-search](https://github.com/rom1504/awesome-semantic-search) outlines a rough semantic search pipeline:
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/semantic_search_pipeline.png" 
+     alt="Semantic Search Pipeline"/>
+ </div>
+
+For this task guide, we will explore the first stage of training an embedding model to predict semantically similar products 
+given a product query.
+
+## Training script deep dive
+
+We finetune [e5-large-v2](https://huggingface.co/intfloat/e5-large-v2) which tops the [MTEB benchmark](https://huggingface.co/spaces/mteb/leaderboard) using PEFT-LoRA. 
+
+[`AutoModelForSentenceEmbedding`] returns the query and product embeddings, and the `mean_pooling` function pools them across the sequence dimension and normalizes them:
+
+```py
+class AutoModelForSentenceEmbedding(nn.Module):
+    def __init__(self, model_name, tokenizer, normalize=True):
+        super(AutoModelForSentenceEmbedding, self).__init__()
+
+        self.model = AutoModel.from_pretrained(model_name)  
+        self.normalize = normalize
+        self.tokenizer = tokenizer
+
+    def forward(self, **kwargs):
+        model_output = self.model(**kwargs)
+        embeddings = self.mean_pooling(model_output, kwargs["attention_mask"])
+        if self.normalize:
+            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
+
+        return embeddings
+
+    def mean_pooling(self, model_output, attention_mask):
+        token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+
+    def __getattr__(self, name: str):
+        """Forward missing attributes to the wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Module's logic
+        except AttributeError:
+            return getattr(self.model, name)
+
+
+def get_cosine_embeddings(query_embs, product_embs):
+    return torch.sum(query_embs * product_embs, axis=1)
+
+
+def get_loss(cosine_score, labels):
+    return torch.mean(torch.square(labels * (1 - cosine_score) + torch.clamp((1 - labels) * cosine_score, min=0.0)))
+```
+
+The `get_cosine_embeddings` function computes the cosine similarity and the `get_loss` function computes the loss. The loss enables the model to learn that a cosine score of `1` for query and product pairs is relevant, and a cosine score of `0` or below is irrelevant.
+
+Define the [`PeftConfig`] with your LoRA hyperparameters, and create a [`PeftModel`]. We use 🤗 Accelerate for handling all device management, mixed precision training, gradient accumulation, WandB tracking, and saving/loading utilities.
+
+## Results
+
+The table below compares the training time, the batch size that could be fit in Colab, and the best ROC-AUC scores between a PEFT model and a fully fine-tuned model:
+
+
+| Training Type | Training time per epoch (Hrs) | Batch Size that fits | ROC-AUC score (higher is better) |
+| ----------------- | ------------- | ---------- | -------- |
+| Pre-Trained e5-large-v2   |   -   | -          |  0.68    |
+| PEFT              |   1.73        | 64         |  0.787   |
+| Full Fine-Tuning  |   2.33        | 32         | 0.7969   |
+
+The PEFT-LoRA model trains **1.35X** faster and can fit **2X** batch size compared to the fully fine-tuned model, and the performance of PEFT-LoRA is comparable to the fully fine-tuned model with a relative drop of **-1.24%** in ROC-AUC. This gap can probably be closed with bigger models as mentioned in [The Power of Scale for Parameter-Efficient Prompt Tuning
+](https://huggingface.co/papers/2104.08691).
+
+## Inference
+
+Let's go! Now we have the model, we need to create a search index of all the products in our catalog. 
+Please refer to `peft_lora_embedding_semantic_similarity_inference.ipynb` for the complete inference code. 
+
+1. Get a list of ids to products which we can call `ids_to_products_dict`:
+
+```bash
+{0: 'RamPro 10" All Purpose Utility Air Tires/Wheels with a 5/8" Diameter Hole with Double Sealed Bearings (Pack of 2)',
+ 1: 'MaxAuto 2-Pack 13x5.00-6 2PLY Turf Mower Tractor Tire with Yellow Rim, (3" Centered Hub, 3/4" Bushings )',
+ 2: 'NEIKO 20601A 14.5 inch Steel Tire Spoon Lever Iron Tool Kit | Professional Tire Changing Tool for Motorcycle, Dirt Bike, Lawn Mower | 3 pcs Tire Spoons | 3 Rim Protector | Valve Tool | 6 Valve Cores',
+ 3: '2PK 13x5.00-6 13x5.00x6 13x5x6 13x5-6 2PLY Turf Mower Tractor Tire with Gray Rim',
+ 4: '(Set of 2) 15x6.00-6 Husqvarna/Poulan Tire Wheel Assy .75" Bearing',
+ 5: 'MaxAuto 2 Pcs 16x6.50-8 Lawn Mower Tire for Garden Tractors Ridings, 4PR, Tubeless',
+ 6: 'Dr.Roc Tire Spoon Lever Dirt Bike Lawn Mower Motorcycle Tire Changing Tools with Durable Bag 3 Tire Irons 2 Rim Protectors 1 Valve Stems Set TR412 TR413',
+ 7: 'MARASTAR 21446-2PK 15x6.00-6" Front Tire Assembly Replacement-Craftsman Mower, Pack of 2',
+ 8: '15x6.00-6" Front Tire Assembly Replacement for 100 and 300 Series John Deere Riding Mowers - 2 pack',
+ 9: 'Honda HRR Wheel Kit (2 Front 44710-VL0-L02ZB, 2 Back 42710-VE2-M02ZE)',
+ 10: 'Honda 42710-VE2-M02ZE (Replaces 42710-VE2-M01ZE) Lawn Mower Rear Wheel Set of 2' ...
+```
+
+2. Use the trained [smangrul/peft_lora_e5_ecommerce_semantic_search_colab](https://huggingface.co/smangrul/peft_lora_e5_ecommerce_semantic_search_colab) model to get the product embeddings:
+
+```py
+# base model
+model = AutoModelForSentenceEmbedding(model_name_or_path, tokenizer)
+
+# peft config and wrapping
+model = PeftModel.from_pretrained(model, peft_model_id)
+
+device = "cuda"
+model.to(device)
+model.eval()
+model = model.merge_and_unload()
+
+import numpy as np
+num_products= len(dataset)
+d = 1024
+
+product_embeddings_array = np.zeros((num_products, d))
+for step, batch in enumerate(tqdm(dataloader)):
+    with torch.no_grad():
+        with torch.amp.autocast(dtype=torch.bfloat16, device_type="cuda"):
+            product_embs = model(**{k:v.to(device) for k, v in batch.items()}).detach().float().cpu()
+    start_index = step*batch_size
+    end_index = start_index+batch_size if  (start_index+batch_size) < num_products else num_products
+    product_embeddings_array[start_index:end_index] = product_embs
+    del product_embs, batch
+```
+
+3. Create a search index using HNSWlib:
+
+```py
+def construct_search_index(dim, num_elements, data):
+    # Declaring index
+    search_index = hnswlib.Index(space = 'ip', dim = dim) # possible options are l2, cosine or ip
+    
+    # Initializing index - the maximum number of elements should be known beforehand
+    search_index.init_index(max_elements = num_elements, ef_construction = 200, M = 100)
+
+    # Element insertion (can be called several times):
+    ids = np.arange(num_elements)
+    search_index.add_items(data, ids)
+
+    return search_index
+
+product_search_index = construct_search_index(d, num_products, product_embeddings_array)
+```
+
+4. Get the query embeddings and nearest neighbors:
+
+```py
+def get_query_embeddings(query, model, tokenizer, device):
+    inputs = tokenizer(query, padding="max_length", max_length=70, truncation=True, return_tensors="pt")
+    model.eval()
+    with torch.no_grad():
+        query_embs = model(**{k:v.to(device) for k, v in inputs.items()}).detach().cpu()
+    return query_embs[0]
+
+    
+def get_nearest_neighbours(k, search_index, query_embeddings, ids_to_products_dict, threshold=0.7):
+    # Controlling the recall by setting ef:
+    search_index.set_ef(100) # ef should always be > k
+
+    # Query dataset, k - number of the closest elements (returns 2 numpy arrays)
+    labels, distances = search_index.knn_query(query_embeddings, k = k)
+    
+    return [(ids_to_products_dict[label], (1-distance)) for label, distance in zip(labels[0], distances[0]) if (1-distance)>=threshold]
+```
+
+5. Let's test it out with the query `deep learning books`:
+
+```py
+query = "deep learning books"
+k = 10
+query_embeddings = get_query_embeddings(query, model, tokenizer, device)
+search_results = get_nearest_neighbours(k, product_search_index, query_embeddings, ids_to_products_dict, threshold=0.7)
+
+print(f"{query=}") 
+for product, cosine_sim_score in search_results:
+    print(f"cosine_sim_score={round(cosine_sim_score,2)} {product=}")
+```
+
+Output:
+
+```bash
+query='deep learning books'
+cosine_sim_score=0.95 product='Deep Learning (The MIT Press Essential Knowledge series)'
+cosine_sim_score=0.93 product='Practical Deep Learning: A Python-Based Introduction'
+cosine_sim_score=0.9 product='Hands-On Machine Learning with Scikit-Learn and TensorFlow: Concepts, Tools, and Techniques to Build Intelligent Systems'
+cosine_sim_score=0.9 product='Machine Learning: A Hands-On, Project-Based Introduction to Machine Learning for Absolute Beginners: Mastering Engineering ML Systems using Scikit-Learn and TensorFlow'
+cosine_sim_score=0.9 product='Mastering Machine Learning on AWS: Advanced machine learning in Python using SageMaker, Apache Spark, and TensorFlow'
+cosine_sim_score=0.9 product='The Hundred-Page Machine Learning Book'
+cosine_sim_score=0.89 product='Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow: Concepts, Tools, and Techniques to Build Intelligent Systems'
+cosine_sim_score=0.89 product='Machine Learning: A Journey from Beginner to Advanced Including Deep Learning, Scikit-learn and Tensorflow'
+cosine_sim_score=0.88 product='Mastering Machine Learning with scikit-learn'
+cosine_sim_score=0.88 product='Mastering Machine Learning with scikit-learn - Second Edition: Apply effective learning algorithms to real-world problems using scikit-learn'
+```
+
+Books on deep learning and machine learning are retrieved even though `machine learning` wasn't included in the query. This means the model has learned that these books are semantically relevant to the query based on the purchase behavior of customers on Amazon. 
+
+The next steps would ideally involve using ONNX/TensorRT to optimize the model and using a Triton server to host it. Check out 🤗 [Optimum](https://huggingface.co/docs/optimum/index) for related optimizations for efficient serving!
--- a/docs/source/task_guides/semantic_segmentation_lora.mdx
+++ b/docs/source/task_guides/semantic_segmentation_lora.mdx
@ -0,0 +1,442 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Semantic segmentation using LoRA
+
+This guide demonstrates how to use LoRA, a low-rank approximation technique, to finetune a SegFormer model variant for semantic segmentation.
+By using LoRA from 🤗 PEFT, we can reduce the number of trainable parameters in the SegFormer model to only 14% of the original trainable parameters.
+
+LoRA achieves this reduction by adding low-rank "update matrices" to specific blocks of the model, such as the attention
+blocks. During fine-tuning, only these matrices are trained, while the original model parameters are left unchanged.
+At inference time, the update matrices are merged with the original model parameters to produce the final classification result.
+
+For more information on LoRA, please refer to the [original LoRA paper](https://arxiv.org/abs/2106.09685).
+
+## Install dependencies
+
+Install the libraries required for model training:
+
+```bash
+!pip install transformers accelerate evaluate datasets peft -q
+```
+
+## Authenticate to share your model
+
+To share the finetuned model with the community at the end of the training, authenticate using your 🤗 token.
+You can obtain your token from your [account settings](https://huggingface.co/settings/token).
+
+```python
+from huggingface_hub import notebook_login
+
+notebook_login()
+```
+
+## Load a dataset
+
+To ensure that this example runs within a reasonable time frame, here we are limiting the number of instances from the training 
+set of the [SceneParse150 dataset](https://huggingface.co/datasets/scene_parse_150) to 150. 
+
+```python
+from datasets import load_dataset
+
+ds = load_dataset("scene_parse_150", split="train[:150]")
+```
+
+Next, split the dataset into train and test sets. 
+
+```python
+ds = ds.train_test_split(test_size=0.1)
+train_ds = ds["train"]
+test_ds = ds["test"]
+```
+
+## Prepare label maps
+
+Create a dictionary that maps a label id to a label class, which will be useful when setting up the model later: 
+* `label2id`: maps the semantic classes of the dataset to integer ids.
+* `id2label`: maps integer ids back to the semantic classes.
+
+```python
+import json
+from huggingface_hub import cached_download, hf_hub_url
+
+repo_id = "huggingface/label-files"
+filename = "ade20k-id2label.json"
+id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
+id2label = {int(k): v for k, v in id2label.items()}
+label2id = {v: k for k, v in id2label.items()}
+num_labels = len(id2label)
+```
+
+## Prepare datasets for training and evaluation
+
+Next, load the SegFormer image processor to prepare the images and annotations for the model. This dataset uses the 
+zero-index as the background class, so make sure to set `reduce_labels=True` to subtract one from all labels since the 
+background class is not among the 150 classes. 
+
+```python
+from transformers import AutoImageProcessor
+
+checkpoint = "nvidia/mit-b0"
+image_processor = AutoImageProcessor.from_pretrained(checkpoint, reduce_labels=True)
+```
+
+Add a function to apply data augmentation to the images, so that the model is more robust against overfitting. Here we use the 
+[ColorJitter](https://pytorch.org/vision/stable/generated/torchvision.transforms.ColorJitter.html) function from 
+[torchvision](https://pytorch.org/vision/stable/index.html) to randomly change the color properties of an image.
+
+```python
+from torchvision.transforms import ColorJitter
+
+jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1)
+```
+
+Add a function to handle grayscale images and ensure that each input image has three color channels, regardless of 
+whether it was originally grayscale or RGB. The function converts RGB images to array as is, and for grayscale images 
+that have only one color channel, the function replicates the same channel three times using `np.tile()` before converting 
+the image into an array.
+
+```python
+import numpy as np
+
+
+def handle_grayscale_image(image):
+    np_image = np.array(image)
+    if np_image.ndim == 2:
+        tiled_image = np.tile(np.expand_dims(np_image, -1), 3)
+        return Image.fromarray(tiled_image)
+    else:
+        return Image.fromarray(np_image)
+```
+
+Finally, combine everything in two functions that you'll use to transform training and validation data. The two functions 
+are similar except data augmentation is applied only to the training data.  
+
+```python
+from PIL import Image
+
+
+def train_transforms(example_batch):
+    images = [jitter(handle_grayscale_image(x)) for x in example_batch["image"]]
+    labels = [x for x in example_batch["annotation"]]
+    inputs = image_processor(images, labels)
+    return inputs
+
+
+def val_transforms(example_batch):
+    images = [handle_grayscale_image(x) for x in example_batch["image"]]
+    labels = [x for x in example_batch["annotation"]]
+    inputs = image_processor(images, labels)
+    return inputs
+```
+
+To apply the preprocessing functions over the entire dataset, use the 🤗 Datasets `set_transform` function:
+
+```python 
+train_ds.set_transform(train_transforms)
+test_ds.set_transform(val_transforms)
+```
+
+## Create evaluation function
+
+Including a metric during training is helpful for evaluating your model's performance. You can load an evaluation 
+method with the [🤗 Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, use 
+the [mean Intersection over Union (IoU)](https://huggingface.co/spaces/evaluate-metric/accuracy) metric (see the 🤗 Evaluate 
+[quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):
+
+```python
+import torch
+from torch import nn
+import evaluate
+
+metric = evaluate.load("mean_iou")
+
+
+def compute_metrics(eval_pred):
+    with torch.no_grad():
+        logits, labels = eval_pred
+        logits_tensor = torch.from_numpy(logits)
+        logits_tensor = nn.functional.interpolate(
+            logits_tensor,
+            size=labels.shape[-2:],
+            mode="bilinear",
+            align_corners=False,
+        ).argmax(dim=1)
+
+        pred_labels = logits_tensor.detach().cpu().numpy()
+        # currently using _compute instead of compute
+        # see this issue for more info: https://github.com/huggingface/evaluate/pull/328#issuecomment-1286866576
+        metrics = metric._compute(
+            predictions=pred_labels,
+            references=labels,
+            num_labels=len(id2label),
+            ignore_index=0,
+            reduce_labels=image_processor.reduce_labels,
+        )
+
+        per_category_accuracy = metrics.pop("per_category_accuracy").tolist()
+        per_category_iou = metrics.pop("per_category_iou").tolist()
+
+        metrics.update({f"accuracy_{id2label[i]}": v for i, v in enumerate(per_category_accuracy)})
+        metrics.update({f"iou_{id2label[i]}": v for i, v in enumerate(per_category_iou)})
+
+        return metrics
+```
+
+## Load a base model 
+
+Before loading a base model, let's define a helper function to check the total number of parameters a model has, as well
+as how many of them are trainable.
+
+```python
+def print_trainable_parameters(model):
+    """
+    Prints the number of trainable parameters in the model.
+    """
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        all_param += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    print(
+        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
+    )
+```
+
+Choose a base model checkpoint. For this example, we use the [SegFormer B0 variant](https://huggingface.co/nvidia/mit-b0). 
+In addition to the checkpoint, pass the `label2id` and `id2label` dictionaries to let the `AutoModelForSemanticSegmentation` class know that we're 
+interested in a custom base model where the decoder head should be randomly initialized using the classes from the custom dataset.
+
+```python
+from transformers import AutoModelForSemanticSegmentation, TrainingArguments, Trainer
+
+model = AutoModelForSemanticSegmentation.from_pretrained(
+    checkpoint, id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True
+)
+print_trainable_parameters(model)
+```
+
+At this point you can check with the `print_trainable_parameters` helper function that all 100% parameters in the base 
+model (aka `model`) are trainable.
+
+## Wrap the base model as a PeftModel for LoRA training
+
+To leverage the LoRa method, you need to wrap the base model as a `PeftModel`.  This involves two steps:
+
+1. Defining LoRa configuration with `LoraConfig`
+2. Wrapping the original `model` with `get_peft_model()` using the config defined in the step above.
+
+```python
+from peft import LoraConfig, get_peft_model
+
+config = LoraConfig(
+    r=32,
+    lora_alpha=32,
+    target_modules=["query", "value"],
+    lora_dropout=0.1,
+    bias="lora_only",
+    modules_to_save=["decode_head"],
+)
+lora_model = get_peft_model(model, config)
+print_trainable_parameters(lora_model)
+```
+
+Let's review the `LoraConfig`. To enable LoRA technique, we must define the target modules within `LoraConfig` so that 
+`PeftModel` can update the necessary matrices. Specifically, we want to target the `query` and `value` matrices in the 
+attention blocks of the base model. These matrices are identified by their respective names, "query" and "value". 
+Therefore, we should specify these names in the `target_modules` argument of `LoraConfig`.
+
+After we wrap our base model `model` with `PeftModel` along with the config, we get 
+a new model where only the LoRA parameters are trainable (so-called "update matrices") while the pre-trained parameters 
+are kept frozen. These include the parameters of the randomly initialized classifier parameters too. This is NOT we want 
+when fine-tuning the base model on our custom dataset. To ensure that the classifier parameters are also trained, we 
+specify `modules_to_save`. This also ensures that these modules are serialized alongside the LoRA trainable parameters 
+when using utilities like `save_pretrained()` and `push_to_hub()`.
+
+In addition to specifying the `target_modules` within `LoraConfig`, we also need to specify the `modules_to_save`. When 
+we wrap our base model with `PeftModel` and pass the configuration, we obtain a new model in which only the LoRA parameters 
+are trainable, while the pre-trained parameters and the randomly initialized classifier parameters are kept frozen. 
+However, we do want to train the classifier parameters. By specifying the `modules_to_save` argument, we ensure that the 
+classifier parameters are also trainable, and they will be serialized alongside the LoRA trainable parameters when we 
+use utility functions like `save_pretrained()` and `push_to_hub()`.
+
+Let's review the rest of the parameters:
+
+- `r`: The dimension used by the LoRA update matrices.
+- `alpha`: Scaling factor.
+- `bias`: Specifies if the `bias` parameters should be trained. `None` denotes none of the `bias` parameters will be trained.
+
+When all is configured, and the base model is wrapped, the `print_trainable_parameters` helper function lets us explore 
+the number of trainable parameters. Since we're interested in performing **parameter-efficient fine-tuning**, 
+we should expect to see a lower number of trainable parameters from the `lora_model` in comparison to the original `model` 
+which is indeed the case here.
+
+You can also manually verify what modules are trainable in the `lora_model`.
+
+```python
+for name, param in lora_model.named_parameters():
+    if param.requires_grad:
+        print(name, param.shape)
+```
+
+This confirms that only the LoRA parameters appended to the attention blocks and the `decode_head` parameters are trainable.
+
+## Train the model
+
+Start by defining your training hyperparameters in `TrainingArguments`. You can change the values of most parameters however 
+you prefer. Make sure to set `remove_unused_columns=False`, otherwise the image column will be dropped, and it's required here.
+The only other required parameter is `output_dir` which specifies where to save your model. 
+At the end of each epoch, the `Trainer` will evaluate the IoU metric and save the training checkpoint.
+
+Note that this example is meant to walk you through the workflow when using PEFT for semantic segmentation. We didn't 
+perform extensive hyperparameter tuning to achieve optimal results.
+
+```python
+model_name = checkpoint.split("/")[-1]
+
+training_args = TrainingArguments(
+    output_dir=f"{model_name}-scene-parse-150-lora",
+    learning_rate=5e-4,
+    num_train_epochs=50,
+    per_device_train_batch_size=4,
+    per_device_eval_batch_size=2,
+    save_total_limit=3,
+    evaluation_strategy="epoch",
+    save_strategy="epoch",
+    logging_steps=5,
+    remove_unused_columns=False,
+    push_to_hub=True,
+    label_names=["labels"],
+)
+```
+
+Pass the training arguments to `Trainer` along with the model, dataset, and `compute_metrics` function.
+Call `train()` to finetune your model.
+
+```python
+trainer = Trainer(
+    model=lora_model,
+    args=training_args,
+    train_dataset=train_ds,
+    eval_dataset=test_ds,
+    compute_metrics=compute_metrics,
+)
+
+trainer.train()
+```
+
+## Save the model and run inference
+
+Use the `save_pretrained()` method of the `lora_model` to save the *LoRA-only parameters* locally. 
+Alternatively,  use the `push_to_hub()` method to upload these parameters directly to the Hugging Face Hub 
+(as shown in the [Image classification using LoRA](image_classification_lora) task guide).
+
+```python
+model_id = "segformer-scene-parse-150-lora"
+lora_model.save_pretrained(model_id)
+```
+
+We can see that the LoRA-only parameters are just **2.2 MB in size**! This greatly improves the portability when using very large models.
+
+```bash
+!ls -lh {model_id}
+total 2.2M
+-rw-r--r-- 1 root root  369 Feb  8 03:09 adapter_config.json
+-rw-r--r-- 1 root root 2.2M Feb  8 03:09 adapter_model.bin
+```
+
+Let's now prepare an `inference_model` and run inference.
+
+```python
+from peft import PeftConfig
+
+config = PeftConfig.from_pretrained(model_id)
+model = AutoModelForSemanticSegmentation.from_pretrained(
+    checkpoint, id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True
+)
+
+inference_model = PeftModel.from_pretrained(model, model_id)
+```
+
+Get an image:
+
+```python
+import requests
+
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/semantic-seg-image.png"
+image = Image.open(requests.get(url, stream=True).raw)
+image
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/semantic-seg-image.png" alt="photo of a room"/>
+</div>
+
+Preprocess the image to prepare for inference.
+
+```python
+encoding = image_processor(image.convert("RGB"), return_tensors="pt")
+```
+
+Run inference with the encoded image.
+
+```python
+with torch.no_grad():
+    outputs = inference_model(pixel_values=encoding.pixel_values)
+    logits = outputs.logits
+
+upsampled_logits = nn.functional.interpolate(
+    logits,
+    size=image.size[::-1],
+    mode="bilinear",
+    align_corners=False,
+)
+
+pred_seg = upsampled_logits.argmax(dim=1)[0]
+```
+
+Next, visualize the results.  We need a color palette for this. Here, we use ade_palette(). As it is a long array, so
+we don't include it in this guide, please copy it from [the TensorFlow Model Garden repository](https://github.com/tensorflow/models/blob/3f1ca33afe3c1631b733ea7e40c294273b9e406d/research/deeplab/utils/get_dataset_colormap.py#L51).
+
+```python
+import matplotlib.pyplot as plt
+
+color_seg = np.zeros((pred_seg.shape[0], pred_seg.shape[1], 3), dtype=np.uint8)
+palette = np.array(ade_palette())
+
+for label, color in enumerate(palette):
+    color_seg[pred_seg == label, :] = color
+color_seg = color_seg[..., ::-1]  # convert to BGR
+
+img = np.array(image) * 0.5 + color_seg * 0.5  # plot the image with the segmentation map
+img = img.astype(np.uint8)
+
+plt.figure(figsize=(15, 10))
+plt.imshow(img)
+plt.show()
+```
+
+As you can see, the results are far from perfect, however, this example is designed to illustrate the end-to-end workflow of 
+fine-tuning a semantic segmentation model with LoRa technique, and is not aiming to achieve state-of-the-art 
+results. The results you see here are the same as you would get if you performed full fine-tuning on the same setup (same 
+model variant, same dataset, same training schedule, etc.), except LoRA allows to achieve them with a fraction of total 
+trainable parameters and in less time.
+
+If you wish to use this example and improve the results, here are some things that you can try:
+
+* Increase the number of training samples.
+* Try a larger SegFormer model variant (explore available model variants on the [Hugging Face Hub](https://huggingface.co/models?search=segformer)).
+* Try different values for the arguments available in `LoraConfig`.
+* Tune the learning rate and batch size.
+
+
--- a/docs/source/task_guides/seq2seq-prefix-tuning.mdx
+++ b/docs/source/task_guides/seq2seq-prefix-tuning.mdx
@ -0,0 +1,252 @@
+# Prefix tuning for conditional generation
+
+[[open-in-colab]]
+
+Prefix tuning is an additive method where only a sequence of continuous task-specific vectors is attached to the beginning of the input, or *prefix*. Only the prefix parameters are optimized and added to the hidden states in every layer of the model. The tokens of the input sequence can still attend to the prefix as *virtual tokens*. As a result, prefix tuning stores 1000x fewer parameters than a fully finetuned model, which means you can use one large language model for many tasks.
+
+<Tip>
+
+💡 Read [Prefix-Tuning: Optimizing Continuous Prompts for Generation](https://arxiv.org/abs/2101.00190) to learn more about prefix tuning. 
+
+</Tip>
+
+This guide will show you how to apply prefix tuning to train a [`t5-large`](https://huggingface.co/t5-large) model on the `sentences_allagree` subset of the [financial_phrasebank](https://huggingface.co/datasets/financial_phrasebank) dataset.
+
+Before you begin, make sure you have all the necessary libraries installed:
+
+```bash
+!pip install -q peft transformers datasets
+```
+
+## Setup
+
+Start by defining the model and tokenizer, text and label columns, and some hyperparameters so it'll be easier to start training faster later. Set the environment variable `TOKENIZERS_PARALLELSIM` to `false` to disable the fast Rust-based tokenizer which processes data in parallel by default so you can use multiprocessing in Python.
+
+```py
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, default_data_collator, get_linear_schedule_with_warmup
+from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, PrefixTuningConfig, TaskType
+from datasets import load_dataset
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+import torch
+import os
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+os.environ["CUDA_VISIBLE_DEVICES"] = "3"
+
+device = "cuda"
+model_name_or_path = "t5-large"
+tokenizer_name_or_path = "t5-large"
+
+text_column = "sentence"
+label_column = "text_label"
+max_length = 128
+lr = 1e-2
+num_epochs = 5
+batch_size = 8
+```
+
+## Load dataset
+
+For this guide, you'll train on the `sentences_allagree` subset of the [`financial_phrasebank`](https://huggingface.co/datasets/financial_phrasebank) dataset. This dataset contains financial news categorized by sentiment.
+
+Use 🤗 [Datasets](https://huggingface.co/docs/datasets/index) [`~datasets.Dataset.train_test_split`] function to create a training and validation split and convert the `label` value to the more readable `text_label`. All of the changes can be applied with the [`~datasets.Dataset.map`] function:
+
+```py
+from datasets import load_dataset
+
+dataset = load_dataset("financial_phrasebank", "sentences_allagree")
+dataset = dataset["train"].train_test_split(test_size=0.1)
+dataset["validation"] = dataset["test"]
+del dataset["test"]
+
+classes = dataset["train"].features["label"].names
+dataset = dataset.map(
+    lambda x: {"text_label": [classes[label] for label in x["label"]]},
+    batched=True,
+    num_proc=1,
+)
+
+dataset["train"][0]
+{"sentence": "Profit before taxes was EUR 4.0 mn , down from EUR 4.9 mn .", "label": 0, "text_label": "negative"}
+```
+
+## Preprocess dataset
+
+Initialize a tokenizer, and create a function to pad and truncate the `model_inputs` and `labels`:
+
+```py
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+
+
+def preprocess_function(examples):
+    inputs = examples[text_column]
+    targets = examples[label_column]
+    model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
+    labels = tokenizer(targets, max_length=2, padding="max_length", truncation=True, return_tensors="pt")
+    labels = labels["input_ids"]
+    labels[labels == tokenizer.pad_token_id] = -100
+    model_inputs["labels"] = labels
+    return model_inputs
+```
+
+Use the [`~datasets.Dataset.map`] function to apply the `preprocess_function` to the dataset. You can remove the unprocessed columns since the model doesn't need them anymore:
+
+```py
+processed_datasets = dataset.map(
+    preprocess_function,
+    batched=True,
+    num_proc=1,
+    remove_columns=dataset["train"].column_names,
+    load_from_cache_file=False,
+    desc="Running tokenizer on dataset",
+)
+```
+
+Create a [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader) from the `train` and `eval` datasets. Set `pin_memory=True` to speed up the data transfer to the GPU during training if the samples in your dataset are on a CPU.
+
+```py
+train_dataset = processed_datasets["train"]
+eval_dataset = processed_datasets["validation"]
+
+train_dataloader = DataLoader(
+    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
+)
+eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)
+```
+
+## Train model
+
+Now you can setup your model and make sure it is ready for training. Specify the task in [`PrefixTuningConfig`], create the base `t5-large` model from [`~transformers.AutoModelForSeq2SeqLM`], and then wrap the model and configuration in a [`PeftModel`]. Feel free to print the [`PeftModel`]'s parameters and compare it to fully training all the model parameters to see how much more efficient it is!
+
+```py
+peft_config = PrefixTuningConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, num_virtual_tokens=20)
+
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
+model = get_peft_model(model, peft_config)
+model.print_trainable_parameters()
+"trainable params: 983040 || all params: 738651136 || trainable%: 0.13308583065659835"
+```
+
+Setup the optimizer and learning rate scheduler:
+
+```py
+optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
+lr_scheduler = get_linear_schedule_with_warmup(
+    optimizer=optimizer,
+    num_warmup_steps=0,
+    num_training_steps=(len(train_dataloader) * num_epochs),
+)
+```
+
+Move the model to the GPU, and then write a training loop to begin!
+
+```py
+model = model.to(device)
+
+for epoch in range(num_epochs):
+    model.train()
+    total_loss = 0
+    for step, batch in enumerate(tqdm(train_dataloader)):
+        batch = {k: v.to(device) for k, v in batch.items()}
+        outputs = model(**batch)
+        loss = outputs.loss
+        total_loss += loss.detach().float()
+        loss.backward()
+        optimizer.step()
+        lr_scheduler.step()
+        optimizer.zero_grad()
+
+    model.eval()
+    eval_loss = 0
+    eval_preds = []
+    for step, batch in enumerate(tqdm(eval_dataloader)):
+        batch = {k: v.to(device) for k, v in batch.items()}
+        with torch.no_grad():
+            outputs = model(**batch)
+        loss = outputs.loss
+        eval_loss += loss.detach().float()
+        eval_preds.extend(
+            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
+        )
+
+    eval_epoch_loss = eval_loss / len(eval_dataloader)
+    eval_ppl = torch.exp(eval_epoch_loss)
+    train_epoch_loss = total_loss / len(train_dataloader)
+    train_ppl = torch.exp(train_epoch_loss)
+    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
+```
+
+Let's see how well the model performs on the validation set:
+
+```py
+correct = 0
+total = 0
+for pred, true in zip(eval_preds, dataset["validation"]["text_label"]):
+    if pred.strip() == true.strip():
+        correct += 1
+    total += 1
+accuracy = correct / total * 100
+print(f"{accuracy=} % on the evaluation dataset")
+print(f"{eval_preds[:10]=}")
+print(f"{dataset['validation']['text_label'][:10]=}")
+"accuracy=97.3568281938326 % on the evaluation dataset"
+"eval_preds[:10]=['neutral', 'positive', 'neutral', 'positive', 'neutral', 'negative', 'negative', 'neutral', 'neutral', 'neutral']"
+"dataset['validation']['text_label'][:10]=['neutral', 'positive', 'neutral', 'positive', 'neutral', 'negative', 'negative', 'neutral', 'neutral', 'neutral']"
+```
+
+97% accuracy in just a few minutes; pretty good!
+
+## Share model
+
+You can store and share your model on the Hub if you'd like. Login to your Hugging Face account and enter your token when prompted:
+
+```py
+from huggingface_hub import notebook_login
+
+notebook_login()
+```
+
+Upload the model to a specifc model repository on the Hub with the [`~transformers.PreTrainedModel.push_to_hub`] function:
+
+```py
+peft_model_id = "your-name/t5-large_PREFIX_TUNING_SEQ2SEQ"
+model.push_to_hub("your-name/t5-large_PREFIX_TUNING_SEQ2SEQ", use_auth_token=True)
+```
+
+If you check the model file size in the repository, you'll see that it is only 3.93MB! 🤏
+
+## Inference
+
+Once the model has been uploaded to the Hub, anyone can easily use it for inference. Load the configuration and model:
+
+```py
+from peft import PeftModel, PeftConfig
+
+peft_model_id = "stevhliu/t5-large_PREFIX_TUNING_SEQ2SEQ"
+
+config = PeftConfig.from_pretrained(peft_model_id)
+model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
+model = PeftModel.from_pretrained(model, peft_model_id)
+```
+
+Get and tokenize some text about financial news:
+
+```py
+inputs = tokenizer(
+    "The Lithuanian beer market made up 14.41 million liters in January , a rise of 0.8 percent from the year-earlier figure , the Lithuanian Brewers ' Association reporting citing the results from its members .",
+    return_tensors="pt",
+)
+```
+
+Put the model on a GPU and *generate* the predicted text sentiment:
+
+```py
+model.to(device)
+
+with torch.no_grad():
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10)
+    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))
+["positive"]
+```
--- a/docs/source/task_guides/token-classification-lora.mdx
+++ b/docs/source/task_guides/token-classification-lora.mdx
@ -0,0 +1,374 @@
+# LoRA for token classification
+
+Low-Rank Adaptation (LoRA) is a reparametrization method that aims to reduce the number of trainable parameters with low-rank representations. The weight matrix is broken down into low-rank matrices that are trained and updated. All the pretrained model parameters remain frozen. After training, the low-rank matrices are added back to the original weights. This makes it more efficient to store and train a LoRA model because there are significantly fewer parameters.
+
+<Tip>
+
+💡 Read [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) to learn more about LoRA.
+
+</Tip>
+
+This guide will show you how to train a [`roberta-large`](https://huggingface.co/roberta-large) model with LoRA on the [BioNLP2004](https://huggingface.co/datasets/tner/bionlp2004) dataset for token classification.
+
+Before you begin, make sure you have all the necessary libraries installed:
+
+```bash
+!pip install -q peft transformers datasets evaluate seqeval
+```
+
+## Setup
+
+Let's start by importing all the necessary libraries you'll need:
+
+- 🤗 Transformers for loading the base `roberta-large` model and tokenizer, and handling the training loop
+- 🤗 Datasets for loading and preparing the `bionlp2004` dataset for training
+- 🤗 Evaluate for evaluating the model's performance
+- 🤗 PEFT for setting up the LoRA configuration and creating the PEFT model
+
+```py
+from datasets import load_dataset
+from transformers import (
+    AutoModelForTokenClassification,
+    AutoTokenizer,
+    DataCollatorForTokenClassification,
+    TrainingArguments,
+    Trainer,
+)
+from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
+import evaluate
+import torch
+import numpy as np
+
+model_checkpoint = "roberta-large"
+lr = 1e-3
+batch_size = 16
+num_epochs = 10
+```
+
+## Load dataset and metric
+
+The [BioNLP2004](https://huggingface.co/datasets/tner/bionlp2004) dataset includes tokens and tags for biological structures like DNA, RNA and proteins. Load the dataset:
+
+```py
+bionlp = load_dataset("tner/bionlp2004")
+bionlp["train"][0]
+{
+    "tokens": [
+        "Since",
+        "HUVECs",
+        "released",
+        "superoxide",
+        "anions",
+        "in",
+        "response",
+        "to",
+        "TNF",
+        ",",
+        "and",
+        "H2O2",
+        "induces",
+        "VCAM-1",
+        ",",
+        "PDTC",
+        "may",
+        "act",
+        "as",
+        "a",
+        "radical",
+        "scavenger",
+        ".",
+    ],
+    "tags": [0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+}
+```
+
+The `tags` values are defined in the label ids [dictionary](https://huggingface.co/datasets/tner/bionlp2004#label-id). The letter that prefixes each label indicates the token position: `B` is for the first token of an entity, `I` is for a token inside the entity, and `0` is for a token that is not part of an entity.
+
+```py
+{
+    "O": 0,
+    "B-DNA": 1,
+    "I-DNA": 2,
+    "B-protein": 3,
+    "I-protein": 4,
+    "B-cell_type": 5,
+    "I-cell_type": 6,
+    "B-cell_line": 7,
+    "I-cell_line": 8,
+    "B-RNA": 9,
+    "I-RNA": 10,
+}
+```
+
+Then load the [`seqeval`](https://huggingface.co/spaces/evaluate-metric/seqeval) framework which includes several metrics - precision, accuracy, F1, and recall - for evaluating sequence labeling tasks.
+
+```py
+seqeval = evaluate.load("seqeval")
+```
+
+Now you can write an evaluation function to compute the metrics from the model predictions and labels, and return the precision, recall, F1, and accuracy scores:
+
+```py
+label_list = [
+    "O",
+    "B-DNA",
+    "I-DNA",
+    "B-protein",
+    "I-protein",
+    "B-cell_type",
+    "I-cell_type",
+    "B-cell_line",
+    "I-cell_line",
+    "B-RNA",
+    "I-RNA",
+]
+
+
+def compute_metrics(p):
+    predictions, labels = p
+    predictions = np.argmax(predictions, axis=2)
+
+    true_predictions = [
+        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
+        for prediction, label in zip(predictions, labels)
+    ]
+    true_labels = [
+        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
+        for prediction, label in zip(predictions, labels)
+    ]
+
+    results = seqeval.compute(predictions=true_predictions, references=true_labels)
+    return {
+        "precision": results["overall_precision"],
+        "recall": results["overall_recall"],
+        "f1": results["overall_f1"],
+        "accuracy": results["overall_accuracy"],
+    }
+```
+
+## Preprocess dataset
+
+Initialize a tokenizer and make sure you set `is_split_into_words=True` because the text sequence has already been split into words. However, this doesn't mean it is tokenized yet (even though it may look like it!), and you'll need to further tokenize the words into subwords.
+
+```py
+tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
+```
+
+You'll also need to write a function to:
+
+1. Map each token to their respective word with the [`~transformers.BatchEncoding.word_ids`] method.
+2. Ignore the special tokens by setting them to `-100`.
+3. Label the first token of a given entity.
+
+```py
+def tokenize_and_align_labels(examples):
+    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
+
+    labels = []
+    for i, label in enumerate(examples[f"tags"]):
+        word_ids = tokenized_inputs.word_ids(batch_index=i)
+        previous_word_idx = None
+        label_ids = []
+        for word_idx in word_ids:
+            if word_idx is None:
+                label_ids.append(-100)
+            elif word_idx != previous_word_idx:
+                label_ids.append(label[word_idx])
+            else:
+                label_ids.append(-100)
+            previous_word_idx = word_idx
+        labels.append(label_ids)
+
+    tokenized_inputs["labels"] = labels
+    return tokenized_inputs
+```
+
+Use [`~datasets.Dataset.map`] to apply the `tokenize_and_align_labels` function to the dataset:
+
+```py
+tokenized_bionlp = bionlp.map(tokenize_and_align_labels, batched=True)
+```
+
+Finally, create a data collator to pad the examples to the longest length in a batch:
+
+```py
+data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
+```
+
+## Train
+
+Now you're ready to create a [`PeftModel`]. Start by loading the base `roberta-large` model, the number of expected labels, and the `id2label` and `label2id` dictionaries:
+
+```py
+id2label = {
+    0: "O",
+    1: "B-DNA",
+    2: "I-DNA",
+    3: "B-protein",
+    4: "I-protein",
+    5: "B-cell_type",
+    6: "I-cell_type",
+    7: "B-cell_line",
+    8: "I-cell_line",
+    9: "B-RNA",
+    10: "I-RNA",
+}
+label2id = {
+    "O": 0,
+    "B-DNA": 1,
+    "I-DNA": 2,
+    "B-protein": 3,
+    "I-protein": 4,
+    "B-cell_type": 5,
+    "I-cell_type": 6,
+    "B-cell_line": 7,
+    "I-cell_line": 8,
+    "B-RNA": 9,
+    "I-RNA": 10,
+}
+
+model = AutoModelForTokenClassification.from_pretrained(
+    model_checkpoint, num_labels=11, id2label=id2label, label2id=label2id
+)
+```
+
+Define the [`LoraConfig`] with:
+
+- `task_type`, token classification (`TaskType.TOKEN_CLS`)
+- `r`, the dimension of the low-rank matrices
+- `lora_alpha`, scaling factor for the weight matrices
+- `lora_dropout`, dropout probability of the LoRA layers
+- `bias`, set to `all` to train all bias parameters
+
+<Tip>
+
+💡 The weight matrix is scaled by `lora_alpha/r`, and a higher `lora_alpha` value assigns more weight to the LoRA activations. For performance, we recommend setting `bias` to `None` first, and then `lora_only`, before trying `all`.
+
+</Tip>
+
+```py
+peft_config = LoraConfig(
+    task_type=TaskType.TOKEN_CLS, inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias="all"
+)
+```
+
+Pass the base model and `peft_config` to the [`get_peft_model`] function to create a [`PeftModel`]. You can check out how much more efficient training the [`PeftModel`] is compared to fully training the base model by printing out the trainable parameters:
+
+```py
+model = get_peft_model(model, peft_config)
+model.print_trainable_parameters()
+"trainable params: 1855499 || all params: 355894283 || trainable%: 0.5213624069370061"
+```
+
+From the 🤗 Transformers library, create a [`~transformers.TrainingArguments`] class and specify where you want to save the model to, the training hyperparameters, how to evaluate the model, and when to save the checkpoints:
+
+```py
+training_args = TrainingArguments(
+    output_dir="roberta-large-lora-token-classification",
+    learning_rate=lr,
+    per_device_train_batch_size=batch_size,
+    per_device_eval_batch_size=batch_size,
+    num_train_epochs=num_epochs,
+    weight_decay=0.01,
+    evaluation_strategy="epoch",
+    save_strategy="epoch",
+    load_best_model_at_end=True,
+)
+```
+
+Pass the model, `TrainingArguments`, datasets, tokenizer, data collator and evaluation function to the [`~transformers.Trainer`] class. The `Trainer` handles the training loop for you, and when you're ready, call [`~transformers.Trainer.train`] to begin!
+
+```py
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=tokenized_bionlp["train"],
+    eval_dataset=tokenized_bionlp["validation"],
+    tokenizer=tokenizer,
+    data_collator=data_collator,
+    compute_metrics=compute_metrics,
+)
+
+trainer.train()
+```
+
+## Share model
+
+Once training is complete, you can store and share your model on the Hub if you'd like. Log in to your Hugging Face account and enter your token when prompted:
+
+```py
+from huggingface_hub import notebook_login
+
+notebook_login()
+```
+
+Upload the model to a specific model repository on the Hub with the [`~transformers.PreTrainedModel.push_to_hub`] method:
+
+```py
+model.push_to_hub("your-name/roberta-large-lora-token-classification")
+```
+
+## Inference
+
+To use your model for inference, load the configuration and model:
+
+```py
+peft_model_id = "stevhliu/roberta-large-lora-token-classification"
+config = PeftConfig.from_pretrained(peft_model_id)
+inference_model = AutoModelForTokenClassification.from_pretrained(
+    config.base_model_name_or_path, num_labels=11, id2label=id2label, label2id=label2id
+)
+tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
+model = PeftModel.from_pretrained(inference_model, peft_model_id)
+```
+
+Get some text to tokenize:
+
+```py
+text = "The activation of IL-2 gene expression and NF-kappa B through CD28 requires reactive oxygen production by 5-lipoxygenase."
+inputs = tokenizer(text, return_tensors="pt")
+```
+
+Pass the inputs to the model, and print out the model prediction for each token:
+
+```py
+with torch.no_grad():
+    logits = model(**inputs).logits
+
+tokens = inputs.tokens()
+predictions = torch.argmax(logits, dim=2)
+
+for token, prediction in zip(tokens, predictions[0].numpy()):
+    print((token, model.config.id2label[prediction]))
+("<s>", "O")
+("The", "O")
+("Ġactivation", "O")
+("Ġof", "O")
+("ĠIL", "B-DNA")
+("-", "O")
+("2", "I-DNA")
+("Ġgene", "O")
+("Ġexpression", "O")
+("Ġand", "O")
+("ĠNF", "B-protein")
+("-", "O")
+("k", "I-protein")
+("appa", "I-protein")
+("ĠB", "I-protein")
+("Ġthrough", "O")
+("ĠCD", "B-protein")
+("28", "I-protein")
+("Ġrequires", "O")
+("Ġreactive", "O")
+("Ġoxygen", "O")
+("Ġproduction", "O")
+("Ġby", "O")
+("Ġ5", "B-protein")
+("-", "O")
+("lip", "I-protein")
+("oxy", "I-protein")
+("gen", "I-protein")
+("ase", "I-protein")
+(".", "O")
+("</s>", "O")
+```
--- a/examples/causal_language_modeling/accelerate_ds_zero3_cpu_offload_config.yaml
+++ b/examples/causal_language_modeling/accelerate_ds_zero3_cpu_offload_config.yaml
@ -0,0 +1,22 @@
+compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  gradient_accumulation_steps: 1
+  gradient_clipping: 1.0
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: true
+  zero3_save_16bit_model: true
+  zero_stage: 3
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+dynamo_backend: 'NO'
+fsdp_config: {}
+machine_rank: 0
+main_training_function: main
+megatron_lm_config: {}
+mixed_precision: 'no'
+num_machines: 1
+num_processes: 1
+rdzv_backend: static
+same_network: true
+use_cpu: false
--- a/examples/causal_language_modeling/peft_lora_clm_accelerate_big_model_inference.ipynb
+++ b/examples/causal_language_modeling/peft_lora_clm_accelerate_big_model_inference.ipynb
--- a/examples/causal_language_modeling/peft_lora_clm_accelerate_ds_zero3_offload.py
+++ b/examples/causal_language_modeling/peft_lora_clm_accelerate_ds_zero3_offload.py
@ -4,9 +4,12 @@ import sys
 import threading

 import numpy as np
+import psutil
 import torch
 from accelerate import Accelerator
+from datasets import load_dataset
 from torch.utils.data import DataLoader
+from tqdm import tqdm
 from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
@ -15,10 +18,7 @@ from transformers import (
    set_seed,
 )

-import psutil
-from datasets import load_dataset
-from peft import LoraConfig, TaskType, get_peft_model, get_peft_model_state_dict
-from tqdm import tqdm
+from peft import LoraConfig, TaskType, get_peft_model


 def levenshtein_distance(str1, str2):
@ -111,9 +111,6 @@ def main():
    model_name_or_path = "bigscience/bloomz-7b1"
    dataset_name = "twitter_complaints"
    peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)
-    checkpoint_name = (
-        f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}_v1.pt".replace("/", "_")
-    )
    text_column = "Tweet text"
    label_column = "text_label"
    lr = 3e-3
@ -121,6 +118,7 @@ def main():
    batch_size = 8
    seed = 42
    max_length = 64
+    do_test = False
    set_seed(seed)

    dataset = load_dataset("ought/raft", dataset_name)
@ -269,7 +267,7 @@ def main():
                tracemalloc.cpu_peaked + b2mb(tracemalloc.cpu_begin)
            )
        )
-        train_epoch_loss = total_loss / len(eval_dataloader)
+        train_epoch_loss = total_loss / len(train_dataloader)
        train_ppl = torch.exp(train_epoch_loss)
        accelerator.print(f"{epoch=}: {train_ppl=} {train_epoch_loss=}")

@ -282,7 +280,9 @@ def main():
                    outputs = accelerator.unwrap_model(model).generate(
                        **batch, synced_gpus=is_ds_zero_3, max_new_tokens=10
                    )  # synced_gpus=True for DS-stage 3
-                preds = outputs[:, max_length:].detach().cpu().numpy()
+                outputs = accelerator.pad_across_processes(outputs, dim=1, pad_index=tokenizer.pad_token_id)
+                preds = accelerator.gather_for_metrics(outputs)
+                preds = preds[:, max_length:].detach().cpu().numpy()
                eval_preds.extend(tokenizer.batch_decode(preds, skip_special_tokens=True))

        # Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
@ -306,6 +306,9 @@ def main():

        correct = 0
        total = 0
+        assert len(eval_preds) == len(
+            dataset["train"][label_column]
+        ), f"{len(eval_preds)} != {len(dataset['train'][label_column])}"
        for pred, true in zip(eval_preds, dataset["train"][label_column]):
            if pred.strip() == true.strip():
                correct += 1
@ -315,35 +318,43 @@ def main():
        accelerator.print(f"{eval_preds[:10]=}")
        accelerator.print(f"{dataset['train'][label_column][:10]=}")

-    model.eval()
-    test_preds = []
-    for _, batch in enumerate(tqdm(test_dataloader)):
-        batch = {k: v for k, v in batch.items() if k != "labels"}
-        with torch.no_grad():
-            outputs = accelerator.unwrap_model(model).generate(
-                **batch, synced_gpus=is_ds_zero_3, max_new_tokens=10
-            )  # synced_gpus=True for DS-stage 3
-        test_preds.extend(
-            tokenizer.batch_decode(outputs[:, max_length:].detach().cpu().numpy(), skip_special_tokens=True)
-        )
+    if do_test:
+        model.eval()
+        test_preds = []
+        for _, batch in enumerate(tqdm(test_dataloader)):
+            batch = {k: v for k, v in batch.items() if k != "labels"}
+            with torch.no_grad():
+                outputs = accelerator.unwrap_model(model).generate(
+                    **batch, synced_gpus=is_ds_zero_3, max_new_tokens=10
+                )  # synced_gpus=True for DS-stage 3
+            outputs = accelerator.pad_across_processes(outputs, dim=1, pad_index=tokenizer.pad_token_id)
+            preds = accelerator.gather(outputs)
+            preds = preds[:, max_length:].detach().cpu().numpy()
+            test_preds.extend(tokenizer.batch_decode(preds, skip_special_tokens=True))

-    test_preds_cleaned = []
-    for _, pred in enumerate(test_preds):
-        test_preds_cleaned.append(get_closest_label(pred, classes))
+        test_preds_cleaned = []
+        for _, pred in enumerate(test_preds):
+            test_preds_cleaned.append(get_closest_label(pred, classes))

-    test_df = dataset["test"].to_pandas()
-    test_df[label_column] = test_preds_cleaned
-    test_df["text_labels_orig"] = test_preds
-    accelerator.print(test_df[[text_column, label_column]].sample(20))
+        test_df = dataset["test"].to_pandas()
+        assert len(test_preds_cleaned) == len(test_df), f"{len(test_preds_cleaned)} != {len(test_df)}"
+        test_df[label_column] = test_preds_cleaned
+        test_df["text_labels_orig"] = test_preds
+        accelerator.print(test_df[[text_column, label_column]].sample(20))

-    pred_df = test_df[["ID", label_column]]
-    pred_df.columns = ["ID", "Label"]
+        pred_df = test_df[["ID", label_column]]
+        pred_df.columns = ["ID", "Label"]

-    os.makedirs(f"data/{dataset_name}", exist_ok=True)
-    pred_df.to_csv(f"data/{dataset_name}/predictions.csv", index=False)
+        os.makedirs(f"data/{dataset_name}", exist_ok=True)
+        pred_df.to_csv(f"data/{dataset_name}/predictions.csv", index=False)

    accelerator.wait_for_everyone()
-    accelerator.save(get_peft_model_state_dict(model, state_dict=accelerator.get_state_dict(model)), checkpoint_name)
+    model.push_to_hub(
+        "smangrul/"
+        + f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace("/", "_"),
+        state_dict=accelerator.get_state_dict(model),
+        use_auth_token=True,
+    )
    accelerator.wait_for_everyone()


--- a/examples/causal_language_modeling/peft_prefix_tuning_clm.ipynb
+++ b/examples/causal_language_modeling/peft_prefix_tuning_clm.ipynb
--- a/examples/causal_language_modeling/peft_prompt_tuning_clm.ipynb
+++ b/examples/causal_language_modeling/peft_prompt_tuning_clm.ipynb
--- a/examples/causal_language_modeling/requirements.txt
+++ b/examples/causal_language_modeling/requirements.txt
@ -1,6 +1,5 @@
 transformers
 accelerate
-loralib
 evaluate
 deepspeed
 tqdm
--- a/examples/conditional_generation/accelerate_ds_zero3_cpu_offload_config.yaml
+++ b/examples/conditional_generation/accelerate_ds_zero3_cpu_offload_config.yaml
@ -0,0 +1,22 @@
+compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  gradient_accumulation_steps: 1
+  gradient_clipping: 1.0
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: true
+  zero3_save_16bit_model: true
+  zero_stage: 3
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+dynamo_backend: 'NO'
+fsdp_config: {}
+machine_rank: 0
+main_training_function: main
+megatron_lm_config: {}
+mixed_precision: 'no'
+num_machines: 1
+num_processes: 1
+rdzv_backend: static
+same_network: true
+use_cpu: false
--- a/examples/conditional_generation/peft_adalora_seq2seq.py
+++ b/examples/conditional_generation/peft_adalora_seq2seq.py
@ -0,0 +1,182 @@
+import os
+
+import torch
+from datasets import load_dataset
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup
+
+from peft import AdaLoraConfig, PeftConfig, PeftModel, TaskType, get_peft_model
+
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+device = "cuda"
+model_name_or_path = "facebook/bart-base"
+tokenizer_name_or_path = "facebook/bart-base"
+
+checkpoint_name = "financial_sentiment_analysis_lora_v1.pt"
+text_column = "sentence"
+label_column = "text_label"
+max_length = 128
+lr = 1e-3
+num_epochs = 8
+batch_size = 8
+
+
+# creating model
+peft_config = AdaLoraConfig(
+    init_r=12,
+    target_r=8,
+    beta1=0.85,
+    beta2=0.85,
+    tinit=200,
+    tfinal=1000,
+    deltaT=10,
+    lora_alpha=32,
+    lora_dropout=0.1,
+    task_type=TaskType.SEQ_2_SEQ_LM,
+    inference_mode=False,
+)
+
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
+model = get_peft_model(model, peft_config)
+model.print_trainable_parameters()
+
+
+# loading dataset
+dataset = load_dataset("financial_phrasebank", "sentences_allagree")
+dataset = dataset["train"].train_test_split(test_size=0.1)
+dataset["validation"] = dataset["test"]
+del dataset["test"]
+
+classes = dataset["train"].features["label"].names
+dataset = dataset.map(
+    lambda x: {"text_label": [classes[label] for label in x["label"]]},
+    batched=True,
+    num_proc=1,
+)
+
+
+# data preprocessing
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+
+
+def preprocess_function(examples):
+    inputs = examples[text_column]
+    targets = examples[label_column]
+    model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
+    labels = tokenizer(targets, max_length=3, padding="max_length", truncation=True, return_tensors="pt")
+    labels = labels["input_ids"]
+    labels[labels == tokenizer.pad_token_id] = -100
+    model_inputs["labels"] = labels
+    return model_inputs
+
+
+processed_datasets = dataset.map(
+    preprocess_function,
+    batched=True,
+    num_proc=1,
+    remove_columns=dataset["train"].column_names,
+    load_from_cache_file=False,
+    desc="Running tokenizer on dataset",
+)
+
+train_dataset = processed_datasets["train"]
+eval_dataset = processed_datasets["validation"]
+
+train_dataloader = DataLoader(
+    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
+)
+eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)
+
+
+# optimizer and lr scheduler
+optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
+lr_scheduler = get_linear_schedule_with_warmup(
+    optimizer=optimizer,
+    num_warmup_steps=0,
+    num_training_steps=(len(train_dataloader) * num_epochs),
+)
+model.base_model.peft_config.total_step = len(train_dataloader) * num_epochs
+
+
+# training and evaluation
+model = model.to(device)
+global_step = 0
+for epoch in range(num_epochs):
+    model.train()
+    total_loss = 0
+    for step, batch in enumerate(tqdm(train_dataloader)):
+        batch = {k: v.to(device) for k, v in batch.items()}
+        outputs = model(**batch)
+        loss = outputs.loss
+        total_loss += loss.detach().float()
+        loss.backward()
+        optimizer.step()
+        lr_scheduler.step()
+        # Update the importance of low-rank matrices
+        # and allocate the budget accordingly.
+        model.base_model.update_and_allocate(global_step)
+        optimizer.zero_grad()
+        global_step += 1
+
+    model.eval()
+    eval_loss = 0
+    eval_preds = []
+    for step, batch in enumerate(tqdm(eval_dataloader)):
+        batch = {k: v.to(device) for k, v in batch.items()}
+        with torch.no_grad():
+            outputs = model(**batch)
+        loss = outputs.loss
+        eval_loss += loss.detach().float()
+        eval_preds.extend(
+            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
+        )
+
+    eval_epoch_loss = eval_loss / len(train_dataloader)
+    eval_ppl = torch.exp(eval_epoch_loss)
+    train_epoch_loss = total_loss / len(eval_dataloader)
+    train_ppl = torch.exp(train_epoch_loss)
+    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
+
+
+# print accuracy
+correct = 0
+total = 0
+for pred, true in zip(eval_preds, dataset["validation"]["text_label"]):
+    if pred.strip() == true.strip():
+        correct += 1
+    total += 1
+accuracy = correct / total * 100
+print(f"{accuracy=} % on the evaluation dataset")
+print(f"{eval_preds[:10]=}")
+print(f"{dataset['validation']['text_label'][:10]=}")
+
+
+# saving model
+peft_model_id = f"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}"
+model.save_pretrained(peft_model_id)
+
+
+ckpt = f"{peft_model_id}/adapter_model.bin"
+# get_ipython().system('du -h $ckpt')
+
+
+peft_model_id = f"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}"
+
+config = PeftConfig.from_pretrained(peft_model_id)
+model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
+model = PeftModel.from_pretrained(model, peft_model_id)
+
+
+model.eval()
+i = 13
+inputs = tokenizer(dataset["validation"][text_column][i], return_tensors="pt")
+print(dataset["validation"][text_column][i])
+print(inputs)
+
+with torch.no_grad():
+    outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10)
+    print(outputs)
+    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))
--- a/examples/conditional_generation/peft_ia3_seq2seq.ipynb
+++ b/examples/conditional_generation/peft_ia3_seq2seq.ipynb
--- a/examples/conditional_generation/peft_lora_seq2seq.ipynb
+++ b/examples/conditional_generation/peft_lora_seq2seq.ipynb
@ -2,20 +2,37 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 1,
   "id": "5f93b7d1",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "===================================BUG REPORT===================================\n",
+      "Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
+      "For effortless bug reporting copy-paste your error into this form: https://docs.google.com/forms/d/e/1FAIpQLScPB8emS3Thkp66nvqwmjTEgxp8Y9ufuWTzFyr9kJ5AoI47dQ/viewform?usp=sf_link\n",
+      "================================================================================\n",
+      "CUDA SETUP: CUDA runtime path found: /home/sourab/miniconda3/envs/ml/lib/libcudart.so\n",
+      "CUDA SETUP: Highest compute capability among GPUs detected: 7.5\n",
+      "CUDA SETUP: Detected CUDA version 117\n",
+      "CUDA SETUP: Loading binary /home/sourab/miniconda3/envs/ml/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...\n"
+     ]
+    }
+   ],
   "source": [
    "from transformers import AutoModelForSeq2SeqLM\n",
-    "from peft import get_peft_config,get_peft_model, get_peft_model_state_dict, LoraConfig, TaskType\n",
+    "from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, LoraConfig, TaskType\n",
    "import torch\n",
    "from datasets import load_dataset\n",
    "import os\n",
+    "\n",
    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
    "from transformers import AutoTokenizer\n",
    "from torch.utils.data import DataLoader\n",
-    "from transformers import default_data_collator,get_linear_schedule_with_warmup\n",
+    "from transformers import default_data_collator, get_linear_schedule_with_warmup\n",
    "from tqdm import tqdm\n",
    "from datasets import load_dataset\n",
    "\n",
@ -26,10 +43,10 @@
    "checkpoint_name = \"financial_sentiment_analysis_lora_v1.pt\"\n",
    "text_column = \"sentence\"\n",
    "label_column = \"text_label\"\n",
-    "max_length=128\n",
+    "max_length = 128\n",
    "lr = 1e-3\n",
    "num_epochs = 3\n",
-    "batch_size=8\n"
+    "batch_size = 8"
   ]
  },
  {
@ -40,9 +57,7 @@
   "outputs": [],
   "source": [
    "# creating model\n",
-    "peft_config = LoraConfig(\n",
-    "    task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1\n",
-    ")\n",
+    "peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)\n",
    "\n",
    "model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)\n",
    "model = get_peft_model(model, peft_config)\n",
@ -60,15 +75,13 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "/home/sourab/miniconda3/envs/ml/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:97: FutureWarning: Deprecated argument(s) used in 'dataset_info': token. Will not be supported from version '0.12'.\n",
-      "  warnings.warn(message, FutureWarning)\n",
      "Found cached dataset financial_phrasebank (/home/sourab/.cache/huggingface/datasets/financial_phrasebank/sentences_allagree/1.0.0/550bde12e6c30e2674da973a55f57edde5181d53f5a5a34c1531c53f93b7e141)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6de075f8208349108291ac5ab7f5c980",
+       "model_id": "3403bf3d718042018b0531848cc30209",
       "version_major": 2,
       "version_minor": 0
      },
@ -82,7 +95,7 @@
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "4b0e67b6d93f43e4b0f6a2f8978e4b0c",
+       "model_id": "d3d5c45e3776469f9560b6eaa9346f8f",
       "version_major": 2,
       "version_minor": 0
      },
@ -96,7 +109,7 @@
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "a9551029c9884529bda7421a99170b51",
+       "model_id": "e9736f26e9aa450b8d65f95c0b9c81cc",
       "version_major": 2,
       "version_minor": 0
      },
@ -110,7 +123,7 @@
    {
     "data": {
      "text/plain": [
-       "{'sentence': 'The order was valued at USD12 .2 m.',\n",
+       "{'sentence': \"The 10,000-odd square metre plot that Stockmann has bought for the Nevsky Center shopping center is located on Nevsky Prospect , St Petersburg 's high street , next to the Vosstaniya Square underground station , in the immediate vicinity of Moscow Station .\",\n",
       " 'label': 1,\n",
       " 'text_label': 'neutral'}"
      ]
@ -122,17 +135,16 @@
   ],
   "source": [
    "# loading dataset\n",
-    "dataset = load_dataset(\"financial_phrasebank\", 'sentences_allagree')\n",
+    "dataset = load_dataset(\"financial_phrasebank\", \"sentences_allagree\")\n",
    "dataset = dataset[\"train\"].train_test_split(test_size=0.1)\n",
    "dataset[\"validation\"] = dataset[\"test\"]\n",
-    "del(dataset[\"test\"])\n",
+    "del dataset[\"test\"]\n",
    "\n",
    "classes = dataset[\"train\"].features[\"label\"].names\n",
    "dataset = dataset.map(\n",
    "    lambda x: {\"text_label\": [classes[label] for label in x[\"label\"]]},\n",
    "    batched=True,\n",
    "    num_proc=1,\n",
-    "    \n",
    ")\n",
    "\n",
    "dataset[\"train\"][0]"
@ -147,7 +159,7 @@
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "4421971232434db1b6141e91fda2f6d7",
+       "model_id": "c460989d4ab24e3f97d81ef040b1d1b4",
       "version_major": 2,
       "version_minor": 0
      },
@ -161,7 +173,7 @@
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "9b2ef793d93443949f4a5d5874d4bc05",
+       "model_id": "1acc389b08b94f8a87900b9fbdbccce4",
       "version_major": 2,
       "version_minor": 0
      },
@ -176,36 +188,35 @@
   "source": [
    "# data preprocessing\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)\n",
+    "\n",
+    "\n",
    "def preprocess_function(examples):\n",
    "    inputs = examples[text_column]\n",
    "    targets = examples[label_column]\n",
    "    model_inputs = tokenizer(inputs, max_length=max_length, padding=\"max_length\", truncation=True, return_tensors=\"pt\")\n",
    "    labels = tokenizer(targets, max_length=3, padding=\"max_length\", truncation=True, return_tensors=\"pt\")\n",
    "    labels = labels[\"input_ids\"]\n",
-    "    labels[labels==tokenizer.pad_token_id] = -100\n",
+    "    labels[labels == tokenizer.pad_token_id] = -100\n",
    "    model_inputs[\"labels\"] = labels\n",
    "    return model_inputs\n",
    "\n",
+    "\n",
    "processed_datasets = dataset.map(\n",
-    "            preprocess_function,\n",
-    "            batched=True,\n",
-    "            num_proc=1,\n",
-    "            remove_columns=dataset[\"train\"].column_names,\n",
-    "            load_from_cache_file=False,\n",
-    "            desc=\"Running tokenizer on dataset\",\n",
-    "        )\n",
+    "    preprocess_function,\n",
+    "    batched=True,\n",
+    "    num_proc=1,\n",
+    "    remove_columns=dataset[\"train\"].column_names,\n",
+    "    load_from_cache_file=False,\n",
+    "    desc=\"Running tokenizer on dataset\",\n",
+    ")\n",
    "\n",
    "train_dataset = processed_datasets[\"train\"]\n",
    "eval_dataset = processed_datasets[\"validation\"]\n",
    "\n",
    "train_dataloader = DataLoader(\n",
-    "        train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True\n",
-    "    )\n",
-    "eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)\n",
-    "\n",
-    "\n",
-    "\n",
-    "    "
+    "    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True\n",
+    ")\n",
+    "eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)"
   ]
  },
  {
@ -221,7 +232,7 @@
    "    optimizer=optimizer,\n",
    "    num_warmup_steps=0,\n",
    "    num_training_steps=(len(train_dataloader) * num_epochs),\n",
-    ")\n"
+    ")"
   ]
  },
  {
@ -234,45 +245,52 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "100%|█████████████████████████████████████████████████████████████| 255/255 [00:53<00:00,  4.80it/s]\n",
-      "100%|███████████████████████████████████████████████████████████████| 29/29 [00:02<00:00, 14.16it/s]\n"
+      "100%|████████████████████████████████████████████████████████████████████████████████████████| 255/255 [02:21<00:00,  1.81it/s]\n",
+      "100%|██████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:07<00:00,  4.13it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "epoch=0: train_ppl=tensor(13.6966, device='cuda:0') train_epoch_loss=tensor(2.6171, device='cuda:0') eval_ppl=tensor(1.0046, device='cuda:0') eval_epoch_loss=tensor(0.0046, device='cuda:0')\n"
+      "epoch=0: train_ppl=tensor(14.6341, device='cuda:0') train_epoch_loss=tensor(2.6834, device='cuda:0') eval_ppl=tensor(1.0057, device='cuda:0') eval_epoch_loss=tensor(0.0057, device='cuda:0')\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "100%|█████████████████████████████████████████████████████████████| 255/255 [00:52<00:00,  4.88it/s]\n",
-      "100%|███████████████████████████████████████████████████████████████| 29/29 [00:02<00:00, 14.20it/s]\n"
+      "100%|████████████████████████████████████████████████████████████████████████████████████████| 255/255 [02:00<00:00,  2.11it/s]\n",
+      "100%|██████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:05<00:00,  5.66it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "epoch=1: train_ppl=tensor(1.5893, device='cuda:0') train_epoch_loss=tensor(0.4633, device='cuda:0') eval_ppl=tensor(1.0020, device='cuda:0') eval_epoch_loss=tensor(0.0020, device='cuda:0')\n"
+      "epoch=1: train_ppl=tensor(1.7576, device='cuda:0') train_epoch_loss=tensor(0.5640, device='cuda:0') eval_ppl=tensor(1.0052, device='cuda:0') eval_epoch_loss=tensor(0.0052, device='cuda:0')\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "100%|█████████████████████████████████████████████████████████████| 255/255 [00:52<00:00,  4.87it/s]\n",
-      "100%|███████████████████████████████████████████████████████████████| 29/29 [00:02<00:00, 14.18it/s]\n"
+      "100%|████████████████████████████████████████████████████████████████████████████████████████| 255/255 [01:33<00:00,  2.74it/s]\n",
+      "100%|██████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:04<00:00,  6.23it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "epoch=2: train_ppl=tensor(1.3210, device='cuda:0') train_epoch_loss=tensor(0.2784, device='cuda:0') eval_ppl=tensor(1.0026, device='cuda:0') eval_epoch_loss=tensor(0.0026, device='cuda:0')\n"
+      "epoch=2: train_ppl=tensor(1.3830, device='cuda:0') train_epoch_loss=tensor(0.3243, device='cuda:0') eval_ppl=tensor(1.0035, device='cuda:0') eval_epoch_loss=tensor(0.0035, device='cuda:0')\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
     ]
    }
   ],
@ -302,18 +320,20 @@
    "            outputs = model(**batch)\n",
    "        loss = outputs.loss\n",
    "        eval_loss += loss.detach().float()\n",
-    "        eval_preds.extend(tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True))\n",
+    "        eval_preds.extend(\n",
+    "            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)\n",
+    "        )\n",
    "\n",
-    "    eval_epoch_loss = eval_loss/len(train_dataloader)\n",
+    "    eval_epoch_loss = eval_loss / len(eval_dataloader)\n",
    "    eval_ppl = torch.exp(eval_epoch_loss)\n",
-    "    train_epoch_loss = total_loss/len(eval_dataloader)\n",
+    "    train_epoch_loss = total_loss / len(train_dataloader)\n",
    "    train_ppl = torch.exp(train_epoch_loss)\n",
-    "    print(f\"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}\")\n"
+    "    print(f\"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}\")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 7,
   "id": "6cafa67b",
   "metadata": {},
   "outputs": [
@ -321,21 +341,21 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "accuracy=98.23788546255507 % on the evaluation dataset\n",
-      "eval_preds[:10]=['neutral', 'neutral', 'positive', 'positive', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral']\n",
-      "dataset['validation']['text_label'][:10]=['neutral', 'neutral', 'positive', 'positive', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral']\n"
+      "accuracy=97.3568281938326 % on the evaluation dataset\n",
+      "eval_preds[:10]=['neutral', 'neutral', 'neutral', 'positive', 'neutral', 'positive', 'positive', 'neutral', 'neutral', 'neutral']\n",
+      "dataset['validation']['text_label'][:10]=['neutral', 'neutral', 'neutral', 'positive', 'neutral', 'positive', 'positive', 'neutral', 'neutral', 'neutral']\n"
     ]
    }
   ],
   "source": [
    "# print accuracy\n",
-    "correct =0\n",
+    "correct = 0\n",
    "total = 0\n",
-    "for pred,true in zip(eval_preds, dataset[\"validation\"][\"text_label\"]):\n",
-    "    if pred.strip()==true.strip():\n",
-    "        correct+=1\n",
-    "    total+=1  \n",
-    "accuracy = correct/total*100\n",
+    "for pred, true in zip(eval_preds, dataset[\"validation\"][\"text_label\"]):\n",
+    "    if pred.strip() == true.strip():\n",
+    "        correct += 1\n",
+    "    total += 1\n",
+    "accuracy = correct / total * 100\n",
    "print(f\"{accuracy=} % on the evaluation dataset\")\n",
    "print(f\"{eval_preds[:10]=}\")\n",
    "print(f\"{dataset['validation']['text_label'][:10]=}\")"
@ -343,20 +363,19 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "id": "a8de6005",
   "metadata": {},
   "outputs": [],
   "source": [
    "# saving model\n",
-    "state_dict = get_peft_model_state_dict(model)\n",
-    "torch.save(state_dict, checkpoint_name)\n",
-    "print(state_dict)"
+    "peft_model_id = f\"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\"\n",
+    "model.save_pretrained(peft_model_id)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 9,
   "id": "bd20cd4c",
   "metadata": {},
   "outputs": [
@ -364,18 +383,75 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "19M\tfinancial_sentiment_analysis_lora_v1.pt\r\n"
+      "9,2M\tbigscience/mt0-large_LORA_SEQ_2_SEQ_LM/adapter_model.bin\r\n"
     ]
    }
   ],
   "source": [
-    "!du -h $checkpoint_name"
+    "ckpt = f\"{peft_model_id}/adapter_model.bin\"\n",
+    "!du -h $ckpt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "76c2fc29",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from peft import PeftModel, PeftConfig\n",
+    "\n",
+    "peft_model_id = f\"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\"\n",
+    "\n",
+    "config = PeftConfig.from_pretrained(peft_model_id)\n",
+    "model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)\n",
+    "model = PeftModel.from_pretrained(model, peft_model_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "37d712ce",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "- Demand for fireplace products was lower than expected , especially in Germany .\n",
+      "{'input_ids': tensor([[  259,   264,   259, 82903,   332,  1090, 10040, 10371,   639,   259,\n",
+      "         19540,  2421,   259, 25505,   259,   261,   259, 21230,   281, 17052,\n",
+      "           259,   260,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}\n",
+      "tensor([[    0,   259, 32588,     1]])\n",
+      "['negative']\n"
+     ]
+    }
+   ],
+   "source": [
+    "model.eval()\n",
+    "i = 13\n",
+    "inputs = tokenizer(dataset[\"validation\"][text_column][i], return_tensors=\"pt\")\n",
+    "print(dataset[\"validation\"][text_column][i])\n",
+    "print(inputs)\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    outputs = model.generate(input_ids=inputs[\"input_ids\"], max_new_tokens=10)\n",
+    "    print(outputs)\n",
+    "    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "76c2fc29",
+   "id": "66c65ea4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "65e71f78",
   "metadata": {},
   "outputs": [],
   "source": []
@ -383,7 +459,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3.10.5 64-bit",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@ -397,7 +473,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.5 (v3.10.5:f377153967, Jun  6 2022, 12:36:10) [Clang 13.0.0 (clang-1300.0.29.30)]"
+   "version": "3.10.5"
  },
  "vscode": {
   "interpreter": {
--- a/examples/conditional_generation/peft_lora_seq2seq_accelerate_big_model_inference.ipynb
+++ b/examples/conditional_generation/peft_lora_seq2seq_accelerate_big_model_inference.ipynb
@ -0,0 +1,253 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "71fbfca2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoModelForSeq2SeqLM\n",
+    "from peft import PeftModel, PeftConfig\n",
+    "import torch\n",
+    "from datasets import load_dataset\n",
+    "import os\n",
+    "from transformers import AutoTokenizer\n",
+    "from torch.utils.data import DataLoader\n",
+    "from transformers import default_data_collator, get_linear_schedule_with_warmup\n",
+    "from tqdm import tqdm\n",
+    "from datasets import load_dataset\n",
+    "\n",
+    "dataset_name = \"twitter_complaints\"\n",
+    "text_column = \"Tweet text\"\n",
+    "label_column = \"text_label\"\n",
+    "batch_size = 8\n",
+    "\n",
+    "peft_model_id = \"smangrul/twitter_complaints_bigscience_T0_3B_LORA_SEQ_2_SEQ_LM\"\n",
+    "config = PeftConfig.from_pretrained(peft_model_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "cc55820a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "peft_model_id = \"smangrul/twitter_complaints_bigscience_T0_3B_LORA_SEQ_2_SEQ_LM\"\n",
+    "max_memory = {0: \"6GIB\", 1: \"0GIB\", 2: \"0GIB\", 3: \"0GIB\", 4: \"0GIB\", \"cpu\": \"30GB\"}\n",
+    "config = PeftConfig.from_pretrained(peft_model_id)\n",
+    "model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, device_map=\"auto\", max_memory=max_memory)\n",
+    "model = PeftModel.from_pretrained(model, peft_model_id, device_map=\"auto\", max_memory=max_memory)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e1a3648b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "dataset = load_dataset(\"ought/raft\", dataset_name)\n",
+    "\n",
+    "classes = [k.replace(\"_\", \" \") for k in dataset[\"train\"].features[\"Label\"].names]\n",
+    "print(classes)\n",
+    "dataset = dataset.map(\n",
+    "    lambda x: {\"text_label\": [classes[label] for label in x[\"Label\"]]},\n",
+    "    batched=True,\n",
+    "    num_proc=1,\n",
+    ")\n",
+    "print(dataset)\n",
+    "dataset[\"train\"][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fe12d4d3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)\n",
+    "target_max_length = max([len(tokenizer(class_label)[\"input_ids\"]) for class_label in classes])\n",
+    "\n",
+    "\n",
+    "def preprocess_function(examples):\n",
+    "    inputs = examples[text_column]\n",
+    "    targets = examples[label_column]\n",
+    "    model_inputs = tokenizer(inputs, truncation=True)\n",
+    "    labels = tokenizer(\n",
+    "        targets, max_length=target_max_length, padding=\"max_length\", truncation=True, return_tensors=\"pt\"\n",
+    "    )\n",
+    "    labels = labels[\"input_ids\"]\n",
+    "    labels[labels == tokenizer.pad_token_id] = -100\n",
+    "    model_inputs[\"labels\"] = labels\n",
+    "    return model_inputs\n",
+    "\n",
+    "\n",
+    "processed_datasets = dataset.map(\n",
+    "    preprocess_function,\n",
+    "    batched=True,\n",
+    "    num_proc=1,\n",
+    "    remove_columns=dataset[\"train\"].column_names,\n",
+    "    load_from_cache_file=True,\n",
+    "    desc=\"Running tokenizer on dataset\",\n",
+    ")\n",
+    "\n",
+    "train_dataset = processed_datasets[\"train\"]\n",
+    "eval_dataset = processed_datasets[\"train\"]\n",
+    "test_dataset = processed_datasets[\"test\"]\n",
+    "\n",
+    "\n",
+    "def collate_fn(examples):\n",
+    "    return tokenizer.pad(examples, padding=\"longest\", return_tensors=\"pt\")\n",
+    "\n",
+    "\n",
+    "train_dataloader = DataLoader(\n",
+    "    train_dataset, shuffle=True, collate_fn=collate_fn, batch_size=batch_size, pin_memory=True\n",
+    ")\n",
+    "eval_dataloader = DataLoader(eval_dataset, collate_fn=collate_fn, batch_size=batch_size, pin_memory=True)\n",
+    "test_dataloader = DataLoader(test_dataset, collate_fn=collate_fn, batch_size=batch_size, pin_memory=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "b33be5e6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "@NYTsupport i have complained a dozen times &amp; yet my papers are still thrown FAR from my door. Why is this so hard to resolve?\n",
+      "{'input_ids': tensor([[25335,  1499,     3,    10,  3320, 12056,   382, 20390,     3,    23,\n",
+      "            43, 25932,     3,     9,  9611,   648,     3,   184,  4624,   117,\n",
+      "           780,    82,  5778,    33,   341,     3, 12618,   377,  4280,    45,\n",
+      "            82,  1365,     5,  1615,    19,    48,    78,   614,    12,  7785,\n",
+      "            58, 16229,     3,    10,     3,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
+      "         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}\n",
+      "tensor([[    0, 10394,     1]], device='cuda:0')\n",
+      "['complaint']\n"
+     ]
+    }
+   ],
+   "source": [
+    "model.eval()\n",
+    "i = 15\n",
+    "inputs = tokenizer(f'{text_column} : {dataset[\"test\"][i][\"Tweet text\"]} Label : ', return_tensors=\"pt\")\n",
+    "print(dataset[\"test\"][i][\"Tweet text\"])\n",
+    "print(inputs)\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    outputs = model.generate(input_ids=inputs[\"input_ids\"].to(\"cuda\"), max_new_tokens=10)\n",
+    "    print(outputs)\n",
+    "    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "b6d6cd5b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|                                                                                                    | 0/7 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:10<00:00,  1.48s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "model.eval()\n",
+    "eval_preds = []\n",
+    "for _, batch in enumerate(tqdm(eval_dataloader)):\n",
+    "    batch = {k: v.to(\"cuda\") for k, v in batch.items() if k != \"labels\"}\n",
+    "    with torch.no_grad():\n",
+    "        outputs = model.generate(**batch, max_new_tokens=10)\n",
+    "    preds = outputs.detach().cpu().numpy()\n",
+    "    eval_preds.extend(tokenizer.batch_decode(preds, skip_special_tokens=True))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "61264abe",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "accuracy=100.0\n",
+      "eval_preds[:10]=['no complaint', 'no complaint', 'complaint', 'complaint', 'no complaint', 'no complaint', 'no complaint', 'complaint', 'complaint', 'no complaint']\n",
+      "dataset['train'][label_column][:10]=['no complaint', 'no complaint', 'complaint', 'complaint', 'no complaint', 'no complaint', 'no complaint', 'complaint', 'complaint', 'no complaint']\n"
+     ]
+    }
+   ],
+   "source": [
+    "correct = 0\n",
+    "total = 0\n",
+    "for pred, true in zip(eval_preds, dataset[\"train\"][label_column]):\n",
+    "    if pred.strip() == true.strip():\n",
+    "        correct += 1\n",
+    "    total += 1\n",
+    "accuracy = correct / total * 100\n",
+    "print(f\"{accuracy=}\")\n",
+    "print(f\"{eval_preds[:10]=}\")\n",
+    "print(f\"{dataset['train'][label_column][:10]=}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a70802a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.eval()\n",
+    "test_preds = []\n",
+    "\n",
+    "for _, batch in enumerate(tqdm(test_dataloader)):\n",
+    "    batch = {k: v for k, v in batch.items() if k != \"labels\"}\n",
+    "    with torch.no_grad():\n",
+    "        outputs = model.generate(**batch, max_new_tokens=10)\n",
+    "    preds = outputs.detach().cpu().numpy()\n",
+    "    test_preds.extend(tokenizer.batch_decode(preds, skip_special_tokens=True))\n",
+    "    if len(test_preds) > 100:\n",
+    "        break\n",
+    "test_preds"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.5 (v3.10.5:f377153967, Jun  6 2022, 12:36:10) [Clang 13.0.0 (clang-1300.0.29.30)]"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/examples/conditional_generation/peft_lora_seq2seq_accelerate_ds_zero3_offload.py
+++ b/examples/conditional_generation/peft_lora_seq2seq_accelerate_ds_zero3_offload.py
@ -4,15 +4,15 @@ import sys
 import threading

 import numpy as np
+import psutil
 import torch
 from accelerate import Accelerator
+from datasets import load_dataset
 from torch.utils.data import DataLoader
+from tqdm import tqdm
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, get_linear_schedule_with_warmup, set_seed

-import psutil
-from datasets import load_dataset
-from peft import LoraConfig, TaskType, get_peft_model, get_peft_model_state_dict
-from tqdm import tqdm
+from peft import LoraConfig, TaskType, get_peft_model


 def levenshtein_distance(str1, str2):
@ -102,20 +102,19 @@ class TorchTracemalloc:

 def main():
    accelerator = Accelerator()
-    model_name_or_path = "bigscience/T0_3B"
+    # model_name_or_path = "bigscience/T0_3B"
+    model_name_or_path = "facebook/bart-large"
    dataset_name = "twitter_complaints"
    peft_config = LoraConfig(
        task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
    )
-    checkpoint_name = (
-        f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}_v1.pt".replace("/", "_")
-    )
    text_column = "Tweet text"
    label_column = "text_label"
    lr = 3e-3
    num_epochs = 5
    batch_size = 8
    seed = 42
+    do_test = False
    set_seed(seed)

    dataset = load_dataset("ought/raft", dataset_name)
@ -219,7 +218,7 @@ def main():
                tracemalloc.cpu_peaked + b2mb(tracemalloc.cpu_begin)
            )
        )
-        train_epoch_loss = total_loss / len(eval_dataloader)
+        train_epoch_loss = total_loss / len(train_dataloader)
        train_ppl = torch.exp(train_epoch_loss)
        accelerator.print(f"{epoch=}: {train_ppl=} {train_epoch_loss=}")

@ -232,7 +231,8 @@ def main():
                    outputs = accelerator.unwrap_model(model).generate(
                        **batch, synced_gpus=is_ds_zero_3
                    )  # synced_gpus=True for DS-stage 3
-                preds = outputs.detach().cpu().numpy()
+                outputs = accelerator.pad_across_processes(outputs, dim=1, pad_index=tokenizer.pad_token_id)
+                preds = accelerator.gather_for_metrics(outputs).detach().cpu().numpy()
                eval_preds.extend(tokenizer.batch_decode(preds, skip_special_tokens=True))

        # Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
@ -256,6 +256,9 @@ def main():

        correct = 0
        total = 0
+        assert len(eval_preds) == len(
+            dataset["train"][label_column]
+        ), f"{len(eval_preds)} != {len(dataset['train'][label_column])}"
        for pred, true in zip(eval_preds, dataset["train"][label_column]):
            if pred.strip() == true.strip():
                correct += 1
@ -265,33 +268,42 @@ def main():
        accelerator.print(f"{eval_preds[:10]=}")
        accelerator.print(f"{dataset['train'][label_column][:10]=}")

-    model.eval()
-    test_preds = []
-    for _, batch in enumerate(tqdm(test_dataloader)):
-        batch = {k: v for k, v in batch.items() if k != "labels"}
-        with torch.no_grad():
-            outputs = accelerator.unwrap_model(model).generate(
-                **batch, synced_gpus=is_ds_zero_3
-            )  # synced_gpus=True for DS-stage 3
-        test_preds.extend(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))
+    if do_test:
+        model.eval()
+        test_preds = []
+        for _, batch in enumerate(tqdm(test_dataloader)):
+            batch = {k: v for k, v in batch.items() if k != "labels"}
+            with torch.no_grad():
+                outputs = accelerator.unwrap_model(model).generate(
+                    **batch, synced_gpus=is_ds_zero_3
+                )  # synced_gpus=True for DS-stage 3
+            outputs = accelerator.pad_across_processes(outputs, dim=1, pad_index=tokenizer.pad_token_id)
+            preds = accelerator.gather(outputs).detach().cpu().numpy()
+            test_preds.extend(tokenizer.batch_decode(preds, skip_special_tokens=True))

-    test_preds_cleaned = []
-    for _, pred in enumerate(test_preds):
-        test_preds_cleaned.append(get_closest_label(pred, classes))
+        test_preds_cleaned = []
+        for _, pred in enumerate(test_preds):
+            test_preds_cleaned.append(get_closest_label(pred, classes))

-    test_df = dataset["test"].to_pandas()
-    test_df[label_column] = test_preds_cleaned
-    test_df["text_labels_orig"] = test_preds
-    accelerator.print(test_df[[text_column, label_column]].sample(20))
+        test_df = dataset["test"].to_pandas()
+        assert len(test_preds_cleaned) == len(test_df), f"{len(test_preds_cleaned)} != {len(test_df)}"
+        test_df[label_column] = test_preds_cleaned
+        test_df["text_labels_orig"] = test_preds
+        accelerator.print(test_df[[text_column, label_column]].sample(20))

-    pred_df = test_df[["ID", label_column]]
-    pred_df.columns = ["ID", "Label"]
+        pred_df = test_df[["ID", label_column]]
+        pred_df.columns = ["ID", "Label"]

-    os.makedirs(f"data/{dataset_name}", exist_ok=True)
-    pred_df.to_csv(f"data/{dataset_name}/predictions.csv", index=False)
+        os.makedirs(f"data/{dataset_name}", exist_ok=True)
+        pred_df.to_csv(f"data/{dataset_name}/predictions.csv", index=False)

    accelerator.wait_for_everyone()
-    accelerator.save(get_peft_model_state_dict(model, state_dict=accelerator.get_state_dict(model)), checkpoint_name)
+    model.push_to_hub(
+        "smangrul/"
+        + f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace("/", "_"),
+        state_dict=accelerator.get_state_dict(model),
+        use_auth_token=True,
+    )
    accelerator.wait_for_everyone()


--- a/examples/conditional_generation/peft_lora_seq2seq_accelerate_fsdp.py
+++ b/examples/conditional_generation/peft_lora_seq2seq_accelerate_fsdp.py
@ -2,13 +2,13 @@ import os

 import torch
 from accelerate import Accelerator
+from datasets import load_dataset
 from torch.utils.data import DataLoader
+from tqdm import tqdm
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup

-from datasets import load_dataset
-from peft import LoraConfig, TaskType, get_peft_model, get_peft_model_state_dict
+from peft import LoraConfig, TaskType, get_peft_model
 from peft.utils.other import fsdp_auto_wrap_policy
-from tqdm import tqdm


 def main():
@ -25,7 +25,6 @@ def main():
    peft_config = LoraConfig(
        task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
    )
-    checkpoint_name = "financial_sentiment_analysis_lora_fsdp_v1.pt"
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
    model = get_peft_model(model, peft_config)
    accelerator.print(model.print_trainable_parameters())
@ -109,9 +108,9 @@ def main():
            eval_loss += loss.detach().float()
            preds = accelerator.gather_for_metrics(torch.argmax(outputs.logits, -1)).detach().cpu().numpy()
            eval_preds.extend(tokenizer.batch_decode(preds, skip_special_tokens=True))
-        eval_epoch_loss = eval_loss / len(train_dataloader)
+        eval_epoch_loss = eval_loss / len(eval_dataloader)
        eval_ppl = torch.exp(eval_epoch_loss)
-        train_epoch_loss = total_loss / len(eval_dataloader)
+        train_epoch_loss = total_loss / len(train_dataloader)
        train_ppl = torch.exp(train_epoch_loss)
        accelerator.print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

@ -126,8 +125,10 @@ def main():
        accelerator.print(f"{eval_preds[:10]=}")
        accelerator.print(f"{dataset['validation'][label_column][:10]=}")
        accelerator.wait_for_everyone()
-        accelerator.save(
-            get_peft_model_state_dict(model, state_dict=accelerator.get_state_dict(model)), checkpoint_name
+        model.push_to_hub(
+            "smangrul/" + f"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace("/", "_"),
+            state_dict=accelerator.get_state_dict(model),
+            use_auth_token=True,
        )
        accelerator.wait_for_everyone()

--- a/examples/conditional_generation/peft_prefix_tuning_seq2seq.ipynb
+++ b/examples/conditional_generation/peft_prefix_tuning_seq2seq.ipynb
@ -5,18 +5,35 @@
   "execution_count": 1,
   "id": "5f93b7d1",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "===================================BUG REPORT===================================\n",
+      "Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
+      "For effortless bug reporting copy-paste your error into this form: https://docs.google.com/forms/d/e/1FAIpQLScPB8emS3Thkp66nvqwmjTEgxp8Y9ufuWTzFyr9kJ5AoI47dQ/viewform?usp=sf_link\n",
+      "================================================================================\n",
+      "CUDA SETUP: CUDA runtime path found: /home/sourab/miniconda3/envs/ml/lib/libcudart.so\n",
+      "CUDA SETUP: Highest compute capability among GPUs detected: 7.5\n",
+      "CUDA SETUP: Detected CUDA version 117\n",
+      "CUDA SETUP: Loading binary /home/sourab/miniconda3/envs/ml/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...\n"
+     ]
+    }
+   ],
   "source": [
    "from transformers import AutoModelForSeq2SeqLM\n",
-    "from peft import get_peft_config,get_peft_model, get_peft_model_state_dict, PrefixTuningConfig, TaskType\n",
+    "from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, PrefixTuningConfig, TaskType\n",
    "import torch\n",
    "from datasets import load_dataset\n",
    "import os\n",
+    "\n",
    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"3\"\n",
    "from transformers import AutoTokenizer\n",
    "from torch.utils.data import DataLoader\n",
-    "from transformers import default_data_collator,get_linear_schedule_with_warmup\n",
+    "from transformers import default_data_collator, get_linear_schedule_with_warmup\n",
    "from tqdm import tqdm\n",
    "from datasets import load_dataset\n",
    "\n",
@ -27,10 +44,10 @@
    "checkpoint_name = \"financial_sentiment_analysis_prefix_tuning_v1.pt\"\n",
    "text_column = \"sentence\"\n",
    "label_column = \"text_label\"\n",
-    "max_length=128\n",
+    "max_length = 128\n",
    "lr = 1e-2\n",
    "num_epochs = 5\n",
-    "batch_size=8\n"
+    "batch_size = 8"
   ]
  },
  {
@ -41,9 +58,7 @@
   "outputs": [],
   "source": [
    "# creating model\n",
-    "peft_config =  PrefixTuningConfig(\n",
-    "    task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, num_virtual_tokens=20\n",
-    ")\n",
+    "peft_config = PrefixTuningConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, num_virtual_tokens=20)\n",
    "\n",
    "model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)\n",
    "model = get_peft_model(model, peft_config)\n",
@ -61,15 +76,13 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "/home/sourab/miniconda3/envs/ml/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:97: FutureWarning: Deprecated argument(s) used in 'dataset_info': token. Will not be supported from version '0.12'.\n",
-      "  warnings.warn(message, FutureWarning)\n",
      "Found cached dataset financial_phrasebank (/home/sourab/.cache/huggingface/datasets/financial_phrasebank/sentences_allagree/1.0.0/550bde12e6c30e2674da973a55f57edde5181d53f5a5a34c1531c53f93b7e141)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e3f8b8faca0a4112b2c3499faee9544b",
+       "model_id": "ec4be98991b84181bfa75f8846422b8b",
       "version_major": 2,
       "version_minor": 0
      },
@ -83,7 +96,7 @@
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "935c8aebde284a5784348588e0bb013a",
+       "model_id": "82a6bd694c4f4751a23c370ab51f01a4",
       "version_major": 2,
       "version_minor": 0
      },
@ -97,7 +110,7 @@
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e3487cd55f6847588492bf7fa51348ca",
+       "model_id": "3844878631534468a1495e435563e4b0",
       "version_major": 2,
       "version_minor": 0
      },
@ -111,9 +124,9 @@
    {
     "data": {
      "text/plain": [
-       "{'sentence': 'ADPnews - Feb 5 , 2010 - Finnish real estate investor Sponda Oyj HEL : SDA1V said today that it slipped to a net loss of EUR 81.5 million USD 11.8 m in 2009 from a profit of EUR 29.3 million in 2008 .',\n",
-       " 'label': 0,\n",
-       " 'text_label': 'negative'}"
+       "{'sentence': 'Finnish elevators and escalators maker KONE Corporation said on Tuesday ( 18 March ) that it has received a major order from Sir Robert McAlpine to supply all elevators and escalators for the Watermark Place project in the City of London .',\n",
+       " 'label': 2,\n",
+       " 'text_label': 'positive'}"
      ]
     },
     "execution_count": 3,
@ -123,17 +136,16 @@
   ],
   "source": [
    "# loading dataset\n",
-    "dataset = load_dataset(\"financial_phrasebank\", 'sentences_allagree')\n",
+    "dataset = load_dataset(\"financial_phrasebank\", \"sentences_allagree\")\n",
    "dataset = dataset[\"train\"].train_test_split(test_size=0.1)\n",
    "dataset[\"validation\"] = dataset[\"test\"]\n",
-    "del(dataset[\"test\"])\n",
+    "del dataset[\"test\"]\n",
    "\n",
    "classes = dataset[\"train\"].features[\"label\"].names\n",
    "dataset = dataset.map(\n",
    "    lambda x: {\"text_label\": [classes[label] for label in x[\"label\"]]},\n",
    "    batched=True,\n",
    "    num_proc=1,\n",
-    "    \n",
    ")\n",
    "\n",
    "dataset[\"train\"][0]"
@ -145,39 +157,11 @@
   "id": "adf9608c",
   "metadata": {},
   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "2ce088f4437d4e2c80c267332a5b84e5",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "4e5f69b61f194220b39336e48edd2f9e",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "/home/sourab/transformers/src/transformers/models/t5/tokenization_t5_fast.py:156: FutureWarning: This tokenizer was incorrectly instantiated with a model max length of 512 which will be corrected in Transformers v5.\n",
+      "/home/sourab/transformers/src/transformers/models/t5/tokenization_t5_fast.py:155: FutureWarning: This tokenizer was incorrectly instantiated with a model max length of 512 which will be corrected in Transformers v5.\n",
      "For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.\n",
      "- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.\n",
      "- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.\n",
@ -188,7 +172,7 @@
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "230c5631891e4ea8ac7a1b39f315a4f0",
+       "model_id": "4af8c12efb5643659573347509079f3a",
       "version_major": 2,
       "version_minor": 0
      },
@ -202,7 +186,7 @@
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b581e5677d2a45459ceb725534ed0891",
+       "model_id": "86033b6257384584afd034075af808cb",
       "version_major": 2,
       "version_minor": 0
      },
@ -217,36 +201,35 @@
   "source": [
    "# data preprocessing\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)\n",
+    "\n",
+    "\n",
    "def preprocess_function(examples):\n",
    "    inputs = examples[text_column]\n",
    "    targets = examples[label_column]\n",
    "    model_inputs = tokenizer(inputs, max_length=max_length, padding=\"max_length\", truncation=True, return_tensors=\"pt\")\n",
    "    labels = tokenizer(targets, max_length=2, padding=\"max_length\", truncation=True, return_tensors=\"pt\")\n",
    "    labels = labels[\"input_ids\"]\n",
-    "    labels[labels==tokenizer.pad_token_id] = -100\n",
+    "    labels[labels == tokenizer.pad_token_id] = -100\n",
    "    model_inputs[\"labels\"] = labels\n",
    "    return model_inputs\n",
    "\n",
+    "\n",
    "processed_datasets = dataset.map(\n",
-    "            preprocess_function,\n",
-    "            batched=True,\n",
-    "            num_proc=1,\n",
-    "            remove_columns=dataset[\"train\"].column_names,\n",
-    "            load_from_cache_file=False,\n",
-    "            desc=\"Running tokenizer on dataset\",\n",
-    "        )\n",
+    "    preprocess_function,\n",
+    "    batched=True,\n",
+    "    num_proc=1,\n",
+    "    remove_columns=dataset[\"train\"].column_names,\n",
+    "    load_from_cache_file=False,\n",
+    "    desc=\"Running tokenizer on dataset\",\n",
+    ")\n",
    "\n",
    "train_dataset = processed_datasets[\"train\"]\n",
    "eval_dataset = processed_datasets[\"validation\"]\n",
    "\n",
    "train_dataloader = DataLoader(\n",
-    "        train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True\n",
-    "    )\n",
-    "eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)\n",
-    "\n",
-    "\n",
-    "\n",
-    "    "
+    "    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True\n",
+    ")\n",
+    "eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)"
   ]
  },
  {
@ -262,7 +245,7 @@
    "    optimizer=optimizer,\n",
    "    num_warmup_steps=0,\n",
    "    num_training_steps=(len(train_dataloader) * num_epochs),\n",
-    ")\n"
+    ")"
   ]
  },
  {
@ -275,82 +258,75 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "100%|█████████████████████████████████████████████████████████████| 255/255 [00:20<00:00, 12.27it/s]\n",
-      "100%|███████████████████████████████████████████████████████████████| 29/29 [00:01<00:00, 17.32it/s]\n"
+      "100%|████████████████████████████████████████████████████████████████████████████████████████| 255/255 [00:49<00:00,  5.15it/s]\n",
+      "100%|██████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:03<00:00,  7.56it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "epoch=0: train_ppl=tensor(2697769., device='cuda:0') train_epoch_loss=tensor(14.8079, device='cuda:0') eval_ppl=tensor(1.0089, device='cuda:0') eval_epoch_loss=tensor(0.0089, device='cuda:0')\n"
+      "epoch=0: train_ppl=tensor(2760654.5000, device='cuda:0') train_epoch_loss=tensor(14.8310, device='cuda:0') eval_ppl=tensor(1.0124, device='cuda:0') eval_epoch_loss=tensor(0.0124, device='cuda:0')\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "100%|█████████████████████████████████████████████████████████████| 255/255 [00:19<00:00, 12.75it/s]\n",
-      "100%|███████████████████████████████████████████████████████████████| 29/29 [00:01<00:00, 17.33it/s]\n"
+      "100%|████████████████████████████████████████████████████████████████████████████████████████| 255/255 [00:40<00:00,  6.22it/s]\n",
+      "100%|██████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:05<00:00,  5.05it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "epoch=1: train_ppl=tensor(2.9475, device='cuda:0') train_epoch_loss=tensor(1.0809, device='cuda:0') eval_ppl=tensor(1.0072, device='cuda:0') eval_epoch_loss=tensor(0.0072, device='cuda:0')\n"
+      "epoch=1: train_ppl=tensor(2.7329, device='cuda:0') train_epoch_loss=tensor(1.0054, device='cuda:0') eval_ppl=tensor(1.0081, device='cuda:0') eval_epoch_loss=tensor(0.0080, device='cuda:0')\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "100%|█████████████████████████████████████████████████████████████| 255/255 [00:20<00:00, 12.71it/s]\n",
-      "100%|███████████████████████████████████████████████████████████████| 29/29 [00:01<00:00, 17.31it/s]\n"
+      "100%|████████████████████████████████████████████████████████████████████████████████████████| 255/255 [00:58<00:00,  4.36it/s]\n",
+      "100%|██████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:05<00:00,  5.05it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "epoch=2: train_ppl=tensor(2.0588, device='cuda:0') train_epoch_loss=tensor(0.7221, device='cuda:0') eval_ppl=tensor(1.0055, device='cuda:0') eval_epoch_loss=tensor(0.0054, device='cuda:0')\n"
+      "epoch=2: train_ppl=tensor(2.1698, device='cuda:0') train_epoch_loss=tensor(0.7747, device='cuda:0') eval_ppl=tensor(1.0057, device='cuda:0') eval_epoch_loss=tensor(0.0057, device='cuda:0')\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "100%|█████████████████████████████████████████████████████████████| 255/255 [00:20<00:00, 12.70it/s]\n",
-      "100%|███████████████████████████████████████████████████████████████| 29/29 [00:01<00:00, 17.32it/s]\n"
+      "100%|████████████████████████████████████████████████████████████████████████████████████████| 255/255 [00:58<00:00,  4.35it/s]\n",
+      "100%|██████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:05<00:00,  5.06it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "epoch=3: train_ppl=tensor(1.7939, device='cuda:0') train_epoch_loss=tensor(0.5844, device='cuda:0') eval_ppl=tensor(1.0063, device='cuda:0') eval_epoch_loss=tensor(0.0063, device='cuda:0')\n"
+      "epoch=3: train_ppl=tensor(2.0724, device='cuda:0') train_epoch_loss=tensor(0.7287, device='cuda:0') eval_ppl=tensor(1.0051, device='cuda:0') eval_epoch_loss=tensor(0.0051, device='cuda:0')\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "100%|█████████████████████████████████████████████████████████████| 255/255 [00:19<00:00, 13.01it/s]\n",
-      "100%|███████████████████████████████████████████████████████████████| 29/29 [00:01<00:00, 17.33it/s]"
+      "100%|████████████████████████████████████████████████████████████████████████████████████████| 255/255 [01:02<00:00,  4.10it/s]\n",
+      "100%|██████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:06<00:00,  4.74it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "epoch=4: train_ppl=tensor(1.7740, device='cuda:0') train_epoch_loss=tensor(0.5732, device='cuda:0') eval_ppl=tensor(1.0062, device='cuda:0') eval_epoch_loss=tensor(0.0061, device='cuda:0')\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n"
+      "epoch=4: train_ppl=tensor(1.7598, device='cuda:0') train_epoch_loss=tensor(0.5652, device='cuda:0') eval_ppl=tensor(1.0047, device='cuda:0') eval_epoch_loss=tensor(0.0047, device='cuda:0')\n"
     ]
    }
   ],
@ -380,13 +356,15 @@
    "            outputs = model(**batch)\n",
    "        loss = outputs.loss\n",
    "        eval_loss += loss.detach().float()\n",
-    "        eval_preds.extend(tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True))\n",
+    "        eval_preds.extend(\n",
+    "            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)\n",
+    "        )\n",
    "\n",
-    "    eval_epoch_loss = eval_loss/len(train_dataloader)\n",
+    "    eval_epoch_loss = eval_loss / len(eval_dataloader)\n",
    "    eval_ppl = torch.exp(eval_epoch_loss)\n",
-    "    train_epoch_loss = total_loss/len(eval_dataloader)\n",
+    "    train_epoch_loss = total_loss / len(train_dataloader)\n",
    "    train_ppl = torch.exp(train_epoch_loss)\n",
-    "    print(f\"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}\")\n"
+    "    print(f\"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}\")"
   ]
  },
  {
@ -399,21 +377,21 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "accuracy=96.47577092511013 % on the evaluation dataset\n",
-      "eval_preds[:10]=['neutral', 'neutral', 'neutral', 'negative', 'neutral', 'neutral', 'neutral', 'neutral', 'positive', 'positive']\n",
-      "dataset['validation']['text_label'][:10]=['neutral', 'neutral', 'neutral', 'negative', 'neutral', 'neutral', 'neutral', 'neutral', 'positive', 'positive']\n"
+      "accuracy=96.91629955947137 % on the evaluation dataset\n",
+      "eval_preds[:10]=['negative', 'positive', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral']\n",
+      "dataset['validation']['text_label'][:10]=['negative', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral']\n"
     ]
    }
   ],
   "source": [
    "# print accuracy\n",
-    "correct =0\n",
+    "correct = 0\n",
    "total = 0\n",
-    "for pred,true in zip(eval_preds, dataset[\"validation\"][\"text_label\"]):\n",
-    "    if pred.strip()==true.strip():\n",
-    "        correct+=1\n",
-    "    total+=1  \n",
-    "accuracy = correct/total*100\n",
+    "for pred, true in zip(eval_preds, dataset[\"validation\"][\"text_label\"]):\n",
+    "    if pred.strip() == true.strip():\n",
+    "        correct += 1\n",
+    "    total += 1\n",
+    "accuracy = correct / total * 100\n",
    "print(f\"{accuracy=} % on the evaluation dataset\")\n",
    "print(f\"{eval_preds[:10]=}\")\n",
    "print(f\"{dataset['validation']['text_label'][:10]=}\")"
@ -424,26 +402,11 @@
   "execution_count": 8,
   "id": "a8de6005",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'prompt_embeddings': tensor([[-0.3165, -0.8389,  0.3262,  ..., -1.5049, -1.6963,  0.3444],\n",
-      "        [-1.8359,  1.1936,  1.0483,  ...,  0.6197, -0.4452,  0.5844],\n",
-      "        [-0.6027,  0.3246, -1.5601,  ..., -0.3645,  0.2329,  0.3402],\n",
-      "        ...,\n",
-      "        [-1.9525, -0.5035,  0.8474,  ...,  0.4793, -0.0789, -0.9305],\n",
-      "        [-1.9741,  0.5242, -2.0594,  ..., -0.7970, -0.4889,  2.7323],\n",
-      "        [ 0.9355, -0.2714,  0.4610,  ...,  0.2692, -1.5801, -1.6405]])}\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "# saving model\n",
-    "state_dict = get_peft_model_state_dict(model)\n",
-    "torch.save(state_dict, checkpoint_name)\n",
-    "print(state_dict)"
+    "peft_model_id = f\"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\"\n",
+    "model.save_pretrained(peft_model_id)"
   ]
  },
  {
@ -456,18 +419,69 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "3,8M\tfinancial_sentiment_analysis_prefix_tuning_v1.pt\r\n"
+      "3,8M\tt5-large_PREFIX_TUNING_SEQ_2_SEQ_LM/adapter_model.bin\r\n"
     ]
    }
   ],
   "source": [
-    "!du -h $checkpoint_name"
+    "ckpt = f\"{peft_model_id}/adapter_model.bin\"\n",
+    "!du -h $ckpt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "76c2fc29",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from peft import PeftModel, PeftConfig\n",
+    "\n",
+    "peft_model_id = f\"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\"\n",
+    "\n",
+    "config = PeftConfig.from_pretrained(peft_model_id)\n",
+    "model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)\n",
+    "model = PeftModel.from_pretrained(model, peft_model_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "d997f1cc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Acando AB ( ACANB SS ) fell 8.9 percent to 13.35 kronor , the lowest close since Dec. 11 .\n",
+      "{'input_ids': tensor([[ 4292,   232,    32,     3,  5359,    41,     3, 22029, 14972,     3,\n",
+      "          4256,     3,    61,  4728,  4848,  1298,  1093,    12,  8808,  2469,\n",
+      "             3, 22318,    29,   127,     3,     6,     8,  7402,   885,   437,\n",
+      "          4451,     5,   850,     3,     5,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
+      "         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}\n",
+      "tensor([[   0, 2841,    1]])\n",
+      "['negative']\n"
+     ]
+    }
+   ],
+   "source": [
+    "model.eval()\n",
+    "i = 107\n",
+    "inputs = tokenizer(dataset[\"validation\"][text_column][i], return_tensors=\"pt\")\n",
+    "print(dataset[\"validation\"][text_column][i])\n",
+    "print(inputs)\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    outputs = model.generate(input_ids=inputs[\"input_ids\"], max_new_tokens=10)\n",
+    "    print(outputs)\n",
+    "    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "76c2fc29",
+   "id": "fb746c1e",
   "metadata": {},
   "outputs": [],
   "source": []
@ -475,7 +489,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3.10.5 64-bit",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@ -489,7 +503,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.5 (v3.10.5:f377153967, Jun  6 2022, 12:36:10) [Clang 13.0.0 (clang-1300.0.29.30)]"
+   "version": "3.10.5"
  },
  "vscode": {
   "interpreter": {
--- a/examples/conditional_generation/peft_prompt_tuning_seq2seq.ipynb
+++ b/examples/conditional_generation/peft_prompt_tuning_seq2seq.ipynb
@ -0,0 +1,804 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "5f93b7d1",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-05-30T08:37:58.711225Z",
+     "start_time": "2023-05-30T08:37:56.881307Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "===================================BUG REPORT===================================\n",
+      "Welcome to bitsandbytes. For bug reports, please run\n",
+      "\n",
+      "python -m bitsandbytes\n",
+      "\n",
+      " and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
+      "================================================================================\n",
+      "bin /udir/tschilla/anaconda3/envs/peft/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda117.so\n",
+      "CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching in backup paths...\n",
+      "CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so.11.0\n",
+      "CUDA SETUP: Highest compute capability among GPUs detected: 8.0\n",
+      "CUDA SETUP: Detected CUDA version 117\n",
+      "CUDA SETUP: Loading binary /udir/tschilla/anaconda3/envs/peft/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/udir/tschilla/anaconda3/envs/peft/lib/python3.9/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: /udir/tschilla/anaconda3 did not contain ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] as expected! Searching further paths...\n",
+      "  warn(msg)\n",
+      "/udir/tschilla/anaconda3/envs/peft/lib/python3.9/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('Europe/Paris')}\n",
+      "  warn(msg)\n",
+      "/udir/tschilla/anaconda3/envs/peft/lib/python3.9/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('/udir/tschilla/.cache/dotnet_bundle_extract')}\n",
+      "  warn(msg)\n",
+      "/udir/tschilla/anaconda3/envs/peft/lib/python3.9/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('5002'), PosixPath('http'), PosixPath('//127.0.0.1')}\n",
+      "  warn(msg)\n",
+      "/udir/tschilla/anaconda3/envs/peft/lib/python3.9/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('() {  ( alias;\\n eval ${which_declare} ) | /usr/bin/which --tty-only --read-alias --read-functions --show-tilde --show-dot $@\\n}')}\n",
+      "  warn(msg)\n",
+      "/udir/tschilla/anaconda3/envs/peft/lib/python3.9/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('module'), PosixPath('//matplotlib_inline.backend_inline')}\n",
+      "  warn(msg)\n",
+      "/udir/tschilla/anaconda3/envs/peft/lib/python3.9/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: Found duplicate ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] files: {PosixPath('/usr/local/cuda/lib64/libcudart.so.11.0'), PosixPath('/usr/local/cuda/lib64/libcudart.so')}.. We'll flip a coin and try one of these, in order to fail forward.\n",
+      "Either way, this might cause trouble in the future:\n",
+      "If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.\n",
+      "  warn(msg)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "\n",
+    "import torch\n",
+    "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup\n",
+    "from peft import get_peft_model, PromptTuningConfig, TaskType, PromptTuningInit\n",
+    "from torch.utils.data import DataLoader\n",
+    "from tqdm import tqdm\n",
+    "from datasets import load_dataset\n",
+    "\n",
+    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+    "\n",
+    "device = \"cuda\"\n",
+    "model_name_or_path = \"t5-large\"\n",
+    "tokenizer_name_or_path = \"t5-large\"\n",
+    "\n",
+    "checkpoint_name = \"financial_sentiment_analysis_prompt_tuning_v1.pt\"\n",
+    "text_column = \"sentence\"\n",
+    "label_column = \"text_label\"\n",
+    "max_length = 128\n",
+    "lr = 1\n",
+    "num_epochs = 5\n",
+    "batch_size = 8"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "8d0850ac",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-05-30T08:38:12.413984Z",
+     "start_time": "2023-05-30T08:38:04.601042Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "trainable params: 40960 || all params: 737709056 || trainable%: 0.005552324411210698\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/udir/tschilla/anaconda3/envs/peft/lib/python3.9/site-packages/transformers/models/t5/tokenization_t5_fast.py:155: FutureWarning: This tokenizer was incorrectly instantiated with a model max length of 512 which will be corrected in Transformers v5.\n",
+      "For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.\n",
+      "- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.\n",
+      "- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.\n",
+      "- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "PeftModelForSeq2SeqLM(\n",
+       "  (base_model): T5ForConditionalGeneration(\n",
+       "    (shared): Embedding(32128, 1024)\n",
+       "    (encoder): T5Stack(\n",
+       "      (embed_tokens): Embedding(32128, 1024)\n",
+       "      (block): ModuleList(\n",
+       "        (0): T5Block(\n",
+       "          (layer): ModuleList(\n",
+       "            (0): T5LayerSelfAttention(\n",
+       "              (SelfAttention): T5Attention(\n",
+       "                (q): Linear(in_features=1024, out_features=1024, bias=False)\n",
+       "                (k): Linear(in_features=1024, out_features=1024, bias=False)\n",
+       "                (v): Linear(in_features=1024, out_features=1024, bias=False)\n",
+       "                (o): Linear(in_features=1024, out_features=1024, bias=False)\n",
+       "                (relative_attention_bias): Embedding(32, 16)\n",
+       "              )\n",
+       "              (layer_norm): T5LayerNorm()\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (1): T5LayerFF(\n",
+       "              (DenseReluDense): T5DenseActDense(\n",
+       "                (wi): Linear(in_features=1024, out_features=4096, bias=False)\n",
+       "                (wo): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "                (dropout): Dropout(p=0.1, inplace=False)\n",
+       "                (act): ReLU()\n",
+       "              )\n",
+       "              (layer_norm): T5LayerNorm()\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "        )\n",
+       "        (1-23): 23 x T5Block(\n",
+       "          (layer): ModuleList(\n",
+       "            (0): T5LayerSelfAttention(\n",
+       "              (SelfAttention): T5Attention(\n",
+       "                (q): Linear(in_features=1024, out_features=1024, bias=False)\n",
+       "                (k): Linear(in_features=1024, out_features=1024, bias=False)\n",
+       "                (v): Linear(in_features=1024, out_features=1024, bias=False)\n",
+       "                (o): Linear(in_features=1024, out_features=1024, bias=False)\n",
+       "              )\n",
+       "              (layer_norm): T5LayerNorm()\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (1): T5LayerFF(\n",
+       "              (DenseReluDense): T5DenseActDense(\n",
+       "                (wi): Linear(in_features=1024, out_features=4096, bias=False)\n",
+       "                (wo): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "                (dropout): Dropout(p=0.1, inplace=False)\n",
+       "                (act): ReLU()\n",
+       "              )\n",
+       "              (layer_norm): T5LayerNorm()\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (final_layer_norm): T5LayerNorm()\n",
+       "      (dropout): Dropout(p=0.1, inplace=False)\n",
+       "    )\n",
+       "    (decoder): T5Stack(\n",
+       "      (embed_tokens): Embedding(32128, 1024)\n",
+       "      (block): ModuleList(\n",
+       "        (0): T5Block(\n",
+       "          (layer): ModuleList(\n",
+       "            (0): T5LayerSelfAttention(\n",
+       "              (SelfAttention): T5Attention(\n",
+       "                (q): Linear(in_features=1024, out_features=1024, bias=False)\n",
+       "                (k): Linear(in_features=1024, out_features=1024, bias=False)\n",
+       "                (v): Linear(in_features=1024, out_features=1024, bias=False)\n",
+       "                (o): Linear(in_features=1024, out_features=1024, bias=False)\n",
+       "                (relative_attention_bias): Embedding(32, 16)\n",
+       "              )\n",
+       "              (layer_norm): T5LayerNorm()\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (1): T5LayerCrossAttention(\n",
+       "              (EncDecAttention): T5Attention(\n",
+       "                (q): Linear(in_features=1024, out_features=1024, bias=False)\n",
+       "                (k): Linear(in_features=1024, out_features=1024, bias=False)\n",
+       "                (v): Linear(in_features=1024, out_features=1024, bias=False)\n",
+       "                (o): Linear(in_features=1024, out_features=1024, bias=False)\n",
+       "              )\n",
+       "              (layer_norm): T5LayerNorm()\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (2): T5LayerFF(\n",
+       "              (DenseReluDense): T5DenseActDense(\n",
+       "                (wi): Linear(in_features=1024, out_features=4096, bias=False)\n",
+       "                (wo): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "                (dropout): Dropout(p=0.1, inplace=False)\n",
+       "                (act): ReLU()\n",
+       "              )\n",
+       "              (layer_norm): T5LayerNorm()\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "        )\n",
+       "        (1-23): 23 x T5Block(\n",
+       "          (layer): ModuleList(\n",
+       "            (0): T5LayerSelfAttention(\n",
+       "              (SelfAttention): T5Attention(\n",
+       "                (q): Linear(in_features=1024, out_features=1024, bias=False)\n",
+       "                (k): Linear(in_features=1024, out_features=1024, bias=False)\n",
+       "                (v): Linear(in_features=1024, out_features=1024, bias=False)\n",
+       "                (o): Linear(in_features=1024, out_features=1024, bias=False)\n",
+       "              )\n",
+       "              (layer_norm): T5LayerNorm()\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (1): T5LayerCrossAttention(\n",
+       "              (EncDecAttention): T5Attention(\n",
+       "                (q): Linear(in_features=1024, out_features=1024, bias=False)\n",
+       "                (k): Linear(in_features=1024, out_features=1024, bias=False)\n",
+       "                (v): Linear(in_features=1024, out_features=1024, bias=False)\n",
+       "                (o): Linear(in_features=1024, out_features=1024, bias=False)\n",
+       "              )\n",
+       "              (layer_norm): T5LayerNorm()\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (2): T5LayerFF(\n",
+       "              (DenseReluDense): T5DenseActDense(\n",
+       "                (wi): Linear(in_features=1024, out_features=4096, bias=False)\n",
+       "                (wo): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "                (dropout): Dropout(p=0.1, inplace=False)\n",
+       "                (act): ReLU()\n",
+       "              )\n",
+       "              (layer_norm): T5LayerNorm()\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (final_layer_norm): T5LayerNorm()\n",
+       "      (dropout): Dropout(p=0.1, inplace=False)\n",
+       "    )\n",
+       "    (lm_head): Linear(in_features=1024, out_features=32128, bias=False)\n",
+       "  )\n",
+       "  (prompt_encoder): ModuleDict(\n",
+       "    (default): PromptEmbedding(\n",
+       "      (embedding): Embedding(40, 1024)\n",
+       "    )\n",
+       "  )\n",
+       "  (word_embeddings): Embedding(32128, 1024)\n",
+       ")"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# creating model\n",
+    "peft_config = PromptTuningConfig(\n",
+    "    task_type=TaskType.SEQ_2_SEQ_LM,\n",
+    "    prompt_tuning_init=PromptTuningInit.TEXT,\n",
+    "    num_virtual_tokens=20,\n",
+    "    prompt_tuning_init_text=\"What is the sentiment of this article?\\n\",\n",
+    "    inference_mode=False,\n",
+    "    tokenizer_name_or_path=model_name_or_path,\n",
+    ")\n",
+    "\n",
+    "model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)\n",
+    "model = get_peft_model(model, peft_config)\n",
+    "model.print_trainable_parameters()\n",
+    "model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "4ee2babf",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-05-30T08:38:18.759143Z",
+     "start_time": "2023-05-30T08:38:17.881621Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Found cached dataset financial_phrasebank (/data/proxem/huggingface/datasets/financial_phrasebank/sentences_allagree/1.0.0/550bde12e6c30e2674da973a55f57edde5181d53f5a5a34c1531c53f93b7e141)\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fb63f50cb7cb4f5aae10648ba74d6c4e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/2037 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/227 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'sentence': '`` Lining stone sales were also good in the early autumn , and order books are strong to the end of the year .',\n",
+       " 'label': 2,\n",
+       " 'text_label': 'positive'}"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# loading dataset\n",
+    "dataset = load_dataset(\"financial_phrasebank\", \"sentences_allagree\")\n",
+    "dataset = dataset[\"train\"].train_test_split(test_size=0.1)\n",
+    "dataset[\"validation\"] = dataset[\"test\"]\n",
+    "del dataset[\"test\"]\n",
+    "\n",
+    "classes = dataset[\"train\"].features[\"label\"].names\n",
+    "dataset = dataset.map(\n",
+    "    lambda x: {\"text_label\": [classes[label] for label in x[\"label\"]]},\n",
+    "    batched=True,\n",
+    "    num_proc=1,\n",
+    ")\n",
+    "\n",
+    "dataset[\"train\"][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "adf9608c",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-05-30T08:38:21.132266Z",
+     "start_time": "2023-05-30T08:38:20.340722Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Running tokenizer on dataset:   0%|          | 0/2037 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Running tokenizer on dataset:   0%|          | 0/227 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# data preprocessing\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)\n",
+    "target_max_length = max([len(tokenizer(class_label)[\"input_ids\"]) for class_label in classes])\n",
+    "\n",
+    "\n",
+    "def preprocess_function(examples):\n",
+    "    inputs = examples[text_column]\n",
+    "    targets = examples[label_column]\n",
+    "    model_inputs = tokenizer(inputs, max_length=max_length, padding=\"max_length\", truncation=True, return_tensors=\"pt\")\n",
+    "    labels = tokenizer(\n",
+    "        targets, max_length=target_max_length, padding=\"max_length\", truncation=True, return_tensors=\"pt\"\n",
+    "    )\n",
+    "    labels = labels[\"input_ids\"]\n",
+    "    labels[labels == tokenizer.pad_token_id] = -100\n",
+    "    model_inputs[\"labels\"] = labels\n",
+    "    return model_inputs\n",
+    "\n",
+    "\n",
+    "processed_datasets = dataset.map(\n",
+    "    preprocess_function,\n",
+    "    batched=True,\n",
+    "    num_proc=1,\n",
+    "    remove_columns=dataset[\"train\"].column_names,\n",
+    "    load_from_cache_file=False,\n",
+    "    desc=\"Running tokenizer on dataset\",\n",
+    ")\n",
+    "\n",
+    "train_dataset = processed_datasets[\"train\"]\n",
+    "eval_dataset = processed_datasets[\"validation\"]\n",
+    "\n",
+    "train_dataloader = DataLoader(\n",
+    "    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True\n",
+    ")\n",
+    "eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "f733a3c6",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-05-30T08:38:22.907922Z",
+     "start_time": "2023-05-30T08:38:22.901057Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# optimizer and lr scheduler\n",
+    "optimizer = torch.optim.AdamW(model.parameters(), lr=lr)\n",
+    "lr_scheduler = get_linear_schedule_with_warmup(\n",
+    "    optimizer=optimizer,\n",
+    "    num_warmup_steps=0,\n",
+    "    num_training_steps=(len(train_dataloader) * num_epochs),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "6b3a4090",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-05-30T08:42:29.409070Z",
+     "start_time": "2023-05-30T08:38:50.102263Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 255/255 [00:42<00:00,  6.05it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:02<00:00, 14.40it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch=0: train_ppl=tensor(8.0846, device='cuda:0') train_epoch_loss=tensor(2.0900, device='cuda:0') eval_ppl=tensor(1.3542, device='cuda:0') eval_epoch_loss=tensor(0.3032, device='cuda:0')\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 255/255 [00:41<00:00,  6.15it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:02<00:00, 14.42it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch=1: train_ppl=tensor(1.5088, device='cuda:0') train_epoch_loss=tensor(0.4113, device='cuda:0') eval_ppl=tensor(1.2692, device='cuda:0') eval_epoch_loss=tensor(0.2384, device='cuda:0')\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 255/255 [00:41<00:00,  6.18it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:02<00:00, 14.45it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch=2: train_ppl=tensor(1.5322, device='cuda:0') train_epoch_loss=tensor(0.4267, device='cuda:0') eval_ppl=tensor(1.2065, device='cuda:0') eval_epoch_loss=tensor(0.1877, device='cuda:0')\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 255/255 [00:41<00:00,  6.17it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:02<00:00, 14.38it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch=3: train_ppl=tensor(1.4475, device='cuda:0') train_epoch_loss=tensor(0.3699, device='cuda:0') eval_ppl=tensor(1.2346, device='cuda:0') eval_epoch_loss=tensor(0.2107, device='cuda:0')\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 255/255 [00:42<00:00,  5.94it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:02<00:00, 14.42it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch=4: train_ppl=tensor(1.3428, device='cuda:0') train_epoch_loss=tensor(0.2948, device='cuda:0') eval_ppl=tensor(1.2041, device='cuda:0') eval_epoch_loss=tensor(0.1857, device='cuda:0')\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# training and evaluation\n",
+    "model = model.to(device)\n",
+    "\n",
+    "for epoch in range(num_epochs):\n",
+    "    model.train()\n",
+    "    total_loss = 0\n",
+    "    for step, batch in enumerate(tqdm(train_dataloader)):\n",
+    "        batch = {k: v.to(device) for k, v in batch.items()}\n",
+    "        outputs = model(**batch)\n",
+    "        loss = outputs.loss\n",
+    "        total_loss += loss.detach().float()\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "        lr_scheduler.step()\n",
+    "        optimizer.zero_grad()\n",
+    "\n",
+    "    model.eval()\n",
+    "    eval_loss = 0\n",
+    "    eval_preds = []\n",
+    "    for step, batch in enumerate(tqdm(eval_dataloader)):\n",
+    "        batch = {k: v.to(device) for k, v in batch.items()}\n",
+    "        with torch.no_grad():\n",
+    "            outputs = model(**batch)\n",
+    "        loss = outputs.loss\n",
+    "        eval_loss += loss.detach().float()\n",
+    "        eval_preds.extend(\n",
+    "            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)\n",
+    "        )\n",
+    "\n",
+    "    eval_epoch_loss = eval_loss / len(eval_dataloader)\n",
+    "    eval_ppl = torch.exp(eval_epoch_loss)\n",
+    "    train_epoch_loss = total_loss / len(train_dataloader)\n",
+    "    train_ppl = torch.exp(train_epoch_loss)\n",
+    "    print(f\"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "6cafa67b",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-05-30T08:42:42.844671Z",
+     "start_time": "2023-05-30T08:42:42.840447Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "accuracy=85.46255506607929 % on the evaluation dataset\n",
+      "eval_preds[:10]=['neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'positive', 'neutral', 'negative', 'neutral', 'positive']\n",
+      "dataset['validation']['text_label'][:10]=['neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'positive', 'neutral', 'negative', 'positive', 'neutral']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# print accuracy\n",
+    "correct = 0\n",
+    "total = 0\n",
+    "for pred, true in zip(eval_preds, dataset[\"validation\"][\"text_label\"]):\n",
+    "    if pred.strip() == true.strip():\n",
+    "        correct += 1\n",
+    "    total += 1\n",
+    "accuracy = correct / total * 100\n",
+    "print(f\"{accuracy=} % on the evaluation dataset\")\n",
+    "print(f\"{eval_preds[:10]=}\")\n",
+    "print(f\"{dataset['validation']['text_label'][:10]=}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "a8de6005",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-05-30T08:42:45.752765Z",
+     "start_time": "2023-05-30T08:42:45.742397Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# saving model\n",
+    "peft_model_id = f\"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\"\n",
+    "model.save_pretrained(peft_model_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "bd20cd4c",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-05-30T08:42:47.660873Z",
+     "start_time": "2023-05-30T08:42:47.488293Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "164K\tt5-large_PROMPT_TUNING_SEQ_2_SEQ_LM/adapter_model.bin\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "ckpt = f\"{peft_model_id}/adapter_model.bin\"\n",
+    "!du -h $ckpt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "76c2fc29",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-05-30T08:42:56.721990Z",
+     "start_time": "2023-05-30T08:42:49.060700Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from peft import PeftModel, PeftConfig\n",
+    "\n",
+    "peft_model_id = f\"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\"\n",
+    "\n",
+    "config = PeftConfig.from_pretrained(peft_model_id)\n",
+    "model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)\n",
+    "model = PeftModel.from_pretrained(model, peft_model_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "d997f1cc",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-05-30T08:42:59.600916Z",
+     "start_time": "2023-05-30T08:42:58.961468Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Danske Bank is Denmark 's largest bank with 3.5 million customers .\n",
+      "tensor([[ 3039,  1050,  1925,    19, 18001,     3,    31,     7,  2015,  2137,\n",
+      "            28,     3,  9285,   770,   722,     3,     5,     1]])\n",
+      "tensor([[   0, 7163,    1]])\n",
+      "['neutral']\n"
+     ]
+    }
+   ],
+   "source": [
+    "model.eval()\n",
+    "i = 107\n",
+    "input_ids = tokenizer(dataset[\"validation\"][text_column][i], return_tensors=\"pt\").input_ids\n",
+    "print(dataset[\"validation\"][text_column][i])\n",
+    "print(input_ids)\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    outputs = model.generate(input_ids=input_ids, max_new_tokens=10)\n",
+    "    print(outputs)\n",
+    "    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "peft",
+   "language": "python",
+   "name": "peft"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/examples/conditional_generation/peft_prompt_tuning_seq2seq_with_generate.ipynb
+++ b/examples/conditional_generation/peft_prompt_tuning_seq2seq_with_generate.ipynb
--- a/examples/conditional_generation/requirements.txt
+++ b/examples/conditional_generation/requirements.txt
@ -1,6 +1,5 @@
 transformers
 accelerate
-loralib
 evaluate
 deepspeed
 tqdm
--- a/examples/feature_extraction/peft_lora_embedding_semantic_search.py
+++ b/examples/feature_extraction/peft_lora_embedding_semantic_search.py
@ -0,0 +1,491 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import math
+import os
+import random
+from pathlib import Path
+
+import datasets
+import evaluate
+import torch
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from datasets import DatasetDict, load_dataset
+from huggingface_hub import Repository, create_repo
+from torch import nn
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoModel, AutoTokenizer, SchedulerType, default_data_collator, get_scheduler
+from transformers.utils import get_full_repo_name
+
+from peft import LoraConfig, TaskType, get_peft_model
+
+
+logger = get_logger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Training a PEFT model for Sematic Search task")
+    parser.add_argument("--dataset_name", type=str, default=None, help="dataset name on HF hub")
+    parser.add_argument(
+        "--max_length",
+        type=int,
+        default=128,
+        help=(
+            "The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
+            " sequences shorter will be padded if `--pad_to_max_length` is passed."
+        ),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+        required=True,
+    )
+    parser.add_argument(
+        "--per_device_train_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--per_device_eval_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the evaluation dataloader.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
+    parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        type=SchedulerType,
+        default="linear",
+        help="The scheduler type to use.",
+        choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
+    )
+    parser.add_argument(
+        "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument(
+        "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
+    )
+    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=str,
+        default=None,
+        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help="If the training should continue from a checkpoint folder.",
+    )
+    parser.add_argument(
+        "--with_tracking",
+        action="store_true",
+        help="Whether to enable experiment trackers for logging.",
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="all",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
+            ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.'
+            "Only applicable when `--with_tracking` is passed."
+        ),
+    )
+    parser.add_argument(
+        "--sanity_test",
+        action="store_true",
+        help="Whether to enable experiment trackers for logging.",
+    )
+    parser.add_argument(
+        "--use_peft",
+        action="store_true",
+        help="Whether to enable experiment trackers for logging.",
+    )
+    args = parser.parse_args()
+
+    if args.push_to_hub:
+        assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."
+
+    return args
+
+
+def save_model_hook(models, weights, output_dir):
+    for i, model in enumerate(models):
+        model.save_pretrained(output_dir, state_dict=weights[i])
+        # make sure to pop weight so that corresponding model is not saved again
+        weights.pop()
+
+
+def load_model_hook(models, input_dir):
+    while len(models) > 0:
+        model = models.pop()
+        # pop models so that they are not loaded again
+        if hasattr(model, "active_adapter") and hasattr(model, "load_adapter"):
+            model.load_adapter(input_dir, model.active_adapter, is_trainable=True)
+
+
+class AutoModelForSentenceEmbedding(nn.Module):
+    def __init__(self, model_name, tokenizer, normalize=True):
+        super(AutoModelForSentenceEmbedding, self).__init__()
+
+        self.model = AutoModel.from_pretrained(model_name)  # , load_in_8bit=True, device_map={"":0})
+        self.normalize = normalize
+        self.tokenizer = tokenizer
+
+    def forward(self, **kwargs):
+        model_output = self.model(**kwargs)
+        embeddings = self.mean_pooling(model_output, kwargs["attention_mask"])
+        if self.normalize:
+            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
+
+        return embeddings
+
+    def mean_pooling(self, model_output, attention_mask):
+        token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+
+    def __getattr__(self, name: str):
+        """Forward missing attributes to the wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Module's logic
+        except AttributeError:
+            return getattr(self.model, name)
+
+
+def get_cosing_embeddings(query_embs, product_embs):
+    return torch.sum(query_embs * product_embs, axis=1)
+
+
+def get_loss(cosine_score, labels):
+    return torch.mean(torch.square(labels * (1 - cosine_score) + torch.clamp((1 - labels) * cosine_score, min=0.0)))
+
+
+def main():
+    args = parse_args()
+    accelerator = (
+        Accelerator(log_with=args.report_to, project_dir=args.output_dir) if args.with_tracking else Accelerator()
+    )
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            create_repo(repo_name, exist_ok=True, token=args.hub_token)
+            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
+
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+    accelerator.wait_for_everyone()
+
+    # get the tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+
+    # dataset download and preprocessing
+    if args.sanity_test:
+        train_dataset = load_dataset("smangrul/amazon_esci", split="train[:1024]")
+        val_dataset = load_dataset("smangrul/amazon_esci", split="validation[:1024]")
+
+        dataset = DatasetDict({"train": train_dataset, "validation": val_dataset})
+    else:
+        dataset = load_dataset(args.dataset_name)
+
+    def preprocess_function(examples):
+        queries = examples["query"]
+        result = tokenizer(queries, padding="max_length", max_length=70, truncation=True)
+        result = {f"query_{k}": v for k, v in result.items()}
+
+        products = examples["product_title"]
+        result_products = tokenizer(products, padding="max_length", max_length=70, truncation=True)
+        for k, v in result_products.items():
+            result[f"product_{k}"] = v
+
+        result["labels"] = examples["relevance_label"]
+        return result
+
+    processed_datasets = dataset.map(
+        preprocess_function,
+        batched=True,
+        remove_columns=dataset["train"].column_names,
+        desc="Running tokenizer on dataset",
+    )
+
+    # Log a few random samples from the training set:
+    for index in random.sample(range(len(processed_datasets["train"])), 3):
+        logger.info(f"Sample {index} of the training set: {processed_datasets['train'][index]}.")
+
+    # base model
+    model = AutoModelForSentenceEmbedding(args.model_name_or_path, tokenizer)
+
+    if args.use_peft:
+        # peft config and wrapping
+        peft_config = LoraConfig(
+            r=8,
+            lora_alpha=16,
+            bias="none",
+            task_type=TaskType.FEATURE_EXTRACTION,
+            target_modules=["key", "query", "value"],
+        )
+        model = get_peft_model(model, peft_config)
+        model.print_trainable_parameters()
+
+    accelerator.print(model)
+
+    # get dataloaders
+    train_dataloader = DataLoader(
+        processed_datasets["train"],
+        shuffle=True,
+        collate_fn=default_data_collator,
+        batch_size=args.per_device_train_batch_size,
+        pin_memory=True,
+    )
+
+    eval_dataloader = DataLoader(
+        processed_datasets["validation"],
+        shuffle=False,
+        collate_fn=default_data_collator,
+        batch_size=args.per_device_eval_batch_size,
+        pin_memory=True,
+    )
+
+    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        name=args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=args.num_warmup_steps,
+        num_training_steps=args.max_train_steps,
+    )
+
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
+    )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # Figure out how many steps we should save the Accelerator states
+    checkpointing_steps = args.checkpointing_steps
+    if checkpointing_steps is not None and checkpointing_steps.isdigit():
+        checkpointing_steps = int(checkpointing_steps)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if args.with_tracking:
+        experiment_config = vars(args)
+        # TensorBoard cannot log Enums, need the raw value
+        experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value
+        accelerator.init_trackers("peft_semantic_search", experiment_config)
+
+    metric = evaluate.load("roc_auc")
+
+    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    if args.use_peft:
+        # saving and loading checkpoints for resuming training
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(processed_datasets['train'])}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
+    completed_steps = 0
+    starting_epoch = 0
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
+            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
+            accelerator.load_state(args.resume_from_checkpoint)
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
+            dirs.sort(key=os.path.getctime)
+            path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+        # Extract `epoch_{i}` or `step_{i}`
+        training_difference = os.path.splitext(path)[0]
+
+        if "epoch" in training_difference:
+            starting_epoch = int(training_difference.replace("epoch_", "")) + 1
+            resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
+        else:
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
+            starting_epoch = resume_step // len(train_dataloader)
+            resume_step -= starting_epoch * len(train_dataloader)
+            completed_steps = resume_step // args.gradient_accumulation_stepp
+
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(completed_steps)
+
+    for epoch in range(starting_epoch, args.num_train_epochs):
+        model.train()
+        if args.with_tracking:
+            total_loss = 0
+        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        else:
+            active_dataloader = train_dataloader
+        for step, batch in enumerate(active_dataloader):
+            with accelerator.accumulate(model):
+                query_embs = model(**{k.replace("query_", ""): v for k, v in batch.items() if "query" in k})
+                product_embs = model(**{k.replace("product_", ""): v for k, v in batch.items() if "product" in k})
+                loss = get_loss(get_cosing_embeddings(query_embs, product_embs), batch["labels"])
+                total_loss += accelerator.reduce(loss.detach().float(), reduction="sum")
+                accelerator.backward(loss)
+                optimizer.step()
+                lr_scheduler.step()
+                model.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                completed_steps += 1
+
+            if (step + 1) % 100 == 0:
+                logger.info(f"Step: {step+1}, Loss: {total_loss/(step+1)}")
+                if args.with_tracking:
+                    accelerator.log({"train/loss": total_loss / (step + 1)}, step=completed_steps)
+
+            if isinstance(checkpointing_steps, int):
+                if completed_steps % checkpointing_steps == 0:
+                    output_dir = f"step_{completed_steps }"
+                    if args.output_dir is not None:
+                        output_dir = os.path.join(args.output_dir, output_dir)
+                    accelerator.save_state(output_dir)
+
+            if completed_steps >= args.max_train_steps:
+                break
+
+        model.eval()
+        for step, batch in enumerate(eval_dataloader):
+            with torch.no_grad():
+                query_embs = model(**{k.replace("query_", ""): v for k, v in batch.items() if "query" in k})
+                product_embs = model(**{k.replace("product_", ""): v for k, v in batch.items() if "product" in k})
+                prediction_scores = get_cosing_embeddings(query_embs, product_embs)
+            prediction_scores, references = accelerator.gather_for_metrics((prediction_scores, batch["labels"]))
+            metric.add_batch(
+                prediction_scores=prediction_scores,
+                references=references,
+            )
+
+        result = metric.compute()
+        result = {f"eval/{k}": v for k, v in result.items()}
+        # Use accelerator.print to print only on the main process.
+        accelerator.print(f"epoch {epoch}:", result)
+        if args.with_tracking:
+            result["train/epoch_loss"] = total_loss.item() / len(train_dataloader)
+            accelerator.log(result, step=completed_steps)
+
+        if args.output_dir is not None:
+            accelerator.wait_for_everyone()
+            if accelerator.is_main_process:
+                if isinstance(checkpointing_steps, str):
+                    accelerator.save_state(os.path.join(args.output_dir, f"epoch_{epoch}"))
+                accelerator.unwrap_model(model).save_pretrained(
+                    args.output_dir, state_dict=accelerator.get_state_dict(accelerator.unwrap_model(model))
+                )
+                tokenizer.save_pretrained(args.output_dir)
+                if args.push_to_hub:
+                    commit_message = (
+                        f"Training in progress epoch {epoch}"
+                        if epoch < args.num_train_epochs - 1
+                        else "End of training"
+                    )
+                    repo.push_to_hub(commit_message=commit_message, blocking=False, auto_lfs_prune=True)
+            accelerator.wait_for_everyone()
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/feature_extraction/peft_lora_embedding_semantic_similarity_inference.ipynb
+++ b/examples/feature_extraction/peft_lora_embedding_semantic_similarity_inference.ipynb
--- a/examples/feature_extraction/requirements.txt
+++ b/examples/feature_extraction/requirements.txt
@ -0,0 +1,10 @@
+git+https://github.com/huggingface/peft
+git+https://github.com/huggingface/accelerate
+git+https://github.com/huggingface/transformers
+datasets
+evaluate
+hnswlib
+pandas
+tqdm
+huggingface_hub
+wandb
--- a/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py
+++ b/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py
@ -0,0 +1,198 @@
+import os
+
+import torch
+import torch.nn as nn
+import transformers
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+from peft import LoraConfig, get_peft_model
+
+
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
+# -*- coding: utf-8 -*-
+"""Finetune-opt-bnb-peft.ipynb
+
+Automatically generated by Colaboratory.
+
+Original file is located at
+    https://colab.research.google.com/drive/1jCkpikz0J2o20FBQmYmAGdiKmJGOMo-o
+
+## Fine-tune large models using 🤗 `peft` adapters, `transformers` & `bitsandbytes`
+
+In this tutorial we will cover how we can fine-tune large language models using the very recent `peft` library and `bitsandbytes` for loading large models in 8-bit.
+The fine-tuning method will rely on a recent method called "Low Rank Adapters" (LoRA), instead of fine-tuning the entire model you just have to fine-tune these adapters and load them properly inside the model.
+After fine-tuning the model you can also share your adapters on the 🤗 Hub and load them very easily. Let's get started!
+
+### Install requirements
+
+First, run the cells below to install the requirements:
+"""
+
+
+"""### Model loading
+
+Here let's load the `opt-6.7b` model, its weights in half-precision (float16) are about 13GB on the Hub! If we load them in 8-bit we would require around 7GB of memory instead.
+"""
+
+
+free_in_GB = int(torch.cuda.mem_get_info()[0] / 1024**3)
+max_memory = f"{free_in_GB-2}GB"
+
+n_gpus = torch.cuda.device_count()
+max_memory = {i: max_memory for i in range(n_gpus)}
+
+model = AutoModelForCausalLM.from_pretrained(
+    "facebook/opt-350m",
+    max_memory=max_memory,
+    quantization_config=BitsAndBytesConfig(
+        load_in_4bit=True,
+        llm_int8_threshold=6.0,
+        llm_int8_has_fp16_weight=False,
+        bnb_4bit_compute_dtype=torch.float16,
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_quant_type="nf4",
+    ),
+    torch_dtype=torch.float16,
+)
+
+tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+
+"""### Post-processing on the model
+
+Finally, we need to apply some post-processing on the 8-bit model to enable training, let's freeze all our layers, and cast the layer-norm in `float32` for stability. We also cast the output of the last layer in `float32` for the same reasons.
+"""
+
+print(model)
+
+for param in model.parameters():
+    param.requires_grad = False  # freeze the model - train adapters later
+    if param.ndim == 1:
+        # cast the small parameters (e.g. layernorm) to fp32 for stability
+        param.data = param.data.to(torch.float32)
+
+# model.gradient_checkpointing_enable()  # reduce number of stored activations
+# model.model.decoder.project_in = lambda x: x.requires_grad_(True)
+
+
+class CastOutputToFloat(nn.Sequential):
+    def forward(self, x):
+        return super().forward(x).to(torch.float32)
+
+
+model.lm_head = CastOutputToFloat(model.lm_head)
+
+"""### Apply LoRA
+
+Here comes the magic with `peft`! Let's load a `PeftModel` and specify that we are going to use low-rank adapters (LoRA) using `get_peft_model` utility function from `peft`.
+"""
+
+
+def print_trainable_parameters(model):
+    """
+    Prints the number of trainable parameters in the model.
+    """
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        all_param += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    print(
+        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
+    )
+
+
+config = LoraConfig(
+    r=64,
+    lora_alpha=32,
+    target_modules=["q_proj", "v_proj", "out_proj", "fc1", "fc2"],
+    lora_dropout=0.01,
+    bias="none",
+    task_type="CAUSAL_LM",
+)
+
+model = get_peft_model(model, config)
+print_trainable_parameters(model)
+
+# Verifying the datatypes.
+dtypes = {}
+for _, p in model.named_parameters():
+    dtype = p.dtype
+    if dtype not in dtypes:
+        dtypes[dtype] = 0
+    dtypes[dtype] += p.numel()
+total = 0
+for k, v in dtypes.items():
+    total += v
+for k, v in dtypes.items():
+    print(k, v, v / total)
+
+"""### Training"""
+
+
+data = load_dataset("Abirate/english_quotes")
+data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
+
+
+data = load_dataset("Abirate/english_quotes")
+data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
+
+trainer = transformers.Trainer(
+    model=model,
+    train_dataset=data["train"],
+    args=transformers.TrainingArguments(
+        per_device_train_batch_size=4,
+        gradient_accumulation_steps=4,
+        warmup_steps=10,
+        max_steps=20,
+        learning_rate=3e-4,
+        fp16=True,
+        logging_steps=1,
+        output_dir="outputs",
+    ),
+    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
+)
+model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
+trainer.train()
+
+# from huggingface_hub import notebook_login
+
+# notebook_login()
+
+# model.push_to_hub("ybelkada/opt-6.7b-lora", use_auth_token=True)
+
+"""## Load adapters from the Hub
+
+You can also directly load adapters from the Hub using the commands below:
+"""
+
+# import torch
+# from peft import PeftModel, PeftConfig
+# from transformers import AutoModelForCausalLM, AutoTokenizer
+#
+# peft_model_id = "ybelkada/opt-6.7b-lora"
+# config = PeftConfig.from_pretrained(peft_model_id)
+# model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map='auto')
+# tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
+#
+## Load the Lora model
+# model = PeftModel.from_pretrained(model, peft_model_id)
+#
+# """## Inference
+#
+# You can then directly use the trained model or the model that you have loaded from the 🤗 Hub for inference as you would do it usually in `transformers`.
+# """
+#
+batch = tokenizer("Two things are infinite: ", return_tensors="pt")
+
+model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
+model.eval()
+with torch.cuda.amp.autocast():
+    output_tokens = model.generate(**batch, max_new_tokens=50)
+
+print("\n\n", tokenizer.decode(output_tokens[0], skip_special_tokens=True))
+# model.save('./test.pt')
+
+# """As you can see by fine-tuning for few steps we have almost recovered the quote from Albert Einstein that is present in the [training data](https://huggingface.co/datasets/Abirate/english_quotes)."""
--- a/examples/image_classification/README.md
+++ b/examples/image_classification/README.md
@ -0,0 +1,15 @@
+# Fine-tuning for image classification using LoRA and 🤗 PEFT
+
+## Vision Transformer model from transformers
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/peft/blob/main/examples/image_classification/image_classification_peft_lora.ipynb) 
+
+We provide a notebook (`image_classification_peft_lora.ipynb`) where we learn how to use [LoRA](https://arxiv.org/abs/2106.09685) from 🤗 PEFT to fine-tune an image classification model by ONLY using **0.7%** of the original trainable parameters of the model. 
+
+LoRA adds low-rank "update matrices" to certain blocks in the underlying model (in this case the attention blocks) and ONLY trains those matrices during fine-tuning. During inference, these update matrices are _merged_ with the original model parameters. For more details, check out the [original LoRA paper](https://arxiv.org/abs/2106.09685). 
+
+## PoolFormer model from timm
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/peft/blob/main/examples/image_classification/image_classification_timm_peft_lora.ipynb) 
+
+The notebook `image_classification_timm_peft_lora.ipynb` showcases fine-tuning an image classification model using from the [timm](https://huggingface.co/docs/timm/index) library. Again, LoRA is used to reduce the numberof trainable parameters to a fraction of the total.
--- a/examples/image_classification/image_classification_peft_lora.ipynb
+++ b/examples/image_classification/image_classification_peft_lora.ipynb
--- a/examples/image_classification/image_classification_timm_peft_lora.ipynb
+++ b/examples/image_classification/image_classification_timm_peft_lora.ipynb
--- a/examples/int8_training/Finetune_flan_t5_large_bnb_peft.ipynb
+++ b/examples/int8_training/Finetune_flan_t5_large_bnb_peft.ipynb
--- a/examples/int8_training/Finetune_opt_bnb_peft.ipynb
+++ b/examples/int8_training/Finetune_opt_bnb_peft.ipynb
--- a/examples/int8_training/fine_tune_blip2_int8.py
+++ b/examples/int8_training/fine_tune_blip2_int8.py
@ -0,0 +1,103 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from datasets import load_dataset
+from torch.utils.data import DataLoader, Dataset
+from transformers import AutoModelForVision2Seq, AutoProcessor
+
+from peft import LoraConfig, get_peft_model
+
+
+# Let's define the LoraConfig
+config = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    lora_dropout=0.05,
+    bias="none",
+)
+
+# We load our model and processor using `transformers`
+model = AutoModelForVision2Seq.from_pretrained("Salesforce/blip2-opt-2.7b", load_in_8bit=True)
+processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
+
+# Get our peft model and print the number of trainable parameters
+model = get_peft_model(model, config)
+model.print_trainable_parameters()
+
+# Let's load the dataset here!
+dataset = load_dataset("ybelkada/football-dataset", split="train")
+
+
+class ImageCaptioningDataset(Dataset):
+    def __init__(self, dataset, processor):
+        self.dataset = dataset
+        self.processor = processor
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, idx):
+        item = self.dataset[idx]
+        encoding = self.processor(images=item["image"], padding="max_length", return_tensors="pt")
+        # remove batch dimension
+        encoding = {k: v.squeeze() for k, v in encoding.items()}
+        encoding["text"] = item["text"]
+        return encoding
+
+
+def collator(batch):
+    # pad the input_ids and attention_mask
+    processed_batch = {}
+    for key in batch[0].keys():
+        if key != "text":
+            processed_batch[key] = torch.stack([example[key] for example in batch])
+        else:
+            text_inputs = processor.tokenizer(
+                [example["text"] for example in batch], padding=True, return_tensors="pt"
+            )
+            processed_batch["input_ids"] = text_inputs["input_ids"]
+            processed_batch["attention_mask"] = text_inputs["attention_mask"]
+    return processed_batch
+
+
+train_dataset = ImageCaptioningDataset(dataset, processor)
+train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=2, collate_fn=collator)
+
+optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+model.train()
+
+for epoch in range(50):
+    print("Epoch:", epoch)
+    for idx, batch in enumerate(train_dataloader):
+        input_ids = batch.pop("input_ids").to(device)
+        pixel_values = batch.pop("pixel_values").to(device, torch.float16)
+
+        outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=input_ids)
+
+        loss = outputs.loss
+
+        print("Loss:", loss.item())
+
+        loss.backward()
+
+        optimizer.step()
+        optimizer.zero_grad()
+
+        if idx % 10 == 0:
+            generated_output = model.generate(pixel_values=pixel_values)
+            print(processor.batch_decode(generated_output, skip_special_tokens=True))
--- a/examples/int8_training/peft_adalora_whisper_large_training.py
+++ b/examples/int8_training/peft_adalora_whisper_large_training.py
@ -0,0 +1,773 @@
+import argparse
+import gc
+import json
+import logging
+import math
+import os
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from random import randint
+from typing import Any, Dict, List, Union
+
+# datasets imports
+import datasets
+
+# metric imports
+import evaluate
+import numpy as np
+import torch
+import transformers
+import wandb
+
+# accelerate imports
+from accelerate import Accelerator, dispatch_model
+from accelerate.logging import get_logger
+from datasets import Audio, DatasetDict, IterableDatasetDict, interleave_datasets, load_dataset
+
+# hf imports
+from huggingface_hub import Repository
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import (
+    SchedulerType,
+    WhisperForConditionalGeneration,
+    WhisperProcessor,
+    get_scheduler,
+    set_seed,
+)
+from transformers.models.whisper.english_normalizer import BasicTextNormalizer
+from transformers.utils import get_full_repo_name
+
+# peft imports
+from peft import AdaLoraConfig, LoraConfig, PeftModel, get_peft_model
+
+
+logger = get_logger(__name__, log_level="INFO")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Whisper Fine-Tuning with AdaLora")
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+        required=True,
+    )
+    parser.add_argument("--language", type=str, help="Language to use for training; e.g., 'Hindi' ", required=True)
+    parser.add_argument("--language_abbr", type=str, help="Language to use for training; e.g., 'hi' ", required=True)
+    parser.add_argument(
+        "--task", type=str, default="transcribe", help="Task to use for training; e.g., 'transcribe' ", required=False
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default="mozilla-foundation/common_voice_11_0",
+        help="Dataset to use for training; e.g., 'whisper' ",
+        required=False,
+    )
+    parser.add_argument(
+        "--dataset_in_streaming_mode",
+        action="store_true",
+        help="Whether to use streaming mode for the dataset.",
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="lowercase the transcribed text before tokenizing"
+    )
+    parser.add_argument(
+        "--do_remove_punctuation", action="store_true", help="remove punctuation from the transcribed text"
+    )
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument(
+        "--overwrite_cache", type=bool, default=False, help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--max_audio_input_length", type=float, default=30.0, help="Maximum audio length in seconds.")
+    parser.add_argument(
+        "--preprocessing_num_workers",
+        type=int,
+        default=None,
+        help="The number of processes to use for the preprocessing.",
+    )
+    parser.add_argument(
+        "--per_device_train_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--per_device_eval_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the evaluation dataloader.",
+    )
+    parser.add_argument(
+        "--buffer_size",
+        type=int,
+        default=5000,
+        help="Number of samples to prefetch in the streaming mode.",
+    )
+    parser.add_argument(
+        "--dataloader_pin_memory",
+        action="store_true",
+        help="Whether or not to pin memory for the DataLoader.",
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help="Number of subprocesses to use for data loading.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
+    parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        type=SchedulerType,
+        default="linear",
+        help="The scheduler type to use.",
+        choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
+    )
+    parser.add_argument(
+        "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--load_best_model",
+        action="store_true",
+        help="Whether to load the best model at the end of training",
+    )
+    parser.add_argument(
+        "--with_tracking",
+        action="store_true",
+        help="Whether to enable experiment trackers for logging.",
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="all",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
+            ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
+            "Only applicable when `--with_tracking` is passed."
+        ),
+    )
+    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
+    )
+    parser.add_argument(
+        "--logging_steps",
+        type=int,
+        default=100,
+        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
+    )
+    parser.add_argument(
+        "--evaluation_steps",
+        type=int,
+        default=500,
+        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help="If the training should continue from a checkpoint folder.",
+    )
+
+    # lora/adalora specific args
+    parser.add_argument(
+        "--use_peft",
+        action="store_true",
+        help="Whether to use PEFT",
+    )
+    parser.add_argument(
+        "--use_adalora",
+        action="store_true",
+        help="Whether to use AdaLoRA or LoRA. If set, uses AdaLoRA instead of the default LoRA.",
+    )
+    parser.add_argument(
+        "--init_r",
+        type=int,
+        default=12,
+        help="Initial AdaLoRA rank",
+    )
+    parser.add_argument(
+        "--target_r",
+        type=int,
+        default=4,
+        help="Target AdaLoRA rank",
+    )
+    parser.add_argument(
+        "--tinit",
+        type=int,
+        default=200,
+        help="number of warmup steps for AdaLoRA wherein no pruning is performed",
+    )
+    parser.add_argument(
+        "--tfinal",
+        type=int,
+        default=1000,
+        help=" fix the resulting budget distribution and fine-tune the model for tfinal steps when using AdaLoRA ",
+    )
+    parser.add_argument(
+        "--delta_t",
+        type=int,
+        default=10,
+        help="interval of steps for AdaLoRA to update rank",
+    )
+    parser.add_argument(
+        "--lora_alpha",
+        type=int,
+        default=32,
+        help="LORA alpha",
+    )
+    parser.add_argument(
+        "--r",
+        type=int,
+        default=8,
+        help="LORA rank",
+    )
+    parser.add_argument(
+        "--lora_dropout",
+        type=float,
+        default=0.1,
+        help="LORA dropout",
+    )
+    parser.add_argument(
+        "--orth_reg_weight",
+        type=float,
+        default=0.5,
+        help="Orthogonal regularization weight",
+    )
+    parser.add_argument(
+        "--debug_mode",
+        action="store_true",
+        help="Whether to use debug mode",
+    )
+
+    args = parser.parse_args()
+
+    if args.push_to_hub:
+        assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."
+
+    return args
+
+
+def load_streaming_dataset(dataset_name, dataset_config_name, split, **kwargs):
+    if "+" in split:
+        # load multiple splits separated by the `+` symbol *with* streaming mode
+        dataset_splits = [
+            load_dataset(dataset_name, dataset_config_name, split=split_name, streaming=True, **kwargs)
+            for split_name in split.split("+")
+        ]
+        # interleave multiple splits to form one dataset
+        interleaved_dataset = interleave_datasets(dataset_splits)
+        return interleaved_dataset
+    else:
+        # load a single split *with* streaming mode
+        dataset = load_dataset(dataset_name, dataset_config_name, split=split, streaming=True, **kwargs)
+        return dataset
+
+
+def prepare_dataset_wrapper(do_lower_case, do_remove_punctuation, processor, normalizer):
+    def prepare_dataset(batch):
+        # load and (possibly) resample audio data to 16kHz
+        audio = batch["audio"]
+
+        # compute log-Mel input features from input audio array
+        batch["input_features"] = processor.feature_extractor(
+            audio["array"], sampling_rate=audio["sampling_rate"]
+        ).input_features[0]
+        # compute input length of audio sample in seconds
+        batch["input_length"] = len(audio["array"]) / audio["sampling_rate"]
+
+        # optional pre-processing steps
+        transcription = batch["sentence"]
+        if do_lower_case:
+            transcription = transcription.lower()
+        if do_remove_punctuation:
+            transcription = normalizer(transcription).strip()
+
+        # encode target text to label ids
+        batch["labels"] = processor.tokenizer(transcription).input_ids
+        return batch
+
+    return prepare_dataset
+
+
+def save_model_hook(models, weights, output_dir):
+    for model in models:
+        model.save_pretrained(output_dir)
+        # make sure to pop weight so that corresponding model is not saved again
+        weights.pop()
+
+
+def load_model_hook(models, input_dir):
+    while len(models) > 0:
+        model = models.pop()
+        # pop models so that they are not loaded again
+        PeftModel.from_pretrained(model.base_model.model, input_dir)
+
+
+@dataclass
+class DataCollatorSpeechSeq2SeqWithPadding:
+    processor: Any
+
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        # split inputs and labels since they have to be of different lengths and need different padding methods
+        # first treat the audio inputs by simply returning torch tensors
+        input_features = [{"input_features": feature["input_features"]} for feature in features]
+        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
+
+        # get the tokenized label sequences
+        label_features = [{"input_ids": feature["labels"]} for feature in features]
+        # pad the labels to max length
+        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
+
+        # replace padding with -100 to ignore loss correctly
+        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
+
+        # if bos token is appended in previous tokenization step,
+        # cut bos token here as it's append later anyways
+        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
+            labels = labels[:, 1:]
+
+        batch["labels"] = labels
+
+        return batch
+
+
+def get_audio_length_processor(max_input_length):
+    def is_audio_in_length_range(length):
+        return length < max_input_length
+
+    return is_audio_in_length_range
+
+
+def evaluation_loop(model, eval_dataloader, processor, normalizer, metric, forced_decoder_ids, accelerator):
+    model.eval()
+    predictions = []
+    references = []
+    normalized_predictions = []
+    normalized_references = []
+    for _, batch in enumerate(tqdm(eval_dataloader)):
+        with torch.cuda.amp.autocast():
+            with torch.no_grad():
+                generated_tokens = (
+                    model.generate(
+                        input_features=batch["input_features"],
+                        forced_decoder_ids=forced_decoder_ids,
+                        max_new_tokens=255,
+                    )
+                    .cpu()
+                    .numpy()
+                )
+                labels = batch["labels"].cpu().numpy()
+                labels = np.where(labels != -100, labels, processor.tokenizer.pad_token_id)
+                decoded_preds = processor.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+                decoded_labels = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)
+                predictions.extend(decoded_preds)
+                references.extend(decoded_labels)
+                normalized_predictions.extend([normalizer(pred).strip() for pred in decoded_preds])
+                normalized_references.extend([normalizer(label).strip() for label in decoded_labels])
+            del generated_tokens, labels, batch
+        gc.collect()
+    wer = 100 * metric.compute(predictions=predictions, references=references)
+    normalized_wer = 100 * metric.compute(predictions=normalized_predictions, references=normalized_references)
+    eval_metrics = {"eval/wer": wer, "eval/normalized_wer": normalized_wer}
+    if accelerator.get_tracker("wandb"):
+        sample_size = min(len(predictions), 256)
+        ids = [randint(0, len(predictions) - 1) for p in range(0, sample_size)]
+        sample_predictions = [predictions[i] for i in ids]
+        sample_references = [references[i] for i in ids]
+        sample_normalized_predictions = [normalized_predictions[i] for i in ids]
+        sample_normalized_references = [normalized_references[i] for i in ids]
+        table_rows = [
+            list(r)
+            for r in zip(
+                sample_predictions, sample_references, sample_normalized_predictions, sample_normalized_references
+            )
+        ]
+        eval_metrics["eval_samples"] = wandb.Table(
+            columns=["predictions", "references", "normalized_predictions", "normalized_references"],
+            rows=table_rows,
+        )
+    return eval_metrics
+
+
+def main():
+    args = parse_args()
+
+    # initialize accelerator
+    accelerator = (
+        Accelerator(
+            log_with=args.report_to,
+            project_dir=args.output_dir,
+            gradient_accumulation_steps=args.gradient_accumulation_steps,
+        )
+        if args.with_tracking
+        else Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps)
+    )
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            repo = Repository(args.output_dir, clone_from=repo_name)
+
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+    accelerator.wait_for_everyone()
+
+    # load dataset either in streaming mode or not
+    processor = WhisperProcessor.from_pretrained(args.model_name_or_path, language=args.language, task=args.task)
+    normalizer = BasicTextNormalizer()
+    prepare_dataset = prepare_dataset_wrapper(args.do_lower_case, args.do_remove_punctuation, processor, normalizer)
+    is_audio_in_length_range = get_audio_length_processor(args.max_audio_input_length)
+    data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
+
+    if args.dataset_in_streaming_mode:
+        raw_datasets = IterableDatasetDict()
+        loading_method = load_streaming_dataset
+    else:
+        raw_datasets = DatasetDict()
+        loading_method = load_dataset
+
+    if args.debug_mode:
+        train_split = "train[:100]"
+        test_split = "test[:10]"
+    else:
+        train_split = "train+validation"
+        test_split = "test"
+
+    raw_datasets["train"] = loading_method(
+        args.dataset_name, args.language_abbr, split=train_split, use_auth_token=True
+    )
+    raw_datasets["test"] = loading_method(args.dataset_name, args.language_abbr, split=test_split, use_auth_token=True)
+    raw_datasets = raw_datasets.cast_column("audio", Audio(sampling_rate=16000))
+
+    logger.info("Dataset loaded: %s", raw_datasets)
+    logger.info(f'{raw_datasets["train"][0]}')
+
+    vectorized_datasets = raw_datasets.map(
+        prepare_dataset,
+        remove_columns=list(next(iter(raw_datasets.values())).features),
+        num_proc=args.preprocessing_num_workers,
+    ).with_format("torch")
+
+    if args.dataset_in_streaming_mode:
+        vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(
+            buffer_size=args.buffer_size,
+            seed=args.seed,
+        )
+
+    # filter out audio files that are too long from the training set
+    is_audio_in_length_range = get_audio_length_processor(args.max_audio_input_length)
+    vectorized_datasets["train"] = vectorized_datasets["train"].filter(
+        is_audio_in_length_range, input_columns=["input_length"]
+    )
+
+    # get dataloaders
+    train_dataloader = DataLoader(
+        vectorized_datasets["train"],
+        batch_size=args.per_device_train_batch_size,
+        shuffle=True,
+        collate_fn=data_collator,
+        num_workers=args.dataloader_num_workers,
+        pin_memory=args.dataloader_pin_memory,
+    )
+    eval_dataloader = DataLoader(
+        vectorized_datasets["test"],
+        batch_size=args.per_device_eval_batch_size,
+        collate_fn=data_collator,
+        num_workers=args.dataloader_num_workers,
+        pin_memory=args.dataloader_pin_memory,
+    )
+
+    # metric
+    metric = evaluate.load("wer")
+
+    # model
+    model = WhisperForConditionalGeneration.from_pretrained(args.model_name_or_path, load_in_8bit=True)
+    model.config.forced_decoder_ids = None
+    model.config.suppress_tokens = []
+    if len(set(model.hf_device_map.values()).intersection({"cpu", "disk"})) > 0:
+        raise ValueError("Training on CPU or disk is not supported.")
+    if len(set(model.hf_device_map.values())) > 1:
+        device_map = model.hf_device_map.copy()
+        # required because `labels` are on main execution device (0) while the output of `proj_out` is on other device.
+        # So, this leads to device mismatch error when calculation cross-entropy between logits and labels.
+        # Won't arise during inference as `labels` aren't supplied during that time
+        # instead of changing device of one of the tied modules, I have to do this for all tied modules
+        # else the execution device of remaining tied modules isn't changed
+        device_map["model.decoder.embed_tokens"] = model._hf_hook.execution_device
+        device_map["model.decoder.embed_positions"] = model._hf_hook.execution_device
+        device_map["proj_out"] = model._hf_hook.execution_device
+        dispatch_model(model, device_map=device_map)
+
+    # preparing peft model
+    if args.use_peft:
+        from peft import prepare_model_for_int8_training
+
+        model = prepare_model_for_int8_training(model)
+
+        # as Whisper model uses Conv layer in encoder, checkpointing disables grad computation
+        # to avoid this, make the inputs trainable
+        def make_inputs_require_grad(module, input, output):
+            output.requires_grad_(True)
+
+        model.model.encoder.conv1.register_forward_hook(make_inputs_require_grad)
+
+        # wrapping model with adalora tuner
+        if args.use_adalora:
+            config = AdaLoraConfig(
+                init_r=args.init_r,
+                target_r=args.target_r,
+                beta1=0.85,
+                beta2=0.85,
+                tinit=args.tinit,
+                tfinal=args.tfinal,
+                deltaT=args.delta_t,
+                lora_alpha=args.lora_alpha,
+                lora_dropout=args.lora_dropout,
+                target_modules=["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"],
+                orth_reg_weight=args.orth_reg_weight,
+            )
+        else:
+            config = LoraConfig(
+                r=args.r,
+                lora_alpha=args.lora_alpha,
+                target_modules=["q_proj", "v_proj"],
+                lora_dropout=args.lora_dropout,
+            )
+
+        model = get_peft_model(model, config)
+        model.print_trainable_parameters()
+
+    # optimizer
+    optimizer = torch.optim.AdamW(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay)
+
+    if args.max_train_steps is None:
+        num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    else:
+        args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # scheduler
+    lr_scheduler = get_scheduler(
+        name=args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=args.num_warmup_steps,
+        num_training_steps=args.max_train_steps,
+    )
+
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
+    )
+
+    accelerator.print(model)
+
+    # Note here that the max steps is adjusted by the accelerator's num_processes
+    args.max_train_steps = math.ceil(args.max_train_steps / accelerator.num_processes)
+    if args.use_peft and args.use_adalora:
+        model.base_model.peft_config["default"].total_step = args.max_train_steps
+        # model.base_model.peft_config.total_step = args.max_train_steps
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if args.with_tracking:
+        run_name = f"run-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
+        experiment_config = vars(args)
+        # TensorBoard cannot log Enums, need the raw value
+        experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value
+        accelerator.init_trackers(
+            "Whisper PEFT Fine-Tuning", config=experiment_config, init_kwargs={"wandb": {"name": run_name}}
+        )
+
+    # saving and loading checkpoints for resuming training
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+
+    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    logger.info("***** Running training *****")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
+    global_step = 0
+    starting_epoch = 0
+    best_metric = None
+    resume_step = 0
+    forced_decoder_ids = processor.get_decoder_prompt_ids(language=args.language, task=args.task)
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        accelerator.load_state(args.resume_from_checkpoint)
+        path = os.path.basename(args.resume_from_checkpoint)
+        training_difference = os.path.splitext(path)[0]
+        global_step = resume_step = int(training_difference.replace("step_", ""))
+        starting_epoch = resume_step // len(train_dataloader)
+        resume_step -= starting_epoch * len(train_dataloader)
+
+    # We need to adjust the progress bar to the current step
+    progress_bar.update(resume_step)
+    for epoch in range(starting_epoch, args.num_train_epochs):
+        model.train()
+        if args.with_tracking:
+            total_loss = 0
+            running_loss = 0
+        for step, batch in enumerate(accelerator.skip_first_batches(train_dataloader, num_batches=resume_step)):
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                accelerator.backward(loss)
+                optimizer.step()
+                lr_scheduler.step()
+
+                # Update the importance of low-rank matrices
+                # and allocate the budget accordingly.
+                # This is only needed for AdaLora.
+                # Note that this requires parameter gradients.
+                # Hence being called before optimizer.zero_grad().
+                if args.use_peft and args.use_adalora:
+                    model.update_and_allocate(global_step)
+
+                optimizer.zero_grad()
+                global_step += 1
+                progress_bar.update(1)
+
+            if args.with_tracking:
+                step_loss = accelerator.reduce(loss.detach().clone()).item()
+                total_loss += step_loss
+                running_loss += step_loss
+
+            if global_step % args.checkpointing_steps == 0:
+                output_dir = os.path.join(args.output_dir, f"step_{global_step}")
+                accelerator.save_state(output_dir)
+
+            if global_step % args.logging_steps == 0:
+                if args.with_tracking:
+                    accelerator.log({"train/running_loss": running_loss / args.logging_steps}, step=global_step)
+                    running_loss = 0
+
+            if global_step % args.evaluation_steps == 0:
+                eval_metrics = evaluation_loop(
+                    model, eval_dataloader, processor, normalizer, metric, forced_decoder_ids, accelerator
+                )
+                if args.with_tracking:
+                    logger.info(f"Step {global_step} eval metrics: {eval_metrics}")
+                    accelerator.log(eval_metrics, step=global_step)
+                if best_metric is None or eval_metrics["eval/wer"] < best_metric:
+                    best_metric = eval_metrics["eval/wer"]
+                    accelerator.save_state(os.path.join(args.output_dir, "best_checkpoint"))
+                model.train()
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if args.with_tracking:
+            train_epoch_loss = total_loss / (step + 1)
+            logger.info(f"Epoch {epoch} train loss: {train_epoch_loss}")
+            accelerator.log({"epoch/train_loss": train_epoch_loss}, step=epoch)
+
+        if args.push_to_hub and epoch <= args.num_train_epochs - 1:
+            accelerator.wait_for_everyone()
+            unwrapped_model = accelerator.unwrap_model(model)
+            unwrapped_model.save_pretrained(args.output_dir, is_main_process=accelerator.is_main_process)
+            # evaluate the model at the end of training
+            eval_metrics = evaluation_loop(
+                model, eval_dataloader, processor, normalizer, metric, forced_decoder_ids, accelerator
+            )
+            if args.with_tracking:
+                logger.info(f"Step {global_step} eval metrics: {eval_metrics}")
+                accelerator.log(eval_metrics, step=global_step)
+            if best_metric is None or eval_metrics["eval/wer"] < best_metric:
+                best_metric = eval_metrics["eval/wer"]
+                accelerator.save_state(os.path.join(args.output_dir, "best_checkpoint"))
+
+            if accelerator.is_main_process:
+                processor.tokenizer.save_pretrained(args.output_dir)
+                repo.push_to_hub(
+                    commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
+                )
+
+    if args.load_best_model:
+        # load the best model
+        accelerator.load_state(os.path.join(args.output_dir, "best_checkpoint"))
+        model.resize_modules_by_rank_pattern(model.peft_config["default"].rank_pattern, "default")
+        eval_metrics = evaluation_loop(
+            model, eval_dataloader, processor, normalizer, metric, forced_decoder_ids, accelerator
+        )
+        if args.with_tracking:
+            best_metrics = {"best_" + k: v for k, v in eval_metrics.items()}
+            accelerator.log(best_metrics, step=global_step)
+
+    accelerator.wait_for_everyone()
+    unwrapped_model = accelerator.unwrap_model(model)
+    unwrapped_model.save_pretrained(args.output_dir, is_main_process=accelerator.is_main_process)
+    if accelerator.is_main_process:
+        processor.tokenizer.save_pretrained(args.output_dir)
+        if args.push_to_hub:
+            repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
+
+    with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
+        eval_metrics.pop("eval_samples")
+        json.dump(eval_metrics, f)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/int8_training/peft_bnb_whisper_large_v2_training.ipynb
+++ b/examples/int8_training/peft_bnb_whisper_large_v2_training.ipynb
--- a/examples/int8_training/run_adalora_whisper_int8.sh
+++ b/examples/int8_training/run_adalora_whisper_int8.sh
@ -0,0 +1,37 @@
+accelerate launch --config_file config.yaml peft_adalora_whisper_large_training.py \
+    --model_name_or_path "openai/whisper-large-v2" \
+    --language "Marathi" \
+    --language_abbr "mr" \
+    --task "transcribe" \
+    --dataset_name "mozilla-foundation/common_voice_11_0" \
+    --push_to_hub \
+    --preprocessing_num_workers 2 \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 8 \
+    --dataloader_pin_memory \
+    --dataloader_num_workers 2 \
+    --learning_rate 1e-3 \
+    --weight_decay 1e-4 \
+    --num_train_epochs 3 \
+    --gradient_accumulation_steps 1 \
+    --lr_scheduler_type "linear" \
+    --num_warmup_steps 50 \
+    --output_dir "adalora_whisper_large_marathi_multi_adapter" \
+    --seed 42 \
+    --load_best_model \
+    --with_tracking \
+    --report_to "wandb" \
+    --hub_token $HUB_TOKEN \
+    --checkpointing_steps 2000 \
+    --evaluation_steps 2000 \
+    --logging_steps 25 \
+    --use_peft \
+    --use_adalora \
+    --init_r 12 \
+    --target_r 8 \
+    --tinit 100 \
+    --tfinal 800 \
+    --delta_t 10 \
+    --lora_alpha 32 \
+    --lora_dropout 0.1 \
+    --orth_reg_weight 0.5
--- a/examples/lora_dreambooth/colab_notebook.ipynb
+++ b/examples/lora_dreambooth/colab_notebook.ipynb
@ -0,0 +1,54 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "kdOhtpergLCQ"
+      },
+      "outputs": [],
+      "source": [
+        "!git clone https://huggingface.co/spaces/smangrul/peft-lora-sd-dreambooth"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "_LuGk9mihPx7"
+      },
+      "outputs": [],
+      "source": [
+        "%cd \"peft-lora-sd-dreambooth\"\n",
+        "!pip install -r requirements.txt"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "BYKO8e5ElJOX"
+      },
+      "outputs": [],
+      "source": [
+        "!python colab.py"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "provenance": []
+    },
+    "gpuClass": "premium",
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
--- a/examples/lora_dreambooth/convert_kohya_ss_sd_lora_to_peft.py
+++ b/examples/lora_dreambooth/convert_kohya_ss_sd_lora_to_peft.py
@ -0,0 +1,167 @@
+import argparse
+import os
+import re
+from typing import Callable, List, Optional, Union
+
+import safetensors
+import torch
+import torch.nn as nn
+from diffusers import UNet2DConditionModel
+from transformers import CLIPTextModel
+
+from peft import LoraConfig, get_peft_model, get_peft_model_state_dict, set_peft_model_state_dict
+
+
+# Default kohya_ss LoRA replacement modules
+# https://github.com/kohya-ss/sd-scripts/blob/c924c47f374ac1b6e33e71f82948eb1853e2243f/networks/lora.py#L661
+UNET_TARGET_REPLACE_MODULE = ["Transformer2DModel", "Attention"]
+UNET_TARGET_REPLACE_MODULE_CONV2D_3X3 = ["ResnetBlock2D", "Downsample2D", "Upsample2D"]
+TEXT_ENCODER_TARGET_REPLACE_MODULE = ["CLIPAttention", "CLIPMLP"]
+LORA_PREFIX_UNET = "lora_unet"
+LORA_PREFIX_TEXT_ENCODER = "lora_te"
+
+
+def get_modules_names(
+    root_module: nn.Module,
+    target_replace_modules_linear: Optional[List[str]] = [],
+    target_replace_modules_conv2d: Optional[List[str]] = [],
+):
+    # Combine replacement modules
+    target_replace_modules = target_replace_modules_linear + target_replace_modules_conv2d
+
+    # Store result
+    modules_names = set()
+    # https://github.com/kohya-ss/sd-scripts/blob/c924c47f374ac1b6e33e71f82948eb1853e2243f/networks/lora.py#L720
+    for name, module in root_module.named_modules():
+        if module.__class__.__name__ in target_replace_modules:
+            if len(name) == 0:
+                continue
+            for child_name, child_module in module.named_modules():
+                if len(child_name) == 0:
+                    continue
+                is_linear = child_module.__class__.__name__ == "Linear"
+                is_conv2d = child_module.__class__.__name__ == "Conv2d"
+
+                if (is_linear and module.__class__.__name__ in target_replace_modules_linear) or (
+                    is_conv2d and module.__class__.__name__ in target_replace_modules_conv2d
+                ):
+                    modules_names.add(f"{name}.{child_name}")
+
+    return sorted(modules_names)
+
+
+def get_rank_alpha(
+    layer_names: List[str],
+    value_getter: Callable[[str], Union[int, float]],
+    filter_string: str,
+) -> Union[int, float]:
+    values = [value_getter(p) for p in filter(lambda x: bool(re.search(filter_string, x)), layer_names)]
+    value = values[0]
+    assert all(v == value for v in values), f"All LoRA ranks and alphas must be same, found: {values}"
+    return value
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--sd_checkpoint", default=None, type=str, required=True, help="SD checkpoint to use")
+
+    parser.add_argument(
+        "--kohya_lora_path", default=None, type=str, required=True, help="Path to kohya_ss trained LoRA"
+    )
+
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+
+    parser.add_argument("--half", action="store_true", help="Save weights in half precision.")
+    args = parser.parse_args()
+
+    # Find text encoder modules to add LoRA to
+    text_encoder = CLIPTextModel.from_pretrained(args.sd_checkpoint, subfolder="text_encoder")
+    text_encoder_modules_names = get_modules_names(
+        text_encoder, target_replace_modules_linear=TEXT_ENCODER_TARGET_REPLACE_MODULE
+    )
+
+    # Find unet2d modules to add LoRA to
+    unet = UNet2DConditionModel.from_pretrained(args.sd_checkpoint, subfolder="unet")
+    unet_modules_names = get_modules_names(
+        unet,
+        target_replace_modules_linear=UNET_TARGET_REPLACE_MODULE,
+        target_replace_modules_conv2d=UNET_TARGET_REPLACE_MODULE,
+    )
+
+    # Open kohya_ss checkpoint
+    with safetensors.safe_open(args.kohya_lora_path, framework="pt", device="cpu") as f:
+        # Extract information about LoRA structure
+        metadata = f.metadata()
+        if (metadata is not None) and ("ss_network_dim" in metadata) and ("ss_network_alpha" in metadata):
+            # LoRA rank and alpha are in safetensors metadata, just get it
+            lora_r = lora_text_encoder_r = int(metadata["ss_network_dim"])
+            lora_alpha = lora_text_encoder_alpha = float(metadata["ss_network_alpha"])
+        else:
+            # LoRA rank and alpha are not present, so infer them
+            lora_r = get_rank_alpha(
+                f.keys(), lambda n: f.get_tensor(n).size(0), f"^{LORA_PREFIX_UNET}\w+\.lora_down\.weight$"
+            )
+            lora_text_encoder_r = get_rank_alpha(
+                f.keys(), lambda n: f.get_tensor(n).size(0), f"^{LORA_PREFIX_TEXT_ENCODER}\w+\.lora_down\.weight$"
+            )
+            lora_alpha = get_rank_alpha(f.keys(), lambda n: f.get_tensor(n).item(), f"^{LORA_PREFIX_UNET}\w+\.alpha$")
+            lora_text_encoder_alpha = get_rank_alpha(
+                f.keys(), lambda n: f.get_tensor(n).item(), f"^{LORA_PREFIX_TEXT_ENCODER}\w+\.alpha$"
+            )
+
+        # Create LoRA for text encoder
+        text_encoder_config = LoraConfig(
+            r=lora_text_encoder_r,
+            lora_alpha=lora_text_encoder_alpha,
+            target_modules=text_encoder_modules_names,
+            lora_dropout=0.0,
+            bias="none",
+        )
+        text_encoder = get_peft_model(text_encoder, text_encoder_config)
+        text_encoder_lora_state_dict = {x: None for x in get_peft_model_state_dict(text_encoder).keys()}
+
+        # Load text encoder values from kohya_ss LoRA
+        for peft_te_key in text_encoder_lora_state_dict.keys():
+            kohya_ss_te_key = peft_te_key.replace("base_model.model", LORA_PREFIX_TEXT_ENCODER)
+            kohya_ss_te_key = kohya_ss_te_key.replace("lora_A", "lora_down")
+            kohya_ss_te_key = kohya_ss_te_key.replace("lora_B", "lora_up")
+            kohya_ss_te_key = kohya_ss_te_key.replace(".", "_", kohya_ss_te_key.count(".") - 2)
+            text_encoder_lora_state_dict[peft_te_key] = f.get_tensor(kohya_ss_te_key).to(text_encoder.dtype)
+
+        # Load converted kohya_ss text encoder LoRA back to PEFT
+        set_peft_model_state_dict(text_encoder, text_encoder_lora_state_dict)
+
+        if args.half:
+            text_encoder.to(torch.float16)
+
+        # Save text encoder result
+        text_encoder.save_pretrained(
+            os.path.join(args.dump_path, "text_encoder"),
+        )
+
+        # Create LoRA for unet2d
+        unet_config = LoraConfig(
+            r=lora_r, lora_alpha=lora_alpha, target_modules=unet_modules_names, lora_dropout=0.0, bias="none"
+        )
+        unet = get_peft_model(unet, unet_config)
+        unet_lora_state_dict = {x: None for x in get_peft_model_state_dict(unet).keys()}
+
+        # Load unet2d values from kohya_ss LoRA
+        for peft_unet_key in unet_lora_state_dict.keys():
+            kohya_ss_unet_key = peft_unet_key.replace("base_model.model", LORA_PREFIX_UNET)
+            kohya_ss_unet_key = kohya_ss_unet_key.replace("lora_A", "lora_down")
+            kohya_ss_unet_key = kohya_ss_unet_key.replace("lora_B", "lora_up")
+            kohya_ss_unet_key = kohya_ss_unet_key.replace(".", "_", kohya_ss_unet_key.count(".") - 2)
+            unet_lora_state_dict[peft_unet_key] = f.get_tensor(kohya_ss_unet_key).to(unet.dtype)
+
+        # Load converted kohya_ss unet LoRA back to PEFT
+        set_peft_model_state_dict(unet, unet_lora_state_dict)
+
+        if args.half:
+            unet.to(torch.float16)
+
+        # Save text encoder result
+        unet.save_pretrained(
+            os.path.join(args.dump_path, "unet"),
+        )
--- a/examples/lora_dreambooth/convert_peft_sd_lora_to_kohya_ss.py
+++ b/examples/lora_dreambooth/convert_peft_sd_lora_to_kohya_ss.py
@ -0,0 +1,101 @@
+import argparse
+import os
+from typing import Dict
+
+import torch
+from diffusers import UNet2DConditionModel
+from safetensors.torch import save_file
+from transformers import CLIPTextModel
+
+from peft import PeftModel, get_peft_model_state_dict
+
+
+# Default kohya_ss LoRA replacement modules
+# https://github.com/kohya-ss/sd-scripts/blob/c924c47f374ac1b6e33e71f82948eb1853e2243f/networks/lora.py#L664
+LORA_PREFIX_UNET = "lora_unet"
+LORA_PREFIX_TEXT_ENCODER = "lora_te"
+LORA_ADAPTER_NAME = "default"
+
+
+def get_module_kohya_state_dict(
+    module: PeftModel, prefix: str, dtype: torch.dtype, adapter_name: str = LORA_ADAPTER_NAME
+) -> Dict[str, torch.Tensor]:
+    kohya_ss_state_dict = {}
+    for peft_key, weight in get_peft_model_state_dict(module, adapter_name=adapter_name).items():
+        kohya_key = peft_key.replace("base_model.model", prefix)
+        kohya_key = kohya_key.replace("lora_A", "lora_down")
+        kohya_key = kohya_key.replace("lora_B", "lora_up")
+        kohya_key = kohya_key.replace(".", "_", kohya_key.count(".") - 2)
+        kohya_ss_state_dict[kohya_key] = weight.to(dtype)
+
+        # Set alpha parameter
+        if "lora_down" in kohya_key:
+            alpha_key = f'{kohya_key.split(".")[0]}.alpha'
+            kohya_ss_state_dict[alpha_key] = torch.tensor(module.peft_config[adapter_name].lora_alpha).to(dtype)
+
+    return kohya_ss_state_dict
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--sd_checkpoint",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+
+    parser.add_argument(
+        "--sd_checkpoint_revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+
+    parser.add_argument("--peft_lora_path", default=None, type=str, required=True, help="Path to peft trained LoRA")
+
+    parser.add_argument(
+        "--dump_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the output safetensors file for use with webui.",
+    )
+
+    parser.add_argument("--half", action="store_true", help="Save weights in half precision.")
+    args = parser.parse_args()
+
+    # Store kohya_ss state dict
+    kohya_ss_state_dict = {}
+    dtype = torch.float16 if args.half else torch.float32
+
+    # Load Text Encoder LoRA model
+    text_encoder_peft_lora_path = os.path.join(args.peft_lora_path, "text_encoder")
+    if os.path.exists(text_encoder_peft_lora_path):
+        text_encoder = CLIPTextModel.from_pretrained(
+            args.sd_checkpoint, subfolder="text_encoder", revision=args.sd_checkpoint_revision
+        )
+        text_encoder = PeftModel.from_pretrained(
+            text_encoder, text_encoder_peft_lora_path, adapter_name=LORA_ADAPTER_NAME
+        )
+        kohya_ss_state_dict.update(
+            get_module_kohya_state_dict(text_encoder, LORA_PREFIX_TEXT_ENCODER, dtype, LORA_ADAPTER_NAME)
+        )
+
+    # Load UNet LoRA model
+    unet_peft_lora_path = os.path.join(args.peft_lora_path, "unet")
+    if os.path.exists(unet_peft_lora_path):
+        unet = UNet2DConditionModel.from_pretrained(
+            args.sd_checkpoint, subfolder="unet", revision=args.sd_checkpoint_revision
+        )
+        unet = PeftModel.from_pretrained(unet, unet_peft_lora_path, adapter_name=LORA_ADAPTER_NAME)
+        kohya_ss_state_dict.update(get_module_kohya_state_dict(unet, LORA_PREFIX_UNET, dtype, LORA_ADAPTER_NAME))
+
+    # Save state dict
+    save_file(
+        kohya_ss_state_dict,
+        args.dump_path,
+    )
--- a/examples/lora_dreambooth/lora_dreambooth_inference.ipynb
+++ b/examples/lora_dreambooth/lora_dreambooth_inference.ipynb
--- a/examples/lora_dreambooth/requirements.txt
+++ b/examples/lora_dreambooth/requirements.txt
@ -1,11 +1,11 @@
 transformers
 accelerate
-loralib
 evaluate
-deepspeed
 tqdm
 datasets
 diffusers
 Pillow
 torchvision
-huggingface_hub
+huggingface_hub
+safetensors
+wandb
--- a/examples/lora_dreambooth/train_dreambooth.py
+++ b/examples/lora_dreambooth/train_dreambooth.py
@ -2,7 +2,6 @@ import argparse
 import gc
 import hashlib
 import itertools
-import json
 import logging
 import math
 import os
@ -11,6 +10,10 @@ import warnings
 from pathlib import Path
 from typing import Optional

+import datasets
+import diffusers
+import numpy as np
+import psutil
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
@ -18,21 +21,24 @@ import transformers
 from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import set_seed
-from torch.utils.data import Dataset
-from transformers import AutoTokenizer, PretrainedConfig
-
-import datasets
-import diffusers
-import psutil
-from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    DPMSolverMultistepScheduler,
+    UNet2DConditionModel,
+)
 from diffusers.optimization import get_scheduler
 from diffusers.utils import check_min_version
 from diffusers.utils.import_utils import is_xformers_available
 from huggingface_hub import HfFolder, Repository, whoami
-from peft import LoraConfig, LoraModel, get_peft_model_state_dict
 from PIL import Image
+from torch.utils.data import Dataset
 from torchvision import transforms
 from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+from peft import LoraConfig, get_peft_model


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
@ -129,6 +135,27 @@ def parse_args(input_args=None):
            " class_data_dir, additional images will be sampled with class_prompt."
        ),
    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=100,
+        help=(
+            "Run dreambooth validation every X steps. Dreambooth validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
    parser.add_argument(
        "--output_dir",
        type=str,
@ -302,6 +329,18 @@ def parse_args(input_args=None):
            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
        ),
    )
+    parser.add_argument(
+        "--wandb_key",
+        type=str,
+        default=None,
+        help=("If report to option is set to wandb, api-key for wandb used for login to wandb "),
+    )
+    parser.add_argument(
+        "--wandb_project_name",
+        type=str,
+        default=None,
+        help=("If report to option is set to wandb, project name in wandb for log tracking  "),
+    )
    parser.add_argument(
        "--mixed_precision",
        type=str,
@ -405,21 +444,6 @@ class TorchTracemalloc:
        # print(f"delta used/peak {self.used:4d}/{self.peaked:4d}")


-def print_trainable_parameters(model):
-    """
-    Prints the number of trainable parameters in the model.
-    """
-    trainable_params = 0
-    all_param = 0
-    for _, param in model.named_parameters():
-        all_param += param.numel()
-        if param.requires_grad:
-            trainable_params += param.numel()
-    print(
-        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
-    )
-
-
 class DreamBoothDataset(Dataset):
    """
    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
@ -557,9 +581,13 @@ def main(args):
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        mixed_precision=args.mixed_precision,
        log_with=args.report_to,
-        logging_dir=logging_dir,
+        project_dir=logging_dir,
    )
+    if args.report_to == "wandb":
+        import wandb

+        wandb.login(key=args.wandb_key)
+        wandb.init(project=args.wandb_project_name)
    # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate
    # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models.
    # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate.
@ -689,8 +717,8 @@ def main(args):
            lora_dropout=args.lora_dropout,
            bias=args.lora_bias,
        )
-        unet = LoraModel(config, unet)
-        print_trainable_parameters(unet)
+        unet = get_peft_model(unet, config)
+        unet.print_trainable_parameters()
        print(unet)

    vae.requires_grad_(False)
@ -704,8 +732,8 @@ def main(args):
            lora_dropout=args.lora_text_encoder_dropout,
            bias=args.lora_text_encoder_bias,
        )
-        text_encoder = LoraModel(config, text_encoder)
-        print_trainable_parameters(text_encoder)
+        text_encoder = get_peft_model(text_encoder, config)
+        text_encoder.print_trainable_parameters()
        print(text_encoder)

    if args.enable_xformers_memory_efficient_attention:
@ -871,6 +899,8 @@ def main(args):
                if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
                    if step % args.gradient_accumulation_steps == 0:
                        progress_bar.update(1)
+                        if args.report_to == "wandb":
+                            accelerator.print(progress_bar)
                    continue

                with accelerator.accumulate(unet):
@ -936,6 +966,8 @@ def main(args):
                # Checks if the accelerator has performed an optimization step behind the scenes
                if accelerator.sync_gradients:
                    progress_bar.update(1)
+                    if args.report_to == "wandb":
+                        accelerator.print(progress_bar)
                    global_step += 1

                    # if global_step % args.checkpointing_steps == 0:
@ -948,6 +980,57 @@ def main(args):
                progress_bar.set_postfix(**logs)
                accelerator.log(logs, step=global_step)

+                if (
+                    args.validation_prompt is not None
+                    and (step + num_update_steps_per_epoch * epoch) % args.validation_steps == 0
+                ):
+                    logger.info(
+                        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+                        f" {args.validation_prompt}."
+                    )
+                    # create pipeline
+                    pipeline = DiffusionPipeline.from_pretrained(
+                        args.pretrained_model_name_or_path,
+                        safety_checker=None,
+                        revision=args.revision,
+                    )
+                    # set `keep_fp32_wrapper` to True because we do not want to remove
+                    # mixed precision hooks while we are still training
+                    pipeline.unet = accelerator.unwrap_model(unet, keep_fp32_wrapper=True)
+                    pipeline.text_encoder = accelerator.unwrap_model(text_encoder, keep_fp32_wrapper=True)
+                    pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+                    pipeline = pipeline.to(accelerator.device)
+                    pipeline.set_progress_bar_config(disable=True)
+
+                    # run inference
+                    if args.seed is not None:
+                        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+                    else:
+                        generator = None
+                    images = []
+                    for _ in range(args.num_validation_images):
+                        image = pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
+                        images.append(image)
+
+                    for tracker in accelerator.trackers:
+                        if tracker.name == "tensorboard":
+                            np_images = np.stack([np.asarray(img) for img in images])
+                            tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+                        if tracker.name == "wandb":
+                            import wandb
+
+                            tracker.log(
+                                {
+                                    "validation": [
+                                        wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                        for i, image in enumerate(images)
+                                    ]
+                                }
+                            )
+
+                    del pipeline
+                    torch.cuda.empty_cache()
+
                if global_step >= args.max_train_steps:
                    break
        # Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
@ -973,21 +1056,16 @@ def main(args):
    accelerator.wait_for_everyone()
    if accelerator.is_main_process:
        if args.use_lora:
-            lora_config = {}
-            state_dict = get_peft_model_state_dict(unet, state_dict=accelerator.get_state_dict(unet))
-            lora_config["peft_config"] = unet.get_peft_config_as_dict(inference=True)
+            unwarpped_unet = accelerator.unwrap_model(unet)
+            unwarpped_unet.save_pretrained(
+                os.path.join(args.output_dir, "unet"), state_dict=accelerator.get_state_dict(unet)
+            )
            if args.train_text_encoder:
-                text_encoder_state_dict = get_peft_model_state_dict(
-                    text_encoder, state_dict=accelerator.get_state_dict(text_encoder)
+                unwarpped_text_encoder = accelerator.unwrap_model(text_encoder)
+                unwarpped_text_encoder.save_pretrained(
+                    os.path.join(args.output_dir, "text_encoder"),
+                    state_dict=accelerator.get_state_dict(text_encoder),
                )
-                text_encoder_state_dict = {f"text_encoder_{k}": v for k, v in text_encoder_state_dict.items()}
-                state_dict.update(text_encoder_state_dict)
-                lora_config["text_encoder_peft_config"] = text_encoder.get_peft_config_as_dict(inference=True)
-
-            accelerator.print(state_dict)
-            accelerator.save(state_dict, os.path.join(args.output_dir, f"{args.instance_prompt}_lora.pt"))
-            with open(os.path.join(args.output_dir, f"{args.instance_prompt}_lora_config.json"), "w") as f:
-                json.dump(lora_config, f)
        else:
            pipeline = DiffusionPipeline.from_pretrained(
                args.pretrained_model_name_or_path,
@ -997,6 +1075,9 @@ def main(args):
            )
            pipeline.save_pretrained(args.output_dir)

+        if args.push_to_hub:
+            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
+
    accelerator.end_training()


--- a/examples/multi_adapter_examples/PEFT_Multi_LoRA_Inference.ipynb
+++ b/examples/multi_adapter_examples/PEFT_Multi_LoRA_Inference.ipynb
@ -0,0 +1,358 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "jONLwzXgLg-I",
+   "metadata": {
+    "id": "jONLwzXgLg-I"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -q git+https://github.com/huggingface/transformers.git\n",
+    "!pip install -q git+https://github.com/huggingface/peft.git\n",
+    "!pip install -q git+https://github.com/huggingface/accelerate.git@main\n",
+    "!pip install huggingface_hub\n",
+    "!pip install bitsandbytes\n",
+    "!pip install SentencePiece"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "36460935",
+   "metadata": {
+    "id": "36460935"
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1351e04c",
+   "metadata": {
+    "id": "1351e04c"
+   },
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import notebook_login\n",
+    "import torch\n",
+    "\n",
+    "notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d85af699",
+   "metadata": {
+    "id": "d85af699"
+   },
+   "outputs": [],
+   "source": [
+    "from peft import PeftModel\n",
+    "from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig\n",
+    "\n",
+    "model_name = \"decapoda-research/llama-7b-hf\"\n",
+    "tokenizer = LlamaTokenizer.from_pretrained(model_name)\n",
+    "model = LlamaForCausalLM.from_pretrained(model_name, load_in_8bit=True, device_map=\"auto\", use_auth_token=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f0f515ed",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "f0f515ed",
+    "outputId": "312488a5-f4f8-48a4-8c63-7b4a59e80418"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 14.3 s, sys: 3.98 s, total: 18.3 s\n",
+      "Wall time: 19.3 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "model = PeftModel.from_pretrained(model, \"tloen/alpaca-lora-7b\", adapter_name=\"eng_alpaca\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "67a0c121",
+   "metadata": {
+    "id": "67a0c121"
+   },
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "model.load_adapter(\"22h/cabrita-lora-v0-1\", adapter_name=\"portuguese_alpaca\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4b655fca",
+   "metadata": {
+    "id": "4b655fca"
+   },
+   "outputs": [],
+   "source": [
+    "model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e9ebd572",
+   "metadata": {
+    "id": "e9ebd572"
+   },
+   "outputs": [],
+   "source": [
+    "model.to(\"cuda\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "138805b3",
+   "metadata": {
+    "id": "138805b3"
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "\n",
+    "device = \"cuda\"\n",
+    "\n",
+    "\n",
+    "def generate_prompt(instruction, input=None):\n",
+    "    if input:\n",
+    "        return f\"\"\"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n",
+    "### Instruction:\n",
+    "{instruction}\n",
+    "### Input:\n",
+    "{input}\n",
+    "### Response:\"\"\"\n",
+    "    else:\n",
+    "        return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n",
+    "### Instruction:\n",
+    "{instruction}\n",
+    "### Response:\"\"\"\n",
+    "\n",
+    "\n",
+    "def evaluate(\n",
+    "    instruction,\n",
+    "    input=None,\n",
+    "    temperature=0.1,\n",
+    "    top_p=0.75,\n",
+    "    top_k=40,\n",
+    "    num_beams=4,\n",
+    "    max_new_tokens=256,\n",
+    "    **kwargs,\n",
+    "):\n",
+    "    prompt = generate_prompt(instruction, input)\n",
+    "    inputs = tokenizer(prompt, return_tensors=\"pt\")\n",
+    "    input_ids = inputs[\"input_ids\"].to(device)\n",
+    "    generation_config = GenerationConfig(\n",
+    "        temperature=temperature,\n",
+    "        top_p=top_p,\n",
+    "        top_k=top_k,\n",
+    "        num_beams=num_beams,\n",
+    "        no_repeat_ngram_size=3,\n",
+    "        **kwargs,\n",
+    "    )\n",
+    "\n",
+    "    with torch.no_grad():\n",
+    "        generation_output = model.generate(\n",
+    "            input_ids=input_ids,\n",
+    "            generation_config=generation_config,\n",
+    "            return_dict_in_generate=True,\n",
+    "            output_scores=True,\n",
+    "            max_new_tokens=max_new_tokens,\n",
+    "        )\n",
+    "    s = generation_output.sequences[0]\n",
+    "    output = tokenizer.decode(s)\n",
+    "    return output.split(\"### Response:\")[1].strip()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fd5e6b3b",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "fd5e6b3b",
+    "outputId": "ec72241b-c427-4258-b02f-2101df0d171a"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 4.98 ms, sys: 0 ns, total: 4.98 ms\n",
+      "Wall time: 5.19 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "model.set_adapter(\"eng_alpaca\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "33650851",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "33650851",
+    "outputId": "aae24052-0f09-4812-88c3-6fb53dec656c"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The alpaca (Vicugna pacos) is a domesticated species of South American camelid. It resembles a small llama in appearance, but unlike the llama, it is not used as a beast of burden. It is kept primarily for its fiber, which can be spun into yarn. Alpaca fiber is warmer, lighter, and softer than sheep's wool, and is highly valued in the textile industry. The fiber comes in a variety of natural colors, including white, beige, cream, and fawn. It can also be dyed in a wide range of colors.\n",
+      "Alpaca herds can be found in the highlands of Peru, Bolivia, Chile, Ecuador, and Colombia. They are also raised in the United States, Canada, Australia, New Zealand, and Europe. The animals graze on grasses, herbs, and shrubs, and can survive in temperatures as low as -30°F (-34°C). They are social animals, living in herds of up to 20 individuals.\n",
+      "The fiber of the alpaka is used to make clothing\n"
+     ]
+    }
+   ],
+   "source": [
+    "instruction = \"Tell me about alpacas.\"\n",
+    "\n",
+    "print(evaluate(instruction))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fdc7196e",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "fdc7196e",
+    "outputId": "44cb6742-066b-470e-f507-cbf21e5ae030"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 5.58 ms, sys: 97 µs, total: 5.68 ms\n",
+      "Wall time: 5.63 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "model.set_adapter(\"portuguese_alpaca\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "31997da3",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "31997da3",
+    "outputId": "8071de75-dc9d-4e89-e85f-674f1de22658"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\"Eu preciso ficar em casa para cuidar de meu gato.\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "instruction = \"Invente uma desculpa criativa pra dizer que não preciso ir à festa.\"\n",
+    "\n",
+    "print(evaluate(instruction))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8b8e4e9a",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "8b8e4e9a",
+    "outputId": "84226223-e018-4feb-e189-969c344fd940"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "I'm sorry, but I can't go to the party. I'm sick. I have a cold. I don't feel well. I need to stay at home and rest.\n",
+      "I have a lot of homework to do. My dog ate my homework. My homework is too hard. I didn't have time to do it. It's too late. I forgot about it.\n",
+      "My parents won't let me go. My parents are out of town. They're on vacation. They have to work. They are sick. They need to take care of my brother.\n",
+      "They're not home. They went to the grocery store. They took the car to the mechanic. They had to go to a meeting. They were in a hurry. They forgot about me.\n",
+      "Their car broke down. Their car ran out of gas. They got a flat tire. They couldn't find a parking space. They didn' t have enough money. They lost their wallet.\n",
+      "It's raining. The roads are icy. There's a blizzard. There are too many cars on the road. There was an accident.\n"
+     ]
+    }
+   ],
+   "source": [
+    "with model.disable_adapter():\n",
+    "    instruction = \"Invente uma desculpa criativa pra dizer que não preciso ir à festa.\"\n",
+    "\n",
+    "    print(evaluate(instruction))"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "provenance": []
+  },
+  "gpuClass": "standard",
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/examples/multilayer_perceptron/README.md
+++ b/examples/multilayer_perceptron/README.md
@ -0,0 +1,5 @@
+# Fine-tuning a multilayer perceptron using LoRA and 🤗 PEFT
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/peft/blob/main/examples/multilayer_perceptron/multilayer_perceptron_lora.ipynb)
+
+PEFT supports fine-tuning any type of model as long as the layers being used are supported. The model does not have to be a transformers model, for instance. To demonstrate this, the accompanying notebook `multilayer_perceptron_lora.ipynb` shows how to apply LoRA to a simple multilayer perceptron and use it to train a model to perform a classification task.
--- a/examples/multilayer_perceptron/multilayer_perceptron_lora.ipynb
+++ b/examples/multilayer_perceptron/multilayer_perceptron_lora.ipynb
@ -0,0 +1,800 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "8e8743c8",
+   "metadata": {},
+   "source": [
+    "# Using PEFT with custom models"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c42c67e1",
+   "metadata": {},
+   "source": [
+    "`peft` allows us to fine-tune models efficiently with LoRA. In this short notebook, we will demonstrate how to train a simple multilayer perceptron (MLP) using `peft`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ce314af5",
+   "metadata": {},
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b28b214d",
+   "metadata": {},
+   "source": [
+    "Make sure that you have the latest version of `peft` installed. To ensure that, run this in your Python environment:\n",
+    "    \n",
+    "    python -m pip install --upgrade peft"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "4d9da3d9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import copy\n",
+    "import os\n",
+    "\n",
+    "# ignore bnb warnings\n",
+    "os.environ[\"BITSANDBYTES_NOWELCOME\"] = \"1\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "44075f54",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import peft\n",
+    "import torch\n",
+    "from torch import nn\n",
+    "import torch.nn.functional as F"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "f72acdfb",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<torch._C.Generator at 0x7f2a64177510>"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "torch.manual_seed(0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2b127a78",
+   "metadata": {},
+   "source": [
+    "## Data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f265da76",
+   "metadata": {},
+   "source": [
+    "We will create a toy dataset consisting of random data for a classification task. There is a little bit of signal in the data, so we should expect that the loss of the model can improve during training."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "b355567e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = torch.rand((1000, 20))\n",
+    "y = (X.sum(1) > 10).long()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "a60a869d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n_train = 800\n",
+    "batch_size = 64"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "8859572e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_dataloader = torch.utils.data.DataLoader(\n",
+    "    torch.utils.data.TensorDataset(X[:n_train], y[:n_train]),\n",
+    "    batch_size=batch_size,\n",
+    "    shuffle=True,\n",
+    ")\n",
+    "eval_dataloader = torch.utils.data.DataLoader(\n",
+    "    torch.utils.data.TensorDataset(X[n_train:], y[n_train:]),\n",
+    "    batch_size=batch_size,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "97bddd2c",
+   "metadata": {},
+   "source": [
+    "## Model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "db694a58",
+   "metadata": {},
+   "source": [
+    "As a model, we use a simple multilayer perceptron (MLP). For demonstration purposes, we use a very large number of hidden units. This is totally overkill for this task but it helps to demonstrate the advantages of `peft`. In more realistic settings, models will also be quite large on average, so this is not far-fetched."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "1b43cd8f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class MLP(nn.Module):\n",
+    "    def __init__(self, num_units_hidden=2000):\n",
+    "        super().__init__()\n",
+    "        self.seq = nn.Sequential(\n",
+    "            nn.Linear(20, num_units_hidden),\n",
+    "            nn.ReLU(),\n",
+    "            nn.Linear(num_units_hidden, num_units_hidden),\n",
+    "            nn.ReLU(),\n",
+    "            nn.Linear(num_units_hidden, 2),\n",
+    "            nn.LogSoftmax(dim=-1),\n",
+    "        )\n",
+    "\n",
+    "    def forward(self, X):\n",
+    "        return self.seq(X)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1277bf00",
+   "metadata": {},
+   "source": [
+    "## Training"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "02caf26a",
+   "metadata": {},
+   "source": [
+    "Here are just a few training hyper-parameters and a simple function that performs the training and evaluation loop."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "5d14c0c4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lr = 0.002\n",
+    "batch_size = 64\n",
+    "max_epochs = 30\n",
+    "device = 'cpu' if not torch.cuda.is_available() else 'cuda'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "657d6b3e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train(model, optimizer, criterion, train_dataloader, eval_dataloader, epochs):\n",
+    "    for epoch in range(epochs):\n",
+    "        model.train()\n",
+    "        train_loss = 0\n",
+    "        for xb, yb in train_dataloader:\n",
+    "            xb = xb.to(device)\n",
+    "            yb = yb.to(device)\n",
+    "            outputs = model(xb)\n",
+    "            loss = criterion(outputs, yb)\n",
+    "            train_loss += loss.detach().float()\n",
+    "            loss.backward()\n",
+    "            optimizer.step()\n",
+    "            optimizer.zero_grad()\n",
+    "\n",
+    "        model.eval()\n",
+    "        eval_loss = 0\n",
+    "        for xb, yb in eval_dataloader:\n",
+    "            xb = xb.to(device)\n",
+    "            yb = yb.to(device)\n",
+    "            with torch.no_grad():\n",
+    "                outputs = model(xb)\n",
+    "            loss = criterion(outputs, yb)\n",
+    "            eval_loss += loss.detach().float()\n",
+    "\n",
+    "        eval_loss_total = (eval_loss / len(eval_dataloader)).item()\n",
+    "        train_loss_total = (train_loss / len(train_dataloader)).item()\n",
+    "        print(f\"{epoch=:<2}  {train_loss_total=:.4f}  {eval_loss_total=:.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b382dcbe",
+   "metadata": {},
+   "source": [
+    "### Training without peft"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b40d4873",
+   "metadata": {},
+   "source": [
+    "Let's start without using `peft` to see what we can expect from the model training."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "f059ced4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module = MLP().to(device)\n",
+    "optimizer = torch.optim.Adam(module.parameters(), lr=lr)\n",
+    "criterion = nn.CrossEntropyLoss()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "17698863",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch=0   train_loss_total=0.7970  eval_loss_total=0.6472\n",
+      "epoch=1   train_loss_total=0.5597  eval_loss_total=0.4898\n",
+      "epoch=2   train_loss_total=0.3696  eval_loss_total=0.3323\n",
+      "epoch=3   train_loss_total=0.2364  eval_loss_total=0.5454\n",
+      "epoch=4   train_loss_total=0.2428  eval_loss_total=0.2843\n",
+      "epoch=5   train_loss_total=0.1251  eval_loss_total=0.2514\n",
+      "epoch=6   train_loss_total=0.0952  eval_loss_total=0.2068\n",
+      "epoch=7   train_loss_total=0.0831  eval_loss_total=0.2395\n",
+      "epoch=8   train_loss_total=0.0655  eval_loss_total=0.2524\n",
+      "epoch=9   train_loss_total=0.0380  eval_loss_total=0.3650\n",
+      "epoch=10  train_loss_total=0.0363  eval_loss_total=0.3495\n",
+      "epoch=11  train_loss_total=0.0231  eval_loss_total=0.2360\n",
+      "epoch=12  train_loss_total=0.0162  eval_loss_total=0.2276\n",
+      "epoch=13  train_loss_total=0.0094  eval_loss_total=0.2716\n",
+      "epoch=14  train_loss_total=0.0065  eval_loss_total=0.2237\n",
+      "epoch=15  train_loss_total=0.0054  eval_loss_total=0.2366\n",
+      "epoch=16  train_loss_total=0.0035  eval_loss_total=0.2673\n",
+      "epoch=17  train_loss_total=0.0028  eval_loss_total=0.2630\n",
+      "epoch=18  train_loss_total=0.0023  eval_loss_total=0.2835\n",
+      "epoch=19  train_loss_total=0.0021  eval_loss_total=0.2727\n",
+      "epoch=20  train_loss_total=0.0018  eval_loss_total=0.2597\n",
+      "epoch=21  train_loss_total=0.0016  eval_loss_total=0.2553\n",
+      "epoch=22  train_loss_total=0.0014  eval_loss_total=0.2712\n",
+      "epoch=23  train_loss_total=0.0013  eval_loss_total=0.2637\n",
+      "epoch=24  train_loss_total=0.0012  eval_loss_total=0.2733\n",
+      "epoch=25  train_loss_total=0.0011  eval_loss_total=0.2738\n",
+      "epoch=26  train_loss_total=0.0010  eval_loss_total=0.2476\n",
+      "epoch=27  train_loss_total=0.0010  eval_loss_total=0.2583\n",
+      "epoch=28  train_loss_total=0.0009  eval_loss_total=0.2842\n",
+      "epoch=29  train_loss_total=0.0008  eval_loss_total=0.2634\n",
+      "CPU times: user 1.26 s, sys: 187 ms, total: 1.45 s\n",
+      "Wall time: 1.45 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%time train(module, optimizer, criterion, train_dataloader, eval_dataloader, epochs=max_epochs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4cef0029",
+   "metadata": {},
+   "source": [
+    "Okay, so we got an eval loss of ~0.26, which is much better than random."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4f106078",
+   "metadata": {},
+   "source": [
+    "### Training with peft"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8dd47aa4",
+   "metadata": {},
+   "source": [
+    "Now let's train with `peft`. First we check the names of the modules, so that we can configure `peft` to fine-tune the right modules."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "922db29b",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('', __main__.MLP),\n",
+       " ('seq', torch.nn.modules.container.Sequential),\n",
+       " ('seq.0', torch.nn.modules.linear.Linear),\n",
+       " ('seq.1', torch.nn.modules.activation.ReLU),\n",
+       " ('seq.2', torch.nn.modules.linear.Linear),\n",
+       " ('seq.3', torch.nn.modules.activation.ReLU),\n",
+       " ('seq.4', torch.nn.modules.linear.Linear),\n",
+       " ('seq.5', torch.nn.modules.activation.LogSoftmax)]"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "[(n, type(m)) for n, m in MLP().named_modules()]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5efb275d",
+   "metadata": {},
+   "source": [
+    "Next we can define the LoRA config. There is nothing special going on here. We set the LoRA rank to 8 and select the layers `seq.0` and `seq.2` to be used for LoRA fine-tuning. As for `seq.4`, which is the output layer, we set it as `module_to_save`, which means it is also trained but no LoRA is applied."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cf2c608d",
+   "metadata": {},
+   "source": [
+    "*Note: Not all layers types can be fine-tuned with LoRA. At the moment, linear layers, embeddings, `Conv2D` and `transformers.pytorch_utils.Conv1D` are supported."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "b342438f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config = peft.LoraConfig(\n",
+    "    r=8,\n",
+    "    target_modules=[\"seq.0\", \"seq.2\"],\n",
+    "    modules_to_save=[\"seq.4\"],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "829b4e2d",
+   "metadata": {},
+   "source": [
+    "Now let's create the `peft` model by passing our initial MLP, as well as the config we just defined, to `get_peft_model`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "602b6658",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "trainable params: 56,164 || all params: 4,100,164 || trainable%: 1.369798866581922\n"
+     ]
+    }
+   ],
+   "source": [
+    "module = MLP().to(device)\n",
+    "module_copy = copy.deepcopy(module)  # we keep a copy of the original model for later\n",
+    "peft_model = peft.get_peft_model(module, config)\n",
+    "optimizer = torch.optim.Adam(peft_model.parameters(), lr=lr)\n",
+    "criterion = nn.CrossEntropyLoss()\n",
+    "peft_model.print_trainable_parameters()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2103737d",
+   "metadata": {},
+   "source": [
+    "Checking the numbers, we see that only ~1% of parameters are actually trained, which is what we like to see.\n",
+    "\n",
+    "Now let's start the training:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "9200cbc6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch=0   train_loss_total=0.6918  eval_loss_total=0.6518\n",
+      "epoch=1   train_loss_total=0.5975  eval_loss_total=0.6125\n",
+      "epoch=2   train_loss_total=0.5402  eval_loss_total=0.4929\n",
+      "epoch=3   train_loss_total=0.3886  eval_loss_total=0.3476\n",
+      "epoch=4   train_loss_total=0.2677  eval_loss_total=0.3185\n",
+      "epoch=5   train_loss_total=0.1938  eval_loss_total=0.2294\n",
+      "epoch=6   train_loss_total=0.1712  eval_loss_total=0.2653\n",
+      "epoch=7   train_loss_total=0.1555  eval_loss_total=0.2764\n",
+      "epoch=8   train_loss_total=0.1218  eval_loss_total=0.2104\n",
+      "epoch=9   train_loss_total=0.0846  eval_loss_total=0.1756\n",
+      "epoch=10  train_loss_total=0.0710  eval_loss_total=0.1873\n",
+      "epoch=11  train_loss_total=0.0372  eval_loss_total=0.1539\n",
+      "epoch=12  train_loss_total=0.0350  eval_loss_total=0.2348\n",
+      "epoch=13  train_loss_total=0.0298  eval_loss_total=0.4605\n",
+      "epoch=14  train_loss_total=0.0355  eval_loss_total=0.2208\n",
+      "epoch=15  train_loss_total=0.0099  eval_loss_total=0.1583\n",
+      "epoch=16  train_loss_total=0.0051  eval_loss_total=0.2042\n",
+      "epoch=17  train_loss_total=0.0029  eval_loss_total=0.2045\n",
+      "epoch=18  train_loss_total=0.0022  eval_loss_total=0.2285\n",
+      "epoch=19  train_loss_total=0.0015  eval_loss_total=0.2118\n",
+      "epoch=20  train_loss_total=0.0012  eval_loss_total=0.2237\n",
+      "epoch=21  train_loss_total=0.0010  eval_loss_total=0.2363\n",
+      "epoch=22  train_loss_total=0.0009  eval_loss_total=0.2531\n",
+      "epoch=23  train_loss_total=0.0008  eval_loss_total=0.2528\n",
+      "epoch=24  train_loss_total=0.0007  eval_loss_total=0.2443\n",
+      "epoch=25  train_loss_total=0.0006  eval_loss_total=0.2267\n",
+      "epoch=26  train_loss_total=0.0006  eval_loss_total=0.2379\n",
+      "epoch=27  train_loss_total=0.0005  eval_loss_total=0.2658\n",
+      "epoch=28  train_loss_total=0.0005  eval_loss_total=0.2326\n",
+      "epoch=29  train_loss_total=0.0004  eval_loss_total=0.2520\n",
+      "CPU times: user 950 ms, sys: 4.7 ms, total: 955 ms\n",
+      "Wall time: 957 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "%time train(peft_model, optimizer, criterion, train_dataloader, eval_dataloader, epochs=max_epochs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "20f6f452",
+   "metadata": {},
+   "source": [
+    "In the end, we see that the eval loss is very similar to the one we saw earlier when we trained without `peft`. This is quite nice to see, given that we are training a much smaller number of parameters."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fa55d1d4",
+   "metadata": {},
+   "source": [
+    "#### Check which parameters were updated"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a6e2146b",
+   "metadata": {},
+   "source": [
+    "Finally, just to check that LoRA was applied as expected, we check what original weights were updated what weights stayed the same."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "c7dcde21",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "New parameter model.seq.0.lora_A.default.weight |   160 parameters | updated\n",
+      "New parameter model.seq.0.lora_B.default.weight | 16000 parameters | updated\n",
+      "New parameter model.seq.2.lora_A.default.weight | 16000 parameters | updated\n",
+      "New parameter model.seq.2.lora_B.default.weight | 16000 parameters | updated\n"
+     ]
+    }
+   ],
+   "source": [
+    "for name, param in peft_model.base_model.named_parameters():\n",
+    "    if \"lora\" not in name:\n",
+    "        continue\n",
+    "\n",
+    "    print(f\"New parameter {name:<13} | {param.numel():>5} parameters | updated\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "022e6c41",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Parameter seq.0.weight  |   40000 parameters | not updated\n",
+      "Parameter seq.0.bias    |    2000 parameters | not updated\n",
+      "Parameter seq.2.weight  | 4000000 parameters | not updated\n",
+      "Parameter seq.2.bias    |    2000 parameters | not updated\n",
+      "Parameter seq.4.weight  |    4000 parameters | not updated\n",
+      "Parameter seq.4.bias    |       2 parameters | not updated\n",
+      "Parameter seq.4.weight  |    4000 parameters | updated\n",
+      "Parameter seq.4.bias    |       2 parameters | updated\n"
+     ]
+    }
+   ],
+   "source": [
+    "params_before = dict(module_copy.named_parameters())\n",
+    "for name, param in peft_model.base_model.named_parameters():\n",
+    "    if \"lora\" in name:\n",
+    "        continue\n",
+    "\n",
+    "    name_before = name.partition(\".\")[-1].replace(\"original_\", \"\").replace(\"module.\", \"\").replace(\"modules_to_save.default.\", \"\")\n",
+    "    param_before = params_before[name_before]\n",
+    "    if torch.allclose(param, param_before):\n",
+    "        print(f\"Parameter {name_before:<13} | {param.numel():>7} parameters | not updated\")\n",
+    "    else:\n",
+    "        print(f\"Parameter {name_before:<13} | {param.numel():>7} parameters | updated\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4c09b43d",
+   "metadata": {},
+   "source": [
+    "So we can see that apart from the new LoRA weights that were added, only the last layer was updated. Since the LoRA weights and the last layer have comparitively few parameters, this gives us a big boost in efficiency."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b46c6198",
+   "metadata": {},
+   "source": [
+    "## Sharing the model through Hugging Face Hub"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6289e647",
+   "metadata": {},
+   "source": [
+    "### Pushing the model to HF Hub"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "06dcdfa0",
+   "metadata": {},
+   "source": [
+    "With the `peft` model, it is also very easy to push a model the Hugging Face Hub. Below, we demonstrate how it works. It is assumed that you have a valid Hugging Face account and are logged in:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "1b91a0af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "user = \"BenjaminB\"  # put your user name here\n",
+    "model_name = \"peft-lora-with-custom-model\"\n",
+    "model_id = f\"{user}/{model_name}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "1430fffd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8163dba7aa8e4012830d72fd7342e9b6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "adapter_model.bin:   0%|          | 0.00/211k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5370fdac247e4a4180406a59e5f1ed63",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "peft_model.push_to_hub(model_id);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "632bd799",
+   "metadata": {},
+   "source": [
+    "As we can see, the adapter size is only 211 kB."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4ff78c0c",
+   "metadata": {},
+   "source": [
+    "### Loading the model from HF Hub"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e5c7e87f",
+   "metadata": {},
+   "source": [
+    "Now, it only takes one step to load the model from HF Hub. To do this, we can use `PeftModel.from_pretrained`, passing our base model and the model ID:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "ce0fcced",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "peft.peft_model.PeftModel"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "loaded = peft.PeftModel.from_pretrained(module_copy, model_id)\n",
+    "type(loaded)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cd4b4eac",
+   "metadata": {},
+   "source": [
+    "Let's check that the two models produce the same output:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "f2cf6ac4",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "y_peft = peft_model(X.to(device))\n",
+    "y_loaded = loaded(X.to(device))\n",
+    "torch.allclose(y_peft, y_loaded)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eeeb653f",
+   "metadata": {},
+   "source": [
+    "### Clean up"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "61c60355",
+   "metadata": {},
+   "source": [
+    "Finally, as a clean up step, you may want to delete the repo."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "b747038f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import delete_repo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "7e5ab237",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "delete_repo(model_id)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/examples/semantic_segmentation/README.md
+++ b/examples/semantic_segmentation/README.md
@ -0,0 +1,7 @@
+# Fine-tuning for semantic segmentation using LoRA and 🤗 PEFT
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/peft/blob/main/examples/semantic_segmentation/semantic_segmentation_peft_lora.ipynb) 
+
+We provide a notebook (`semantic_segmentation_peft_lora.ipynb`) where we learn how to use [LoRA](https://arxiv.org/abs/2106.09685) from 🤗 PEFT to fine-tune an semantic segmentation by ONLY using **14%%** of the original trainable parameters of the model. 
+
+LoRA adds low-rank "update matrices" to certain blocks in the underlying model (in this case the attention blocks) and ONLY trains those matrices during fine-tuning. During inference, these update matrices are _merged_ with the original model parameters. For more details, check out the [original LoRA paper](https://arxiv.org/abs/2106.09685). 
--- a/examples/semantic_segmentation/semantic_segmentation_peft_lora.ipynb
+++ b/examples/semantic_segmentation/semantic_segmentation_peft_lora.ipynb
--- a/examples/sequence_classification/IA3.ipynb
+++ b/examples/sequence_classification/IA3.ipynb
--- a/examples/sequence_classification/LoRA.ipynb
+++ b/examples/sequence_classification/LoRA.ipynb
--- a/examples/sequence_classification/P_Tuning.ipynb
+++ b/examples/sequence_classification/P_Tuning.ipynb
--- a/examples/sequence_classification/Prompt_Tuning.ipynb
+++ b/examples/sequence_classification/Prompt_Tuning.ipynb
--- a/examples/sequence_classification/peft_no_lora_accelerate.py
+++ b/examples/sequence_classification/peft_no_lora_accelerate.py
@ -0,0 +1,214 @@
+import argparse
+
+import evaluate
+import torch
+from accelerate import Accelerator, DistributedDataParallelKwargs
+from datasets import load_dataset
+from torch.optim import AdamW
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
+
+from peft import (
+    PrefixTuningConfig,
+    PromptEncoderConfig,
+    PromptTuningConfig,
+    get_peft_model,
+)
+from peft.utils.other import fsdp_auto_wrap_policy
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="PEFT a transformers model on a sequence classification task")
+    parser.add_argument(
+        "--num_virtual_tokens",
+        type=int,
+        default=20,
+        help="num_virtual_tokens if the number of virtual tokens used in prompt/prefix/P tuning.",
+    )
+    parser.add_argument(
+        "--encoder_hidden_size",
+        type=int,
+        default=128,
+        help="encoder_hidden_size if the encoder hidden size used in P tuninig/Prefix tuning.",
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+        required=True,
+    )
+    parser.add_argument(
+        "--per_device_train_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--per_device_eval_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the evaluation dataloader.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-3,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
+    parser.add_argument(
+        "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--peft_type",
+        type=str,
+        default="p_tuning",
+        help="The PEFT type to use.",
+        choices=["p_tuning", "prefix_tuning", "prompt_tuning"],
+    )
+    args = parser.parse_args()
+
+    assert args.output_dir is not None, "Need an `output_dir` to store the finetune model and verify."
+
+    return args
+
+
+def main():
+    args = parse_args()
+    ddp_scaler = DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(kwargs_handlers=[ddp_scaler])
+
+    task = "mrpc"
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    if args.peft_type == "p_tuning":
+        peft_config = PromptEncoderConfig(
+            task_type="SEQ_CLS",
+            num_virtual_tokens=args.num_virtual_tokens,
+            encoder_hidden_size=args.encoder_hidden_size,
+        )
+    elif args.peft_type == "prefix_tuning":
+        peft_config = PrefixTuningConfig(
+            task_type="SEQ_CLS",
+            num_virtual_tokens=args.num_virtual_tokens,
+            encoder_hidden_size=args.encoder_hidden_size,
+        )
+    else:
+        peft_config = PromptTuningConfig(task_type="SEQ_CLS", num_virtual_tokens=args.num_virtual_tokens)
+
+    tokenizer_kwargs = {}
+
+    if any(k in args.model_name_or_path for k in ("gpt", "opt", "bloom")):
+        tokenizer_kwargs["padding_side"] = "left"
+    else:
+        tokenizer_kwargs["padding_side"] = "right"
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, **tokenizer_kwargs)
+    if getattr(tokenizer, "pad_token_id") is None:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+
+    datasets = load_dataset("glue", task)
+    metric = evaluate.load("glue", task)
+
+    def tokenize_function(examples):
+        # max_length=None => use the model max length (it's actually the default)
+        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
+        return outputs
+
+    def collate_fn(examples):
+        return tokenizer.pad(examples, padding="longest", return_tensors="pt")
+
+    with accelerator.main_process_first():
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            batched=True,
+            remove_columns=["idx", "sentence1", "sentence2"],
+        )
+
+    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
+    # transformers library
+    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
+
+    # Instantiate dataloaders.
+    train_dataloader = DataLoader(
+        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=args.per_device_train_batch_size
+    )
+    eval_dataloader = DataLoader(
+        tokenized_datasets["validation"],
+        shuffle=False,
+        collate_fn=collate_fn,
+        batch_size=args.per_device_eval_batch_size,
+    )
+
+    model = AutoModelForSequenceClassification.from_pretrained(args.model_name_or_path)
+    model = get_peft_model(model, peft_config)
+    model.print_trainable_parameters()
+
+    if getattr(accelerator.state, "fsdp_plugin", None) is not None:
+        accelerator.state.fsdp_plugin.auto_wrap_policy = fsdp_auto_wrap_policy(model)
+        model = accelerator.prepare(model)
+
+    optimizer = AdamW(params=model.parameters(), lr=args.learning_rate)
+
+    # Instantiate scheduler
+    lr_scheduler = get_linear_schedule_with_warmup(
+        optimizer=optimizer,
+        num_warmup_steps=args.num_warmup_steps,
+        num_training_steps=(len(train_dataloader) * args.num_train_epochs),
+    )
+
+    if getattr(accelerator.state, "fsdp_plugin", None) is not None:
+        train_dataloader, eval_dataloader, optimizer, lr_scheduler = accelerator.prepare(
+            train_dataloader, eval_dataloader, optimizer, lr_scheduler
+        )
+    else:
+        model, train_dataloader, eval_dataloader, optimizer, lr_scheduler = accelerator.prepare(
+            model, train_dataloader, eval_dataloader, optimizer, lr_scheduler
+        )
+
+    for epoch in range(args.num_train_epochs):
+        model.train()
+        for step, batch in enumerate(tqdm(train_dataloader)):
+            outputs = model(**batch)
+            loss = outputs.loss
+            accelerator.backward(loss)
+            optimizer.step()
+            lr_scheduler.step()
+            optimizer.zero_grad()
+
+        model.eval()
+        samples_seen = 0
+        for step, batch in enumerate(tqdm(eval_dataloader)):
+            with torch.no_grad():
+                outputs = model(**batch)
+            predictions = outputs.logits.argmax(dim=-1)
+            predictions, references = accelerator.gather((predictions, batch["labels"]))
+            # If we are in a multiprocess environment, the last batch has duplicates
+            if accelerator.num_processes > 1:
+                if step == len(eval_dataloader) - 1:
+                    predictions = predictions[: len(eval_dataloader.dataset) - samples_seen]
+                    references = references[: len(eval_dataloader.dataset) - samples_seen]
+                else:
+                    samples_seen += references.shape[0]
+            metric.add_batch(
+                predictions=predictions,
+                references=references,
+            )
+        eval_metric = metric.compute()
+        accelerator.print(f"epoch {epoch}:", eval_metric)
+
+    accelerator.wait_for_everyone()
+    unwrapped_model = accelerator.unwrap_model(model)
+    unwrapped_model.save_pretrained(args.output_dir, state_dict=accelerator.get_state_dict(model))
+    if accelerator.is_main_process:
+        tokenizer.save_pretrained(args.output_dir)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/sequence_classification/prefix_tuning.ipynb
+++ b/examples/sequence_classification/prefix_tuning.ipynb
--- a/examples/sequence_classification/requirements.txt
+++ b/examples/sequence_classification/requirements.txt
@ -1,7 +1,5 @@
 transformers
 accelerate
-loralib
 evaluate
-deepspeed
 tqdm
 datasets
--- a/examples/token_classification/peft_lora_token_cls.ipynb
+++ b/examples/token_classification/peft_lora_token_cls.ipynb
--- a/examples/token_classification/requirements.txt
+++ b/examples/token_classification/requirements.txt
@ -1,8 +1,6 @@
 transformers
 accelerate
-loralib
 evaluate
-deepspeed
 tqdm
 datasets
 Pillow
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,3 +1,43 @@
 [tool.black]
 line-length = 119
 target-version = ['py36']
+
+[tool.ruff]
+ignore = ["C901", "E501", "E741", "W605"]
+select = ["C", "E", "F", "I", "W"]
+line-length = 119
+
+[tool.ruff.isort]
+lines-after-imports = 2
+known-first-party = ["peft"]
+
+[isort]
+default_section = "FIRSTPARTY"
+known_first_party = "peft"
+known_third_party = [
+    "numpy",
+    "torch",
+    "accelerate",
+    "transformers",
+]
+line_length = 119
+lines_after_imports = 2
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+use_parentheses = true
+ensure_newline_before_comments = true
+
+[tool.pytest]
+doctest_optionflags = [
+    "NORMALIZE_WHITESPACE",
+    "ELLIPSIS",
+    "NUMBER",
+]
+
+[tool.pytest.ini_options]
+addopts = "--cov=src/peft --cov-report=term-missing"
+markers = [
+    "single_gpu_tests: tests that run on a single GPU",
+    "multi_gpu_tests: tests that run on multiple GPUs",
+]
--- a/scripts/log_reports.py
+++ b/scripts/log_reports.py
@ -0,0 +1,110 @@
+import json, os
+from pathlib import Path
+from datetime import date
+from tabulate import tabulate
+
+failed = []
+passed = []
+
+group_info = []
+
+total_num_failed = 0
+empty_file = False or len(list(Path().glob("*.log"))) == 0
+for log in Path().glob("*.log"):
+    section_num_failed = 0
+    with open(log, "r") as f:
+        nb_lines = sum(1 for _ in f)
+        for i, line in f:
+            line = json.loads(line)
+            if line.get("nodeid", "") != "":
+                test = line["nodeid"]
+                if line.get("duration", None) is not None:
+                    duration = f'{line["duration"]:.4f}'
+                    if line.get("outcome", "") == "failed":
+                        section_num_failed += 1
+                        failed.append([test, duration, log.name.split('_')[0]])
+                        total_num_failed += 1
+                    else:
+                        passed.append([test, duration, log.name.split('_')[0]])
+        if nb_lines == 0:
+            empty_file = True
+    group_info.append([str(log), section_num_failed, failed])
+    os.remove(log)
+    failed = []
+no_error_payload = {
+    "type": "section",
+    "text": {
+        "type": "plain_text",
+        "text": "🌞 There were no failures!" if not empty_file else "Something went wrong - please check GH action results.",
+        "emoji": True
+    }
+}
+
+message = ""
+payload = [
+    {
+        "type": "header",
+        "text": {
+            "type": "plain_text",
+            "text": "🤗 Results of the {} PEFT scheduled tests.".format(os.environ.get("TEST_TYPE", "")),
+        }
+    },
+]
+if total_num_failed > 0:
+    for name, num_failed, failed_tests in group_info:
+        if num_failed > 0:
+            if num_failed == 1:
+                message += f"*{name}: {num_failed} failed test*\n"
+            else:
+                message += f"*{name}: {num_failed} failed tests*\n"
+            failed_table = []
+            for test in failed_tests:
+                failed_table.append(test[0].split("::"))
+            failed_table = tabulate(failed_table, headers=["Test Location", "Test Case", "Test Name"], showindex="always", tablefmt="grid", maxcolwidths=[12, 12, 12])
+            message += '\n```\n' +failed_table + '\n```'
+    print(f'### {message}')
+else:
+    payload.append(no_error_payload)
+    
+
+if os.environ.get("TEST_TYPE", "") != "":
+    from slack_sdk import WebClient
+
+    if len(message) != 0:
+        md_report = {
+            "type": "section",
+            "text": {
+                "type": "mrkdwn",
+                "text": message
+            },
+        }
+        payload.append(md_report)
+        action_button = {
+            "type": "section",
+            "text": {
+                "type": "mrkdwn",
+                "text": "*For more details:*"
+            },
+            "accessory": {
+                "type": "button",
+                "text": {"type": "plain_text", "text": "Check Action results", "emoji": True},
+                "url": f"https://github.com/huggingface/peft/actions/runs/{os.environ['GITHUB_RUN_ID']}",
+            },
+        }
+        payload.append(action_button)
+
+    date_report = {
+        "type": "context",
+        "elements": [
+            {
+                "type": "plain_text",
+                "text": f"Nightly {os.environ.get('TEST_TYPE')} test results for {date.today()}",
+            },  
+        ],
+    }
+    payload.append(date_report)
+
+    print(payload)
+
+    client = WebClient(token=os.environ.get("SLACK_API_TOKEN"))
+    client.chat_postMessage(channel="#peft-ci-daily", text=message, blocks=payload)
--- a/scripts/stale.py
+++ b/scripts/stale.py
@ -0,0 +1,63 @@
+# Copyright 2023 The HuggingFace Team, the AllenNLP library authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Script to close stale issue. Taken in part from the AllenNLP repository.
+https://github.com/allenai/allennlp.
+"""
+from datetime import datetime as dt
+import os
+
+from github import Github
+
+
+LABELS_TO_EXEMPT = [
+    "good first issue",
+    "good second issue",
+    "good difficult issue",
+    "feature request",
+    "new model",
+    "wip",
+    "PRs welcome to address this",
+]
+
+
+def main():
+    g = Github(os.environ["GITHUB_TOKEN"])
+    repo = g.get_repo("huggingface/peft")
+    open_issues = repo.get_issues(state="open")
+
+    for issue in open_issues:
+        comments = sorted([comment for comment in issue.get_comments()], key=lambda i: i.created_at, reverse=True)
+        last_comment = comments[0] if len(comments) > 0 else None
+        if (
+            last_comment is not None and last_comment.user.login == "github-actions[bot]"
+            and (dt.utcnow() - issue.updated_at).days > 7
+            and (dt.utcnow() - issue.created_at).days >= 30
+            and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels())
+        ):
+            issue.edit(state="closed")
+        elif (
+            (dt.utcnow() - issue.updated_at).days > 23
+            and (dt.utcnow() - issue.created_at).days >= 30
+            and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels())
+        ):
+            issue.create_comment(
+                "This issue has been automatically marked as stale because it has not had "
+                "recent activity. If you think this still needs to be addressed "
+                "please comment on this thread.\n\n"
+            )
+
+
+if __name__ == "__main__":
+    main()
--- a/setup.cfg
+++ b/setup.cfg
@ -1,23 +0,0 @@
-[isort]
-default_section = FIRSTPARTY
-ensure_newline_before_comments = True
-force_grid_wrap = 0
-include_trailing_comma = True
-known_first_party = pet
-known_third_party =
-    numpy
-    torch
-    accelerate
-    transformers
-
-line_length = 119
-lines_after_imports = 2
-multi_line_output = 3
-use_parentheses = True
-
-[flake8]
-ignore = E203, E722, E501, E741, W503, W605
-max-line-length = 119
-
-[tool:pytest]
-doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS
--- a/setup.py
+++ b/setup.py
@ -1,4 +1,4 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
+# Copyright 2023 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -12,29 +12,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from setuptools import setup
-from setuptools import find_packages
+from setuptools import find_packages, setup

 extras = {}
-extras["quality"] = ["black ~= 22.0", "isort >= 5.5.4", "flake8 >= 3.8.3"]
+extras["quality"] = ["black ~= 22.0", "ruff>=0.0.241", "urllib3<=2.0.0"]
 extras["docs_specific"] = ["hf-doc-builder"]
 extras["dev"] = extras["quality"] + extras["docs_specific"]
+extras["test"] = extras["dev"] + ["pytest", "pytest-cov", "pytest-xdist", "parameterized", "datasets", "diffusers"]

 setup(
    name="peft",
-    version="0.0.2",
+    version="0.5.0.dev0",
    description="Parameter-Efficient Fine-Tuning (PEFT)",
+    license_files=["LICENSE"],
    long_description=open("README.md", "r", encoding="utf-8").read(),
    long_description_content_type="text/markdown",
    keywords="deep learning",
    license="Apache",
    author="The HuggingFace team",
    author_email="sourab@huggingface.co",
-    url="https://github.com/huggingface/pets",
+    url="https://github.com/huggingface/peft",
    package_dir={"": "src"},
    packages=find_packages("src"),
+    package_data={"peft": ["py.typed"]},
    entry_points={},
-    python_requires=">=3.7.0",
+    python_requires=">=3.8.0",
    install_requires=[
        "numpy>=1.17",
        "packaging>=20.0",
@ -42,8 +44,9 @@ setup(
        "pyyaml",
        "torch>=1.13.0",
        "transformers",
+        "tqdm",
        "accelerate",
-        "loralib",
+        "safetensors",
    ],
    extras_require=extras,
    classifiers=[
@ -54,7 +57,7 @@ setup(
        "License :: OSI Approved :: Apache Software License",
        "Operating System :: OS Independent",
        "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
    ],
 )
@ -71,9 +74,7 @@ setup(
 #      twine upload dist/* -r pypitest
 #      twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/
 # 6. Check that you can install it in a virtualenv by running:
-#      pip install -i https://testpypi.python.org/pypi accelerate
-#      accelerate env
-#      accelerate test
+#      pip install -i https://testpypi.python.org/pypi peft
 # 7. Upload the final version to actual pypi:
 #      twine upload dist/* -r pypi
 # 8. Add release notes to the tag in github once everything is looking hunky-dory.
--- a/src/peft/init.py
+++ b/src/peft/init.py
@ -17,19 +17,42 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-__version__ = "0.0.2"
+__version__ = "0.5.0.dev0"

-from .mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING, PEFT_TYPE_TO_CONFIG_MAPPING, get_peft_config, get_peft_model
+from .auto import (
+    AutoPeftModel,
+    AutoPeftModelForCausalLM,
+    AutoPeftModelForSequenceClassification,
+    AutoPeftModelForSeq2SeqLM,
+    AutoPeftModelForTokenClassification,
+    AutoPeftModelForQuestionAnswering,
+    AutoPeftModelForFeatureExtraction,
+)
+from .mapping import (
+    MODEL_TYPE_TO_PEFT_MODEL_MAPPING,
+    PEFT_TYPE_TO_CONFIG_MAPPING,
+    get_peft_config,
+    get_peft_model,
+    inject_adapter_in_model,
+)
 from .peft_model import (
    PeftModel,
    PeftModelForCausalLM,
    PeftModelForSeq2SeqLM,
    PeftModelForSequenceClassification,
    PeftModelForTokenClassification,
+    PeftModelForQuestionAnswering,
+    PeftModelForFeatureExtraction,
 )
 from .tuners import (
+    AdaptionPromptConfig,
+    AdaptionPromptModel,
    LoraConfig,
    LoraModel,
+    IA3Config,
+    IA3Model,
+    AdaLoraConfig,
+    AdaLoraModel,
    PrefixEncoder,
    PrefixTuningConfig,
    PromptEmbedding,
@ -40,13 +63,15 @@ from .tuners import (
    PromptTuningInit,
 )
 from .utils import (
-    PeftConfig,
+    TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING,
    PeftType,
-    PromptLearningConfig,
    TaskType,
    bloom_model_postprocess_past_key_value,
    get_peft_model_state_dict,
-    peft_model_load_and_dispatch,
+    prepare_model_for_int8_training,
+    prepare_model_for_kbit_training,
    set_peft_model_state_dict,
    shift_tokens_right,
+    load_peft_weights,
 )
+from .config import PeftConfig, PromptLearningConfig
--- a/src/peft/auto.py
+++ b/src/peft/auto.py
@ -0,0 +1,145 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import importlib
+from typing import Optional
+
+from transformers import (
+    AutoModel,
+    AutoModelForCausalLM,
+    AutoModelForQuestionAnswering,
+    AutoModelForSeq2SeqLM,
+    AutoModelForSequenceClassification,
+    AutoModelForTokenClassification,
+)
+
+from .config import PeftConfig
+from .mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING
+from .peft_model import (
+    PeftModel,
+    PeftModelForCausalLM,
+    PeftModelForFeatureExtraction,
+    PeftModelForQuestionAnswering,
+    PeftModelForSeq2SeqLM,
+    PeftModelForSequenceClassification,
+    PeftModelForTokenClassification,
+)
+
+
+class _BaseAutoPeftModel:
+    _target_class = None
+    _target_peft_class = None
+
+    def __init__(self, *args, **kwargs):
+        # For consistency with transformers: https://github.com/huggingface/transformers/blob/91d7df58b6537d385e90578dac40204cb550f706/src/transformers/models/auto/auto_factory.py#L400
+        raise EnvironmentError(
+            f"{self.__class__.__name__} is designed to be instantiated "
+            f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or "
+            f"`{self.__class__.__name__}.from_config(config)` methods."
+        )
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path,
+        adapter_name: str = "default",
+        is_trainable: bool = False,
+        config: Optional[PeftConfig] = None,
+        **kwargs,
+    ):
+        r"""
+        A wrapper around all the preprocessing steps a user needs to perform in order to load a PEFT model. The kwargs
+        are passed along to `PeftConfig` that automatically takes care of filtering the kwargs of the Hub methods and
+        the config object init.
+        """
+        peft_config = PeftConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        base_model_path = peft_config.base_model_name_or_path
+
+        task_type = getattr(peft_config, "task_type", None)
+
+        if cls._target_class is not None:
+            target_class = cls._target_class
+        elif cls._target_class is None and task_type is not None:
+            # this is only in the case where we use `AutoPeftModel`
+            raise ValueError(
+                "Cannot use `AutoPeftModel` with a task type, please use a specific class for your task type. (e.g. `AutoPeftModelForCausalLM` for `task_type='CAUSAL_LM'`)"
+            )
+
+        if task_type is not None:
+            expected_target_class = MODEL_TYPE_TO_PEFT_MODEL_MAPPING[task_type]
+            if cls._target_peft_class.__name__ != expected_target_class.__name__:
+                raise ValueError(
+                    f"Expected target PEFT class: {expected_target_class.__name__}, but you have asked for: {cls._target_peft_class.__name__ }"
+                    " make sure that you are loading the correct model for your task type."
+                )
+        elif task_type is None and getattr(peft_config, "auto_mapping", None) is not None:
+            auto_mapping = getattr(peft_config, "auto_mapping", None)
+            base_model_class = auto_mapping["base_model_class"]
+            parent_library_name = auto_mapping["parent_library"]
+
+            parent_library = importlib.import_module(parent_library_name)
+            target_class = getattr(parent_library, base_model_class)
+        else:
+            raise ValueError(
+                "Cannot infer the auto class from the config, please make sure that you are loading the correct model for your task type."
+            )
+
+        base_model = target_class.from_pretrained(base_model_path, **kwargs)
+
+        return cls._target_peft_class.from_pretrained(
+            base_model,
+            pretrained_model_name_or_path,
+            adapter_name=adapter_name,
+            is_trainable=is_trainable,
+            config=config,
+            **kwargs,
+        )
+
+
+class AutoPeftModel(_BaseAutoPeftModel):
+    _target_class = None
+    _target_peft_class = PeftModel
+
+
+class AutoPeftModelForCausalLM(_BaseAutoPeftModel):
+    _target_class = AutoModelForCausalLM
+    _target_peft_class = PeftModelForCausalLM
+
+
+class AutoPeftModelForSeq2SeqLM(_BaseAutoPeftModel):
+    _target_class = AutoModelForSeq2SeqLM
+    _target_peft_class = PeftModelForSeq2SeqLM
+
+
+class AutoPeftModelForSequenceClassification(_BaseAutoPeftModel):
+    _target_class = AutoModelForSequenceClassification
+    _target_peft_class = PeftModelForSequenceClassification
+
+
+class AutoPeftModelForTokenClassification(_BaseAutoPeftModel):
+    _target_class = AutoModelForTokenClassification
+    _target_peft_class = PeftModelForTokenClassification
+
+
+class AutoPeftModelForQuestionAnswering(_BaseAutoPeftModel):
+    _target_class = AutoModelForQuestionAnswering
+    _target_peft_class = PeftModelForQuestionAnswering
+
+
+class AutoPeftModelForFeatureExtraction(_BaseAutoPeftModel):
+    _target_class = AutoModel
+    _target_peft_class = PeftModelForFeatureExtraction
--- a/src/peft/config.py
+++ b/src/peft/config.py
@ -0,0 +1,251 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import json
+import os
+from dataclasses import asdict, dataclass, field
+from typing import Optional, Union
+
+from huggingface_hub import hf_hub_download
+from transformers.utils import PushToHubMixin
+
+from .utils import CONFIG_NAME, PeftType, TaskType
+
+
+@dataclass
+class PeftConfigMixin(PushToHubMixin):
+    r"""
+    This is the base configuration class for PEFT adapter models. It contains all the methods that are common to all
+    PEFT adapter models. This class inherits from [`~transformers.utils.PushToHubMixin`] which contains the methods to
+    push your model to the Hub. The method `save_pretrained` will save the configuration of your adapter model in a
+    directory. The method `from_pretrained` will load the configuration of your adapter model from a directory.
+
+    Args:
+        peft_type (Union[[`~peft.utils.config.PeftType`], `str`]): The type of Peft method to use.
+    """
+    peft_type: Optional[PeftType] = field(default=None, metadata={"help": "The type of PEFT model."})
+    auto_mapping: Optional[dict] = field(
+        default=None, metadata={"help": "An auto mapping dict to help retrieve the base model class if needed."}
+    )
+
+    def to_dict(self):
+        return asdict(self)
+
+    def save_pretrained(self, save_directory, **kwargs):
+        r"""
+        This method saves the configuration of your adapter model in a directory.
+
+        Args:
+            save_directory (`str`):
+                The directory where the configuration will be saved.
+            kwargs (additional keyword arguments, *optional*):
+                Additional keyword arguments passed along to the [`~transformers.utils.PushToHubMixin.push_to_hub`]
+                method.
+        """
+        if os.path.isfile(save_directory):
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+
+        os.makedirs(save_directory, exist_ok=True)
+        auto_mapping_dict = kwargs.pop("auto_mapping_dict", None)
+
+        output_dict = asdict(self)
+        output_path = os.path.join(save_directory, CONFIG_NAME)
+
+        # Add auto mapping details for custom models.
+        if auto_mapping_dict is not None:
+            output_dict["auto_mapping"] = auto_mapping_dict
+
+        # save it
+        with open(output_path, "w") as writer:
+            writer.write(json.dumps(output_dict, indent=2, sort_keys=True))
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, subfolder=None, **kwargs):
+        r"""
+        This method loads the configuration of your adapter model from a directory.
+
+        Args:
+            pretrained_model_name_or_path (`str`):
+                The directory or the Hub repository id where the configuration is saved.
+            kwargs (additional keyword arguments, *optional*):
+                Additional keyword arguments passed along to the child class initialization.
+        """
+        # Avoid circular dependency .. TODO: fix this with a larger refactor
+        from peft.mapping import PEFT_TYPE_TO_CONFIG_MAPPING
+
+        path = (
+            os.path.join(pretrained_model_name_or_path, subfolder)
+            if subfolder is not None
+            else pretrained_model_name_or_path
+        )
+
+        hf_hub_download_kwargs, class_kwargs, _ = cls._split_kwargs(kwargs)
+
+        if os.path.isfile(os.path.join(path, CONFIG_NAME)):
+            config_file = os.path.join(path, CONFIG_NAME)
+        else:
+            try:
+                config_file = hf_hub_download(
+                    pretrained_model_name_or_path, CONFIG_NAME, subfolder=subfolder, **hf_hub_download_kwargs
+                )
+            except Exception:
+                raise ValueError(f"Can't find '{CONFIG_NAME}' at '{pretrained_model_name_or_path}'")
+
+        loaded_attributes = cls.from_json_file(config_file)
+
+        # TODO: this hack is needed to fix the following issue (on commit 702f937):
+        # if someone saves a default config and loads it back with `PeftConfig` class it yields to
+        # not loading the correct config class.
+
+        # from peft import AdaLoraConfig, PeftConfig
+        # peft_config = AdaLoraConfig()
+        # print(peft_config)
+        # >>> AdaLoraConfig(peft_type=<PeftType.ADALORA: 'ADALORA'>, auto_mapping=None, base_model_name_or_path=None,
+        # revision=None, task_type=None, inference_mode=False, r=8, target_modules=None, lora_alpha=8, lora_dropout=0.0, ...
+        #
+        # peft_config.save_pretrained("./test_config")
+        # peft_config = PeftConfig.from_pretrained("./test_config")
+        # print(peft_config)
+        # >>> PeftConfig(peft_type='ADALORA', auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=None, inference_mode=False)
+        if "peft_type" in loaded_attributes:
+            peft_type = loaded_attributes["peft_type"]
+            config_cls = PEFT_TYPE_TO_CONFIG_MAPPING[peft_type]
+        else:
+            config_cls = cls
+
+        config = config_cls(**class_kwargs)
+
+        for key, value in loaded_attributes.items():
+            if hasattr(config, key):
+                setattr(config, key, value)
+
+        return config
+
+    @classmethod
+    def from_json_file(cls, path_json_file, **kwargs):
+        r"""
+        Loads a configuration file from a json file.
+
+        Args:
+            path_json_file (`str`):
+                The path to the json file.
+        """
+        with open(path_json_file, "r") as file:
+            json_object = json.load(file)
+
+        return json_object
+
+    @classmethod
+    def _split_kwargs(cls, kwargs):
+        hf_hub_download_kwargs = {}
+        class_kwargs = {}
+        other_kwargs = {}
+
+        for key, value in kwargs.items():
+            if key in inspect.signature(hf_hub_download).parameters:
+                hf_hub_download_kwargs[key] = value
+            elif key in list(cls.__annotations__):
+                class_kwargs[key] = value
+            else:
+                other_kwargs[key] = value
+
+        return hf_hub_download_kwargs, class_kwargs, other_kwargs
+
+    @classmethod
+    def _get_peft_type(
+        cls,
+        model_id,
+        **hf_hub_download_kwargs,
+    ):
+        subfolder = hf_hub_download_kwargs.get("subfolder", None)
+
+        path = os.path.join(model_id, subfolder) if subfolder is not None else model_id
+
+        if os.path.isfile(os.path.join(path, CONFIG_NAME)):
+            config_file = os.path.join(path, CONFIG_NAME)
+        else:
+            try:
+                config_file = hf_hub_download(
+                    model_id,
+                    CONFIG_NAME,
+                    **hf_hub_download_kwargs,
+                )
+            except Exception:
+                raise ValueError(f"Can't find '{CONFIG_NAME}' at '{model_id}'")
+
+        loaded_attributes = cls.from_json_file(config_file)
+        return loaded_attributes["peft_type"]
+
+    @property
+    def is_prompt_learning(self):
+        r"""
+        Utility method to check if the configuration is for prompt learning.
+        """
+        return False
+
+    @property
+    def is_adaption_prompt(self) -> bool:
+        """Return True if this is an adaption prompt config."""
+        return False
+
+
+@dataclass
+class PeftConfig(PeftConfigMixin):
+    """
+    This is the base configuration class to store the configuration of a [`PeftModel`].
+
+    Args:
+        peft_type (Union[[`~peft.utils.config.PeftType`], `str`]): The type of Peft method to use.
+        task_type (Union[[`~peft.utils.config.TaskType`], `str`]): The type of task to perform.
+        inference_mode (`bool`, defaults to `False`): Whether to use the Peft model in inference mode.
+    """
+
+    base_model_name_or_path: str = field(default=None, metadata={"help": "The name of the base model to use."})
+    revision: str = field(default=None, metadata={"help": "The specific model version to use."})
+    peft_type: Union[str, PeftType] = field(default=None, metadata={"help": "Peft type"})
+    task_type: Union[str, TaskType] = field(default=None, metadata={"help": "Task type"})
+    inference_mode: bool = field(default=False, metadata={"help": "Whether to use inference mode"})
+
+
+@dataclass
+class PromptLearningConfig(PeftConfig):
+    """
+    This is the base configuration class to store the configuration of [`PrefixTuning`], [`PromptEncoder`], or
+    [`PromptTuning`].
+
+    Args:
+        num_virtual_tokens (`int`): The number of virtual tokens to use.
+        token_dim (`int`): The hidden embedding dimension of the base transformer model.
+        num_transformer_submodules (`int`): The number of transformer submodules in the base transformer model.
+        num_attention_heads (`int`): The number of attention heads in the base transformer model.
+        num_layers (`int`): The number of layers in the base transformer model.
+    """
+
+    num_virtual_tokens: int = field(default=None, metadata={"help": "Number of virtual tokens"})
+    token_dim: int = field(
+        default=None, metadata={"help": "The hidden embedding dimension of the base transformer model"}
+    )
+    num_transformer_submodules: Optional[int] = field(
+        default=None, metadata={"help": "Number of transformer submodules"}
+    )
+    num_attention_heads: Optional[int] = field(default=None, metadata={"help": "Number of attention heads"})
+    num_layers: Optional[int] = field(default=None, metadata={"help": "Number of transformer layers"})
+
+    @property
+    def is_prompt_learning(self):
+        r"""
+        Utility method to check if the configuration is for prompt learning.
+        """
+        return True
--- a/src/peft/import_utils.py
+++ b/src/peft/import_utils.py
@ -0,0 +1,28 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+
+
+def is_bnb_available():
+    return importlib.util.find_spec("bitsandbytes") is not None
+
+
+def is_bnb_4bit_available():
+    if not is_bnb_available():
+        return False
+
+    import bitsandbytes as bnb
+
+    return hasattr(bnb.nn, "Linear4bit")
--- a/Show More
+++ b/Show More