mirror of
				https://github.com/huggingface/transformers.git
				synced 2025-11-04 20:14:36 +08:00 
			
		
		
		
	Compare commits
	
		
			160 Commits
		
	
	
		
			default-fa
			...
			hqq_serial
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| a8704d266e | |||
| bc9cb55d8d | |||
| f2ea032e40 | |||
| 75dfe0a9c6 | |||
| 51ab25e293 | |||
| e3d8285a84 | |||
| ca59d6f77c | |||
| b4727a1216 | |||
| db8c7caeb6 | |||
| 2229ebe722 | |||
| 05c1f9af9a | |||
| 9e28284032 | |||
| 48ed24c50a | |||
| e234061cdd | |||
| 9451a38526 | |||
| 453e74884f | |||
| 14ee2326e5 | |||
| 53f0c9c290 | |||
| 92abe60334 | |||
| b46bd8b9d2 | |||
| ef177a5e1c | |||
| 5f1fcc299c | |||
| b75ad56620 | |||
| 7f552e28e0 | |||
| a3264332cf | |||
| 6e2d04e429 | |||
| 026a173a64 | |||
| 516af4bb63 | |||
| 62c60a3018 | |||
| 1627108033 | |||
| bd54ed2ed7 | |||
| e68ec18ce2 | |||
| 2fbbcf5007 | |||
| 084b5094eb | |||
| 20528f067c | |||
| 934fe1504e | |||
| 3e8106d253 | |||
| f0bc49e7f6 | |||
| a24a9a66f4 | |||
| 811a9caa21 | |||
| 7f5d644e69 | |||
| 3fbaaaa64d | |||
| 7ffe25f2b9 | |||
| 49928892d6 | |||
| 6494479f1d | |||
| 535fe78b9f | |||
| a2ad9d5ad5 | |||
| 5019aabfac | |||
| f2122cc6eb | |||
| f739687684 | |||
| 44f6fdd74f | |||
| 8da9068730 | |||
| 81233c069c | |||
| 27c7f971c0 | |||
| 5f841c74b6 | |||
| f9756d9edb | |||
| b8e5cd5396 | |||
| 1c7ebf1d6e | |||
| c46edfb823 | |||
| fad15fba78 | |||
| 4ab33c2d81 | |||
| 9d6c0641c4 | |||
| 3a83ec48a6 | |||
| 6ed0bf1e85 | |||
| df6eee9201 | |||
| de2318894e | |||
| 9b9a54e61b | |||
| 1ecedf1d9e | |||
| f53a5dec7b | |||
| 5658e749ad | |||
| 85a1269e19 | |||
| edd68f4ed8 | |||
| 1c122a46dc | |||
| af0e4b7b37 | |||
| 1392a6867f | |||
| 8d2534c4d0 | |||
| e0182f3bd7 | |||
| 165116bc14 | |||
| 5f4ee98a7a | |||
| 8678879f1d | |||
| 01be5b4879 | |||
| c85510f958 | |||
| bc2adb0112 | |||
| 23f6a43f82 | |||
| d5a99dfcee | |||
| ff0d708fe6 | |||
| d2c687b3f1 | |||
| 9cf4f2aa9a | |||
| 7d92009af6 | |||
| 63700628ad | |||
| a009fbdab3 | |||
| 3263b34354 | |||
| 034b477847 | |||
| bab32d6fe9 | |||
| 9ced33ca7f | |||
| a5b226ce98 | |||
| a1844a3209 | |||
| 2e113422b3 | |||
| 5a4a76edb7 | |||
| 1535a2c93d | |||
| 34b43211d7 | |||
| 7405c1c77e | |||
| 605f3245dc | |||
| 2782aadae2 | |||
| f83c6f1d02 | |||
| 3aefb4ec7f | |||
| 251a2409c6 | |||
| 96a074fa7e | |||
| bd9dca3b85 | |||
| 817a676bd7 | |||
| 74d0eb3fed | |||
| 7987710696 | |||
| 12b6880c81 | |||
| d1ec36b94f | |||
| 7ba028fccb | |||
| 5a649ff3ec | |||
| f2a1e3ca68 | |||
| 0fcfc5ccc9 | |||
| c38c55f4fb | |||
| aa8f86a421 | |||
| b381880597 | |||
| 0fdea8607d | |||
| fe008d6ebe | |||
| 62aa270f2a | |||
| 89575b567e | |||
| 46835ec6ae | |||
| 4bd8f12972 | |||
| 566b0f1fbf | |||
| e316c5214f | |||
| 22f888b3fa | |||
| cd48553fc8 | |||
| 56a7745704 | |||
| b873234cb6 | |||
| 271fd8e60d | |||
| 8f0d26c55e | |||
| c75969ee28 | |||
| 4c040aba02 | |||
| fa8a9f55c0 | |||
| ff40f1a9e1 | |||
| c50e0551fd | |||
| c25dde1fc9 | |||
| 673d30b826 | |||
| 765732e92c | |||
| 1c37e8c1a6 | |||
| b31d595040 | |||
| cb23d1b20b | |||
| bc36c26fa6 | |||
| 63be8e6f39 | |||
| 72fb02c47d | |||
| 691586b0dc | |||
| 24cfcc2114 | |||
| 4037a2b5b1 | |||
| 6f40a213eb | |||
| e391706420 | |||
| c22efa6196 | |||
| 88e0813d8d | |||
| 036d3de23d | |||
| 89eec5cf20 | |||
| 999981daf4 | |||
| 693cb828ff | 
@ -142,6 +142,7 @@ jobs:
 | 
			
		||||
            - run: python utils/custom_init_isort.py --check_only
 | 
			
		||||
            - run: python utils/sort_auto_mappings.py --check_only
 | 
			
		||||
            - run: python utils/check_doc_toc.py
 | 
			
		||||
            - run: python utils/check_docstrings.py --check_all
 | 
			
		||||
 | 
			
		||||
    check_repository_consistency:
 | 
			
		||||
        working_directory: ~/transformers
 | 
			
		||||
@ -190,4 +191,4 @@ workflows:
 | 
			
		||||
            - check_circleci_user
 | 
			
		||||
            - check_code_quality
 | 
			
		||||
            - check_repository_consistency
 | 
			
		||||
            - fetch_all_tests
 | 
			
		||||
            - fetch_all_tests
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										17
									
								
								.github/ISSUE_TEMPLATE/bug-report.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										17
									
								
								.github/ISSUE_TEMPLATE/bug-report.yml
									
									
									
									
										vendored
									
									
								
							@ -1,6 +1,17 @@
 | 
			
		||||
name: "\U0001F41B Bug Report"
 | 
			
		||||
description: Submit a bug report to help us improve transformers
 | 
			
		||||
labels: [ "bug" ]
 | 
			
		||||
body:
 | 
			
		||||
  - type: markdown
 | 
			
		||||
    attributes:
 | 
			
		||||
      value: |
 | 
			
		||||
        Thanks for taking the time to fill out this bug report! 🤗
 | 
			
		||||
 | 
			
		||||
        Before you submit your bug report:
 | 
			
		||||
 | 
			
		||||
          - If it is your first time submitting, be sure to check our [bug report guidelines](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#did-you-find-a-bug)
 | 
			
		||||
          - Try our [docs bot](https://huggingface.co/spaces/huggingchat/hf-docs-chat) -- it might be able to help you with your issue
 | 
			
		||||
 | 
			
		||||
  - type: textarea
 | 
			
		||||
    id: system-info
 | 
			
		||||
    attributes:
 | 
			
		||||
@ -25,7 +36,7 @@ body:
 | 
			
		||||
 | 
			
		||||
        Models:
 | 
			
		||||
 | 
			
		||||
          - text models: @ArthurZucker 
 | 
			
		||||
          - text models: @ArthurZucker
 | 
			
		||||
          - vision models: @amyeroberts
 | 
			
		||||
          - speech models: @sanchit-gandhi
 | 
			
		||||
          - graph models: @clefourrier
 | 
			
		||||
@ -38,9 +49,9 @@ body:
 | 
			
		||||
          - tensorflow: @gante and @Rocketknight1
 | 
			
		||||
          - tokenizers: @ArthurZucker
 | 
			
		||||
          - trainer: @muellerzr @SunMarc
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
        Integrations:
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
          - deepspeed: HF Trainer/Accelerate: @muellerzr
 | 
			
		||||
          - ray/raytune: @richardliaw, @amogkam
 | 
			
		||||
          - Big Model Inference: @SunMarc
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										4
									
								
								.github/PULL_REQUEST_TEMPLATE.md
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.github/PULL_REQUEST_TEMPLATE.md
									
									
									
									
										vendored
									
									
								
							@ -58,9 +58,9 @@ Integrations:
 | 
			
		||||
- deepspeed: HF Trainer/Accelerate: @muellerzr
 | 
			
		||||
- ray/raytune: @richardliaw, @amogkam
 | 
			
		||||
- Big Model Inference: @SunMarc
 | 
			
		||||
- quantization (bitsandbytes, autogpt): @SunMarc 
 | 
			
		||||
- quantization (bitsandbytes, autogpt): @SunMarc
 | 
			
		||||
 | 
			
		||||
Documentation: @stevhliu and @MKhalusova
 | 
			
		||||
Documentation: @stevhliu
 | 
			
		||||
 | 
			
		||||
HF projects:
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										2
									
								
								.github/workflows/self-pr-slow-ci.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/self-pr-slow-ci.yml
									
									
									
									
										vendored
									
									
								
							@ -4,7 +4,7 @@ on:
 | 
			
		||||
  pull_request:
 | 
			
		||||
    paths:
 | 
			
		||||
      - "src/transformers/models/*/modeling_*.py"
 | 
			
		||||
      - "tests/models/*/test_*.py"
 | 
			
		||||
      - "tests/**/test_*.py"
 | 
			
		||||
 | 
			
		||||
concurrency:
 | 
			
		||||
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										23
									
								
								.github/workflows/trufflehog.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										23
									
								
								.github/workflows/trufflehog.yml
									
									
									
									
										vendored
									
									
								
							@ -10,20 +10,9 @@ jobs:
 | 
			
		||||
  trufflehog:
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
    steps:
 | 
			
		||||
    - shell: bash
 | 
			
		||||
      run: |
 | 
			
		||||
        if [ "${{ github.event_name }}" == "push" ]; then
 | 
			
		||||
          echo "depth=$(($(jq length <<< '${{ toJson(github.event.commits) }}') + 2))" >> $GITHUB_ENV
 | 
			
		||||
          echo "branch=${{ github.ref_name }}" >> $GITHUB_ENV
 | 
			
		||||
        fi
 | 
			
		||||
        if [ "${{ github.event_name }}" == "pull_request" ]; then
 | 
			
		||||
          echo "depth=$((${{ github.event.pull_request.commits }}+2))" >> $GITHUB_ENV
 | 
			
		||||
          echo "branch=${{ github.event.pull_request.head.ref }}" >> $GITHUB_ENV
 | 
			
		||||
        fi
 | 
			
		||||
    - name: Checkout code
 | 
			
		||||
      uses: actions/checkout@v4
 | 
			
		||||
      with:
 | 
			
		||||
        ref: ${{env.branch}}
 | 
			
		||||
        fetch-depth: ${{env.depth}}
 | 
			
		||||
    - name: Secret Scanning
 | 
			
		||||
      uses: trufflesecurity/trufflehog@main
 | 
			
		||||
      - name: Checkout code
 | 
			
		||||
        uses: actions/checkout@v4
 | 
			
		||||
        with:
 | 
			
		||||
          fetch-depth: 0
 | 
			
		||||
      - name: Secret Scanning
 | 
			
		||||
        uses: trufflesecurity/trufflehog@main
 | 
			
		||||
 | 
			
		||||
@ -61,7 +61,10 @@ feedback.
 | 
			
		||||
The 🤗 Transformers library is robust and reliable thanks to users who report the problems they encounter.
 | 
			
		||||
 | 
			
		||||
Before you report an issue, we would really appreciate it if you could **make sure the bug was not
 | 
			
		||||
already reported** (use the search bar on GitHub under Issues). Your issue should also be related to bugs in the library itself, and not your code. If you're unsure whether the bug is in your code or the library, please ask in the [forum](https://discuss.huggingface.co/) first. This helps us respond quicker to fixing issues related to the library versus general questions.
 | 
			
		||||
already reported** (use the search bar on GitHub under Issues). Your issue should also be related to bugs in the library itself, and not your code. If you're unsure whether the bug is in your code or the library, please ask in the [forum](https://discuss.huggingface.co/) or on our [discord](https://discord.com/invite/hugging-face-879548962464493619) first. This helps us respond quicker to fixing issues related to the library versus general questions.
 | 
			
		||||
 | 
			
		||||
> [!TIP]
 | 
			
		||||
> We have a [docs bot](https://huggingface.co/spaces/huggingchat/hf-docs-chat), and we highly encourage you to ask all your questions there. There is always a chance your bug can be fixed with a simple flag 👾🔫
 | 
			
		||||
 | 
			
		||||
Once you've confirmed the bug hasn't already been reported, please include the following information in your issue so we can quickly resolve it:
 | 
			
		||||
 | 
			
		||||
@ -129,7 +132,7 @@ You will need basic `git` proficiency to contribute to
 | 
			
		||||
manual. Type `git --help` in a shell and enjoy! If you prefer books, [Pro
 | 
			
		||||
Git](https://git-scm.com/book/en/v2) is a very good reference.
 | 
			
		||||
 | 
			
		||||
You'll need **[Python 3.8](https://github.com/huggingface/transformers/blob/main/setup.py#L426)** or above to contribute to 🤗 Transformers. Follow the steps below to start contributing:
 | 
			
		||||
You'll need **[Python 3.8](https://github.com/huggingface/transformers/blob/main/setup.py#L449)** or above to contribute to 🤗 Transformers. Follow the steps below to start contributing:
 | 
			
		||||
 | 
			
		||||
1. Fork the [repository](https://github.com/huggingface/transformers) by
 | 
			
		||||
   clicking on the **[Fork](https://github.com/huggingface/transformers/fork)** button on the repository's page. This creates a copy of the code
 | 
			
		||||
@ -160,7 +163,7 @@ You'll need **[Python 3.8](https://github.com/huggingface/transformers/blob/main
 | 
			
		||||
   If 🤗 Transformers was already installed in the virtual environment, remove
 | 
			
		||||
   it with `pip uninstall transformers` before reinstalling it in editable
 | 
			
		||||
   mode with the `-e` flag.
 | 
			
		||||
   
 | 
			
		||||
 | 
			
		||||
   Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a
 | 
			
		||||
   failure with this command. If that's the case make sure to install the Deep Learning framework you are working with
 | 
			
		||||
   (PyTorch, TensorFlow and/or Flax) then do:
 | 
			
		||||
@ -219,7 +222,7 @@ You'll need **[Python 3.8](https://github.com/huggingface/transformers/blob/main
 | 
			
		||||
 | 
			
		||||
   If you're modifying documents under the `docs/source` directory, make sure the documentation can still be built. This check will also run in the CI when you open a pull request. To run a local check
 | 
			
		||||
   make sure you install the documentation builder:
 | 
			
		||||
   
 | 
			
		||||
 | 
			
		||||
   ```bash
 | 
			
		||||
   pip install ".[docs]"
 | 
			
		||||
   ```
 | 
			
		||||
@ -338,12 +341,12 @@ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./tests/models/my_ne
 | 
			
		||||
RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/text-classification
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Like the slow tests, there are other environment variables available which not enabled by default during testing:
 | 
			
		||||
Like the slow tests, there are other environment variables available which are not enabled by default during testing:
 | 
			
		||||
- `RUN_CUSTOM_TOKENIZERS`: Enables tests for custom tokenizers.
 | 
			
		||||
- `RUN_PT_FLAX_CROSS_TESTS`: Enables tests for PyTorch + Flax integration.
 | 
			
		||||
- `RUN_PT_TF_CROSS_TESTS`: Enables tests for TensorFlow + PyTorch integration.
 | 
			
		||||
 | 
			
		||||
More environment variables and additional information can be found in the [testing_utils.py](src/transformers/testing_utils.py).
 | 
			
		||||
More environment variables and additional information can be found in the [testing_utils.py](https://github.com/huggingface/transformers/blob/main/src/transformers/testing_utils.py).
 | 
			
		||||
 | 
			
		||||
🤗 Transformers uses `pytest` as a test runner only. It doesn't use any
 | 
			
		||||
`pytest`-specific features in the test suite itself.
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										1
									
								
								Makefile
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								Makefile
									
									
									
									
									
								
							@ -56,6 +56,7 @@ quality:
 | 
			
		||||
	python utils/custom_init_isort.py --check_only
 | 
			
		||||
	python utils/sort_auto_mappings.py --check_only
 | 
			
		||||
	python utils/check_doc_toc.py
 | 
			
		||||
	python utils/check_docstrings.py --check_all
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Format source code automatically and check is there are any problems left that need manual fixing
 | 
			
		||||
 | 
			
		||||
@ -8,7 +8,7 @@ RUN pip install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 | 
			
		||||
RUN uv pip install --no-cache-dir --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
 | 
			
		||||
# tensorflow pin matching setup.py
 | 
			
		||||
RUN uv pip install --no-cache-dir "tensorflow-cpu<2.16" "tf-keras<2.16"
 | 
			
		||||
RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,quality,vision,testing]"
 | 
			
		||||
RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,quality,speech,vision,testing]"
 | 
			
		||||
RUN git lfs install
 | 
			
		||||
 | 
			
		||||
RUN pip uninstall -y transformers
 | 
			
		||||
 | 
			
		||||
@ -9,7 +9,7 @@ SHELL ["sh", "-lc"]
 | 
			
		||||
# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
 | 
			
		||||
# to be used as arguments for docker build (so far).
 | 
			
		||||
 | 
			
		||||
ARG PYTORCH='2.3.0'
 | 
			
		||||
ARG PYTORCH='2.4.0'
 | 
			
		||||
# (not always a valid torch version)
 | 
			
		||||
ARG INTEL_TORCH_EXT='2.3.0'
 | 
			
		||||
# Example: `cu102`, `cu113`, etc.
 | 
			
		||||
 | 
			
		||||
@ -11,7 +11,7 @@ ARG REF=main
 | 
			
		||||
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
 | 
			
		||||
 | 
			
		||||
# If set to nothing, will install the latest version
 | 
			
		||||
ARG PYTORCH='2.3.0'
 | 
			
		||||
ARG PYTORCH='2.4.0'
 | 
			
		||||
ARG TORCH_VISION=''
 | 
			
		||||
ARG TORCH_AUDIO=''
 | 
			
		||||
# Example: `cu102`, `cu113`, etc.
 | 
			
		||||
 | 
			
		||||
@ -92,6 +92,8 @@
 | 
			
		||||
      title: Visual Question Answering
 | 
			
		||||
    - local: tasks/text-to-speech
 | 
			
		||||
      title: Text to speech
 | 
			
		||||
    - local: tasks/image_text_to_text
 | 
			
		||||
      title: Image-text-to-text
 | 
			
		||||
    title: Multimodal
 | 
			
		||||
  - isExpanded: false
 | 
			
		||||
    sections:
 | 
			
		||||
@ -155,6 +157,8 @@
 | 
			
		||||
    title: EETQ
 | 
			
		||||
  - local: quantization/hqq
 | 
			
		||||
    title: HQQ
 | 
			
		||||
  - local: quantization/fbgemm_fp8
 | 
			
		||||
    title: FBGEMM_FP8
 | 
			
		||||
  - local: quantization/optimum
 | 
			
		||||
    title: Optimum
 | 
			
		||||
  - local: quantization/contribute
 | 
			
		||||
@ -758,6 +762,8 @@
 | 
			
		||||
        title: BridgeTower
 | 
			
		||||
      - local: model_doc/bros
 | 
			
		||||
        title: BROS
 | 
			
		||||
      - local: model_doc/chameleon
 | 
			
		||||
        title: Chameleon
 | 
			
		||||
      - local: model_doc/chinese_clip
 | 
			
		||||
        title: Chinese-CLIP
 | 
			
		||||
      - local: model_doc/clip
 | 
			
		||||
 | 
			
		||||
@ -509,3 +509,54 @@ agent = ReactCodeAgent(tools=[search_tool])
 | 
			
		||||
 | 
			
		||||
agent.run("How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?")
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Gradio interface
 | 
			
		||||
 | 
			
		||||
You can leverage `gradio.Chatbot`to display your agent's thoughts using `stream_to_gradio`, here is an example:
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
import gradio as gr
 | 
			
		||||
from transformers import (
 | 
			
		||||
    load_tool,
 | 
			
		||||
    ReactCodeAgent,
 | 
			
		||||
    HfEngine,
 | 
			
		||||
    stream_to_gradio,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
# Import tool from Hub
 | 
			
		||||
image_generation_tool = load_tool("m-ric/text-to-image")
 | 
			
		||||
 | 
			
		||||
llm_engine = HfEngine("meta-llama/Meta-Llama-3-70B-Instruct")
 | 
			
		||||
 | 
			
		||||
# Initialize the agent with the image generation tool
 | 
			
		||||
agent = ReactCodeAgent(tools=[image_generation_tool], llm_engine=llm_engine)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def interact_with_agent(task):
 | 
			
		||||
    messages = []
 | 
			
		||||
    messages.append(gr.ChatMessage(role="user", content=task))
 | 
			
		||||
    yield messages
 | 
			
		||||
    for msg in stream_to_gradio(agent, task):
 | 
			
		||||
        messages.append(msg)
 | 
			
		||||
        yield messages + [
 | 
			
		||||
            gr.ChatMessage(role="assistant", content="⏳ Task not finished yet!")
 | 
			
		||||
        ]
 | 
			
		||||
    yield messages
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
with gr.Blocks() as demo:
 | 
			
		||||
    text_input = gr.Textbox(lines=1, label="Chat Message", value="Make me a picture of the Statue of Liberty.")
 | 
			
		||||
    submit = gr.Button("Run illustrator agent!")
 | 
			
		||||
    chatbot = gr.Chatbot(
 | 
			
		||||
        label="Agent",
 | 
			
		||||
        type="messages",
 | 
			
		||||
        avatar_images=(
 | 
			
		||||
            None,
 | 
			
		||||
            "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png",
 | 
			
		||||
        ),
 | 
			
		||||
    )
 | 
			
		||||
    submit.click(interact_with_agent, [text_input], [chatbot])
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    demo.launch()
 | 
			
		||||
```
 | 
			
		||||
@ -580,7 +580,7 @@ default template for that model class is used instead. Let's take a look at the
 | 
			
		||||
>>> from transformers import AutoTokenizer
 | 
			
		||||
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
 | 
			
		||||
 | 
			
		||||
>>> tokenizer.default_chat_template
 | 
			
		||||
>>> tokenizer.chat_template
 | 
			
		||||
"{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ '  ' }}{% endif %}{% endfor %}{{ eos_token }}"
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
@ -704,23 +704,6 @@ with other names, pass the name of the template you want to the `chat_template`
 | 
			
		||||
We find that this can be a bit confusing for users, though - so if you're writing a template yourself, we recommend
 | 
			
		||||
trying to put it all in a single template where possible!
 | 
			
		||||
 | 
			
		||||
### What are "default" templates?
 | 
			
		||||
 | 
			
		||||
Before the introduction of chat templates, chat handling was hardcoded at the model class level. For backwards 
 | 
			
		||||
compatibility, we have retained this class-specific handling as default templates, also set at the class level. If a
 | 
			
		||||
model does not have a chat template set, but there is a default template for its model class, the `TextGenerationPipeline`
 | 
			
		||||
class and methods like `apply_chat_template` will use the class template instead. You can find out what the default
 | 
			
		||||
template for your tokenizer is by checking the `tokenizer.default_chat_template` attribute.
 | 
			
		||||
 | 
			
		||||
This is something we do purely for backward compatibility reasons, to avoid breaking any existing workflows. Even when
 | 
			
		||||
the class template is appropriate for your model, we strongly recommend overriding the default template by
 | 
			
		||||
setting the `chat_template` attribute explicitly to make it clear to users that your model has been correctly configured
 | 
			
		||||
for chat.
 | 
			
		||||
 | 
			
		||||
Now that actual chat templates have been adopted more widely, default templates have been deprecated and will be
 | 
			
		||||
removed in a future release. We strongly recommend setting the `chat_template` attribute for any tokenizers that
 | 
			
		||||
still depend on them!
 | 
			
		||||
 | 
			
		||||
### What template should I use?
 | 
			
		||||
 | 
			
		||||
When setting the template for a model that's already been trained for chat, you should ensure that the template
 | 
			
		||||
 | 
			
		||||
@ -195,7 +195,7 @@ inputs = {key: tensor.to(model.device) for key, tensor in inputs.items()}
 | 
			
		||||
print("Tokenized inputs:\n", inputs)
 | 
			
		||||
 | 
			
		||||
# 4: Generate text from the model
 | 
			
		||||
outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.)
 | 
			
		||||
outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.1)
 | 
			
		||||
print("Generated tokens:\n", outputs)
 | 
			
		||||
 | 
			
		||||
# 5: Decode the output back to a string
 | 
			
		||||
 | 
			
		||||
@ -211,6 +211,80 @@ I like rock music because it's loud and energetic. It's a great way to express m
 | 
			
		||||
I like rock music because it's loud and energetic. I like to listen to it when I'm feeling
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## KV Cache Offloading
 | 
			
		||||
 | 
			
		||||
Similarly to KV cache quantization, this strategy aims to reduce GPU VRAM usage.
 | 
			
		||||
It does so by moving the KV cache for most layers to the CPU.
 | 
			
		||||
As the model's `forward()` method iterates over the layers, this strategy maintains the current layer cache on the GPU.
 | 
			
		||||
At the same time it asynchronously prefetches the next layer cache as well as sending the previous layer cache back to the CPU.
 | 
			
		||||
Unlike KV cache quantization, this strategy always produces the same result as the default KV cache implementation.
 | 
			
		||||
Thus, it can serve as a drop-in replacement or a fallback for it.
 | 
			
		||||
 | 
			
		||||
Depending on your model and the characteristics of your generation task (size of context, number of generated tokens, number of beams, etc.)
 | 
			
		||||
you may notice a small degradation in generation throughput compared to the default KV cache implementation.
 | 
			
		||||
 | 
			
		||||
To enable KV cache offloading, pass `cache_implementation="offloaded"` in the `generation_config`.
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
>>> import torch
 | 
			
		||||
>>> from transformers import AutoTokenizer, AutoModelForCausalLM
 | 
			
		||||
>>> ckpt = "microsoft/Phi-3-mini-4k-instruct"
 | 
			
		||||
 | 
			
		||||
>>> tokenizer = AutoTokenizer.from_pretrained(ckpt)
 | 
			
		||||
>>> model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16).to("cuda:0")
 | 
			
		||||
>>> inputs = tokenizer("Fun fact: The shortest", return_tensors="pt").to(model.device)
 | 
			
		||||
 | 
			
		||||
>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=23, cache_implementation="offloaded")
 | 
			
		||||
>>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
 | 
			
		||||
Fun fact: The shortest war in history was between Britain and Zanzibar on August 27, 1896.
 | 
			
		||||
 | 
			
		||||
>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=23)
 | 
			
		||||
>>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
 | 
			
		||||
Fun fact: The shortest war in history was between Britain and Zanzibar on August 27, 1896.
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
<Tip warning={true}>
 | 
			
		||||
 | 
			
		||||
Cache offloading requires a GPU and can be slower than the default KV cache. Use it if you are getting CUDA out of memory errors.
 | 
			
		||||
 | 
			
		||||
</Tip>
 | 
			
		||||
 | 
			
		||||
The example below shows how KV cache offloading can be used as a fallback strategy.
 | 
			
		||||
```python
 | 
			
		||||
>>> import torch
 | 
			
		||||
>>> from transformers import AutoTokenizer, AutoModelForCausalLM
 | 
			
		||||
>>> def resilient_generate(model, *args, **kwargs):
 | 
			
		||||
...     oom = False
 | 
			
		||||
...     try:
 | 
			
		||||
...         return model.generate(*args, **kwargs)
 | 
			
		||||
...     except torch.cuda.OutOfMemoryError as e:
 | 
			
		||||
...         print(e)
 | 
			
		||||
...         print("retrying with cache_implementation='offloaded'")
 | 
			
		||||
...         oom = True
 | 
			
		||||
...     if oom:
 | 
			
		||||
...         torch.cuda.empty_cache()
 | 
			
		||||
...         kwargs["cache_implementation"] = "offloaded"
 | 
			
		||||
...         return model.generate(*args, **kwargs)
 | 
			
		||||
...
 | 
			
		||||
...
 | 
			
		||||
>>> ckpt = "microsoft/Phi-3-mini-4k-instruct"
 | 
			
		||||
>>> tokenizer = AutoTokenizer.from_pretrained(ckpt)
 | 
			
		||||
>>> model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16).to("cuda:0")
 | 
			
		||||
>>> prompt = ["okay "*1000 + "Fun fact: The most"]
 | 
			
		||||
>>> inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
 | 
			
		||||
>>> beams = { "num_beams": 40, "num_beam_groups": 40, "num_return_sequences": 40, "diversity_penalty": 1.0, "max_new_tokens": 23, "early_stopping": True, }
 | 
			
		||||
>>> out = resilient_generate(model, **inputs, **beams)
 | 
			
		||||
>>> responses = tokenizer.batch_decode(out[:,-28:], skip_special_tokens=True)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
On a GPU with 50 GB of RAM, running this code will print
 | 
			
		||||
```
 | 
			
		||||
CUDA out of memory. Tried to allocate 4.83 GiB. GPU
 | 
			
		||||
retrying with cache_implementation='offloaded'
 | 
			
		||||
```
 | 
			
		||||
before successfully generating 40 beams.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## Watermarking
 | 
			
		||||
 | 
			
		||||
The `generate()` supports watermarking the generated text by randomly marking a portion of tokens as "green".
 | 
			
		||||
 | 
			
		||||
@ -88,6 +88,7 @@ Flax), PyTorch, and/or TensorFlow.
 | 
			
		||||
|                          [ByT5](model_doc/byt5)                          |       ✅        |         ✅         |      ✅      |
 | 
			
		||||
|                     [CamemBERT](model_doc/camembert)                     |       ✅        |         ✅         |      ❌      |
 | 
			
		||||
|                        [CANINE](model_doc/canine)                        |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|                     [Chameleon](model_doc/chameleon)                     |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|                  [Chinese-CLIP](model_doc/chinese_clip)                  |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|                          [CLAP](model_doc/clap)                          |       ✅        |         ❌         |      ❌      |
 | 
			
		||||
|                          [CLIP](model_doc/clip)                          |       ✅        |         ✅         |      ✅      |
 | 
			
		||||
 | 
			
		||||
@ -18,59 +18,109 @@ Basic inference is slow because LLMs have to be called repeatedly to generate th
 | 
			
		||||
This guide will show you how to use the optimization techniques available in Transformers to accelerate LLM inference.
 | 
			
		||||
 | 
			
		||||
> [!TIP]
 | 
			
		||||
> Hugging Face also provides [Text Generation Inference (TGI)](https://hf.co/docs/text-generation-inference), a library dedicated to deploying and serving highly optimized LLMs for inference. It includes more optimization features not included in Transformers, such as continuous batching for increasing throughput and tensor parallelism for multi-GPU inference.
 | 
			
		||||
> Hugging Face also provides [Text Generation Inference (TGI)](https://hf.co/docs/text-generation-inference), a library dedicated to deploying and serving highly optimized LLMs for inference. It includes deployment-oriented optimization features not included in Transformers, such as continuous batching for increasing throughput and tensor parallelism for multi-GPU inference.
 | 
			
		||||
 | 
			
		||||
## Static kv-cache and torch.compile
 | 
			
		||||
## Static kv-cache and `torch.compile`
 | 
			
		||||
 | 
			
		||||
During decoding, a LLM computes the key-value (kv) values for each input token and since it is autoregressive, it computes the same kv values each time because the generated output becomes part of the input now. This is not very efficient because you're recomputing the same kv values each time.
 | 
			
		||||
 | 
			
		||||
To optimize this, you can use a kv-cache to store the past keys and values instead of recomputing them each time. However, since the kv-cache grows with each generation step and is dynamic, it prevents you from taking advantage of [torch.compile](./perf_torch_compile), a powerful optimization tool that fuses PyTorch code into fast and optimized kernels.
 | 
			
		||||
To optimize this, you can use a kv-cache to store the past keys and values instead of recomputing them each time. However, since the kv-cache grows with each generation step and is dynamic, it prevents you from taking advantage of [`torch.compile`](./perf_torch_compile), a powerful optimization tool that fuses PyTorch code into fast and optimized kernels.
 | 
			
		||||
 | 
			
		||||
The *static kv-cache* solves this issue by pre-allocating the kv-cache size to a maximum value which allows you to combine it with torch.compile for up to a 4x speed up.
 | 
			
		||||
The *static kv-cache* solves this issue by pre-allocating the kv-cache size to a maximum value which allows you to combine it with `torch.compile` for up to a 4x speed up. Your speed up may vary depending on the model size (larger models have a smaller speed up) and hardware.
 | 
			
		||||
 | 
			
		||||
> [!WARNING]
 | 
			
		||||
> Currently, only [Llama](./model_doc/llama2) and a few other models support static kv-cache and torch.compile. Check [this issue](https://github.com/huggingface/transformers/issues/28981) for a live model compatibility list.
 | 
			
		||||
> Currently, only [Llama](./model_doc/llama2) and a few other models support static kv-cache and `torch.compile`. Check [this issue](https://github.com/huggingface/transformers/issues/28981) for a live model compatibility list.
 | 
			
		||||
 | 
			
		||||
For this example, let's load the [Gemma](https://hf.co/google/gemma-2b) model.
 | 
			
		||||
There are three flavors of static kv-cache usage, depending on the complexity of your task:
 | 
			
		||||
1. Basic usage: simply set a flag in `generation_config` (recommended);
 | 
			
		||||
2. Advanced usage: handle a cache object for multi-turn generation or a custom generation loop;
 | 
			
		||||
3. Advanced usage: compile the entire `generate` function into a single graph, if having a single graph is relevant for you.
 | 
			
		||||
 | 
			
		||||
Select the correct tab below for further instructions on each of these flavors.
 | 
			
		||||
 | 
			
		||||
> [!TIP]
 | 
			
		||||
> Regardless of the strategy used with `torch.compile`, you can avoid shape-related recompilations if you left-pad your LLM inputs to a limited set of values. The [`pad_to_multiple_of` tokenizer flag](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.__call__.pad_to_multiple_of) is your friend!
 | 
			
		||||
 | 
			
		||||
<hfoptions id="static-kv">
 | 
			
		||||
<hfoption id="basic usage: generation_config">
 | 
			
		||||
 | 
			
		||||
For this example, let's use the [Gemma](https://hf.co/google/gemma-2b) model. All we need to do is to:
 | 
			
		||||
1. Access the model's `generation_config` attribute and set the `cache_implementation` to "static";
 | 
			
		||||
2. Call `torch.compile` on the model to compile the forward pass with the static kv-cache.
 | 
			
		||||
 | 
			
		||||
And that's it!
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
from transformers import AutoTokenizer, AutoModelForCausalLM
 | 
			
		||||
import torch
 | 
			
		||||
import os
 | 
			
		||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To prevent long warnings :)
 | 
			
		||||
 | 
			
		||||
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
 | 
			
		||||
model = AutoModelForCausalLM.from_pretrained(
 | 
			
		||||
    "google/gemma-2b", device_map="auto"
 | 
			
		||||
)
 | 
			
		||||
```
 | 
			
		||||
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto")
 | 
			
		||||
 | 
			
		||||
There are two ways you can configure the model to use a static kv-cache. For a 7B model on an A100, both methods get a 4x speed up in the forward pass. Your speed up may vary depending on the model size (larger models have a smaller speed up) and hardware. If you're using the [`~GenerationMixin.generate`] method, the speed up is ~3x. The forward pass (which still gets 4x speed up) is only a part of the whole [`~GenerationMixin.generate`] code.
 | 
			
		||||
 | 
			
		||||
<hfoptions id="static-kv">
 | 
			
		||||
<hfoption id="generation_config">
 | 
			
		||||
 | 
			
		||||
Access the model's `generation_config` attribute and set the `cache_implementation` to "static".
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
model.generation_config.cache_implementation = "static"
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Call torch.compile on the model to compile the forward pass with the static kv-cache.
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
compiled_model = torch.compile(model, mode="reduce-overhead", fullgraph=True)
 | 
			
		||||
model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
 | 
			
		||||
input_text = "The theory of special relativity states "
 | 
			
		||||
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
 | 
			
		||||
 | 
			
		||||
outputs = compiled_model.generate(**input_ids)
 | 
			
		||||
tokenizer.batch_decode(outputs, skip_special_tokens=True)
 | 
			
		||||
outputs = model.generate(**input_ids)
 | 
			
		||||
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
 | 
			
		||||
['The theory of special relativity states 1. The speed of light is constant in all inertial reference']
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Under the hood, `generate` will attempt to reuse the same cache object, removing the need for re-compilation at each call. However, if the batch size or the maximum output length increase between calls, the cache will have to be reinitialized, triggering a new compilation.
 | 
			
		||||
Under the hood, `generate` will attempt to reuse the same cache object, removing the need for re-compilation at each call. Avoiding re-compilation is critical to get the most out of `torch.compile`, and you should be aware of the following:
 | 
			
		||||
1. If the batch size changes or the maximum output length increases between calls, the cache will have to be reinitialized, triggering a new compilation;
 | 
			
		||||
2. The first couple of calls of the compiled function are slower, as the function is being compiled.
 | 
			
		||||
 | 
			
		||||
> [!WARNING]
 | 
			
		||||
> For a more advanced usage of the static cache, such as multi-turn conversations, we recommend instantiating and manipulating the cache object outside [`~GenerationMixin.generate`]. See the advanced usage tab.
 | 
			
		||||
 | 
			
		||||
</hfoption>
 | 
			
		||||
<hfoption id="Static Cache">
 | 
			
		||||
<hfoption id="advanced usage: control Static Cache">
 | 
			
		||||
 | 
			
		||||
A [`StaticCache`] object can be passed to the model's forward pass under the `past_key_values` argument, enabling the use of this object as a static kv-cache. Using this strategy, you can write your own function to decode the next token given the current token and position and cache position of previously generated tokens. You can also pass the [`StaticCache`] object to [`~GenerationMixin.generate`] and use it across calls, like you would do with a dynamic cache.
 | 
			
		||||
A [`StaticCache`] object can be passed to the model's [`~GenerationMixin.generate`] under the `past_key_values` argument. The object will retain the cache contents, so you can pass it to a new [`~GenerationMixin.generate`] call to continue generation, like you would do with a dynamic cache.
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
from transformers import AutoTokenizer, AutoModelForCausalLM, StaticCache
 | 
			
		||||
import torch
 | 
			
		||||
import os
 | 
			
		||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To prevent long warnings :)
 | 
			
		||||
 | 
			
		||||
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
 | 
			
		||||
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto")
 | 
			
		||||
 | 
			
		||||
model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
 | 
			
		||||
input_text = "The theory of special relativity states "
 | 
			
		||||
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
 | 
			
		||||
prompt_length = input_ids.input_ids.shape[1]
 | 
			
		||||
model.generation_config.max_new_tokens = 16
 | 
			
		||||
 | 
			
		||||
past_key_values = StaticCache(
 | 
			
		||||
    config=model.config,
 | 
			
		||||
    max_batch_size=1,
 | 
			
		||||
    # If you plan to reuse the cache, make sure the cache length is large enough for all cases
 | 
			
		||||
    max_cache_len=prompt_length+(model.generation_config.max_new_tokens*2),
 | 
			
		||||
    device=model.device,
 | 
			
		||||
    dtype=model.dtype
 | 
			
		||||
)
 | 
			
		||||
outputs = model.generate(**input_ids, past_key_values=past_key_values)
 | 
			
		||||
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
 | 
			
		||||
['The theory of special relativity states 1. The speed of light is constant in all inertial reference frames. 2']
 | 
			
		||||
 | 
			
		||||
# pass in the generated text and the same cache object to continue generation from where it left off. Optionally, in a
 | 
			
		||||
# multi-turn conversation, append the new user input to the generated text.
 | 
			
		||||
new_input_ids = outputs
 | 
			
		||||
outputs = model.generate(new_input_ids, past_key_values=past_key_values)
 | 
			
		||||
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
 | 
			
		||||
['The theory of special relativity states 1. The speed of light is constant in all inertial reference frames. 2. The speed of light is constant in all inertial reference frames. 3.']
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
> [!TIP]
 | 
			
		||||
> If you want to reuse the same [`StaticCache`] object on a new prompt, be sure to reset its contents with the `.reset()` method between calls
 | 
			
		||||
 | 
			
		||||
If you want to go further down a level, the [`StaticCache`] object can also be passed to the model's forward pass under the same `past_key_values` argument. Using this strategy, you can write your own function to decode the next token given the current token and position and cache position of previously generated tokens.
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
from transformers import LlamaTokenizer, LlamaForCausalLM, StaticCache, logging
 | 
			
		||||
@ -102,12 +152,9 @@ def decode_one_tokens(model, cur_token, input_pos, cache_position, past_key_valu
 | 
			
		||||
    return new_token
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
There are a few important things you must do to enable static kv-cache and torch.compile with the `StaticCache` method:
 | 
			
		||||
 | 
			
		||||
There are a few important things you must do to enable static kv-cache and `torch.compile` with the `StaticCache` method:
 | 
			
		||||
1. Initialize the [`StaticCache`] instance before using the model for inference. There you can configure parameters like the maximum batch size and sequence length.
 | 
			
		||||
 | 
			
		||||
2. Call torch.compile on the model to compile the forward pass with the static kv-cache.
 | 
			
		||||
 | 
			
		||||
2. Call `torch.compile` on the model to compile the forward pass with the static kv-cache.
 | 
			
		||||
3. Set `enable_math=True` in the [torch.backends.cuda.sdp_kernel](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) context manager to enable the native PyTorch C++ implementation of scaled dot product attention to speed up inference even more.
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
@ -142,8 +189,34 @@ text
 | 
			
		||||
 'My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, my fries, my chicken, my burgers, my hot dogs, my sandwiches, my salads, my p']
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
> [!TIP]
 | 
			
		||||
> If you want to reuse the [`StaticCache`] object on a new prompt, be sure to reset its contents with the `.reset()` method
 | 
			
		||||
</hfoption>
 | 
			
		||||
<hfoption id="advanced usage: end-to-end generate compilation">
 | 
			
		||||
 | 
			
		||||
Compiling the entire `generate` function, in terms of code, is even simpler than in the basic usage: call `torch.compile` on `generate` to compile the entire function. No need to specify the use of the static cache: although it is compatible, dynamic cache (default) was faster in our benchmarks.
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
from transformers import AutoTokenizer, AutoModelForCausalLM
 | 
			
		||||
import torch
 | 
			
		||||
import os
 | 
			
		||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To prevent long warnings :)
 | 
			
		||||
 | 
			
		||||
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
 | 
			
		||||
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto")
 | 
			
		||||
 | 
			
		||||
model.generate = torch.compile(model.generate, mode="reduce-overhead", fullgraph=True)
 | 
			
		||||
input_text = "The theory of special relativity states "
 | 
			
		||||
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
 | 
			
		||||
 | 
			
		||||
outputs = model.generate(**input_ids)
 | 
			
		||||
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
 | 
			
		||||
['The theory of special relativity states 1. The speed of light is constant in all inertial reference']
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
As a result, we compile not only the model forward pass, but also all input preparation, logit processor operations, and so on. The result should be a slightly `generate` call, compared to the basic usage example, and the compiled graph may be better suited to more exotic hardware devices or use cases. However, there are severe drawbacks in using this approach:
 | 
			
		||||
1. Compilation is much slower;
 | 
			
		||||
2. All parameterization of `generate` must be done through `generation_config`;
 | 
			
		||||
3. Many warnings and exceptions are suppressed -- we suggest testing with its uncompiled form first;
 | 
			
		||||
4. Although we are working on it, it is heavily feature restricted (for instance, at the time of writing, generation does not stop if an EOS token is selected).
 | 
			
		||||
 | 
			
		||||
</hfoption>
 | 
			
		||||
</hfoptions>
 | 
			
		||||
 | 
			
		||||
@ -72,6 +72,10 @@ We provide two types of agents, based on the main [`Agent`] class:
 | 
			
		||||
 | 
			
		||||
[[autodoc]] launch_gradio_demo
 | 
			
		||||
 | 
			
		||||
### stream_to_gradio
 | 
			
		||||
 | 
			
		||||
[[autodoc]] stream_to_gradio
 | 
			
		||||
 | 
			
		||||
### ToolCollection
 | 
			
		||||
 | 
			
		||||
[[autodoc]] ToolCollection
 | 
			
		||||
 | 
			
		||||
@ -25,11 +25,11 @@ A backbone is a model used for feature extraction for higher level computer visi
 | 
			
		||||
 | 
			
		||||
Backbones are supported for the following models:
 | 
			
		||||
 | 
			
		||||
* [BEiT](..model_doc/beit)
 | 
			
		||||
* [BEiT](../model_doc/beit)
 | 
			
		||||
* [BiT](../model_doc/bit)
 | 
			
		||||
* [ConvNet](../model_doc/convnext)
 | 
			
		||||
* [ConvNext](../model_doc/convnext)
 | 
			
		||||
* [ConvNextV2](../model_doc/convnextv2)
 | 
			
		||||
* [DiNAT](..model_doc/dinat)
 | 
			
		||||
* [DiNAT](../model_doc/dinat)
 | 
			
		||||
* [DINOV2](../model_doc/dinov2)
 | 
			
		||||
* [FocalNet](../model_doc/focalnet)
 | 
			
		||||
* [MaskFormer](../model_doc/maskformer)
 | 
			
		||||
 | 
			
		||||
@ -66,3 +66,8 @@ Examples of use can be found in the [example scripts](../examples) or [example n
 | 
			
		||||
    - numpy_mask_tokens
 | 
			
		||||
    - tf_mask_tokens
 | 
			
		||||
    - torch_mask_tokens
 | 
			
		||||
 | 
			
		||||
## DataCollatorWithFlattening
 | 
			
		||||
 | 
			
		||||
[[autodoc]] data.data_collator.DataCollatorWithFlattening
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -56,3 +56,8 @@ Learn how to quantize models in the [Quantization](../quantization) guide.
 | 
			
		||||
## HqqConfig
 | 
			
		||||
 | 
			
		||||
[[autodoc]] HqqConfig
 | 
			
		||||
 | 
			
		||||
## FbgemmFp8Config
 | 
			
		||||
 | 
			
		||||
[[autodoc]] FbgemmFp8Config
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										192
									
								
								docs/source/en/model_doc/chameleon.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										192
									
								
								docs/source/en/model_doc/chameleon.md
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,192 @@
 | 
			
		||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 | 
			
		||||
the License. You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 | 
			
		||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 | 
			
		||||
specific language governing permissions and limitations under the License.
 | 
			
		||||
 | 
			
		||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
 | 
			
		||||
rendered properly in your Markdown viewer.
 | 
			
		||||
 | 
			
		||||
-->
 | 
			
		||||
 | 
			
		||||
# Chameleon
 | 
			
		||||
 | 
			
		||||
## Overview
 | 
			
		||||
 | 
			
		||||
The Chameleon model was proposed in [Chameleon: Mixed-Modal Early-Fusion Foundation Models
 | 
			
		||||
](https://arxiv.org/abs/2405.09818v1) by META AI Chameleon Team. Chameleon is a Vision-Language Model that use vector quantization to tokenize images which enables the model to generate multimodal output. The model takes images and texts as input, including an interleaved format, and generates textual response. Image generation module is not released yet. 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
The abstract from the paper is the following:
 | 
			
		||||
 | 
			
		||||
*We present Chameleon, a family of early-fusion token-based mixed-modal models capable of understanding and generating images and text in any arbitrary sequence. We outline a stable training
 | 
			
		||||
approach from inception, an alignment recipe, and an architectural parameterization tailored for the
 | 
			
		||||
early-fusion, token-based, mixed-modal setting. The models are evaluated on a comprehensive range
 | 
			
		||||
of tasks, including visual question answering, image captioning, text generation, image generation, and
 | 
			
		||||
long-form mixed modal generation. Chameleon demonstrates broad and general capabilities, including
 | 
			
		||||
state-of-the-art performance in image captioning tasks, outperforms Llama-2 in text-only tasks while
 | 
			
		||||
being competitive with models such as Mixtral 8x7B and Gemini-Pro, and performs non-trivial image
 | 
			
		||||
generation, all in a single model. It also matches or exceeds the performance of much larger models,
 | 
			
		||||
including Gemini Pro and GPT-4V, according to human judgments on a new long-form mixed-modal
 | 
			
		||||
generation evaluation, where either the prompt or outputs contain mixed sequences of both images and
 | 
			
		||||
text. Chameleon marks a significant step forward in unified modeling of full multimodal documents*
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/chameleon_arch.png"
 | 
			
		||||
alt="drawing" width="600"/>
 | 
			
		||||
 | 
			
		||||
<small> Chameleon incorporates a vector quantizer module to transform images into discrete tokens. That also enables image generation using an auto-regressive transformer. Taken from the <a href="https://arxiv.org/abs/2405.09818v1">original paper.</a> </small>
 | 
			
		||||
 | 
			
		||||
This model was contributed by [joaogante](https://huggingface.co/joaogante) and [RaushanTurganbay](https://huggingface.co/RaushanTurganbay).
 | 
			
		||||
The original code can be found [here](https://github.com/facebookresearch/chameleon).
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## Usage tips
 | 
			
		||||
 | 
			
		||||
- We advise users to use `padding_side="left"` when computing batched generation as it leads to more accurate results. Simply make sure to set `processor.tokenizer.padding_side = "left"` before generating.
 | 
			
		||||
 | 
			
		||||
- Note that Chameleon was tuned for safety alignment. If the model is refusing to answer, consider asking a more concrete question, instead of an open question.
 | 
			
		||||
 | 
			
		||||
- Chameleon generates in chat format which means that the generated text will always be the "assistant's turn". You can enable a text completion generation by passing `return_for_text_completion=True` when calling the processor.
 | 
			
		||||
 | 
			
		||||
> [!NOTE]
 | 
			
		||||
> Chameleon implementation in Transformers uses a special image token to indicate where to merge image embeddings. For special image token we didn't add a new one but used one of the reserved tokens: `<reserved08707>`. You have to add `<image>` to your prompt in the place where the image should be embedded for correct generation.
 | 
			
		||||
 | 
			
		||||
## Usage example
 | 
			
		||||
 | 
			
		||||
### Single image inference
 | 
			
		||||
 | 
			
		||||
Chameleon is a gated model so make sure to have access and login to Hugging Face Hub using a token. 
 | 
			
		||||
Here's how to load the model and perform inference in half-precision (`torch.bfloat16`):
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
from transformers import ChameleonProcessor, ChameleonForConditionalGeneration
 | 
			
		||||
import torch
 | 
			
		||||
from PIL import Image
 | 
			
		||||
import requests
 | 
			
		||||
 | 
			
		||||
processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b")
 | 
			
		||||
model = ChameleonForConditionalGeneration.from_pretrained("facebook/chameleon-7b", torch_dtype=torch.bfloat16, device_map="cuda")
 | 
			
		||||
 | 
			
		||||
# prepare image and text prompt
 | 
			
		||||
url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
 | 
			
		||||
image = Image.open(requests.get(url, stream=True).raw)
 | 
			
		||||
prompt = "What do you see in this image?<image>"
 | 
			
		||||
 | 
			
		||||
inputs = processor(prompt, image, return_tensors="pt").to(model.device)
 | 
			
		||||
 | 
			
		||||
# autoregressively complete prompt
 | 
			
		||||
output = model.generate(**inputs, max_new_tokens=50)
 | 
			
		||||
print(processor.decode(output[0], skip_special_tokens=True))
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
### Multi image inference
 | 
			
		||||
 | 
			
		||||
Chameleon can perform inference with multiple images as input, where images either belong to the same prompt or different prompts (in batched inference). Here is how you can do it:
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
from transformers import ChameleonProcessor, ChameleonForConditionalGeneration
 | 
			
		||||
import torch
 | 
			
		||||
from PIL import Image
 | 
			
		||||
import requests
 | 
			
		||||
 | 
			
		||||
processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b")
 | 
			
		||||
 | 
			
		||||
model = ChameleonForConditionalGeneration.from_pretrained("facebook/chameleon-7b", torch_dtype=torch.bfloat16, device_map="cuda")
 | 
			
		||||
 | 
			
		||||
# Get three different images
 | 
			
		||||
url = "https://www.ilankelman.org/stopsigns/australia.jpg"
 | 
			
		||||
image_stop = Image.open(requests.get(url, stream=True).raw)
 | 
			
		||||
 | 
			
		||||
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 | 
			
		||||
image_cats = Image.open(requests.get(url, stream=True).raw)
 | 
			
		||||
 | 
			
		||||
url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
 | 
			
		||||
image_snowman = Image.open(requests.get(url, stream=True).raw)
 | 
			
		||||
 | 
			
		||||
# Prepare a batched prompt, where the first one is a multi-image prompt and the second is not
 | 
			
		||||
prompts = [
 | 
			
		||||
    "What do these images have in common?<image><image>",
 | 
			
		||||
    "<image>What is shown in this image?"
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
# We can simply feed images in the order they have to be used in the text prompt
 | 
			
		||||
# Each "<image>" token uses one image leaving the next for the subsequent "<image>" tokens
 | 
			
		||||
inputs = processor(text=prompts, images=[image_stop, image_cats, image_snowman], padding=True, return_tensors="pt").to(device="cuda", dtype=torch.bfloat16)
 | 
			
		||||
 | 
			
		||||
# Generate
 | 
			
		||||
generate_ids = model.generate(**inputs, max_new_tokens=50)
 | 
			
		||||
processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Model optimization
 | 
			
		||||
 | 
			
		||||
### Quantization using Bitsandbytes
 | 
			
		||||
 | 
			
		||||
The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes` and make sure to have access to a CUDA compatible GPU device. Simply change the snippet above with:
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
from transformers import ChameleonForConditionalGeneration, BitsAndBytesConfig
 | 
			
		||||
 | 
			
		||||
# specify how to quantize the model
 | 
			
		||||
quantization_config = BitsAndBytesConfig(
 | 
			
		||||
    load_in_4bit=True,
 | 
			
		||||
    bnb_4bit_quant_type="nf4",
 | 
			
		||||
    bnb_4bit_compute_dtype=torch.float16,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
model = ChameleonForConditionalGeneration.from_pretrained("facebook/chameleon-7b", quantization_config=quantization_config, device_map="cuda")
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
### Use Flash-Attention 2 and SDPA to further speed-up generation
 | 
			
		||||
 | 
			
		||||
The models supports both, Flash-Attention 2 and PyTorch's [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) which can be enables for optimization. SDPA is the default options when you load the model, If you want to switch for Flash Attention 2, first make sure to install flash-attn. Refer to the [original repository](https://github.com/Dao-AILab/flash-attention) regarding that package installation. Simply change the snippet above with:
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
from transformers import ChameleonForConditionalGeneration
 | 
			
		||||
 | 
			
		||||
model_id = "facebook/chameleon-7b"
 | 
			
		||||
model = ChameleonForConditionalGeneration.from_pretrained(
 | 
			
		||||
    model_id, 
 | 
			
		||||
    torch_dtype=torch.bfloat16, 
 | 
			
		||||
    low_cpu_mem_usage=True,
 | 
			
		||||
    attn_implementation="flash_attention_2"
 | 
			
		||||
).to(0)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## ChameleonConfig
 | 
			
		||||
 | 
			
		||||
[[autodoc]] ChameleonConfig
 | 
			
		||||
 | 
			
		||||
## ChameleonVQVAEConfig
 | 
			
		||||
 | 
			
		||||
[[autodoc]] ChameleonVQVAEConfig
 | 
			
		||||
 | 
			
		||||
## ChameleonProcessor
 | 
			
		||||
 | 
			
		||||
[[autodoc]] ChameleonProcessor
 | 
			
		||||
 | 
			
		||||
## ChameleonImageProcessor
 | 
			
		||||
 | 
			
		||||
[[autodoc]] ChameleonImageProcessor
 | 
			
		||||
    - preprocess
 | 
			
		||||
 | 
			
		||||
## ChameleonVQVAE
 | 
			
		||||
 | 
			
		||||
[[autodoc]] ChameleonVQVAE
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## ChameleonModel
 | 
			
		||||
 | 
			
		||||
[[autodoc]] ChameleonModel
 | 
			
		||||
    - forward
 | 
			
		||||
 | 
			
		||||
## ChameleonForConditionalGeneration
 | 
			
		||||
 | 
			
		||||
[[autodoc]] ChameleonForConditionalGeneration
 | 
			
		||||
    - forward
 | 
			
		||||
@ -79,6 +79,123 @@ encode the text and prepare the images. The following example shows how to get t
 | 
			
		||||
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
### Combining CLIP and Flash Attention 2
 | 
			
		||||
 | 
			
		||||
First, make sure to install the latest version of Flash Attention 2.
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
pip install -U flash-attn --no-build-isolation
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Make also sure that you have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of flash-attn repository. Make also sure to load your model in half-precision (e.g. `torch.float16`)
 | 
			
		||||
 | 
			
		||||
<Tip warning={true}>
 | 
			
		||||
 | 
			
		||||
For small batch sizes, you might notice a slowdown in your model when using flash attention. Refer to the section [Expected speedups with Flash Attention and SDPA](#Expected-speedups-with-Flash-Attention-and-SDPA) below and select an appropriate attention implementation.
 | 
			
		||||
 | 
			
		||||
</Tip>
 | 
			
		||||
 | 
			
		||||
To load and run a model using Flash Attention 2, refer to the snippet below:
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
>>> import torch
 | 
			
		||||
>>> import requests
 | 
			
		||||
>>> from PIL import Image
 | 
			
		||||
 | 
			
		||||
>>> from transformers import CLIPProcessor, CLIPModel
 | 
			
		||||
 | 
			
		||||
>>> device = "cuda"
 | 
			
		||||
>>> torch_dtype = torch.float16
 | 
			
		||||
 | 
			
		||||
>>> model = CLIPModel.from_pretrained(
 | 
			
		||||
...     "openai/clip-vit-base-patch32",
 | 
			
		||||
...     attn_implementation="flash_attention_2",
 | 
			
		||||
...     device_map=device,
 | 
			
		||||
...     torch_dtype=torch_dtype,
 | 
			
		||||
... )
 | 
			
		||||
>>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
 | 
			
		||||
 | 
			
		||||
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 | 
			
		||||
>>> image = Image.open(requests.get(url, stream=True).raw)
 | 
			
		||||
 | 
			
		||||
>>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
 | 
			
		||||
>>> inputs.to(device)
 | 
			
		||||
 | 
			
		||||
>>> with torch.no_grad():
 | 
			
		||||
...     with torch.autocast(device):
 | 
			
		||||
...         outputs = model(**inputs)
 | 
			
		||||
 | 
			
		||||
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
 | 
			
		||||
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
 | 
			
		||||
>>> print(probs)
 | 
			
		||||
tensor([[0.9946, 0.0052]], device='cuda:0', dtype=torch.float16)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
### Using Scaled Dot Product Attention (SDPA)
 | 
			
		||||
 | 
			
		||||
PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
 | 
			
		||||
encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
 | 
			
		||||
[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
 | 
			
		||||
or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
 | 
			
		||||
page for more information.
 | 
			
		||||
 | 
			
		||||
SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
 | 
			
		||||
`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
from transformers import CLIPModel
 | 
			
		||||
 | 
			
		||||
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", torch_dtype=torch.float16, attn_implementation="sdpa")
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
 | 
			
		||||
 | 
			
		||||
### Expected speedups with Flash Attention and SDPA
 | 
			
		||||
 | 
			
		||||
On a local benchmark (NVIDIA A10G, PyTorch 2.3.1+cu121) with `float16`, we saw the following speedups during inference for `"openai/clip-vit-large-patch14"` checkpoint ([code](https://gist.github.com/qubvel/ac691a54e54f9fae8144275f866a7ff8)):
 | 
			
		||||
 | 
			
		||||
#### CLIPTextModel
 | 
			
		||||
 | 
			
		||||
|   Num text labels |   Eager (s/iter) |   FA2 (s/iter) |   FA2 speedup |   SDPA (s/iter) |   SDPA speedup |
 | 
			
		||||
|------------------:|-----------------:|---------------:|--------------:|----------------:|---------------:|
 | 
			
		||||
|                 4 |            0.009 |          0.012 |         0.737 |           0.007 |          1.269 |
 | 
			
		||||
|                16 |            0.009 |          0.014 |         0.659 |           0.008 |          1.187 |
 | 
			
		||||
|                32 |            0.018 |          0.021 |         0.862 |           0.016 |          1.142 |
 | 
			
		||||
|                64 |            0.034 |          0.034 |         1.001 |           0.03  |          1.163 |
 | 
			
		||||
|               128 |            0.063 |          0.058 |         1.09  |           0.054 |          1.174 |
 | 
			
		||||
 | 
			
		||||

 | 
			
		||||
 | 
			
		||||
#### CLIPVisionModel
 | 
			
		||||
 | 
			
		||||
|   Image batch size |   Eager (s/iter) |   FA2 (s/iter) |   FA2 speedup |   SDPA (s/iter) |   SDPA speedup |
 | 
			
		||||
|-------------------:|-----------------:|---------------:|--------------:|----------------:|---------------:|
 | 
			
		||||
|                  1 |            0.016 |          0.013 |         1.247 |           0.012 |          1.318 |
 | 
			
		||||
|                  4 |            0.025 |          0.021 |         1.198 |           0.021 |          1.202 |
 | 
			
		||||
|                 16 |            0.093 |          0.075 |         1.234 |           0.075 |          1.24  |
 | 
			
		||||
|                 32 |            0.181 |          0.147 |         1.237 |           0.146 |          1.241 |
 | 
			
		||||
 | 
			
		||||

 | 
			
		||||
 | 
			
		||||
#### CLIPModel
 | 
			
		||||
 | 
			
		||||
|   Image batch size |   Num text labels |   Eager (s/iter) |   FA2 (s/iter) |   FA2 speedup |   SDPA (s/iter) |   SDPA speedup |
 | 
			
		||||
|-------------------:|------------------:|-----------------:|---------------:|--------------:|----------------:|---------------:|
 | 
			
		||||
|                  1 |                 4 |            0.025 |          0.026 |         0.954 |           0.02  |          1.217 |
 | 
			
		||||
|                  1 |                16 |            0.026 |          0.028 |         0.918 |           0.02  |          1.287 |
 | 
			
		||||
|                  1 |                64 |            0.042 |          0.046 |         0.906 |           0.036 |          1.167 |
 | 
			
		||||
|                  4 |                 4 |            0.028 |          0.033 |         0.849 |           0.024 |          1.189 |
 | 
			
		||||
|                  4 |                16 |            0.034 |          0.035 |         0.955 |           0.029 |          1.169 |
 | 
			
		||||
|                  4 |                64 |            0.059 |          0.055 |         1.072 |           0.05  |          1.179 |
 | 
			
		||||
|                 16 |                 4 |            0.096 |          0.088 |         1.091 |           0.078 |          1.234 |
 | 
			
		||||
|                 16 |                16 |            0.102 |          0.09  |         1.129 |           0.083 |          1.224 |
 | 
			
		||||
|                 16 |                64 |            0.127 |          0.11  |         1.157 |           0.105 |          1.218 |
 | 
			
		||||
|                 32 |                 4 |            0.185 |          0.159 |         1.157 |           0.149 |          1.238 |
 | 
			
		||||
|                 32 |                16 |            0.19  |          0.162 |         1.177 |           0.154 |          1.233 |
 | 
			
		||||
|                 32 |                64 |            0.216 |          0.181 |         1.19  |           0.176 |          1.228 |
 | 
			
		||||
 | 
			
		||||
## Resources
 | 
			
		||||
 | 
			
		||||
A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with CLIP.
 | 
			
		||||
 | 
			
		||||
@ -57,7 +57,7 @@ print((last_hidden_states - traced_outputs[0]).abs().max())
 | 
			
		||||
 | 
			
		||||
## Resources
 | 
			
		||||
 | 
			
		||||
A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DPT.
 | 
			
		||||
A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DINOv2.
 | 
			
		||||
 | 
			
		||||
- Demo notebooks for DINOv2 can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DINOv2). 🌎
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -41,33 +41,40 @@ The original code can be found [here](https://github.com/IDEA-Research/Grounding
 | 
			
		||||
Here's how to use the model for zero-shot object detection:
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
import requests
 | 
			
		||||
>>> import requests
 | 
			
		||||
 | 
			
		||||
import torch
 | 
			
		||||
from PIL import Image
 | 
			
		||||
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection, 
 | 
			
		||||
>>> import torch
 | 
			
		||||
>>> from PIL import Image
 | 
			
		||||
>>> from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
 | 
			
		||||
 | 
			
		||||
model_id = "IDEA-Research/grounding-dino-tiny"
 | 
			
		||||
>>> model_id = "IDEA-Research/grounding-dino-tiny"
 | 
			
		||||
>>> device = "cuda"
 | 
			
		||||
 | 
			
		||||
processor = AutoProcessor.from_pretrained(model_id)
 | 
			
		||||
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
 | 
			
		||||
>>> processor = AutoProcessor.from_pretrained(model_id)
 | 
			
		||||
>>> model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
 | 
			
		||||
 | 
			
		||||
image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 | 
			
		||||
image = Image.open(requests.get(image_url, stream=True).raw)
 | 
			
		||||
# Check for cats and remote controls
 | 
			
		||||
text = "a cat. a remote control."
 | 
			
		||||
>>> image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 | 
			
		||||
>>> image = Image.open(requests.get(image_url, stream=True).raw)
 | 
			
		||||
>>> # Check for cats and remote controls
 | 
			
		||||
>>> text = "a cat. a remote control."
 | 
			
		||||
 | 
			
		||||
inputs = processor(images=image, text=text, return_tensors="pt").to(device)
 | 
			
		||||
with torch.no_grad():
 | 
			
		||||
    outputs = model(**inputs)
 | 
			
		||||
>>> inputs = processor(images=image, text=text, return_tensors="pt").to(device)
 | 
			
		||||
>>> with torch.no_grad():
 | 
			
		||||
...     outputs = model(**inputs)
 | 
			
		||||
 | 
			
		||||
results = processor.post_process_grounded_object_detection(
 | 
			
		||||
    outputs,
 | 
			
		||||
    inputs.input_ids,
 | 
			
		||||
    box_threshold=0.4,
 | 
			
		||||
    text_threshold=0.3,
 | 
			
		||||
    target_sizes=[image.size[::-1]]
 | 
			
		||||
)
 | 
			
		||||
>>> results = processor.post_process_grounded_object_detection(
 | 
			
		||||
...     outputs,
 | 
			
		||||
...     inputs.input_ids,
 | 
			
		||||
...     box_threshold=0.4,
 | 
			
		||||
...     text_threshold=0.3,
 | 
			
		||||
...     target_sizes=[image.size[::-1]]
 | 
			
		||||
... )
 | 
			
		||||
>>> print(results)
 | 
			
		||||
[{'boxes': tensor([[344.6959,  23.1090, 637.1833, 374.2751],
 | 
			
		||||
        [ 12.2666,  51.9145, 316.8582, 472.4392],
 | 
			
		||||
        [ 38.5742,  70.0015, 176.7838, 118.1806]], device='cuda:0'),
 | 
			
		||||
  'labels': ['a cat', 'a cat', 'a remote control'],
 | 
			
		||||
  'scores': tensor([0.4785, 0.4381, 0.4776], device='cuda:0')}]
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Grounded SAM
 | 
			
		||||
 | 
			
		||||
@ -26,8 +26,22 @@ The abstract from the paper is the following:
 | 
			
		||||
 | 
			
		||||
*Modern hierarchical vision transformers have added several vision-specific components in the pursuit of supervised classification performance. While these components lead to effective accuracies and attractive FLOP counts, the added complexity actually makes these transformers slower than their vanilla ViT counterparts. In this paper, we argue that this additional bulk is unnecessary. By pretraining with a strong visual pretext task (MAE), we can strip out all the bells-and-whistles from a state-of-the-art multi-stage vision transformer without losing accuracy. In the process, we create Hiera, an extremely simple hierarchical vision transformer that is more accurate than previous models while being significantly faster both at inference and during training. We evaluate Hiera on a variety of tasks for image and video recognition. Our code and models are available at https://github.com/facebookresearch/hiera.*
 | 
			
		||||
 | 
			
		||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/hiera_overview.png"
 | 
			
		||||
alt="drawing" width="600"/>
 | 
			
		||||
 | 
			
		||||
<small> Hiera architecture. Taken from the <a href="https://arxiv.org/abs/2306.00989">original paper.</a> </small>
 | 
			
		||||
 | 
			
		||||
This model was a joint contibution by [EduardoPacheco](https://huggingface.co/EduardoPacheco) and [namangarg110](https://huggingface.co/namangarg110). The original code can be found [here] (https://github.com/facebookresearch/hiera).
 | 
			
		||||
 | 
			
		||||
## Resources
 | 
			
		||||
 | 
			
		||||
A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Hiera. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
 | 
			
		||||
 | 
			
		||||
<PipelineTag pipeline="image-classification"/>
 | 
			
		||||
 | 
			
		||||
- [`HieraForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
 | 
			
		||||
- See also: [Image classification task guide](../tasks/image_classification)
 | 
			
		||||
 | 
			
		||||
## HieraConfig
 | 
			
		||||
 | 
			
		||||
[[autodoc]] HieraConfig
 | 
			
		||||
 | 
			
		||||
@ -43,6 +43,13 @@ The original code can be found [here](https://github.com/LLaVA-VL/LLaVA-NeXT/tre
 | 
			
		||||
 | 
			
		||||
- We advise users to use `padding_side="left"` when computing batched generation as it leads to more accurate results. Simply make sure to call `processor.tokenizer.padding_side = "left"` before generating.
 | 
			
		||||
 | 
			
		||||
<Tip warning={true}>
 | 
			
		||||
 | 
			
		||||
- Llava-Next uses different number of patches for images and thus has to pad the inputs inside modeling code, aside from the padding done when processing the inputs. The default setting is "left-padding" if model is in `eval()` mode, otherwise "right-padding".
 | 
			
		||||
 | 
			
		||||
</Tip>
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
- Note that each checkpoint has been trained with a specific prompt format, depending on which large language model (LLM) was used. You can use tokenizer's `apply_chat_template` to format your prompts correctly. Below is an example of how to do that.
 | 
			
		||||
 | 
			
		||||
We will use [LLaVA-NeXT-Video-7B-hf](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf) and a conversation history of videos and images. Each content field has to be a list of dicts, as follows:
 | 
			
		||||
 | 
			
		||||
@ -40,7 +40,42 @@ The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/
 | 
			
		||||
 | 
			
		||||
- Note the model has not been explicitly trained to process multiple images in the same prompt, although this is technically possible, you may experience inaccurate results.
 | 
			
		||||
 | 
			
		||||
- For better results, we recommend users to prompt the model with the correct prompt format. Below is a list of prompt formats accepted by each llava checkpoint:
 | 
			
		||||
- For better results, we recommend users to use the processor's `apply_chat_template()` method to format your prompt correctly. For that you need to construct a conversation history, passing in a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities, as follows:
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
from transformers import AutoProcessor
 | 
			
		||||
 | 
			
		||||
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
 | 
			
		||||
 | 
			
		||||
conversation = [
 | 
			
		||||
    {
 | 
			
		||||
        "role": "user",
 | 
			
		||||
        "content": [
 | 
			
		||||
            {"type": "image"},
 | 
			
		||||
            {"type": "text", "text": "What’s shown in this image?"},
 | 
			
		||||
            ],
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "role": "assistant",
 | 
			
		||||
        "content": [{"type": "text", "text": "This image shows a red stop sign."},]
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
        "role": "user",
 | 
			
		||||
        "content": [
 | 
			
		||||
            {"type": "text", "text": "Describe the image in more details."},
 | 
			
		||||
        ],
 | 
			
		||||
    },
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
 | 
			
		||||
 | 
			
		||||
# Note that the template simply formats your prompt, you still have to tokenize it and obtain pixel values for your images
 | 
			
		||||
print(text_prompt)
 | 
			
		||||
>>> "USER: <image>\n<What’s shown in this image? ASSISTANT: This image shows a red stop sign.</s>USER: Describe the image in more details. ASSISTANT:"
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
- If you want to construct a chat prompt yourself, below is a list of prompt formats accepted by each llava checkpoint:
 | 
			
		||||
 | 
			
		||||
[llava-interleave models](https://huggingface.co/collections/llava-hf/llava-interleave-668e19a97da0036aad4a2f19) requires the following format:
 | 
			
		||||
```bash
 | 
			
		||||
@ -64,6 +99,7 @@ For multiple turns conversation:
 | 
			
		||||
"USER: <image>\n<prompt1> ASSISTANT: <answer1></s>USER: <prompt2> ASSISTANT: <answer2></s>USER: <prompt3> ASSISTANT:"
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
### Using Flash Attention 2
 | 
			
		||||
 | 
			
		||||
Flash Attention 2 is an even faster, optimized version of the previous optimization, please refer to the [Flash Attention 2 section of performance docs](https://huggingface.co/docs/transformers/perf_infer_gpu_one).
 | 
			
		||||
 | 
			
		||||
@ -46,26 +46,79 @@ The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/
 | 
			
		||||
 | 
			
		||||
- We advise users to use `padding_side="left"` when computing batched generation as it leads to more accurate results. Simply make sure to call `processor.tokenizer.padding_side = "left"` before generating.
 | 
			
		||||
 | 
			
		||||
- Note that each checkpoint has been trained with a specific prompt format, depending on which large language model (LLM) was used. Below, we list the correct prompt formats to use for the text prompt "What is shown in this image?":
 | 
			
		||||
<Tip warning={true}>
 | 
			
		||||
 | 
			
		||||
- Llava-Next uses different number of patches for images and thus has to pad the inputs inside modeling code, aside from the padding done when processing the inputs. The default setting is "left-padding" if model is in `eval()` mode, otherwise "right-padding".
 | 
			
		||||
 | 
			
		||||
</Tip>
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
- Note that each checkpoint has been trained with a specific prompt format, depending on which large language model (LLM) was used. You can use the processor's `apply_chat_template` to format your prompts correctly. For that you have to construct a conversation history, passing a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities. Below is an example of how to do that and the list of formats accepted by each checkpoint.
 | 
			
		||||
 | 
			
		||||
We will use [llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-hf/llava-v1.6-mistral-7b-hf) and a conversation history of text and image. Each content field has to be a list of dicts, as follows:
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
from transformers import LlavaNextProcessor
 | 
			
		||||
 | 
			
		||||
processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-hf/llava-v1.6-mistral-7b-hf")
 | 
			
		||||
 | 
			
		||||
conversation = [
 | 
			
		||||
    {
 | 
			
		||||
        "role": "user",
 | 
			
		||||
        "content": [
 | 
			
		||||
            {"type": "image"},
 | 
			
		||||
            {"type": "text", "text": "What’s shown in this image?"},
 | 
			
		||||
        ],
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "role": "assistant",
 | 
			
		||||
        "content": [{"type": "text", "text": "This image shows a red stop sign."},]
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
        "role": "user",
 | 
			
		||||
        "content": [
 | 
			
		||||
            {"type": "text", "text": "Describe the image in more details."},
 | 
			
		||||
        ],
 | 
			
		||||
    },
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
 | 
			
		||||
 | 
			
		||||
# Note that the template simply formats your prompt, you still have to tokenize it and obtain pixel values for your images
 | 
			
		||||
print(text_prompt)
 | 
			
		||||
>>> "[INST] <image>\nWhat's shown in this image? [/INST] This image shows a red stop sign. [INST] Describe the image in more details. [/INST]"
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
- If you want to construct a chat prompt yourself, below is a list of possible formats
 | 
			
		||||
.
 | 
			
		||||
[llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) requires the following format:
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
"[INST] <image>\nWhat is shown in this image? [/INST]"
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
[llava-v1.6-vicuna-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-7b-hf) and [llava-v1.6-vicuna-13b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) require the following format:
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
"A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nWhat is shown in this image? ASSISTANT:"
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
[llava-v1.6-34b-hf](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) requires the following format:
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
"<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n"
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
[llama3-llava-next-8b-hf](https://huggingface.co/llava-hf/llava-next-8b-hf) requires the following format:
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
"<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.<|eot_id|><|start_header_id|><|start_header_id|>user<|end_header_id|>\n\n<image>\nWhat is shown in this image?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
[llava-next-72b-hf](https://huggingface.co/llava-hf/llava-next-72b-hf) and [llava-next-110b-hf](https://huggingface.co/llava-hf/llava-next-110b-hf) require the following format:
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|>\n<|im_start|>assistant\n"
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Usage example
 | 
			
		||||
 | 
			
		||||
### Single image inference
 | 
			
		||||
@ -86,8 +139,17 @@ model.to("cuda:0")
 | 
			
		||||
# prepare image and text prompt, using the appropriate prompt template
 | 
			
		||||
url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
 | 
			
		||||
image = Image.open(requests.get(url, stream=True).raw)
 | 
			
		||||
prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
 | 
			
		||||
 | 
			
		||||
conversation = [
 | 
			
		||||
    {
 | 
			
		||||
        "role": "user",
 | 
			
		||||
        "content": [
 | 
			
		||||
            {"type": "image"},
 | 
			
		||||
            {"type": "text", "text": "What is shown in this image?"},
 | 
			
		||||
        ],
 | 
			
		||||
    },
 | 
			
		||||
]
 | 
			
		||||
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
 | 
			
		||||
inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
 | 
			
		||||
 | 
			
		||||
# autoregressively complete prompt
 | 
			
		||||
@ -120,15 +182,47 @@ image_cats = Image.open(requests.get(url, stream=True).raw)
 | 
			
		||||
url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
 | 
			
		||||
image_snowman = Image.open(requests.get(url, stream=True).raw)
 | 
			
		||||
 | 
			
		||||
# Prepare a batched prompt, where the first one is a multi-turn conversation and the second is not
 | 
			
		||||
prompt = [
 | 
			
		||||
    "[INST] <image>\nWhat is shown in this image? [/INST] There is a red stop sign in the image. [INST] <image>\nWhat about this image? How many cats do you see [/INST]",
 | 
			
		||||
    "[INST] <image>\nWhat is shown in this image? [/INST]"
 | 
			
		||||
# Prepare a batch of two prompts, where the first one is a multi-turn conversation and the second is not
 | 
			
		||||
conversation_1 = [
 | 
			
		||||
    {
 | 
			
		||||
        "role": "user",
 | 
			
		||||
        "content": [
 | 
			
		||||
            {"type": "image"},
 | 
			
		||||
            {"type": "text", "text": "What is shown in this image?"},
 | 
			
		||||
            ],
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "role": "assistant",
 | 
			
		||||
        "content": [
 | 
			
		||||
            {"type": "text", "text": "There is a red stop sign in the image."},
 | 
			
		||||
            ],
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "role": "user",
 | 
			
		||||
        "content": [
 | 
			
		||||
            {"type": "image"},
 | 
			
		||||
            {"type": "text", "text": "What about this image? How many cats do you see?"},
 | 
			
		||||
            ],
 | 
			
		||||
    },
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
conversation_2 = [
 | 
			
		||||
    {
 | 
			
		||||
        "role": "user",
 | 
			
		||||
        "content": [
 | 
			
		||||
            {"type": "image"},
 | 
			
		||||
            {"type": "text", "text": "What is shown in this image?"},
 | 
			
		||||
            ],
 | 
			
		||||
    },
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
prompt_1 = processor.apply_chat_template(conversation_1, add_generation_prompt=True)
 | 
			
		||||
prompt_2 = processor.apply_chat_template(conversation_2, add_generation_prompt=True)
 | 
			
		||||
prompts = [prompt_1, prompt_2]
 | 
			
		||||
 | 
			
		||||
# We can simply feed images in the order they have to be used in the text prompt
 | 
			
		||||
# Each "<image>" token uses one image leaving the next for the subsequent "<image>" tokens
 | 
			
		||||
inputs = processor(text=prompt, images=[image_stop, image_cats, image_snowman], padding=True, return_tensors="pt").to(model.device)
 | 
			
		||||
inputs = processor(text=prompts, images=[image_stop, image_cats, image_snowman], padding=True, return_tensors="pt").to(model.device)
 | 
			
		||||
 | 
			
		||||
# Generate
 | 
			
		||||
generate_ids = model.generate(**inputs, max_new_tokens=30)
 | 
			
		||||
 | 
			
		||||
@ -105,7 +105,7 @@ from huggingface_hub import list_models
 | 
			
		||||
 | 
			
		||||
model_list = list_models()
 | 
			
		||||
org = "Helsinki-NLP"
 | 
			
		||||
model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)]
 | 
			
		||||
model_ids = [x.id for x in model_list if x.id.startswith(org)]
 | 
			
		||||
suffix = [x.split("/")[1] for x in model_ids]
 | 
			
		||||
old_style_multi_models = [f"{org}/{s}" for s in suffix if s != s.lower()]
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.
 | 
			
		||||
 | 
			
		||||
## Overview
 | 
			
		||||
 | 
			
		||||
Qwen2 is the new model series of large language models from the Qwen team. Previously, we released the Qwen series, including Qwen-72B, Qwen-1.8B, Qwen-VL, Qwen-Audio, etc.
 | 
			
		||||
Qwen2 is the new model series of large language models from the Qwen team. Previously, we released the Qwen series, including Qwen2-0.5B, Qwen2-1.5B, Qwen2-7B, Qwen2-57B-A14B, Qwen2-72B, Qwen2-Audio, etc.
 | 
			
		||||
 | 
			
		||||
### Model Details
 | 
			
		||||
 | 
			
		||||
@ -27,16 +27,16 @@ Qwen2 is a language model series including decoder language models of different
 | 
			
		||||
 | 
			
		||||
## Usage tips
 | 
			
		||||
 | 
			
		||||
`Qwen2-7B-beta` and `Qwen2-7B-Chat-beta` can be found on the [Huggingface Hub](https://huggingface.co/Qwen)
 | 
			
		||||
`Qwen2-7B` and `Qwen2-7B-Instruct` can be found on the [Huggingface Hub](https://huggingface.co/Qwen)
 | 
			
		||||
 | 
			
		||||
In the following, we demonstrate how to use `Qwen2-7B-Chat-beta` for the inference. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose.
 | 
			
		||||
In the following, we demonstrate how to use `Qwen2-7B-Instruct` for the inference. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose.
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
>>> from transformers import AutoModelForCausalLM, AutoTokenizer
 | 
			
		||||
>>> device = "cuda" # the device to load the model onto
 | 
			
		||||
 | 
			
		||||
>>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-7B-Chat", device_map="auto")
 | 
			
		||||
>>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-7B-Chat")
 | 
			
		||||
>>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-7B-Instruct", device_map="auto")
 | 
			
		||||
>>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct")
 | 
			
		||||
 | 
			
		||||
>>> prompt = "Give me a short introduction to large language model."
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -51,19 +51,19 @@ This model was contributed by [julien-c](https://huggingface.co/julien-c). The o
 | 
			
		||||
 | 
			
		||||
## Usage tips
 | 
			
		||||
 | 
			
		||||
- This implementation is the same as [`BertModel`] with a tiny embeddings tweak as well as a setup
 | 
			
		||||
  for Roberta pretrained models.
 | 
			
		||||
- RoBERTa has the same architecture as BERT, but uses a byte-level BPE as a tokenizer (same as GPT-2) and uses a
 | 
			
		||||
- This implementation is the same as [`BertModel`] with a minor tweak to the embeddings, as well as a setup
 | 
			
		||||
  for RoBERTa pretrained models.
 | 
			
		||||
- RoBERTa has the same architecture as BERT but uses a byte-level BPE as a tokenizer (same as GPT-2) and uses a
 | 
			
		||||
  different pretraining scheme.
 | 
			
		||||
- RoBERTa doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just
 | 
			
		||||
  separate your segments with the separation token `tokenizer.sep_token` (or `</s>`)
 | 
			
		||||
- Same as BERT with better pretraining tricks:
 | 
			
		||||
- RoBERTa doesn't have `token_type_ids`, so you don't need to indicate which token belongs to which segment. Just
 | 
			
		||||
  separate your segments with the separation token `tokenizer.sep_token` (or `</s>`).
 | 
			
		||||
- RoBERTa is similar to BERT but with better pretraining techniques:
 | 
			
		||||
 | 
			
		||||
    * dynamic masking: tokens are masked differently at each epoch, whereas BERT does it once and for all
 | 
			
		||||
    * together to reach 512 tokens (so the sentences are in an order than may span several documents)
 | 
			
		||||
    * train with larger batches
 | 
			
		||||
    * use BPE with bytes as a subunit and not characters (because of unicode characters)
 | 
			
		||||
- [CamemBERT](camembert) is a wrapper around RoBERTa. Refer to this page for usage examples.
 | 
			
		||||
    * Dynamic masking: tokens are masked differently at each epoch, whereas BERT does it once and for all.
 | 
			
		||||
    * Sentence packing: Sentences are packed together to reach 512 tokens (so the sentences are in an order that may span several documents).
 | 
			
		||||
    * Larger batches: Training uses larger batches.
 | 
			
		||||
    * Byte-level BPE vocabulary: Uses BPE with bytes as a subunit instead of characters, accommodating Unicode characters.
 | 
			
		||||
- [CamemBERT](camembert) is a wrapper around RoBERTa. Refer to its model page for usage examples.
 | 
			
		||||
 | 
			
		||||
## Resources
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -98,7 +98,7 @@ indices = np.arange(0, total_frames, total_frames / 8).astype(int)
 | 
			
		||||
video = read_video_pyav(container, indices)
 | 
			
		||||
 | 
			
		||||
# For better results, we recommend to prompt the model in the following format
 | 
			
		||||
prompt = "USER: <video>Why is this funny? ASSISTANT:"
 | 
			
		||||
prompt = "USER: <video>\nWhy is this funny? ASSISTANT:"
 | 
			
		||||
inputs = processor(text=prompt, videos=video, return_tensors="pt")
 | 
			
		||||
 | 
			
		||||
out = model.generate(**inputs, max_new_tokens=60)
 | 
			
		||||
@ -108,7 +108,7 @@ processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spac
 | 
			
		||||
For multiple turns conversation change the prompt format to:
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
"USER: <video>What do you see in this video? ASSISTANT: A baby reading a book. USER: Why is the it funny? ASSISTANT:"
 | 
			
		||||
"USER: <video>\nWhat do you see in this video? ASSISTANT: A baby reading a book. USER: Why is the it funny? ASSISTANT:"
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
### Mixed Media Mode
 | 
			
		||||
@ -123,7 +123,7 @@ import requests
 | 
			
		||||
# Load and image and write a new prompt
 | 
			
		||||
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 | 
			
		||||
image = Image.open(requests.get(url, stream=True).raw)
 | 
			
		||||
prompt = "USER: <image> How many cats are there in the image? ASSISTANT: There are two cats. USER: <video>Why is this video funny? ASSISTANT:"
 | 
			
		||||
prompt = "USER: <image>\nHow many cats are there in the image? ASSISTANT: There are two cats. USER: <video>\nWhy is this video funny? ASSISTANT:"
 | 
			
		||||
 | 
			
		||||
inputs = processor(text=prompt, images=image, videos=clip, padding=True, return_tensors="pt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -26,7 +26,12 @@ The abstract from the paper is the following:
 | 
			
		||||
 | 
			
		||||
*While existing large vision-language multimodal models focus on whole image understanding, there is a prominent gap in achieving region-specific comprehension. Current approaches that use textual coordinates or spatial encodings often fail to provide a user-friendly interface for visual prompting. To address this challenge, we introduce a novel multimodal model capable of decoding arbitrary visual prompts. This allows users to intuitively mark images and interact with the model using natural cues like a "red bounding box" or "pointed arrow". Our simple design directly overlays visual markers onto the RGB image, eliminating the need for complex region encodings, yet achieves state-of-the-art performance on region-understanding tasks like Visual7W, PointQA, and Visual Commonsense Reasoning benchmark. Furthermore, we present ViP-Bench, a comprehensive benchmark to assess the capability of models in understanding visual prompts across multiple dimensions, enabling future research in this domain. Code, data, and model are publicly available.*
 | 
			
		||||
 | 
			
		||||
Tips:
 | 
			
		||||
The original code can be found [here](https://github.com/mu-cai/ViP-LLaVA).
 | 
			
		||||
 | 
			
		||||
This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## Usage tips:
 | 
			
		||||
 | 
			
		||||
- The architecture is similar than llava architecture except that the multi-modal projector takes a set of concatenated vision hidden states and has an additional layernorm layer on that module.
 | 
			
		||||
 | 
			
		||||
@ -34,22 +39,51 @@ Tips:
 | 
			
		||||
 | 
			
		||||
- Note the model has not been explicitly trained to process multiple images in the same prompt, although this is technically possible, you may experience inaccurate results.
 | 
			
		||||
 | 
			
		||||
- For better results, we recommend users to prompt the model with the correct prompt format: 
 | 
			
		||||
- For better results, we recommend users to use the processor's `apply_chat_template()` method to format your prompt correctly. For that you need to construct a conversation history, passing in a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities, as follows:
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
from transformers import AutoProcessor
 | 
			
		||||
 | 
			
		||||
processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")
 | 
			
		||||
 | 
			
		||||
conversation = [
 | 
			
		||||
    {
 | 
			
		||||
        "role": "user",
 | 
			
		||||
        "content": [
 | 
			
		||||
            {"type": "image"},
 | 
			
		||||
            {"type": "text", "text": "What’s shown in this image?"},
 | 
			
		||||
         ,
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "role": "assistant",
 | 
			
		||||
        "content": [{"type": "text", "text": "This image shows a red stop sign."},]
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
        "role": "user",
 | 
			
		||||
        "content": [
 | 
			
		||||
            {"type": "text", "text": "Describe the image in more details."},
 | 
			
		||||
        ],
 | 
			
		||||
    },
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
 | 
			
		||||
 | 
			
		||||
# Note that the template simply formats your prompt, you still have to tokenize it and obtain pixel values for your images
 | 
			
		||||
print(text_prompt)
 | 
			
		||||
>>> "###Human: <image>\nWhat’s shown in this image?###Assistant: This image shows a red stop sign.###Human: Describe the image in more details.###Assistant:"
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
- If you want to construct a chat prompt yourself, below is a list of prompt formats accepted by VipLLaVa checkpoints:
 | 
			
		||||
```bash
 | 
			
		||||
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n<prompt>###Assistant:
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
For multiple turns conversation:
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n<prompt1>###Assistant: <answer1>###Human: <prompt2>###Assistant:
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
The original code can be found [here](https://github.com/mu-cai/ViP-LLaVA).
 | 
			
		||||
 | 
			
		||||
This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## VipLlavaConfig
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -72,7 +72,7 @@ Here is a step-by-step guide to transcribing an audio sample using a pre-trained
 | 
			
		||||
' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Whisper is compatible with the following optimisations:
 | 
			
		||||
Whisper is compatible with the following optimisations for both short and long-form generation:
 | 
			
		||||
- [PyTorch Scaled Dot Product Attention (SDPA)](../perf_infer_gpu_one#pytorch-scaled-dot-product-attention): flash attention and memory-efficient attention kernels. Enabled by default for `torch>=2.1.1`.
 | 
			
		||||
- [Flash Attention 2](../perf_infer_gpu_one#flashattention-2): improved implementation of flash attention through better parallelism and work partitioning. 
 | 
			
		||||
- [torch.compile](../llm_optims#static-kv-cache-and-torchcompile): JIT-compile the forward pass to dispatch to efficient fused kernels.
 | 
			
		||||
@ -101,7 +101,8 @@ As an example, the following codesnippet enables SDPA and `torch.compile` for up
 | 
			
		||||
... ).input_features
 | 
			
		||||
 | 
			
		||||
>>> # Compile the forward pass
 | 
			
		||||
>>> _ = model.generate(input_features)
 | 
			
		||||
>>> for _ in range(2):
 | 
			
		||||
>>>     model.generate(input_features)
 | 
			
		||||
 | 
			
		||||
>>> # Generate token ids using compiled graph (fast!)
 | 
			
		||||
>>> predicted_ids = model.generate(input_features)
 | 
			
		||||
 | 
			
		||||
@ -77,7 +77,7 @@ Then use `notebook_login` to sign-in to the Hub, and follow the link [here](http
 | 
			
		||||
 | 
			
		||||
To ensure your model can be used by someone working with a different framework, we recommend you convert and upload your model with both PyTorch and TensorFlow checkpoints. While users are still able to load your model from a different framework if you skip this step, it will be slower because 🤗 Transformers will need to convert the checkpoint on-the-fly.
 | 
			
		||||
 | 
			
		||||
Converting a checkpoint for another framework is easy. Make sure you have PyTorch and TensorFlow installed (see [here](installation) for installation instructions), and then find the specific model for your task in the other framework. 
 | 
			
		||||
Converting a checkpoint for another framework is easy. Make sure you have PyTorch and TensorFlow installed (see [here](installation) for installation instructions), and then find the specific model for your task in the other framework.
 | 
			
		||||
 | 
			
		||||
<frameworkcontent>
 | 
			
		||||
<pt>
 | 
			
		||||
 | 
			
		||||
@ -39,6 +39,8 @@ FlashAttention-2 is experimental and may change considerably in future versions.
 | 
			
		||||
FlashAttention-2 is currently supported for the following architectures:
 | 
			
		||||
* [Bark](https://huggingface.co/docs/transformers/model_doc/bark#transformers.BarkModel)
 | 
			
		||||
* [Bart](https://huggingface.co/docs/transformers/model_doc/bart#transformers.BartModel)
 | 
			
		||||
* [Chameleon](https://huggingface.co/docs/transformers/model_doc/chameleon#transformers.Chameleon)
 | 
			
		||||
* [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPModel)
 | 
			
		||||
* [Cohere](https://huggingface.co/docs/transformers/model_doc/cohere#transformers.CohereModel)
 | 
			
		||||
* [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel)
 | 
			
		||||
* [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel)
 | 
			
		||||
@ -198,6 +200,8 @@ For now, Transformers supports SDPA inference and training for the following arc
 | 
			
		||||
* [Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer#transformers.ASTModel)
 | 
			
		||||
* [Bart](https://huggingface.co/docs/transformers/model_doc/bart#transformers.BartModel)
 | 
			
		||||
* [Bert](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel)
 | 
			
		||||
* [Chameleon](https://huggingface.co/docs/transformers/model_doc/chameleon#transformers.Chameleon)
 | 
			
		||||
* [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPModel)
 | 
			
		||||
* [Cohere](https://huggingface.co/docs/transformers/model_doc/cohere#transformers.CohereModel)
 | 
			
		||||
* [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel)
 | 
			
		||||
* [DeiT](https://huggingface.co/docs/transformers/model_doc/deit#transformers.DeiTModel)
 | 
			
		||||
 | 
			
		||||
@ -98,7 +98,7 @@ Below you can find the list of the models we benchmarked.
 | 
			
		||||
- [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224)
 | 
			
		||||
- [microsoft/beit-base-patch16-224-pt22k-ft22k](https://huggingface.co/microsoft/beit-base-patch16-224-pt22k-ft22k)
 | 
			
		||||
- [facebook/convnext-large-224](https://huggingface.co/facebook/convnext-large-224)
 | 
			
		||||
- [microsoft/resnet-50](https://huggingface.co/)
 | 
			
		||||
- [microsoft/resnet-50](https://huggingface.co/microsoft/resnet-50)
 | 
			
		||||
 | 
			
		||||
**Image Segmentation** 
 | 
			
		||||
- [nvidia/segformer-b0-finetuned-ade-512-512](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512)
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										58
									
								
								docs/source/en/quantization/fbgemm_fp8.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										58
									
								
								docs/source/en/quantization/fbgemm_fp8.md
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,58 @@
 | 
			
		||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 | 
			
		||||
the License. You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 | 
			
		||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 | 
			
		||||
specific language governing permissions and limitations under the License.
 | 
			
		||||
 | 
			
		||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
 | 
			
		||||
rendered properly in your Markdown viewer.
 | 
			
		||||
 | 
			
		||||
-->
 | 
			
		||||
 | 
			
		||||
# FBGEMM FP8
 | 
			
		||||
 | 
			
		||||
With FBGEMM FP8 quantization method, you can quantize your model in FP8 (W8A8):
 | 
			
		||||
- the weights will be quantized in 8bit (FP8) per channel
 | 
			
		||||
- the activation will be quantized in 8bit (FP8) per token
 | 
			
		||||
 | 
			
		||||
It relies on the [FBGEMM](https://github.com/pytorch/FBGEMM) library which provides efficient low-precision general matrix multiplication for small batch sizes and support for accuracy-loss minimizing techniques such as row-wise quantization and outlier-aware quantization. 
 | 
			
		||||
 | 
			
		||||
> [!TIP]
 | 
			
		||||
> You need a GPU with compute capability>=9 (e.g. H100) 
 | 
			
		||||
 | 
			
		||||
Before you begin, make sure the following libraries are installed with their latest version:
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
pip install --upgrade accelerate fbgemm-gpu torch
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
If you are having issues with fbgemm-gpu and torch library, you might need to install the nighlty release. You can follow the instruction [here](https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-libraries:~:text=found%20here.-,Install%20the%20FBGEMM_GPU%20Package,-Install%20through%20PyTorch)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
from transformers import FbgemmFp8Config, AutoModelForCausalLM, AutoTokenizer
 | 
			
		||||
 | 
			
		||||
model_name = "meta-llama/Meta-Llama-3-8B"
 | 
			
		||||
quantization_config = FbgemmFp8Config()
 | 
			
		||||
quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", quantization_config=quantization_config)
 | 
			
		||||
 | 
			
		||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
 | 
			
		||||
input_text = "What are we having for dinner?"
 | 
			
		||||
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
 | 
			
		||||
 | 
			
		||||
output = quantized_model.generate(**input_ids, max_new_tokens=10)
 | 
			
		||||
print(tokenizer.decode(output[0], skip_special_tokens=True))
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
A quantized model can be saved via "saved_pretrained" and be reused again via the "from_pretrained".
 | 
			
		||||
 | 
			
		||||
```py
 | 
			
		||||
quant_path = "/path/to/save/quantized/model"
 | 
			
		||||
model.save_pretrained(quant_path)
 | 
			
		||||
model = AutoModelForCausalLM.from_pretrained(quant_path, device_map="auto")
 | 
			
		||||
```
 | 
			
		||||
@ -55,4 +55,5 @@ Use the table below to help you decide which quantization method to use.
 | 
			
		||||
| [GPTQ](./gptq)                                | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | 🔴                       | 2 - 3 - 4 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
 | 
			
		||||
| [HQQ](./hqq)                                 | 🟢                       | 🟢    | 🟢        | 🔴              | 🔴                     | 🟢                       | 1 - 8          | 🟢                                   | 🔴            | 🟢                      | https://github.com/mobiusml/hqq/            |
 | 
			
		||||
| [Quanto](./quanto)                              | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🟢                       | 2 / 4 / 8      | 🔴                                   | 🔴            | 🟢                      | https://github.com/huggingface/quanto       |
 | 
			
		||||
| [FBGEMM_FP8](./fbgemm_fp8.md)                              | 🟢                       | 🔴    | 🟢        | 🔴              | 🔴                      | 🔴                        | 8      | 🔴                                   | 🟢            | 🟢                      | https://github.com/pytorch/FBGEMM       |
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										232
									
								
								docs/source/en/tasks/image_text_to_text.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										232
									
								
								docs/source/en/tasks/image_text_to_text.md
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,232 @@
 | 
			
		||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 | 
			
		||||
the License. You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 | 
			
		||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 | 
			
		||||
specific language governing permissions and limitations under the License.
 | 
			
		||||
 | 
			
		||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
 | 
			
		||||
rendered properly in your Markdown viewer.
 | 
			
		||||
 | 
			
		||||
-->
 | 
			
		||||
 | 
			
		||||
# Image-text-to-text
 | 
			
		||||
 | 
			
		||||
[[open-in-colab]]
 | 
			
		||||
 | 
			
		||||
Image-text-to-text models, also known as vision language models (VLMs), are language models that take an image input. These models can tackle various tasks, from visual question answering to image segmentation. This task shares many similarities with image-to-text, but with some overlapping use cases like image captioning. Image-to-text models only take image inputs and often accomplish a specific task, whereas VLMs take open-ended text and image inputs and are more generalist models.
 | 
			
		||||
 | 
			
		||||
In this guide, we provide a brief overview of VLMs and show how to use them with Transformers for inference.
 | 
			
		||||
 | 
			
		||||
To begin with, there are multiple types of VLMs:
 | 
			
		||||
- base models used for fine-tuning
 | 
			
		||||
- chat fine-tuned models for conversation
 | 
			
		||||
- instruction fine-tuned models
 | 
			
		||||
 | 
			
		||||
This guide focuses on inference with an instruction-tuned model. 
 | 
			
		||||
 | 
			
		||||
Let's begin installing the dependencies.
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
pip install -q transformers accelerate flash_attn 
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Let's initialize the model and the processor. 
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
from transformers import AutoProcessor, Idefics2ForConditionalGeneration
 | 
			
		||||
import torch
 | 
			
		||||
 | 
			
		||||
device = torch.device("cuda")
 | 
			
		||||
model = Idefics2ForConditionalGeneration.from_pretrained(
 | 
			
		||||
    "HuggingFaceM4/idefics2-8b",
 | 
			
		||||
    torch_dtype=torch.bfloat16,
 | 
			
		||||
    attn_implementation="flash_attention_2",
 | 
			
		||||
).to(device)
 | 
			
		||||
 | 
			
		||||
processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b")
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
This model has a [chat template](./chat_templating) that helps user parse chat outputs. Moreover, the model can also accept multiple images as input in a single conversation or message. We will now prepare the inputs. 
 | 
			
		||||
 | 
			
		||||
The image inputs look like the following.
 | 
			
		||||
 | 
			
		||||
<div class="flex justify-center">
 | 
			
		||||
     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png" alt="Two cats sitting on a net"/>
 | 
			
		||||
</div>
 | 
			
		||||
 | 
			
		||||
<div class="flex justify-center">
 | 
			
		||||
     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg" alt="A bee on a pink flower"/>
 | 
			
		||||
</div>
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
from PIL import Image
 | 
			
		||||
import requests
 | 
			
		||||
 | 
			
		||||
img_urls =["https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png",
 | 
			
		||||
           "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"]
 | 
			
		||||
images = [Image.open(requests.get(img_urls[0], stream=True).raw),
 | 
			
		||||
          Image.open(requests.get(img_urls[1], stream=True).raw)]
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Below is an example of the chat template. We can feed conversation turns and the last message as an input by appending it at the end of the template. 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
messages = [
 | 
			
		||||
    {
 | 
			
		||||
        "role": "user",
 | 
			
		||||
        "content": [
 | 
			
		||||
            {"type": "image"},
 | 
			
		||||
            {"type": "text", "text": "What do we see in this image?"},
 | 
			
		||||
        ]
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "role": "assistant",
 | 
			
		||||
        "content": [
 | 
			
		||||
            {"type": "text", "text": "In this image we can see two cats on the nets."},
 | 
			
		||||
        ]
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "role": "user",
 | 
			
		||||
        "content": [
 | 
			
		||||
            {"type": "image"},
 | 
			
		||||
            {"type": "text", "text": "And how about this image?"},
 | 
			
		||||
        ]
 | 
			
		||||
    },       
 | 
			
		||||
]
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
We will now call the processors' [`~ProcessorMixin.apply_chat_template`] method to preprocess its output along with the image inputs.
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
 | 
			
		||||
inputs = processor(text=prompt, images=[images[0], images[1]], return_tensors="pt").to(device)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
We can now pass the preprocessed inputs to the model.
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
with torch.no_grad():
 | 
			
		||||
    generated_ids = model.generate(**inputs, max_new_tokens=500)
 | 
			
		||||
generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
 | 
			
		||||
 | 
			
		||||
print(generated_texts)
 | 
			
		||||
## ['User: What do we see in this image? \nAssistant: In this image we can see two cats on the nets. \nUser: And how about this image? \nAssistant: In this image we can see flowers, plants and insect.']
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Streaming
 | 
			
		||||
 | 
			
		||||
We can use [text streaming](./generation_strategies#streaming) for a better generation experience. Transformers supports streaming with the [`TextStreamer`] or [`TextIteratorStreamer`] classes. We will use the [`TextIteratorStreamer`] with IDEFICS-8B.
 | 
			
		||||
 | 
			
		||||
Assume we have an application that keeps chat history and takes in the new user input. We will preprocess the inputs as usual and initialize [`TextIteratorStreamer`] to handle the generation in a separate thread. This allows you to stream the generated text tokens in real-time. Any generation arguments can be passed to [`TextIteratorStreamer`].
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
import time
 | 
			
		||||
from transformers import TextIteratorStreamer
 | 
			
		||||
from threading import Thread
 | 
			
		||||
 | 
			
		||||
def model_inference(
 | 
			
		||||
    user_prompt,
 | 
			
		||||
    chat_history,
 | 
			
		||||
    max_new_tokens,
 | 
			
		||||
    images
 | 
			
		||||
):
 | 
			
		||||
    user_prompt = {
 | 
			
		||||
        "role": "user",
 | 
			
		||||
        "content": [
 | 
			
		||||
            {"type": "image"},
 | 
			
		||||
            {"type": "text", "text": user_prompt},
 | 
			
		||||
        ]
 | 
			
		||||
    }
 | 
			
		||||
    chat_history.append(user_prompt)
 | 
			
		||||
    streamer = TextIteratorStreamer(
 | 
			
		||||
        processor.tokenizer,
 | 
			
		||||
        skip_prompt=True,
 | 
			
		||||
        timeout=5.0,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    generation_args = {
 | 
			
		||||
        "max_new_tokens": max_new_tokens,
 | 
			
		||||
        "streamer": streamer,
 | 
			
		||||
        "do_sample": False
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    # add_generation_prompt=True makes model generate bot response
 | 
			
		||||
    prompt = processor.apply_chat_template(chat_history, add_generation_prompt=True)
 | 
			
		||||
    inputs = processor(
 | 
			
		||||
        text=prompt,
 | 
			
		||||
        images=images,
 | 
			
		||||
        return_tensors="pt",
 | 
			
		||||
    ).to(device)
 | 
			
		||||
    generation_args.update(inputs)
 | 
			
		||||
 | 
			
		||||
    thread = Thread(
 | 
			
		||||
        target=model.generate,
 | 
			
		||||
        kwargs=generation_args,
 | 
			
		||||
    )
 | 
			
		||||
    thread.start()
 | 
			
		||||
 | 
			
		||||
    acc_text = ""
 | 
			
		||||
    for text_token in streamer:
 | 
			
		||||
        time.sleep(0.04)
 | 
			
		||||
        acc_text += text_token
 | 
			
		||||
        if acc_text.endswith("<end_of_utterance>"):
 | 
			
		||||
            acc_text = acc_text[:-18]
 | 
			
		||||
        yield acc_text
 | 
			
		||||
    
 | 
			
		||||
    thread.join()
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Now let's call the `model_inference` function we created and stream the values. 
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
generator = model_inference(
 | 
			
		||||
    user_prompt="And what is in this image?",
 | 
			
		||||
    chat_history=messages,
 | 
			
		||||
    max_new_tokens=100,
 | 
			
		||||
    images=images
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
for value in generator:
 | 
			
		||||
  print(value)
 | 
			
		||||
 | 
			
		||||
# In
 | 
			
		||||
# In this
 | 
			
		||||
# In this image ...
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Fit models in smaller hardware
 | 
			
		||||
 | 
			
		||||
VLMs are often large and need to be optimized to fit in smaller hardware. Transformers supports many model quantization libraries, and here we will only show int8 quantization with [Quanto](./quantization/quanto#quanto). int8 quantization offers memory improvements up to 75 percent (if all weights are quantized). However it is no free lunch, since 8-bit is not a CUDA-native precision, the weights are quantized back and forth on the fly, which adds up to latency. 
 | 
			
		||||
 | 
			
		||||
First, install dependencies.
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
pip install -U quanto bitsandbytes
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
To quantize a model during loading, we need to first create [`QuantoConfig`]. Then load the model as usual, but pass `quantization_config` during model initialization.
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
from transformers import Idefics2ForConditionalGeneration, AutoTokenizer, QuantoConfig
 | 
			
		||||
 | 
			
		||||
model_id = "HuggingFaceM4/idefics2-8b"
 | 
			
		||||
quantization_config = QuantoConfig(weights="int8")
 | 
			
		||||
quantized_model = Idefics2ForConditionalGeneration.from_pretrained(model_id, device_map="cuda", quantization_config=quantization_config)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
And that's it, we can use the model the same way with no changes. 
 | 
			
		||||
 | 
			
		||||
## Further Reading
 | 
			
		||||
 | 
			
		||||
Here are some more resources for the image-text-to-text task.
 | 
			
		||||
 | 
			
		||||
- [Image-text-to-text task page](https://huggingface.co/tasks/image-text-to-text) covers model types, use cases, datasets, and more. 
 | 
			
		||||
- [Vision Language Models Explained](https://huggingface.co/blog/vlms) is a blog post that covers everything about vision language models and supervised fine-tuning using [TRL](https://huggingface.co/docs/trl/en/index).
 | 
			
		||||
@ -157,7 +157,7 @@ Execution time -- 79.0 ms
 | 
			
		||||
 | 
			
		||||
Execution time -- 78.9 ms
 | 
			
		||||
```
 | 
			
		||||
The first call to `xla_generate()` is time-consuming because of tracing, but the successive calls are orders of magnitude faster. Keep in mind that any change in the generation options at any point with trigger re-tracing and thus leading to slow-downs in the generation time. 
 | 
			
		||||
The first call to `xla_generate()` is time-consuming because of tracing, but the successive calls are orders of magnitude faster. Keep in mind that any change in the generation options at any point will trigger re-tracing and thus leading to slow-downs in the generation time. 
 | 
			
		||||
 | 
			
		||||
We didn’t cover all the text generation options 🤗 Transformers provides in this document. We encourage you to read the documentation for advanced use cases.
 | 
			
		||||
 | 
			
		||||
@ -171,4 +171,4 @@ Here, we leave you with some additional resources if you want to delve deeper in
 | 
			
		||||
* Recommended posts for learning more about XLA and TensorFlow graphs in general:
 | 
			
		||||
    * [XLA: Optimizing Compiler for Machine Learning](https://www.tensorflow.org/xla)
 | 
			
		||||
    * [Introduction to graphs and tf.function](https://www.tensorflow.org/guide/intro_to_graphs)
 | 
			
		||||
    * [Better performance with tf.function](https://www.tensorflow.org/guide/function) 
 | 
			
		||||
    * [Better performance with tf.function](https://www.tensorflow.org/guide/function)
 | 
			
		||||
 | 
			
		||||
@ -278,7 +278,7 @@ args = TrainingArguments(
 | 
			
		||||
    max_steps=100,
 | 
			
		||||
    per_device_train_batch_size=2,
 | 
			
		||||
    optim="galore_adamw",
 | 
			
		||||
    optim_target_modules=["attn", "mlp"]
 | 
			
		||||
    optim_target_modules=[r".*.attn.*", r".*.mlp.*"]
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
model_id = "google/gemma-2b"
 | 
			
		||||
@ -315,7 +315,7 @@ args = TrainingArguments(
 | 
			
		||||
    max_steps=100,
 | 
			
		||||
    per_device_train_batch_size=2,
 | 
			
		||||
    optim="galore_adamw",
 | 
			
		||||
    optim_target_modules=["attn", "mlp"],
 | 
			
		||||
    optim_target_modules=[r".*.attn.*", r".*.mlp.*"],
 | 
			
		||||
    optim_args="rank=64, update_proj_gap=100, scale=0.10",
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
@ -359,7 +359,7 @@ args = TrainingArguments(
 | 
			
		||||
    max_steps=100,
 | 
			
		||||
    per_device_train_batch_size=2,
 | 
			
		||||
    optim="galore_adamw_layerwise",
 | 
			
		||||
    optim_target_modules=["attn", "mlp"]
 | 
			
		||||
    optim_target_modules=[r".*.attn.*", r".*.mlp.*"]
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
model_id = "google/gemma-2b"
 | 
			
		||||
 | 
			
		||||
@ -220,7 +220,7 @@ La plantilla de chat para un modelo se almacena en el atributo `tokenizer.chat_t
 | 
			
		||||
>>> from transformers import AutoTokenizer
 | 
			
		||||
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
 | 
			
		||||
 | 
			
		||||
>>> tokenizer.default_chat_template
 | 
			
		||||
>>> tokenizer.chat_template
 | 
			
		||||
"{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ '  ' }}{% endif %}{% endfor %}{{ eos_token }}"
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
@ -307,12 +307,6 @@ Si estás ajustando finamente un modelo para chat, además de establecer una pla
 | 
			
		||||
 | 
			
		||||
</Tip>
 | 
			
		||||
 | 
			
		||||
### ¿Qué son las plantillas "default"?
 | 
			
		||||
 | 
			
		||||
Antes de la introducción de las plantillas de chat, el manejo del chat estaba codificado en el nivel de la clase del modelo. Por razones de compatibilidad con versiones anteriores, hemos conservado este manejo específico de la clase como plantillas predeterminadas, también establecidas a nivel de clase. Si un modelo no tiene una plantilla de chat establecida, pero hay una plantilla predeterminada para su clase de modelo, la clase `TextGenerationPipeline` y métodos como `apply_chat_template` usarán la plantilla de clase en su lugar. Puedes averiguar cuál es la plantilla predeterminada para tu tokenizador comprobando el atributo `tokenizer.default_chat_template`.
 | 
			
		||||
 | 
			
		||||
Esto es algo que hacemos puramente por razones de compatibilidad con versiones anteriores, para evitar romper cualquier flujo de trabajo existente. Incluso cuando la plantilla de clase es apropiada para tu modelo, recomendamos encarecidamente anular la plantilla predeterminada estableciendo explícitamente el atributo `chat_template` para dejar claro a los usuarios que tu modelo ha sido configurado correctamente para el chat, y para estar preparados para el futuro en caso de que las plantillas predeterminadas alguna vez se alteren o se eliminen.
 | 
			
		||||
 | 
			
		||||
### ¿Qué plantilla debería usar?
 | 
			
		||||
 | 
			
		||||
Cuando establezcas la plantilla para un modelo que ya ha sido entrenado para chat, debes asegurarte de que la plantilla coincida exactamente con el formato de mensajes que el modelo vio durante el entrenamiento, o de lo contrario es probable que experimentes degradación del rendimiento. Esto es cierto incluso si estás entrenando aún más el modelo; probablemente obtendrás el mejor rendimiento si mantienes constantes los tokens de chat. Esto es muy análogo a la tokenización: generalmente obtienes el mejor rendimiento para la inferencia o el ajuste fino cuando coincides precisamente con la tokenización utilizada durante el entrenamiento.
 | 
			
		||||
 | 
			
		||||
@ -85,7 +85,7 @@ LLM(Language Model)のますます一般的な使用事例の1つは「チ
 | 
			
		||||
>>> from transformers import AutoTokenizer
 | 
			
		||||
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
 | 
			
		||||
 | 
			
		||||
>>> tokenizer.default_chat_template
 | 
			
		||||
>>> tokenizer.chat_template
 | 
			
		||||
"{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ '  ' }}{% endif %}{% endfor %}{{ eos_token }}"
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -27,6 +27,8 @@
 | 
			
		||||
    title: 에이전트
 | 
			
		||||
  - local: llm_tutorial
 | 
			
		||||
    title: 대규모 언어 모델로 생성하기
 | 
			
		||||
  - local: in_translation
 | 
			
		||||
    title: (번역중)Chatting with Transformers
 | 
			
		||||
  title: 튜토리얼
 | 
			
		||||
- sections:
 | 
			
		||||
  - isExpanded: false
 | 
			
		||||
@ -131,21 +133,41 @@
 | 
			
		||||
    title: (번역중) Notebooks with examples
 | 
			
		||||
  - local: community
 | 
			
		||||
    title: 커뮤니티 리소스
 | 
			
		||||
  - local: custom_tools
 | 
			
		||||
    title: 사용자 정의 도구와 프롬프트
 | 
			
		||||
  - local: troubleshooting
 | 
			
		||||
    title: 문제 해결
 | 
			
		||||
  - local: in_translation
 | 
			
		||||
    title: (번역중) Contribute new quantization method
 | 
			
		||||
    title: (번역중) Interoperability with GGUF files
 | 
			
		||||
  title: (번역중) 개발자 가이드
 | 
			
		||||
- sections:
 | 
			
		||||
  - local: in_translation
 | 
			
		||||
    title: (번역중) Getting started
 | 
			
		||||
  - local: in_translation
 | 
			
		||||
    title: (번역중) bitsandbytes
 | 
			
		||||
  - local: in_translation
 | 
			
		||||
    title: (번역중) GPTQ
 | 
			
		||||
  - local: in_translation
 | 
			
		||||
    title: (번역중) AWQ
 | 
			
		||||
  - local: in_translation
 | 
			
		||||
    title: (번역중) AQLM
 | 
			
		||||
  - local: in_translation
 | 
			
		||||
    title: (번역중) Quanto
 | 
			
		||||
  - local: in_translation
 | 
			
		||||
    title: (번역중) EETQ
 | 
			
		||||
  - local: in_translation
 | 
			
		||||
    title: (번역중) HQQ
 | 
			
		||||
  - local: in_translation
 | 
			
		||||
    title: (번역중) Optimum
 | 
			
		||||
  - local: in_translation
 | 
			
		||||
    title: (번역중) Contribute new quantization method
 | 
			
		||||
  title: (번역중) 경량화 메소드
 | 
			
		||||
- sections:
 | 
			
		||||
  - local: performance
 | 
			
		||||
    title: 성능 및 확장성
 | 
			
		||||
  - local: in_translation
 | 
			
		||||
    title: (번역중) Quantization
 | 
			
		||||
    title: (번역중) LLM inference optimization
 | 
			
		||||
  - sections:
 | 
			
		||||
    - local: in_translation
 | 
			
		||||
      title: (번역중) Training on one GPU
 | 
			
		||||
      title: (번역중) Methods and tools for efficient training on a single GPU
 | 
			
		||||
    - local: perf_train_gpu_many
 | 
			
		||||
      title: 다중 GPU에서 훈련 진행하기
 | 
			
		||||
    - local: in_translation
 | 
			
		||||
@ -191,7 +213,7 @@
 | 
			
		||||
      title: 테스트
 | 
			
		||||
    - local: pr_checks
 | 
			
		||||
      title: Pull Request에 대한 검사
 | 
			
		||||
  title: (번역중) 기여하기
 | 
			
		||||
  title: 기여하기
 | 
			
		||||
- sections:
 | 
			
		||||
  - local: philosophy
 | 
			
		||||
    title: 이념과 목표
 | 
			
		||||
 | 
			
		||||
@ -1,22 +0,0 @@
 | 
			
		||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 | 
			
		||||
the License. You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 | 
			
		||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 | 
			
		||||
specific language governing permissions and limitations under the License.
 | 
			
		||||
-->
 | 
			
		||||
 | 
			
		||||
# 사용자 정의 도구와 프롬프트[[custom-tools-and-prompts]]
 | 
			
		||||
 | 
			
		||||
<Tip warning={true}>
 | 
			
		||||
 | 
			
		||||
The Agents framework has significantly changed in version v4.41.0.
 | 
			
		||||
This document has been removed as it was referencing an older API.
 | 
			
		||||
 | 
			
		||||
We eagerly welcome new contributions for the updated API.
 | 
			
		||||
 | 
			
		||||
</Tip>
 | 
			
		||||
@ -78,6 +78,8 @@
 | 
			
		||||
    title: 如何将流水线添加到 🤗 Transformers?
 | 
			
		||||
  title: 贡献
 | 
			
		||||
- sections:
 | 
			
		||||
  - local: philosophy
 | 
			
		||||
    title: Transformers的设计理念
 | 
			
		||||
  - local: task_summary
 | 
			
		||||
    title: 🤗Transformers能做什么
 | 
			
		||||
  - local: tokenizer_summary
 | 
			
		||||
 | 
			
		||||
@ -228,7 +228,7 @@ The sun.</s>
 | 
			
		||||
>>> from transformers import AutoTokenizer
 | 
			
		||||
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
 | 
			
		||||
 | 
			
		||||
>>> tokenizer.default_chat_template
 | 
			
		||||
>>> tokenizer.chat_template
 | 
			
		||||
"{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ '  ' }}{% endif %}{% endfor %}{{ eos_token }}"
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										67
									
								
								docs/source/zh/philosophy.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										67
									
								
								docs/source/zh/philosophy.md
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,67 @@
 | 
			
		||||
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 | 
			
		||||
the License. You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 | 
			
		||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 | 
			
		||||
specific language governing permissions and limitations under the License.
 | 
			
		||||
 | 
			
		||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
 | 
			
		||||
rendered properly in your Markdown viewer.
 | 
			
		||||
 | 
			
		||||
-->
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Transformers 的设计理念
 | 
			
		||||
 | 
			
		||||
🤗 Transformers 是一个专为以下用户群体构建的库:
 | 
			
		||||
 | 
			
		||||
- 寻求使用、研究或扩展大规模 Transformers 模型的机器学习研究人员和教育者。
 | 
			
		||||
- 希望微调这些模型或在生产环境中使用它们(或两者兼而有之)的实际操作者。
 | 
			
		||||
- 只想下载预训练模型并将其用于解决给定机器学习任务的工程师。
 | 
			
		||||
 | 
			
		||||
Transformers 设计时有两个主要目标:
 | 
			
		||||
 | 
			
		||||
1. 尽可能简单快速地使用:
 | 
			
		||||
 | 
			
		||||
   - 我们尽可能地限制用户能接触的抽象层,实际上几乎没有抽象。用户只需学习三个标准类即可使用每个模型:[configuration](main_classes/configuration)、[models](main_classes/model) 和一个预处理类(用于 NLP 的 [tokenizer](main_classes/tokenizer),用于视觉的 [image processor](main_classes/image_processor),用于音频的 [feature extractor](main_classes/feature_extractor),以及用于多模态输入的 [processor](main_classes/processors))。
 | 
			
		||||
   - 所有这些类都可以通过一个通用的 `from_pretrained()` 方法从预训练实例中简单统一地初始化,该方法会从提供在 [Hugging Face Hub](https://huggingface.co/models) 上的预训练检查点(如果需要的话)下载、缓存和加载相关类实例及相关数据(配置的超参数、分词器的词汇表和模型的权重)。
 | 
			
		||||
   - 在这三个基本类之上,该库提供了两种 API:[`pipeline`] 用于快速在给定任务上使用模型进行推断,以及 [`Trainer`] 用于快速训练或微调 PyTorch 模型(所有 TensorFlow 模型与 `Keras.fit` 兼容)。
 | 
			
		||||
   - 因此,Transformers 不是神经网络的模块化工具箱。如果要基于 Transformers 扩展或搭建新项目,请使用常规的 Python、PyTorch、TensorFlow、Keras 模块,并从 Transformers 的基类继承以重用模型加载和保存等功能。如果想了解更多有关我们的模型代码的设计理念,请查看我们的[重复自己](https://huggingface.co/blog/transformers-design-philosophy)博文。
 | 
			
		||||
 | 
			
		||||
2. 提供与原始模型性能尽可能接近的最新模型:
 | 
			
		||||
 | 
			
		||||
   - 我们为每种架构提供至少一个示例,复现了该架构官方作者提供的结果。
 | 
			
		||||
   - 代码通常尽可能接近原始代码库,这意味着某些 PyTorch 代码可能不够*pytorchic*,因为它是转换后的 TensorFlow 代码,反之亦然。
 | 
			
		||||
 | 
			
		||||
其他几个目标:
 | 
			
		||||
 | 
			
		||||
- 尽可能一致地公开模型的内部:
 | 
			
		||||
 | 
			
		||||
   - 我们使用单一 API 提供对完整隐藏状态和注意力权重的访问。
 | 
			
		||||
   - 预处理类和基本模型 API 标准化,便于在不同模型之间轻松切换。
 | 
			
		||||
 | 
			
		||||
- 结合主观选择的有前途的工具进行模型微调和调查:
 | 
			
		||||
 | 
			
		||||
   - 简单一致的方法来向词汇表和嵌入中添加新标记以进行微调。
 | 
			
		||||
   - 简单的方法来屏蔽和修剪 Transformer 头部。
 | 
			
		||||
 | 
			
		||||
- 轻松在 PyTorch、TensorFlow 2.0 和 Flax 之间切换,允许使用一个框架进行训练并使用另一个进行推断。
 | 
			
		||||
 | 
			
		||||
## 主要概念
 | 
			
		||||
 | 
			
		||||
该库围绕每个模型的三类类构建:
 | 
			
		||||
 | 
			
		||||
- **模型类** 可以是 PyTorch 模型([torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module))、Keras 模型([tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model))或 JAX/Flax 模型([flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html)),这些模型可以使用库中提供的预训练权重。
 | 
			
		||||
- **配置类** 存储构建模型所需的超参数(如层数和隐藏大小)。通常情况下,如果您使用不进行任何修改的预训练模型,则创建模型将自动处理配置的实例化(配置是模型的一部分)。
 | 
			
		||||
- **预处理类** 将原始数据转换为模型可接受的格式。一个 [tokenizer](main_classes/tokenizer) 存储每个模型的词汇表,并提供编码和解码字符串为要馈送到模型的令牌嵌入索引列表的方法。[Image processors](main_classes/image_processor) 预处理视觉输入,[feature extractors](main_classes/feature_extractor) 预处理音频输入,而 [processor](main_classes/processors) 则处理多模态输入。
 | 
			
		||||
 | 
			
		||||
所有这些类都可以从预训练实例中实例化、本地保存,并通过以下三种方法与 Hub 共享:
 | 
			
		||||
 | 
			
		||||
- `from_pretrained()` 允许您从库自身提供的预训练版本(支持的模型可在 [Model Hub](https://huggingface.co/models) 上找到)或用户本地(或服务器上)存储的版本实例化模型、配置和预处理类。
 | 
			
		||||
- `save_pretrained()` 允许您本地保存模型、配置和预处理类,以便可以使用 `from_pretrained()` 重新加载。
 | 
			
		||||
- `push_to_hub()` 允许您将模型、配置和预处理类共享到 Hub,以便所有人都可以轻松访问。
 | 
			
		||||
@ -61,7 +61,7 @@ from transformers.utils import check_min_version, send_example_telemetry
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
Array = Any
 | 
			
		||||
Dataset = datasets.arrow_dataset.Dataset
 | 
			
		||||
 | 
			
		||||
@ -60,7 +60,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risk.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recognition/requirements.txt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -56,7 +56,7 @@ from transformers.utils import check_min_version, send_example_telemetry
 | 
			
		||||
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
Array = Any
 | 
			
		||||
Dataset = datasets.arrow_dataset.Dataset
 | 
			
		||||
@ -484,7 +484,7 @@ def main():
 | 
			
		||||
            label_to_id = {i: label_name_to_id[label_list[i]] for i in range(num_labels)}
 | 
			
		||||
        else:
 | 
			
		||||
            logger.warning(
 | 
			
		||||
                "Your model seems to have been trained with labels, but they don't match the dataset: ",
 | 
			
		||||
                "Your model seems to have been trained with labels, but they don't match the dataset: "
 | 
			
		||||
                f"model labels: {sorted(label_name_to_id.keys())}, dataset labels: {sorted(label_list)}."
 | 
			
		||||
                "\nIgnoring the model labels as a result.",
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -45,7 +45,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -54,7 +54,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
logger = get_logger(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -43,7 +43,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -48,7 +48,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -53,7 +53,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -46,7 +46,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -55,7 +55,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
logger = get_logger(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -58,7 +58,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -60,7 +60,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
logger = get_logger(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -54,7 +54,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
logger = get_logger(__name__)
 | 
			
		||||
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 | 
			
		||||
 | 
			
		||||
@ -47,7 +47,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -47,7 +47,7 @@ from transformers.utils import PaddingStrategy, check_min_version, send_example_
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -56,7 +56,7 @@ from transformers.utils import PaddingStrategy, check_min_version, send_example_
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
logger = get_logger(__name__)
 | 
			
		||||
# You should update this to your particular problem to have better documentation of `model_type`
 | 
			
		||||
 | 
			
		||||
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/object-detection/requirements.txt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
logging.basicConfig(level=logging.INFO)
 | 
			
		||||
logger = get_logger(__name__)
 | 
			
		||||
 | 
			
		||||
@ -50,7 +50,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -46,7 +46,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -50,7 +50,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
logger = get_logger(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -50,7 +50,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -53,7 +53,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
logger = get_logger(__name__)
 | 
			
		||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 | 
			
		||||
 | 
			
		||||
@ -47,7 +47,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 | 
			
		||||
 | 
			
		||||
@ -428,7 +428,7 @@ def main():
 | 
			
		||||
            label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
 | 
			
		||||
        else:
 | 
			
		||||
            logger.warning(
 | 
			
		||||
                "Your model seems to have been trained with labels, but they don't match the dataset: ",
 | 
			
		||||
                "Your model seems to have been trained with labels, but they don't match the dataset: "
 | 
			
		||||
                f"model labels: {sorted(label_name_to_id.keys())}, dataset labels: {sorted(label_list)}."
 | 
			
		||||
                "\nIgnoring the model labels as a result.",
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
logger = get_logger(__name__)
 | 
			
		||||
 | 
			
		||||
@ -370,7 +370,7 @@ def main():
 | 
			
		||||
            label_to_id = {i: label_name_to_id[label_list[i]] for i in range(num_labels)}
 | 
			
		||||
        else:
 | 
			
		||||
            logger.warning(
 | 
			
		||||
                "Your model seems to have been trained with labels, but they don't match the dataset: ",
 | 
			
		||||
                "Your model seems to have been trained with labels, but they don't match the dataset: "
 | 
			
		||||
                f"model labels: {sorted(label_name_to_id.keys())}, dataset labels: {sorted(label_list)}."
 | 
			
		||||
                "\nIgnoring the model labels as a result.",
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
 | 
			
		||||
 | 
			
		||||
@ -417,7 +417,7 @@ def main():
 | 
			
		||||
                label_to_id = {l: i for i, l in enumerate(label_list)}
 | 
			
		||||
        else:
 | 
			
		||||
            logger.warning(
 | 
			
		||||
                "Your model seems to have been trained with labels, but they don't match the dataset: ",
 | 
			
		||||
                "Your model seems to have been trained with labels, but they don't match the dataset: "
 | 
			
		||||
                f"model labels: {sorted(model.config.label2id.keys())}, dataset labels:"
 | 
			
		||||
                f" {sorted(label_list)}.\nIgnoring the model labels as a result.",
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
logger = get_logger(__name__)
 | 
			
		||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
 | 
			
		||||
@ -458,7 +458,7 @@ def main():
 | 
			
		||||
                label_to_id = {l: i for i, l in enumerate(label_list)}
 | 
			
		||||
        else:
 | 
			
		||||
            logger.warning(
 | 
			
		||||
                "Your model seems to have been trained with labels, but they don't match the dataset: ",
 | 
			
		||||
                "Your model seems to have been trained with labels, but they don't match the dataset: "
 | 
			
		||||
                f"model labels: {sorted(model.config.label2id.keys())}, dataset labels:"
 | 
			
		||||
                f" {sorted(label_list)}.\nIgnoring the model labels as a result.",
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 | 
			
		||||
check_min_version("4.43.0.dev0")
 | 
			
		||||
check_min_version("4.44.0.dev0")
 | 
			
		||||
 | 
			
		||||
logger = get_logger(__name__)
 | 
			
		||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
 | 
			
		||||
 | 
			
		||||
@ -557,7 +557,7 @@ class MultiHeadedAttention(nn.Module):
 | 
			
		||||
            return context
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class DecoderState(object):
 | 
			
		||||
class DecoderState:
 | 
			
		||||
    """Interface for grouping together the current state of a recurrent
 | 
			
		||||
    decoder. In the simplest case just represents the hidden state of
 | 
			
		||||
    the model.  But can also be used for implementing various forms of
 | 
			
		||||
@ -694,7 +694,7 @@ def build_predictor(args, tokenizer, symbols, model, logger=None):
 | 
			
		||||
    return translator
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class GNMTGlobalScorer(object):
 | 
			
		||||
class GNMTGlobalScorer:
 | 
			
		||||
    """
 | 
			
		||||
    NMT re-ranking score from
 | 
			
		||||
    "Google's Neural Machine Translation System" :cite:`wu2016google`
 | 
			
		||||
@ -717,7 +717,7 @@ class GNMTGlobalScorer(object):
 | 
			
		||||
        return normalized_probs
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class PenaltyBuilder(object):
 | 
			
		||||
class PenaltyBuilder:
 | 
			
		||||
    """
 | 
			
		||||
    Returns the Length and Coverage Penalty function for Beam Search.
 | 
			
		||||
 | 
			
		||||
@ -763,7 +763,7 @@ class PenaltyBuilder(object):
 | 
			
		||||
        return logprobs
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Translator(object):
 | 
			
		||||
class Translator:
 | 
			
		||||
    """
 | 
			
		||||
    Uses a model to translate a batch of sentences.
 | 
			
		||||
 | 
			
		||||
@ -1002,7 +1002,7 @@ def tile(x, count, dim=0):
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class BertSumOptimizer(object):
 | 
			
		||||
class BertSumOptimizer:
 | 
			
		||||
    """Specific optimizer for BertSum.
 | 
			
		||||
 | 
			
		||||
    As described in [1], the authors fine-tune BertSum for abstractive
 | 
			
		||||
 | 
			
		||||
@ -2,4 +2,4 @@ datasets==2.3.2
 | 
			
		||||
transformers==4.38.0
 | 
			
		||||
wandb==0.13.1
 | 
			
		||||
evaluate==0.2.2
 | 
			
		||||
scikit-learn==1.1.2
 | 
			
		||||
scikit-learn==1.5.0
 | 
			
		||||
@ -187,7 +187,7 @@ rsa==4.8
 | 
			
		||||
s3transfer==0.3.7
 | 
			
		||||
sacrebleu==1.5.1
 | 
			
		||||
sacremoses==0.0.49
 | 
			
		||||
scikit-learn==1.0.2
 | 
			
		||||
scikit-learn==1.5.0
 | 
			
		||||
scipy==1.8.0
 | 
			
		||||
segments==2.2.0
 | 
			
		||||
sentencepiece==0.1.96
 | 
			
		||||
 | 
			
		||||
@ -59,7 +59,7 @@ class GroupedBatchSampler(BatchSampler):
 | 
			
		||||
 | 
			
		||||
    def __init__(self, sampler, group_ids, batch_size):
 | 
			
		||||
        if not isinstance(sampler, Sampler):
 | 
			
		||||
            raise ValueError(
 | 
			
		||||
            raise TypeError(
 | 
			
		||||
                "sampler should be an instance of torch.utils.data.Sampler, but got sampler={}".format(sampler)
 | 
			
		||||
            )
 | 
			
		||||
        self.sampler = sampler
 | 
			
		||||
 | 
			
		||||
@ -3,7 +3,7 @@ import torch
 | 
			
		||||
from transformers import AutoTokenizer
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class FSNERTokenizerUtils(object):
 | 
			
		||||
class FSNERTokenizerUtils:
 | 
			
		||||
    def __init__(self, pretrained_model_name_or_path):
 | 
			
		||||
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -417,7 +417,7 @@ class ShapeSpec(namedtuple("_ShapeSpec", ["channels", "height", "width", "stride
 | 
			
		||||
        return super().__new__(cls, channels, height, width, stride)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Box2BoxTransform(object):
 | 
			
		||||
class Box2BoxTransform:
 | 
			
		||||
    """
 | 
			
		||||
    This R-CNN transformation scales the box's width and height
 | 
			
		||||
    by exp(dw), exp(dh) and shifts a box's center by the offset
 | 
			
		||||
@ -519,7 +519,7 @@ class Box2BoxTransform(object):
 | 
			
		||||
        return pred_boxes
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Matcher(object):
 | 
			
		||||
class Matcher:
 | 
			
		||||
    """
 | 
			
		||||
    This class assigns to each predicted "element" (e.g., a box) a ground-truth
 | 
			
		||||
    element. Each predicted element will have exactly zero or one matches; each
 | 
			
		||||
@ -622,7 +622,7 @@ class Matcher(object):
 | 
			
		||||
        match_labels[pred_inds_with_highest_quality] = 1
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class RPNOutputs(object):
 | 
			
		||||
class RPNOutputs:
 | 
			
		||||
    def __init__(
 | 
			
		||||
        self,
 | 
			
		||||
        box2box_transform,
 | 
			
		||||
@ -1132,7 +1132,7 @@ class ROIPooler(nn.Module):
 | 
			
		||||
        return output
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ROIOutputs(object):
 | 
			
		||||
class ROIOutputs:
 | 
			
		||||
    def __init__(self, cfg, training=False):
 | 
			
		||||
        self.smooth_l1_beta = cfg.ROI_BOX_HEAD.SMOOTH_L1_BETA
 | 
			
		||||
        self.box2box_transform = Box2BoxTransform(weights=cfg.ROI_BOX_HEAD.BBOX_REG_WEIGHTS)
 | 
			
		||||
 | 
			
		||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user