Compare commits

..

11 Commits

209 changed files with 3148 additions and 18167 deletions

View File

@ -3,7 +3,7 @@ name: Build docker images (scheduled)
on:
push:
branches:
- flex_attn_example
- build_ci_docker_image*
repository_dispatch:
workflow_call:
inputs:
@ -43,4 +43,351 @@ jobs:
build-args: |
REF=main
push: true
tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }}-test
tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }}
# Push CI images still need to be re-built daily
-
name: Build and push (for Push CI) in a daily basis
# This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
# The later case is useful for manual image building for debugging purpose. Use another tag in this case!
if: inputs.image_postfix != '-push-ci'
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-all-latest-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-all-latest-gpu-push-ci
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the transformers-all-latest-gpu-push-ci docker build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
latest-torch-deepspeed-docker:
name: "Latest PyTorch + DeepSpeed"
runs-on:
group: aws-general-8-plus
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Check out code
uses: actions/checkout@v4
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
name: Build and push
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-pytorch-deepspeed-latest-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }}
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER}}
title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu docker build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
# Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`)
latest-torch-deepspeed-docker-for-push-ci-daily-build:
name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)"
runs-on:
group: aws-general-8-plus
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Check out code
uses: actions/checkout@v4
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
# Push CI images still need to be re-built daily
-
name: Build and push (for Push CI) in a daily basis
# This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
# The later case is useful for manual image building for debugging purpose. Use another tag in this case!
if: inputs.image_postfix != '-push-ci'
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-pytorch-deepspeed-latest-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu-push-ci docker build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
doc-builder:
name: "Doc builder"
# Push CI doesn't need this image
if: inputs.image_postfix != '-push-ci'
runs-on:
group: aws-general-8-plus
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Check out code
uses: actions/checkout@v4
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
name: Build and push
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-doc-builder
push: true
tags: huggingface/transformers-doc-builder
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the huggingface/transformers-doc-builder docker build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
latest-pytorch:
name: "Latest PyTorch [dev]"
# Push CI doesn't need this image
if: inputs.image_postfix != '-push-ci'
runs-on:
group: aws-general-8-plus
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Check out code
uses: actions/checkout@v4
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
name: Build and push
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-pytorch-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-pytorch-gpu
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the huggingface/transformers-pytorch-gpudocker build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
latest-pytorch-amd:
name: "Latest PyTorch (AMD) [dev]"
runs-on:
group: aws-general-8-plus
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Check out code
uses: actions/checkout@v4
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
name: Build and push
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-pytorch-amd-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-pytorch-amd-gpu${{ inputs.image_postfix }}
# Push CI images still need to be re-built daily
-
name: Build and push (for Push CI) in a daily basis
# This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
# The later case is useful for manual image building for debugging purpose. Use another tag in this case!
if: inputs.image_postfix != '-push-ci'
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-pytorch-amd-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-pytorch-amd-gpu-push-ci
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu-push-ci build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
latest-tensorflow:
name: "Latest TensorFlow [dev]"
# Push CI doesn't need this image
if: inputs.image_postfix != '-push-ci'
runs-on:
group: aws-general-8-plus
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Check out code
uses: actions/checkout@v4
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
name: Build and push
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-tensorflow-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-tensorflow-gpu
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the huggingface/transformers-tensorflow-gpu build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
latest-pytorch-deepspeed-amd:
name: "PyTorch + DeepSpeed (AMD) [dev]"
runs-on:
group: aws-general-8-plus
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Check out code
uses: actions/checkout@v4
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
name: Build and push
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-pytorch-deepspeed-amd-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}
# Push CI images still need to be re-built daily
-
name: Build and push (for Push CI) in a daily basis
# This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
# The later case is useful for manual image building for debugging purpose. Use another tag in this case!
if: inputs.image_postfix != '-push-ci'
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-pytorch-deepspeed-amd-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the transformers-pytorch-deepspeed-amd-gpu build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
latest-quantization-torch-docker:
name: "Latest Pytorch + Quantization [dev]"
# Push CI doesn't need this image
if: inputs.image_postfix != '-push-ci'
runs-on:
group: aws-general-8-plus
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Check out code
uses: actions/checkout@v4
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
name: Build and push
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-quantization-latest-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-quantization-latest-gpu${{ inputs.image_postfix }}
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the transformers-quantization-latest-gpu build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

View File

@ -1,4 +1,4 @@
FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
LABEL maintainer="Hugging Face"
ARG DEBIAN_FRONTEND=noninteractive
@ -6,6 +6,14 @@ ARG DEBIAN_FRONTEND=noninteractive
# Use login shell to read variables from `~/.profile` (to pass dynamic created variables between RUN commands)
SHELL ["sh", "-lc"]
# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
# to be used as arguments for docker build (so far).
ARG PYTORCH='2.4.0'
# (not always a valid torch version)
ARG INTEL_TORCH_EXT='2.3.0'
# Example: `cu102`, `cu113`, etc.
ARG CUDA='cu121'
RUN apt update
RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs
@ -18,8 +26,44 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
# 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future.
# 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`.
# Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions).
RUN python3 -m pip install --no-cache-dir -U torch==2.5.0 --index-url https://download.pytorch.org/whl/test/cu124
RUN python3 -m pip install --no-cache-dir -U tensorflow==2.13 protobuf==3.20.3 tensorflow_text tensorflow_probability && python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
RUN python3 -m pip uninstall -y flax jax
RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT -f https://developer.intel.com/ipex-whl-stable-cpu
RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
RUN python3 -m pip install -U "itsdangerous<2.1.0"
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/peft@main#egg=peft
# For bettertransformer
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum
# For video model testing
RUN python3 -m pip install --no-cache-dir decord av==9.2.0
# Some slow tests require bnb
RUN python3 -m pip install --no-cache-dir bitsandbytes
# Some tests require quanto
RUN python3 -m pip install --no-cache-dir quanto
# `quanto` will install `ninja` which leads to many `CUDA error: an illegal memory access ...` in some model tests
# (`deformable_detr`, `rwkv`, `mra`)
RUN python3 -m pip uninstall -y ninja
# For `dinat` model
# The `XXX` part in `torchXXX` needs to match `PYTORCH` (to some extent)
RUN python3 -m pip install --no-cache-dir natten==0.15.1+torch220$CUDA -f https://shi-labs.com/natten/wheels
# For `nougat` tokenizer
RUN python3 -m pip install --no-cache-dir python-Levenshtein
# For `FastSpeech2ConformerTokenizer` tokenizer
RUN python3 -m pip install --no-cache-dir g2p-en
# When installing in editable mode, `transformers` is not recognized as a package.
# this line must be added in order for python to be aware of transformers.

View File

@ -177,8 +177,6 @@
title: Optimum
- local: quantization/torchao
title: TorchAO
- local: quantization/compressed_tensors
title: compressed-tensors
- local: quantization/contribute
title: Contribute new quantization method
title: Quantization Methods
@ -830,8 +828,6 @@
title: IDEFICS
- local: model_doc/idefics2
title: Idefics2
- local: model_doc/idefics3
title: Idefics3
- local: model_doc/instructblip
title: InstructBLIP
- local: model_doc/instructblipvideo
@ -862,12 +858,8 @@
title: MatCha
- local: model_doc/mgp-str
title: MGP-STR
- local: model_doc/mllama
title: mllama
- local: model_doc/nougat
title: Nougat
- local: model_doc/omdet-turbo
title: OmDet-Turbo
- local: model_doc/oneformer
title: OneFormer
- local: model_doc/owlvit

View File

@ -169,7 +169,6 @@ Flax), PyTorch, and/or TensorFlow.
| [I-BERT](model_doc/ibert) | ✅ | ❌ | ❌ |
| [IDEFICS](model_doc/idefics) | ✅ | ✅ | ❌ |
| [Idefics2](model_doc/idefics2) | ✅ | ❌ | ❌ |
| [Idefics3](model_doc/idefics3) | ✅ | ❌ | ❌ |
| [ImageGPT](model_doc/imagegpt) | ✅ | ❌ | ❌ |
| [Informer](model_doc/informer) | ✅ | ❌ | ❌ |
| [InstructBLIP](model_doc/instructblip) | ✅ | ❌ | ❌ |
@ -215,7 +214,6 @@ Flax), PyTorch, and/or TensorFlow.
| [Mimi](model_doc/mimi) | ✅ | ❌ | ❌ |
| [Mistral](model_doc/mistral) | ✅ | ✅ | ✅ |
| [Mixtral](model_doc/mixtral) | ✅ | ❌ | ❌ |
| [Mllama](model_doc/mllama) | ✅ | ❌ | ❌ |
| [mLUKE](model_doc/mluke) | ✅ | ❌ | ❌ |
| [MMS](model_doc/mms) | ✅ | ✅ | ✅ |
| [MobileBERT](model_doc/mobilebert) | ✅ | ✅ | ❌ |
@ -239,7 +237,6 @@ Flax), PyTorch, and/or TensorFlow.
| [Nyströmformer](model_doc/nystromformer) | ✅ | ❌ | ❌ |
| [OLMo](model_doc/olmo) | ✅ | ❌ | ❌ |
| [OLMoE](model_doc/olmoe) | ✅ | ❌ | ❌ |
| [OmDet-Turbo](model_doc/omdet-turbo) | ✅ | ❌ | ❌ |
| [OneFormer](model_doc/oneformer) | ✅ | ❌ | ❌ |
| [OpenAI GPT](model_doc/openai-gpt) | ✅ | ✅ | ❌ |
| [OpenAI GPT-2](model_doc/gpt2) | ✅ | ✅ | ✅ |
@ -258,7 +255,7 @@ Flax), PyTorch, and/or TensorFlow.
| [Phi3](model_doc/phi3) | ✅ | ❌ | ❌ |
| [PhoBERT](model_doc/phobert) | ✅ | ✅ | ✅ |
| [Pix2Struct](model_doc/pix2struct) | ✅ | ❌ | ❌ |
| [Pixtral](model_doc/pixtral) | | ❌ | ❌ |
| [Pixtral](model_doc/pixtral) | | ❌ | ❌ |
| [PLBart](model_doc/plbart) | ✅ | ❌ | ❌ |
| [PoolFormer](model_doc/poolformer) | ✅ | ❌ | ❌ |
| [Pop2Piano](model_doc/pop2piano) | ✅ | ❌ | ❌ |

View File

@ -61,10 +61,7 @@ Learn how to quantize models in the [Quantization](../quantization) guide.
[[autodoc]] FbgemmFp8Config
## CompressedTensorsConfig
[[autodoc]] CompressedTensorsConfig
## TorchAoConfig
[[autodoc]] TorchAoConfig

View File

@ -46,7 +46,7 @@ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
candidate_labels = ["an image of a cat", "an image of a dog"]
inputs = processor(images=image ,text=candidate_labels, return_tensors="pt")
inputs = processor(text=candidate_labels, images=image, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)

View File

@ -18,16 +18,16 @@ rendered properly in your Markdown viewer.
## Overview
The Fuyu model was created by [ADEPT](https://www.adept.ai/blog/fuyu-8b), and authored by Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, Sağnak Taşırlar.
The Fuyu model was created by [ADEPT](https://www.adept.ai/blog/fuyu-8b), and authored by Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, Sağnak Taşırlar.
The authors introduced Fuyu-8B, a decoder-only multimodal model based on the classic transformers architecture, with query and key normalization. A linear encoder is added to create multimodal embeddings from image inputs.
The authors introduced Fuyu-8B, a decoder-only multimodal model based on the classic transformers architecture, with query and key normalization. A linear encoder is added to create multimodal embeddings from image inputs.
By treating image tokens like text tokens and using a special image-newline character, the model knows when an image line ends. Image positional embeddings are removed. This avoids the need for different training phases for various image resolutions. With 8 billion parameters and licensed under CC-BY-NC, Fuyu-8B is notable for its ability to handle both text and images, its impressive context size of 16K, and its overall performance.
<Tip warning={true}>
The `Fuyu` models were trained using `bfloat16`, but the original inference uses `float16` The checkpoints uploaded on the hub use `torch_dtype = 'float16'` which will be
used by the `AutoModel` API to cast the checkpoints from `torch.float32` to `torch.float16`.
used by the `AutoModel` API to cast the checkpoints from `torch.float32` to `torch.float16`.
The `dtype` of the online weights is mostly irrelevant, unless you are using `torch_dtype="auto"` when initializing a model using `model = AutoModelForCausalLM.from_pretrained("path", torch_dtype = "auto")`. The reason is that the model will first be downloaded ( using the `dtype` of the checkpoints online) then it will be cast to the default `dtype` of `torch` (becomes `torch.float32`). Users should specify the `torch_dtype` they want, and if they don't it will be `torch.float32`.
@ -56,7 +56,7 @@ tar -xvf 8b_base_model_release.tar
```
Then, model can be loaded via:
```py
```py
from transformers import FuyuConfig, FuyuForCausalLM
model_config = FuyuConfig()
model = FuyuForCausalLM(model_config).from_pretrained('/output/path')
@ -81,7 +81,7 @@ text_prompt = "Generate a coco-style caption.\\n"
bus_image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
bus_image_pil = Image.open(io.BytesIO(requests.get(bus_image_url).content))
inputs_to_model = processor(images=bus_image_pil, text=text_prompt)
inputs_to_model = processor(text=text_prompt, images=bus_image_pil)
```
@ -90,7 +90,7 @@ This model was contributed by [Molbap](https://huggingface.co/Molbap).
The original code can be found [here](https://github.com/persimmon-ai-labs/adept-inference).
- Fuyu uses a `sentencepiece` based tokenizer, with a `Unigram` model. It supports bytefallback, which is only available in `tokenizers==0.14.0` for the fast tokenizer.
The `LlamaTokenizer` is used as it is a standard wrapper around sentencepiece.
The `LlamaTokenizer` is used as it is a standard wrapper around sentencepiece.
- The authors suggest to use the following prompt for image captioning: `f"Generate a coco-style caption.\\n"`

View File

@ -1,73 +0,0 @@
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# Idefics3
## Overview
The Idefics3 model was proposed in [Building and better understanding vision-language models: insights and future directions](https://huggingface.co/papers/2408.12637) by Hugo Laurençon, Andrés Marafioti, Victor Sanh, and Léo Tronchon.
Idefics3 is an adaptation of the Idefics2 model with three main differences:
- It uses Llama3 for the text model.
- It uses an updated processing logic for the images.
- It removes the perceiver.
The abstract from the paper is the following:
*The field of vision-language models (VLMs), which take images and texts as inputs and output texts, is rapidly evolving and has yet to reach consensus on several key aspects of the development pipeline, including data, architecture, and training methods. This paper can be seen as a tutorial for building a VLM. We begin by providing a comprehensive overview of the current state-of-the-art approaches, highlighting the strengths and weaknesses of each, addressing the major challenges in the field, and suggesting promising research directions for underexplored areas. We then walk through the practical steps to build Idefics3-8B, a powerful VLM that significantly outperforms its predecessor Idefics2-8B, while being trained efficiently, exclusively on open datasets, and using a straightforward pipeline. These steps include the creation of Docmatix, a dataset for improving document understanding capabilities, which is 240 times larger than previously available datasets. We release the model along with the datasets created for its training.*
## Usage tips
Input images are processed either by upsampling (if resizing is enabled) or at their original resolution. The resizing behavior depends on two parameters: do_resize and size.
If `do_resize` is set to `True`, the model resizes images so that the longest edge is 4*364 pixels by default.
The default resizing behavior can be customized by passing a dictionary to the `size` parameter. For example, `{"longest_edge": 4 * 364}` is the default, but you can change it to a different value if needed.
Heres how to control resizing and set a custom size:
```python
image_processor = Idefics3ImageProcessor(do_resize=True, size={"longest_edge": 2 * 364}, max_image_size=364)
```
Additionally, the `max_image_size` parameter, which controls the size of each square patch the image is decomposed into, is set to 364 by default but can be adjusted as needed. After resizing (if applicable), the image processor decomposes the images into square patches based on the `max_image_size` parameter.
This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts) and [andimarafioti](https://huggingface.co/andito).
## Idefics3Config
[[autodoc]] Idefics3Config
## Idefics3Model
[[autodoc]] Idefics3Model
- forward
## Idefics3ForConditionalGeneration
[[autodoc]] Idefics3ForConditionalGeneration
- forward
## Idefics3ImageProcessor
[[autodoc]] Idefics3ImageProcessor
- preprocess
## Idefics3Processor
[[autodoc]] Idefics3Processor
- __call__

View File

@ -133,7 +133,7 @@ import requests
processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True)
model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True)
model.to("cuda:0")
# prepare image and text prompt, using the appropriate prompt template
@ -150,7 +150,7 @@ conversation = [
},
]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(image, prompt, return_tensors="pt").to("cuda:0")
inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
# autoregressively complete prompt
output = model.generate(**inputs, max_new_tokens=100)
@ -222,7 +222,7 @@ prompts = [prompt_1, prompt_2]
# We can simply feed images in the order they have to be used in the text prompt
# Each "<image>" token uses one image leaving the next for the subsequent "<image>" tokens
inputs = processor(images=[image_stop, image_cats, image_snowman], text=prompts, padding=True, return_tensors="pt").to(model.device)
inputs = processor(text=prompts, images=[image_stop, image_cats, image_snowman], padding=True, return_tensors="pt").to(model.device)
# Generate
generate_ids = model.generate(**inputs, max_new_tokens=30)
@ -266,8 +266,8 @@ First make sure to install flash-attn. Refer to the [original repository of Flas
from transformers import LlavaNextForConditionalGeneration
model = LlavaNextForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.float16,
model_id,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
use_flash_attention_2=True
).to(0)

View File

@ -163,21 +163,3 @@ Below is an expected speedup diagram that compares pure inference time between t
<div style="text-align: center">
<img src="https://huggingface.co/datasets/visheratin/documentation-images/resolve/main/nllb-speedup.webp">
</div>
## Using Scaled Dot Product Attention (SDPA)
PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function
encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the
[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
page for more information.
SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set
`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
```python
from transformers import M2M100ForConditionalGeneration
model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M", torch_dtype=torch.float16, attn_implementation="sdpa")
...
```
For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).

View File

@ -1,124 +0,0 @@
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# Mllama
## Overview
The Llama 3.2-Vision collection of multimodal large language models (LLMs) is a collection of pretrained and instruction-tuned image reasoning generative models in 11B and 90B sizes (text \+ images in / text out). The Llama 3.2-Vision instruction-tuned models are optimized for visual recognition, image reasoning, captioning, and answering general questions about an image.
**Model Architecture:** Llama 3.2-Vision is built on top of Llama 3.1 text-only model, which is an auto-regressive language model that uses an optimized transformer architecture. The tuned versions use supervised fine-tuning (SFT) and reinforcement learning with human feedback (RLHF) to align with human preferences for helpfulness and safety. To support image recognition tasks, the Llama 3.2-Vision model uses a separately trained vision adapter that integrates with the pre-trained Llama 3.1 language model. The adapter consists of a series of cross-attention layers that feed image encoder representations into the core LLM.
## Usage Tips
- For image+text and text inputs use `MllamaForConditionalGeneration`.
- For text-only inputs use `MllamaForCausalLM` for generation to avoid loading vision tower.
- Each sample can contain multiple images, and the number of images can vary between samples. The processor will pad the inputs to the maximum number of images across samples and to a maximum number of tiles within each image.
- The text passed to the processor should have the `"<|image|>"` tokens where the images should be inserted.
- The processor has its own `apply_chat_template` method to convert chat messages to text that can then be passed as text to the processor.
## Usage Example
#### Instruct model
```python
import requests
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
model = MllamaForConditionalGeneration.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained(model_id)
messages = [
[
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What does the image show?"}
]
}
],
]
text = processor.apply_chat_template(messages, add_generation_prompt=True)
url = "https://llava-vl.github.io/static/images/view.jpg"
image = Image.open(requests.get(url, stream=True).raw)
inputs = processor(text=text, images=image, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=25)
print(processor.decode(output[0]))
```
#### Base model
```python
import requests
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor
model_id = "meta-llama/Llama-3.2-11B-Vision"
model = MllamaForConditionalGeneration.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained(model_id)
prompt = "<|image|>If I had to write a haiku for this one"
url = "https://llava-vl.github.io/static/images/view.jpg"
raw_image = Image.open(requests.get(url, stream=True).raw)
inputs = processor(text=prompt, images=raw_image, return_tensors="pt").to(model.device)
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
print(processor.decode(output[0], skip_special_tokens=True))
```
## MllamaConfig
[[autodoc]] MllamaConfig
## MllamaProcessor
[[autodoc]] MllamaProcessor
## MllamaImageProcessor
[[autodoc]] MllamaImageProcessor
## MllamaForConditionalGeneration
[[autodoc]] MllamaForConditionalGeneration
- forward
## MllamaForCausalLM
[[autodoc]] MllamaForCausalLM
- forward
## MllamaTextModel
[[autodoc]] MllamaTextModel
- forward
## MllamaForCausalLM
[[autodoc]] MllamaForCausalLM
- forward
## MllamaVisionModel
[[autodoc]] MllamaVisionModel
- forward

View File

@ -188,21 +188,3 @@ Below is an expected speedup diagram that compares pure inference time between t
<div style="text-align: center">
<img src="https://huggingface.co/datasets/visheratin/documentation-images/resolve/main/nllb-speedup.webp">
</div>
## Using Scaled Dot Product Attention (SDPA)
PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function
encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the
[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
page for more information.
SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set
`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
```python
from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M", torch_dtype=torch.float16, attn_implementation="sdpa")
...
```
For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).

View File

@ -1,164 +0,0 @@
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# OmDet-Turbo
## Overview
The OmDet-Turbo model was proposed in [Real-time Transformer-based Open-Vocabulary Detection with Efficient Fusion Head](https://arxiv.org/abs/2403.06892) by Tiancheng Zhao, Peng Liu, Xuan He, Lu Zhang, Kyusong Lee. OmDet-Turbo incorporates components from RT-DETR and introduces a swift multimodal fusion module to achieve real-time open-vocabulary object detection capabilities while maintaining high accuracy. The base model achieves performance of up to 100.2 FPS and 53.4 AP on COCO zero-shot.
The abstract from the paper is the following:
*End-to-end transformer-based detectors (DETRs) have shown exceptional performance in both closed-set and open-vocabulary object detection (OVD) tasks through the integration of language modalities. However, their demanding computational requirements have hindered their practical application in real-time object detection (OD) scenarios. In this paper, we scrutinize the limitations of two leading models in the OVDEval benchmark, OmDet and Grounding-DINO, and introduce OmDet-Turbo. This novel transformer-based real-time OVD model features an innovative Efficient Fusion Head (EFH) module designed to alleviate the bottlenecks observed in OmDet and Grounding-DINO. Notably, OmDet-Turbo-Base achieves a 100.2 frames per second (FPS) with TensorRT and language cache techniques applied. Notably, in zero-shot scenarios on COCO and LVIS datasets, OmDet-Turbo achieves performance levels nearly on par with current state-of-the-art supervised models. Furthermore, it establishes new state-of-the-art benchmarks on ODinW and OVDEval, boasting an AP of 30.1 and an NMS-AP of 26.86, respectively. The practicality of OmDet-Turbo in industrial applications is underscored by its exceptional performance on benchmark datasets and superior inference speed, positioning it as a compelling choice for real-time object detection tasks.*
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/omdet_turbo_architecture.jpeg" alt="drawing" width="600"/>
<small> OmDet-Turbo architecture overview. Taken from the <a href="https://arxiv.org/abs/2403.06892">original paper</a>. </small>
This model was contributed by [yonigozlan](https://huggingface.co/yonigozlan).
The original code can be found [here](https://github.com/om-ai-lab/OmDet).
## Usage tips
One unique property of OmDet-Turbo compared to other zero-shot object detection models, such as [Grounding DINO](grounding-dino), is the decoupled classes and prompt embedding structure that allows caching of text embeddings. This means that the model needs both classes and task as inputs, where classes is a list of objects we want to detect and task is the grounded text used to guide open-vocabulary detection. This approach limits the scope of the open-vocabulary detection and makes the decoding process faster.
[`OmDetTurboProcessor`] is used to prepare the classes, task and image triplet. The task input is optional, and when not provided, it will default to `"Detect [class1], [class2], [class3], ..."`. To process the results from the model, one can use `post_process_grounded_object_detection` from [`OmDetTurboProcessor`]. Notably, this function takes in the input classes, as unlike other zero-shot object detection models, the decoupling of classes and task embeddings means that no decoding of the predicted class embeddings is needed in the post-processing step, and the predicted classes can be matched to the inputted ones directly.
## Usage example
### Single image inference
Here's how to load the model and prepare the inputs to perform zero-shot object detection on a single image:
```python
import requests
from PIL import Image
from transformers import AutoProcessor, OmDetTurboForObjectDetection
processor = AutoProcessor.from_pretrained("omlab/omdet-turbo-tiny")
model = OmDetTurboForObjectDetection.from_pretrained("omlab/omdet-turbo-tiny")
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
classes = ["cat", "remote"]
inputs = processor(image, text=classes, return_tensors="pt")
outputs = model(**inputs)
# convert outputs (bounding boxes and class logits)
results = processor.post_process_grounded_object_detection(
outputs,
classes=classes,
target_sizes=[image.size[::-1]],
score_threshold=0.3,
nms_threshold=0.3,
)[0]
for score, class_name, box in zip(
results["scores"], results["classes"], results["boxes"]
):
box = [round(i, 1) for i in box.tolist()]
print(
f"Detected {class_name} with confidence "
f"{round(score.item(), 2)} at location {box}"
)
```
### Multi image inference
OmDet-Turbo can perform batched multi-image inference, with support for different text prompts and classes in the same batch:
```python
>>> import torch
>>> import requests
>>> from io import BytesIO
>>> from PIL import Image
>>> from transformers import AutoProcessor, OmDetTurboForObjectDetection
>>> processor = AutoProcessor.from_pretrained("omlab/omdet-turbo-swin-tiny-hf")
>>> model = OmDetTurboForObjectDetection.from_pretrained("omlab/omdet-turbo-swin-tiny-hf")
>>> url1 = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image1 = Image.open(BytesIO(requests.get(url1).content)).convert("RGB")
>>> classes1 = ["cat", "remote"]
>>> task1 = "Detect {}.".format(", ".join(classes1))
>>> url2 = "http://images.cocodataset.org/train2017/000000257813.jpg"
>>> image2 = Image.open(BytesIO(requests.get(url2).content)).convert("RGB")
>>> classes2 = ["boat"]
>>> task2 = "Detect everything that looks like a boat."
>>> url3 = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
>>> image3 = Image.open(BytesIO(requests.get(url3).content)).convert("RGB")
>>> classes3 = ["statue", "trees"]
>>> task3 = "Focus on the foreground, detect statue and trees."
>>> inputs = processor(
... images=[image1, image2, image3],
... text=[classes1, classes2, classes3],
... task=[task1, task2, task3],
... return_tensors="pt",
... )
>>> with torch.no_grad():
... outputs = model(**inputs)
>>> # convert outputs (bounding boxes and class logits)
>>> results = processor.post_process_grounded_object_detection(
... outputs,
... classes=[classes1, classes2, classes3],
... target_sizes=[image1.size[::-1], image2.size[::-1], image3.size[::-1]],
... score_threshold=0.2,
... nms_threshold=0.3,
... )
>>> for i, result in enumerate(results):
... for score, class_name, box in zip(
... result["scores"], result["classes"], result["boxes"]
... ):
... box = [round(i, 1) for i in box.tolist()]
... print(
... f"Detected {class_name} with confidence "
... f"{round(score.item(), 2)} at location {box} in image {i}"
... )
Detected remote with confidence 0.77 at location [39.9, 70.4, 176.7, 118.0] in image 0
Detected cat with confidence 0.72 at location [11.6, 54.2, 314.8, 474.0] in image 0
Detected remote with confidence 0.56 at location [333.4, 75.8, 370.7, 187.0] in image 0
Detected cat with confidence 0.55 at location [345.2, 24.0, 639.8, 371.7] in image 0
Detected boat with confidence 0.32 at location [146.9, 219.8, 209.6, 250.7] in image 1
Detected boat with confidence 0.3 at location [319.1, 223.2, 403.2, 238.4] in image 1
Detected boat with confidence 0.27 at location [37.7, 220.3, 84.0, 235.9] in image 1
Detected boat with confidence 0.22 at location [407.9, 207.0, 441.7, 220.2] in image 1
Detected statue with confidence 0.73 at location [544.7, 210.2, 651.9, 502.8] in image 2
Detected trees with confidence 0.25 at location [3.9, 584.3, 391.4, 785.6] in image 2
Detected trees with confidence 0.25 at location [1.4, 621.2, 118.2, 787.8] in image 2
Detected statue with confidence 0.2 at location [428.1, 205.5, 767.3, 759.5] in image 2
```
## OmDetTurboConfig
[[autodoc]] OmDetTurboConfig
## OmDetTurboProcessor
[[autodoc]] OmDetTurboProcessor
- post_process_grounded_object_detection
## OmDetTurboForObjectDetection
[[autodoc]] OmDetTurboForObjectDetection
- forward

View File

@ -18,22 +18,20 @@ rendered properly in your Markdown viewer.
## Overview
The Pixtral model was released by the Mistral AI team on [vLLM](https://github.com/vllm-project/vllm/pull/8377), where a version of the code can be found!
The Pixtral model was released by the Mistral AI team on [Vllm](https://github.com/vllm-project/vllm/pull/8377), where a version of the code can be found!
Tips:
- Pixtral is a multimodal model, taking images and text as input, and producing text as output.
- This model follows the [Llava](llava) family, meaning image embeddings are placed instead of the `[IMG]` token placeholders. The model uses [`PixtralVisionModel`] for its vision encoder, and [`MistralForCausalLM`] for its language decoder.
- The main contribution is the 2d ROPE (rotary postiion embeddings) on the images, and support for arbitrary image sizes (the images are not padded together nor are they resized).
- Pixtral is a multimodal model, the main contribution is the 2d ROPE on the images, and support for arbitrary image size (the images are not padded together nor are they resized)
- This model follows the `Llava` familiy, meaning image embeddings are placed instead of the `[IMG]` token placeholders.
- The format for one or mulitple prompts is the following:
```
"<s>[INST][IMG]\nWhat are the things I should be cautious about when I visit this place?[/INST]"
```
Then, the processor will replace each `[IMG]` token with a number of `[IMG]` token that depends on the height and the width of the image. Each *row* of the image is separated by a `[IMG_BREAK]` token, and each image is separated by a `[IMG_END]` token.
This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts) and [ArthurZ](https://huggingface.co/ArthurZ). The original code can be found [here](https://github.com/vllm-project/vllm/pull/8377).
## Usage
This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts) and [ArthurZ](https://huggingface.co/ArthurZ)
Here is an example of how to run it:
@ -85,9 +83,9 @@ Each image captures a different scene, from a close-up of a dog to expansive nat
[[autodoc]] PixtralVisionConfig
## PixtralVisionModel
## PixtralModel
[[autodoc]] PixtralVisionModel
[[autodoc]] PixtralModel
- forward
## PixtralImageProcessor

View File

@ -54,7 +54,6 @@ FlashAttention-2 is currently supported for the following architectures:
* [Granite](https://huggingface.co/docs/transformers/model_doc/granite#transformers.GraniteModel)
* [GraniteMoe](https://huggingface.co/docs/transformers/model_doc/granitemoe#transformers.GraniteMoeModel)
* [Idefics2](https://huggingface.co/docs/transformers/model_doc/idefics2#transformers.Idefics2Model)
* [Idefics3](https://huggingface.co/docs/transformers/model_doc/idefics3#transformers.Idefics3Model)
* [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel)
* [JetMoe](https://huggingface.co/docs/transformers/model_doc/jetmoe#transformers.JetMoeModel)
* [Jamba](https://huggingface.co/docs/transformers/model_doc/jamba#transformers.JambaModel)
@ -234,14 +233,11 @@ For now, Transformers supports SDPA inference and training for the following arc
* [Jamba](https://huggingface.co/docs/transformers/model_doc/jamba#transformers.JambaModel)
* [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel)
* [LLaVA-Onevision](https://huggingface.co/docs/transformers/model_doc/llava_onevision)
* [M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100#transformers.M2M100Model)
* [Mimi](https://huggingface.co/docs/transformers/model_doc/mimi)
* [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel)
* [Mllama](https://huggingface.co/docs/transformers/model_doc/mllama#transformers.MllamaForConditionalGeneration)
* [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel)
* [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel)
* [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel)
* [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)
* [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel)
* [OLMoE](https://huggingface.co/docs/transformers/model_doc/olmoe#transformers.OlmoeModel)
* [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration)

View File

@ -186,7 +186,7 @@ If you prefer to use 🤗 Accelerate, find the 🤗 Accelerate example [further
If you have access to an Ampere or newer hardware you can use bf16 for mixed precision training and evaluation. While
bf16 has a worse precision than fp16, it has a much bigger dynamic range. In fp16 the biggest number you can have
is `65504` and any number above that will result in an overflow. A bf16 number can be as large as `3.39e+38` (!) which
is `65535` and any number above that will result in an overflow. A bf16 number can be as large as `3.39e+38` (!) which
is about the same as fp32 - because both have 8-bits used for the numerical range.
You can enable BF16 in the 🤗 Trainer with:

View File

@ -1,230 +0,0 @@
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# Compressed Tensors
The [`compressed-tensors`](https://github.com/neuralmagic/compressed-tensors) library provides a versatile and efficient way to store and manage compressed model checkpoints. This library supports various quantization and sparsity schemes, making it a unified format for handling different model optimizations like GPTQ, AWQ, SmoothQuant, INT8, FP8, SparseGPT, and more.
Some of the supported formats include:
1. `dense`
2. `int-quantized`: INT8 quantized models
- sample [model/config](https://huggingface.co/nm-testing/tinyllama-w8a8-compressed-hf-quantizer)
3. `float-quantized`: FP8 quantized models; currently support E4M3
- sample [model/config](https://huggingface.co/nm-testing/Meta-Llama-3-8B-Instruct-fp8-hf_compat/tree/main)
4. `pack-quantized`: INT4 or INT8 weight-quantized models, packed into INT32. For INT4, the weights have an INT4 range but are stored as INT8 and then packed into INT32.
- sample [model/config](nm-testing/tinyllama-w4a16-compressed-hf-quantizer)
Compressed models can be easily created using [llm-compressor](https://github.com/vllm-project/llm-compressor).
Alternatively models can be created indepedenty and serialized with a compressed tensors config.
To find existing models on the Hugging Face Model Hub, search for the [`compressed-tensors` tag](https://huggingface.co/models?other=compressed-tensors).
#### Features:
- Weight and activation precisions: FP8, INT4, INT8 (for Q/DQ arbitrary precision is allowed for INT)
- Quantization scales and zero-points strategies: [tensor, channel, group, block, token](https://github.com/neuralmagic/compressed-tensors/blob/83b2e7a969d70606421a76b9a3d112646077c8de/src/compressed_tensors/quantization/quant_args.py#L43-L52)
- Dynamic per-token activation quantization (or any static strategy)
- Sparsity can be
- Supports quantization of arbitrary modules, not just Linear modules
- Targeted support or ignoring of modules by name or class
## Installation
It is recommended to install stable releases of compressed-tensors from [PyPI](https://pypi.org/project/compressed-tensors):
```bash
pip install compressed-tensors
```
Developers who want to experiment with the latest features can also install the package from source:
```bash
git clone https://github.com/neuralmagic/compressed-tensors
cd compressed-tensors
pip install -e .
```
## Quickstart Model Load
Quantized models can be easily loaded for inference as shown below. Only models that have already been quantized can be loaded at the moment. To quantize a model into the compressed-tensors format see [llm-compressor](https://github.com/vllm-project/llm-compressor).
```python
from transformers import AutoModelForCausalLM
# Load the model in compressed-tensors format
ct_model = AutoModelForCausalLM.from_pretrained("nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf")
# Measure memory usage
mem_params = sum([param.nelement()*param.element_size() for param in ct_model.parameters()])
print(f"{mem/2**30:.4f} GB")
# 8.4575 GB
```
We can see just above that the compressed-tensors FP8 checkpoint of Llama 3.1 8B is able to be loaded for inference using half of the memory of the unquantized reference checkpoint.
## Sample Use Cases - Load and run an FP8 model
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
prompt = [
"Hello, my name is",
"The capital of France is",
"The future of AI is"
]
model_name = "nm-testing/Meta-Llama-3-8B-Instruct-fp8-hf_compat"
quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
inputs = tokenizer(prompt, return_tensors="pt")
generated_ids = quantized_model.generate(**inputs, max_length=50, do_sample=False)
outputs = tokenizer.batch_decode(generated_ids)
print(outputs)
"""
['<|begin_of_text|>Hello, my name is [Name]. I am a [Your Profession/Student] and I am here to learn about the [Course/Program] at [University/Institution]. I am excited to be here and I am looking forward to', '<|begin_of_text|>The capital of France is Paris, which is located in the north-central part of the country. Paris is the most populous city in France and is known for its stunning architecture, art museums, fashion, and romantic atmosphere. The city is home to', "<|begin_of_text|>The future of AI is here, and it's already changing the way we live and work. From virtual assistants to self-driving cars, AI is transforming industries and revolutionizing the way we interact with technology. But what does the future of AI hold"]
"""
```
The above shows a quick example for running generation using a `compressed-tensors`
model. Currently, once loaded the model cannot be saved.
## Deep dive into a compressed-tensors model checkpoint
In this example we will examine how the compressed-tensors model nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf is defined through its configuration entry and see how this translates to the loaded model representation.
First, let us look at the [`quantization_config` of the model](https://huggingface.co/nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf/blob/main/config.json). At a glance it looks overwhelming with the number of entries but this is because compressed-tensors is a format that allows for flexible expression both during and after model compression.
In practice for checkpoint loading and inference the configuration can be simplified to not include all the default or empty entries, so we will do that here to focus on what compression is actually represented.
```yaml
"quantization_config": {
"config_groups": {
"group_0": {
"input_activations": {
"num_bits": 8,
"strategy": "tensor",
"type": "float"
},
"targets": ["Linear"],
"weights": {
"num_bits": 8,
"strategy": "tensor",
"type": "float"
}
}
},
"format": "naive-quantized",
"ignore": ["lm_head"],
"quant_method": "compressed-tensors",
"quantization_status": "frozen"
},
```
We can see from the above configuration that it is specifying one config group that includes weight and activation quantization to FP8 with a static per-tensor strategy. It is also worth noting that in the `ignore` list there is an entry to skip quantization of the `lm_head` module, so that module should be untouched in the checkpoint.
To see the result of the configuration in practice, we can simply use the [safetensors viewer](https://huggingface.co/nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf?show_file_info=model.safetensors.index.json) on the model card to see the quantized weights, input_scale, and weight_scale for all of the Linear modules in the first model layer (and so on for the rest of the layers).
| Tensors | Shape | Precision |
| ------- | ----- | --------- |
model.layers.0.input_layernorm.weight | [4096] | BF16
model.layers.0.mlp.down_proj.input_scale | [1] | BF16
model.layers.0.mlp.down_proj.weight | [4096, 14336] | F8_E4M3
model.layers.0.mlp.down_proj.weight_scale | [1] | BF16
model.layers.0.mlp.gate_proj.input_scale | [1] | BF16
model.layers.0.mlp.gate_proj.weight | [14336, 4096] | F8_E4M3
model.layers.0.mlp.gate_proj.weight_scale | [1] | BF16
model.layers.0.mlp.up_proj.input_scale| [1] |BF16
model.layers.0.mlp.up_proj.weight | [14336, 4096] | F8_E4M3
model.layers.0.mlp.up_proj.weight_scale | [1] | BF16
model.layers.0.post_attention_layernorm.weight | [4096] |BF16
model.layers.0.self_attn.k_proj.input_scale | [1] | BF16
model.layers.0.self_attn.k_proj.weight | [1024, 4096]| F8_E4M3
model.layers.0.self_attn.k_proj.weight_scale |[1] | BF16
model.layers.0.self_attn.o_proj.input_scale | [1] | BF16
model.layers.0.self_attn.o_proj.weight | [4096, 4096] | F8_E4M3
model.layers.0.self_attn.o_proj.weight_scale | [1] | BF16
model.layers.0.self_attn.q_proj.input_scale | [1] | BF16
model.layers.0.self_attn.q_proj.weight | [4096, 4096] | F8_E4M3
model.layers.0.self_attn.q_proj.weight_scale | [1] | BF16
model.layers.0.self_attn.v_proj.input_scale | [1] | BF16
model.layers.0.self_attn.v_proj.weight | [1024, 4096] | F8_E4M3
model.layers.0.self_attn.v_proj.weight_scale | [1] | BF16
When we load the model with the compressed-tensors HFQuantizer integration, we can see that all of the Linear modules that are specified within the quantization configuration have been replaced by `CompressedLinear` modules that manage the compressed weights and forward pass for inference. Note that the `lm_head` mentioned before in the ignore list is still kept as an unquantized Linear module.
```python
from transformers import AutoModelForCausalLM
ct_model = AutoModelForCausalLM.from_pretrained("nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf")
print(ct_model)
"""
LlamaForCausalLM(
(model): LlamaModel(
(embed_tokens): Embedding(128256, 4096)
(layers): ModuleList(
(0-31): 32 x LlamaDecoderLayer(
(self_attn): LlamaSdpaAttention(
(q_proj): CompressedLinear(
in_features=4096, out_features=4096, bias=False
(input_observer): MovingAverageMinMaxObserver()
(weight_observer): MovingAverageMinMaxObserver()
)
(k_proj): CompressedLinear(
in_features=4096, out_features=1024, bias=False
(input_observer): MovingAverageMinMaxObserver()
(weight_observer): MovingAverageMinMaxObserver()
)
(v_proj): CompressedLinear(
in_features=4096, out_features=1024, bias=False
(input_observer): MovingAverageMinMaxObserver()
(weight_observer): MovingAverageMinMaxObserver()
)
(o_proj): CompressedLinear(
in_features=4096, out_features=4096, bias=False
(input_observer): MovingAverageMinMaxObserver()
(weight_observer): MovingAverageMinMaxObserver()
)
(rotary_emb): LlamaRotaryEmbedding()
)
(mlp): LlamaMLP(
(gate_proj): CompressedLinear(
in_features=4096, out_features=14336, bias=False
(input_observer): MovingAverageMinMaxObserver()
(weight_observer): MovingAverageMinMaxObserver()
)
(up_proj): CompressedLinear(
in_features=4096, out_features=14336, bias=False
(input_observer): MovingAverageMinMaxObserver()
(weight_observer): MovingAverageMinMaxObserver()
)
(down_proj): CompressedLinear(
in_features=14336, out_features=4096, bias=False
(input_observer): MovingAverageMinMaxObserver()
(weight_observer): MovingAverageMinMaxObserver()
)
(act_fn): SiLU()
)
(input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
(post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
)
)
(norm): LlamaRMSNorm((4096,), eps=1e-05)
(rotary_emb): LlamaRotaryEmbedding()
)
(lm_head): Linear(in_features=4096, out_features=128256, bias=False)
)
"""
```

View File

@ -50,7 +50,6 @@ Use the table below to help you decide which quantization method to use.
| [AQLM](./aqlm) | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 🟢 | 1 / 2 | 🟢 | 🟢 | 🟢 | https://github.com/Vahe1994/AQLM |
| [AWQ](./awq) | 🔴 | 🔴 | 🟢 | 🟢 | 🔴 | ? | 4 | 🟢 | 🟢 | 🟢 | https://github.com/casper-hansen/AutoAWQ |
| [bitsandbytes](./bitsandbytes) | 🟢 | 🟡 * | 🟢 | 🟡 * | 🔴 ** | 🔴 (soon!) | 4 / 8 | 🟢 | 🟢 | 🟢 | https://github.com/bitsandbytes-foundation/bitsandbytes |
| [compressed-tensors](./compressed_tensors) | 🔴 | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 1 - 8 | 🟢 | 🟢 | 🟢 | https://github.com/neuralmagic/compressed-tensors |
| [EETQ](./eetq) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | ? | 8 | 🟢 | 🟢 | 🟢 | https://github.com/NetEase-FuXi/EETQ |
| GGUF / GGML (llama.cpp) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🔴 | 1 - 8 | 🔴 | [See GGUF section](../gguf) | [See GGUF section](../gguf) | https://github.com/ggerganov/llama.cpp |
| [GPTQ](./gptq) | 🔴 | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 2 - 3 - 4 - 8 | 🟢 | 🟢 | 🟢 | https://github.com/AutoGPTQ/AutoGPTQ |

View File

@ -61,7 +61,7 @@ from transformers.utils import check_min_version, send_example_telemetry
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
Array = Any
Dataset = datasets.arrow_dataset.Dataset

View File

@ -60,7 +60,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risk.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recognition/requirements.txt")

View File

@ -56,7 +56,7 @@ from transformers.utils import check_min_version, send_example_telemetry
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
Array = Any
Dataset = datasets.arrow_dataset.Dataset

View File

@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")

View File

@ -45,7 +45,7 @@ from transformers.utils.versions import require_version
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")

View File

@ -54,7 +54,7 @@ from transformers.utils.versions import require_version
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")

View File

@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")

View File

@ -49,7 +49,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
logger = get_logger(__name__)

View File

@ -43,7 +43,7 @@ from transformers.utils.versions import require_version
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

View File

@ -48,7 +48,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

View File

@ -53,7 +53,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

View File

@ -46,7 +46,7 @@ from transformers.utils.versions import require_version
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")

View File

@ -52,7 +52,7 @@ from transformers.utils.versions import require_version
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")

View File

@ -55,7 +55,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

View File

@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
logger = get_logger(__name__)

View File

@ -58,7 +58,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

View File

@ -60,7 +60,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
logger = get_logger(__name__)

View File

@ -54,7 +54,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

View File

@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
logger = get_logger(__name__)
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

View File

@ -47,7 +47,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

View File

@ -47,7 +47,7 @@ from transformers.utils import PaddingStrategy, check_min_version, send_example_
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
logger = logging.getLogger(__name__)

View File

@ -56,7 +56,7 @@ from transformers.utils import PaddingStrategy, check_min_version, send_example_
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
logger = get_logger(__name__)
# You should update this to your particular problem to have better documentation of `model_type`

View File

@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/object-detection/requirements.txt")

View File

@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
logging.basicConfig(level=logging.INFO)
logger = get_logger(__name__)

View File

@ -50,7 +50,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

View File

@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

View File

@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

View File

@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

View File

@ -46,7 +46,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

View File

@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt")

View File

@ -50,7 +50,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
logger = get_logger(__name__)

View File

@ -50,7 +50,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")

View File

@ -53,7 +53,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")

View File

@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")

View File

@ -52,7 +52,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

View File

@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
logger = get_logger(__name__)
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

View File

@ -47,7 +47,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

View File

@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

View File

@ -49,7 +49,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
logger = get_logger(__name__)

View File

@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

View File

@ -49,7 +49,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")

View File

@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
logger = get_logger(__name__)
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")

View File

@ -52,7 +52,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")

View File

@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
logger = get_logger(__name__)
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")

View File

@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version(
"datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt"

View File

@ -55,7 +55,7 @@ from transformers.utils.versions import require_version
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")

View File

@ -50,7 +50,7 @@ from transformers.utils import PaddingStrategy, check_min_version, send_example_
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
logger = logging.getLogger(__name__)

View File

@ -62,7 +62,7 @@ except (ModuleNotFoundError, ImportError):
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
logger = logging.getLogger(__name__)

View File

@ -53,7 +53,7 @@ from transformers.utils.versions import require_version
# region Checking dependencies
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

View File

@ -47,7 +47,7 @@ from transformers.utils import check_min_version, send_example_telemetry
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
task_to_keys = {
"cola": ("sentence", None),

View File

@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
# region Dependencies and constants
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.46.0.dev0")
check_min_version("4.45.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

View File

@ -181,7 +181,7 @@ _deps = [
"timeout-decorator",
"tiktoken",
"timm<=0.9.16",
"tokenizers>=0.20,<0.21",
"tokenizers>=0.19,<0.20",
"torch",
"torchaudio",
"torchvision",
@ -436,7 +436,7 @@ install_requires = [
setup(
name="transformers",
version="4.46.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
version="4.45.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
author_email="transformers@huggingface.co",
description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",

View File

@ -18,7 +18,7 @@
# to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
# in the namespace without actually importing anything (and especially none of the backends).
__version__ = "4.46.0.dev0"
__version__ = "4.45.0.dev0"
from typing import TYPE_CHECKING
@ -481,7 +481,6 @@ _import_structure = {
"models.ibert": ["IBertConfig"],
"models.idefics": ["IdeficsConfig"],
"models.idefics2": ["Idefics2Config"],
"models.idefics3": ["Idefics3Config"],
"models.imagegpt": ["ImageGPTConfig"],
"models.informer": ["InformerConfig"],
"models.instructblip": [
@ -578,10 +577,6 @@ _import_structure = {
"models.mimi": ["MimiConfig"],
"models.mistral": ["MistralConfig"],
"models.mixtral": ["MixtralConfig"],
"models.mllama": [
"MllamaConfig",
"MllamaProcessor",
],
"models.mluke": [],
"models.mobilebert": [
"MobileBertConfig",
@ -614,10 +609,6 @@ _import_structure = {
"models.nystromformer": ["NystromformerConfig"],
"models.olmo": ["OlmoConfig"],
"models.olmoe": ["OlmoeConfig"],
"models.omdet_turbo": [
"OmDetTurboConfig",
"OmDetTurboProcessor",
],
"models.oneformer": [
"OneFormerConfig",
"OneFormerProcessor",
@ -967,7 +958,6 @@ _import_structure = {
"AqlmConfig",
"AwqConfig",
"BitsAndBytesConfig",
"CompressedTensorsConfig",
"EetqConfig",
"FbgemmFp8Config",
"GPTQConfig",
@ -1192,7 +1182,6 @@ else:
_import_structure["models.grounding_dino"].extend(["GroundingDinoImageProcessor"])
_import_structure["models.idefics"].extend(["IdeficsImageProcessor"])
_import_structure["models.idefics2"].extend(["Idefics2ImageProcessor"])
_import_structure["models.idefics3"].extend(["Idefics3ImageProcessor"])
_import_structure["models.imagegpt"].extend(["ImageGPTFeatureExtractor", "ImageGPTImageProcessor"])
_import_structure["models.instructblipvideo"].extend(["InstructBlipVideoImageProcessor"])
_import_structure["models.layoutlmv2"].extend(["LayoutLMv2FeatureExtractor", "LayoutLMv2ImageProcessor"])
@ -1205,7 +1194,6 @@ else:
)
_import_structure["models.mask2former"].append("Mask2FormerImageProcessor")
_import_structure["models.maskformer"].extend(["MaskFormerFeatureExtractor", "MaskFormerImageProcessor"])
_import_structure["models.mllama"].extend(["MllamaImageProcessor"])
_import_structure["models.mobilenet_v1"].extend(["MobileNetV1FeatureExtractor", "MobileNetV1ImageProcessor"])
_import_structure["models.mobilenet_v2"].extend(["MobileNetV2FeatureExtractor", "MobileNetV2ImageProcessor"])
_import_structure["models.mobilevit"].extend(["MobileViTFeatureExtractor", "MobileViTImageProcessor"])
@ -2422,14 +2410,6 @@ else:
"Idefics2Processor",
]
)
_import_structure["models.idefics3"].extend(
[
"Idefics3ForConditionalGeneration",
"Idefics3Model",
"Idefics3PreTrainedModel",
"Idefics3Processor",
]
)
_import_structure["models.imagegpt"].extend(
[
"ImageGPTForCausalImageModeling",
@ -2719,16 +2699,6 @@ else:
"MixtralPreTrainedModel",
]
)
_import_structure["models.mllama"].extend(
[
"MllamaForCausalLM",
"MllamaForConditionalGeneration",
"MllamaPreTrainedModel",
"MllamaProcessor",
"MllamaTextModel",
"MllamaVisionModel",
]
)
_import_structure["models.mobilebert"].extend(
[
"MobileBertForMaskedLM",
@ -2890,12 +2860,6 @@ else:
"OlmoePreTrainedModel",
]
)
_import_structure["models.omdet_turbo"].extend(
[
"OmDetTurboForObjectDetection",
"OmDetTurboPreTrainedModel",
]
)
_import_structure["models.oneformer"].extend(
[
"OneFormerForUniversalSegmentation",
@ -3030,7 +2994,7 @@ else:
"Pix2StructVisionModel",
]
)
_import_structure["models.pixtral"].extend(["PixtralPreTrainedModel", "PixtralVisionModel"])
_import_structure["models.pixtral"].extend(["PixtralModel", "PixtralPreTrainedModel"])
_import_structure["models.plbart"].extend(
[
"PLBartForCausalLM",
@ -5299,7 +5263,6 @@ if TYPE_CHECKING:
IdeficsConfig,
)
from .models.idefics2 import Idefics2Config
from .models.idefics3 import Idefics3Config
from .models.imagegpt import ImageGPTConfig
from .models.informer import InformerConfig
from .models.instructblip import (
@ -5403,10 +5366,6 @@ if TYPE_CHECKING:
)
from .models.mistral import MistralConfig
from .models.mixtral import MixtralConfig
from .models.mllama import (
MllamaConfig,
MllamaProcessor,
)
from .models.mobilebert import (
MobileBertConfig,
MobileBertTokenizer,
@ -5447,10 +5406,6 @@ if TYPE_CHECKING:
)
from .models.olmo import OlmoConfig
from .models.olmoe import OlmoeConfig
from .models.omdet_turbo import (
OmDetTurboConfig,
OmDetTurboProcessor,
)
from .models.oneformer import (
OneFormerConfig,
OneFormerProcessor,
@ -5847,7 +5802,6 @@ if TYPE_CHECKING:
AqlmConfig,
AwqConfig,
BitsAndBytesConfig,
CompressedTensorsConfig,
EetqConfig,
FbgemmFp8Config,
GPTQConfig,
@ -6048,7 +6002,6 @@ if TYPE_CHECKING:
from .models.grounding_dino import GroundingDinoImageProcessor
from .models.idefics import IdeficsImageProcessor
from .models.idefics2 import Idefics2ImageProcessor
from .models.idefics3 import Idefics3ImageProcessor
from .models.imagegpt import ImageGPTFeatureExtractor, ImageGPTImageProcessor
from .models.instructblipvideo import InstructBlipVideoImageProcessor
from .models.layoutlmv2 import (
@ -6068,7 +6021,6 @@ if TYPE_CHECKING:
MaskFormerFeatureExtractor,
MaskFormerImageProcessor,
)
from .models.mllama import MllamaImageProcessor
from .models.mobilenet_v1 import (
MobileNetV1FeatureExtractor,
MobileNetV1ImageProcessor,
@ -7083,12 +7035,6 @@ if TYPE_CHECKING:
Idefics2PreTrainedModel,
Idefics2Processor,
)
from .models.idefics3 import (
Idefics3ForConditionalGeneration,
Idefics3Model,
Idefics3PreTrainedModel,
Idefics3Processor,
)
from .models.imagegpt import (
ImageGPTForCausalImageModeling,
ImageGPTForImageClassification,
@ -7308,14 +7254,6 @@ if TYPE_CHECKING:
MixtralModel,
MixtralPreTrainedModel,
)
from .models.mllama import (
MllamaForCausalLM,
MllamaForConditionalGeneration,
MllamaPreTrainedModel,
MllamaProcessor,
MllamaTextModel,
MllamaVisionModel,
)
from .models.mobilebert import (
MobileBertForMaskedLM,
MobileBertForMultipleChoice,
@ -7443,10 +7381,6 @@ if TYPE_CHECKING:
OlmoeModel,
OlmoePreTrainedModel,
)
from .models.omdet_turbo import (
OmDetTurboForObjectDetection,
OmDetTurboPreTrainedModel,
)
from .models.oneformer import (
OneFormerForUniversalSegmentation,
OneFormerModel,
@ -7552,8 +7486,8 @@ if TYPE_CHECKING:
Pix2StructVisionModel,
)
from .models.pixtral import (
PixtralModel,
PixtralPreTrainedModel,
PixtralVisionModel,
)
from .models.plbart import (
PLBartForCausalLM,

View File

@ -80,12 +80,10 @@ class Cache(torch.nn.Module):
def reorder_cache(self, beam_idx: torch.LongTensor):
"""Reorders the cache for beam search, given the selected beam indices."""
for layer_idx in range(len(self.key_cache)):
if self.key_cache[layer_idx] != []:
device = self.key_cache[layer_idx].device
self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
if self.value_cache[layer_idx] != []:
device = self.value_cache[layer_idx].device
self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
device = self.key_cache[layer_idx].device
self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
device = self.value_cache[layer_idx].device
self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
@property
def seen_tokens(self):
@ -360,14 +358,10 @@ class DynamicCache(Cache):
```
"""
def __init__(self, num_hidden_layers: Optional[int] = None) -> None:
def __init__(self) -> None:
super().__init__()
if num_hidden_layers is None:
self.key_cache: List[torch.Tensor] = []
self.value_cache: List[torch.Tensor] = []
else:
self.key_cache: List[torch.Tensor] = [[] for _ in range(num_hidden_layers)]
self.value_cache: List[torch.Tensor] = [[] for _ in range(num_hidden_layers)]
self.key_cache: List[torch.Tensor] = []
self.value_cache: List[torch.Tensor] = []
self._seen_tokens = 0 # Used in `generate` to keep tally of how many tokens the cache has seen
def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]:
@ -426,11 +420,6 @@ class DynamicCache(Cache):
if len(self.key_cache) <= layer_idx:
self.key_cache.append(key_states)
self.value_cache.append(value_states)
# content on layer cache can be a tensor and checking not tensor causes errors
# so we explicitly check for the empty list
elif self.key_cache[layer_idx] == []:
self.key_cache[layer_idx] = key_states
self.value_cache[layer_idx] = value_states
else:
self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
@ -440,7 +429,7 @@ class DynamicCache(Cache):
def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
"""Returns the sequence length of the cached states. A layer index can be optionally passed."""
# TODO: deprecate this function in favor of `cache_position`
if len(self.key_cache) <= layer_idx or (len(self.key_cache) > layer_idx and self.key_cache[layer_idx] == []):
if len(self.key_cache) <= layer_idx:
return 0
return self.key_cache[layer_idx].shape[-2]
@ -457,12 +446,10 @@ class DynamicCache(Cache):
return legacy_cache
@classmethod
def from_legacy_cache(
cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, num_hidden_layers: int = None
) -> "DynamicCache":
def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None) -> "DynamicCache":
"""Converts a cache in the legacy cache format into an equivalent `DynamicCache`. Used for
backward compatibility."""
cache = cls(num_hidden_layers)
cache = cls()
if past_key_values is not None:
for layer_idx in range(len(past_key_values)):
key_states, value_states = past_key_values[layer_idx]
@ -481,16 +468,15 @@ class DynamicCache(Cache):
self._seen_tokens = max_length
for idx in range(len(self.key_cache)):
if self.key_cache[idx] != []:
self.key_cache[idx] = self.key_cache[idx][..., :max_length, :]
self.value_cache[idx] = self.value_cache[idx][..., :max_length, :]
self.key_cache[idx] = self.key_cache[idx][..., :max_length, :]
self.value_cache[idx] = self.value_cache[idx][..., :max_length, :]
def batch_split(self, full_batch_size: int, split_size: int, num_hidden_layers: int) -> List["DynamicCache"]:
def batch_split(self, full_batch_size: int, split_size: int) -> List["DynamicCache"]:
"""Split the current instance into a list of `DynamicCache` by the batch size. This will be used by
`_split_model_inputs()` in `generation.utils`"""
out = []
for i in range(0, full_batch_size, split_size):
current_split = DynamicCache(num_hidden_layers)
current_split = DynamicCache()
current_split._seen_tokens = self._seen_tokens
current_split.key_cache = [tensor[i : i + split_size] for tensor in self.key_cache]
current_split.value_cache = [tensor[i : i + split_size] for tensor in self.value_cache]
@ -498,17 +484,14 @@ class DynamicCache(Cache):
return out
@classmethod
def from_batch_splits(cls, splits: List["DynamicCache"], num_hidden_layers: int) -> "DynamicCache":
def from_batch_splits(cls, splits: List["DynamicCache"]) -> "DynamicCache":
"""This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in
`generation.utils`"""
cache = cls(num_hidden_layers)
cache = cls()
for idx in range(len(splits[0])):
key_cache = [current.key_cache[idx] for current in splits if current.key_cache[idx] != []]
value_cache = [current.key_cache[idx] for current in splits if current.key_cache[idx] != []]
if key_cache != []:
layer_keys = torch.cat(key_cache, dim=0)
layer_values = torch.cat(value_cache, dim=0)
cache.update(layer_keys, layer_values, idx)
layer_keys = torch.cat([current.key_cache[idx] for current in splits], dim=0)
layer_values = torch.cat([current.value_cache[idx] for current in splits], dim=0)
cache.update(layer_keys, layer_values, idx)
return cache
def batch_repeat_interleave(self, repeats: int):
@ -1408,13 +1391,10 @@ class EncoderDecoderCache(Cache):
@classmethod
def from_legacy_cache(
cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, num_hidden_layers: int = None
cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
) -> "EncoderDecoderCache":
"""Converts a cache in the legacy cache format into an equivalent `EncoderDecoderCache`."""
cache = cls(
self_attention_cache=DynamicCache(num_hidden_layers),
cross_attention_cache=DynamicCache(num_hidden_layers),
)
cache = cls(self_attention_cache=DynamicCache(), cross_attention_cache=DynamicCache())
if past_key_values is not None:
for layer_idx in range(len(past_key_values)):
key_states, value_states = past_key_values[layer_idx][:2]
@ -1427,10 +1407,7 @@ class EncoderDecoderCache(Cache):
def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
"""Returns the sequence length of the cached states. A layer index can be optionally passed."""
# check if empty list because in case of static cache it will be a tensors and we can't check `if not torch.Tensor`
if self.self_attention_cache.key_cache == []:
return 0
if len(self.self_attention_cache.key_cache) > 1 and self.self_attention_cache.key_cache[layer_idx] == []:
if len(self.self_attention_cache.key_cache) <= layer_idx:
return 0
return (self.self_attention_cache.key_cache[layer_idx][0, 0].any(dim=-1)).sum()
@ -1471,14 +1448,12 @@ class EncoderDecoderCache(Cache):
self.check_dynamic_cache(self.crop.__name__)
self.self_attention_cache.crop(maximum_length)
def batch_split(
self, full_batch_size: int, split_size: int, num_hidden_layers: int
) -> "List[EncoderDecoderCache]":
def batch_split(self, full_batch_size: int, split_size: int) -> "List[EncoderDecoderCache]":
"""Split the current instance into a list of `DynamicCache` by the batch size. This will be used by
`_split_model_inputs()` in `generation.utils`"""
self.check_dynamic_cache(self.batch_split.__name__)
self_attention_cache = self.self_attention_cache.batch_split(full_batch_size, split_size, num_hidden_layers)
cross_attention_cache = self.cross_attention_cache.batch_split(full_batch_size, split_size, num_hidden_layers)
self_attention_cache = self.self_attention_cache.batch_split(full_batch_size, split_size)
cross_attention_cache = self.cross_attention_cache.batch_split(full_batch_size, split_size)
out = []
for self_attn, cross_attn in zip(self_attention_cache, cross_attention_cache):
@ -1486,11 +1461,11 @@ class EncoderDecoderCache(Cache):
return out
@classmethod
def from_batch_splits(cls, splits: List["EncoderDecoderCache"], num_hidden_layers: int) -> "EncoderDecoderCache":
def from_batch_splits(cls, splits: List["EncoderDecoderCache"]) -> "EncoderDecoderCache":
"""This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in
`generation.utils`"""
self_attention_cache = DynamicCache(num_hidden_layers)
cross_attention_cache = DynamicCache(num_hidden_layers)
self_attention_cache = DynamicCache()
cross_attention_cache = DynamicCache()
for idx in range(len(splits[0])):
layer_keys = torch.cat([current.self_attention_cache.key_cache[idx] for current in splits], dim=0)
layer_values = torch.cat([current.self_attention_cache.value_cache[idx] for current in splits], dim=0)

View File

@ -609,12 +609,33 @@ class SpmConverter(Converter):
for id, p in enumerate(proto.pieces)
if p.type in [3, 4]
]
tokenizer.add_tokens(
[
AddedToken(token, normalized=False, special=special)
for id, token, special in sorted(spm_added_tokens, key=lambda x: x[0])
]
)
tokens_to_add = [
AddedToken(token, normalized=False, special=special)
for id, token, special in sorted(spm_added_tokens, key=lambda x: x[0])
]
if len(tokens_to_add) > 0:
# super hack: if a token.special is set, tokenizer ignores it for now so FIXME @ArthurZ
# Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
# individual tokens would repeatedly rebuild a trie, which can be slow.
is_last_special = None
tokens = []
for token in tokens_to_add:
is_special = token.special
if is_last_special is None or is_last_special == is_special:
tokens.append(token)
else:
if is_last_special:
tokenizer.add_special_tokens(tokens)
else:
tokenizer.add_tokens(tokens)
tokens = [token]
is_last_special = is_special
if tokens:
if is_last_special:
tokenizer.add_special_tokens(tokens)
else:
tokenizer.add_tokens(tokens)
return tokenizer

View File

@ -86,7 +86,7 @@ deps = {
"timeout-decorator": "timeout-decorator",
"tiktoken": "tiktoken",
"timm": "timm<=0.9.16",
"tokenizers": "tokenizers>=0.20,<0.21",
"tokenizers": "tokenizers>=0.19,<0.20",
"torch": "torch",
"torchaudio": "torchaudio",
"torchvision": "torchvision",

View File

@ -159,8 +159,6 @@ class AssistedCandidateGenerator(CandidateGenerator):
self.generation_config.return_dict_in_generate = True
self.generation_config.output_scores = True
self.generation_config.assistant_confidence_threshold = self.assistant_confidence_threshold
# this flag allow us set the confidence stopping criteria for assistant model generation.
self.generation_config.is_assistant = True
# avoid unnecessary warnings that min_length is larger than max_new_tokens
# remove the `MinLengthLogitsProcessor` if exists (NOTE: no need to check for `MinNewTokensLogitsProcessor`)
@ -398,15 +396,12 @@ def _crop_past_key_values(model, past_key_values, max_length):
past_key_values.crop(max_length)
elif past_key_values is not None:
for idx in range(len(past_key_values)):
if past_key_values[idx] != ([], []):
new_past.append(
(
past_key_values[idx][0][:, :, :max_length, :],
past_key_values[idx][1][:, :, :max_length, :],
)
new_past.append(
(
past_key_values[idx][0][:, :, :max_length, :],
past_key_values[idx][1][:, :, :max_length, :],
)
else:
new_past.append((past_key_values[idx][0], past_key_values[idx][1]))
)
past_key_values = tuple(new_past)
return past_key_values

View File

@ -338,20 +338,19 @@ class GenerationConfig(PushToHubMixin):
(e.g. multilingual models with different target languages in one batch)
> Generation parameters exclusive to assistant generation
is_assistant (`bool`, *optional*, defaults to `False`):
Whether the model is an assistant (draft) model.
num_assistant_tokens (`int`, *optional*, defaults to 20):
num_assistant_tokens (`int`, *optional*, defaults to 5):
Defines the number of _speculative tokens_ that shall be generated by the assistant model before being
checked by the target model at each iteration. Higher values for `num_assistant_tokens` make the generation
more _speculative_ : If the assistant model is performant larger speed-ups can be reached, if the assistant
model requires lots of corrections, lower speed-ups are reached.
num_assistant_tokens_schedule (`str`, *optional*, defaults to `"constant"`):
num_assistant_tokens_schedule (`str`, *optional*, defaults to `"heuristic"`):
Defines the schedule at which max assistant tokens shall be changed during inference.
- `"heuristic"`: When all speculative tokens are correct, increase `num_assistant_tokens` by 2 else
reduce by 1. `num_assistant_tokens` value is persistent over multiple generation calls with the same assistant model.
- `"heuristic_transient"`: Same as `"heuristic"` but `num_assistant_tokens` is reset to its initial value after each generation call.
- `"constant"`: `num_assistant_tokens` stays unchanged during generation
assistant_confidence_threshold (`float`, *optional*, defaults to 0.4):
assistant_confidence_threshold (`float`, *optional*):
The confidence threshold for the assistant model. If the assistant model's confidence in its prediction for the current token is lower
than this threshold, the assistant model stops the current token generation iteration, even if the number of _speculative tokens_
(defined by `num_assistant_tokens`) is not yet reached. It is an unsupervised version of the dynamic speculation lookahead
@ -453,10 +452,9 @@ class GenerationConfig(PushToHubMixin):
self.decoder_start_token_id = kwargs.pop("decoder_start_token_id", None)
# Assistant generation
self.is_assistant = False
self.num_assistant_tokens = kwargs.pop("num_assistant_tokens", 20)
self.num_assistant_tokens_schedule = kwargs.pop("num_assistant_tokens_schedule", "constant")
self.assistant_confidence_threshold = kwargs.pop("assistant_confidence_threshold", 0.4)
self.num_assistant_tokens = kwargs.pop("num_assistant_tokens", 5)
self.num_assistant_tokens_schedule = kwargs.pop("num_assistant_tokens_schedule", "heuristic")
self.assistant_confidence_threshold = kwargs.pop("assistant_confidence_threshold", None)
# Prompt lookup decoding
self.prompt_lookup_num_tokens = kwargs.pop("prompt_lookup_num_tokens", None)

View File

@ -32,7 +32,6 @@ from ..cache_utils import (
OffloadedCache,
QuantizedCacheConfig,
)
from ..configuration_utils import PretrainedConfig
from ..integrations.deepspeed import is_deepspeed_zero3_enabled
from ..modeling_outputs import CausalLMOutputWithPast, Seq2SeqLMOutput
from ..pytorch_utils import isin_mps_friendly
@ -954,8 +953,7 @@ class GenerationMixin:
if generation_config._eos_token_tensor is not None:
criteria.append(EosTokenCriteria(eos_token_id=generation_config._eos_token_tensor))
if (
generation_config.is_assistant
and generation_config.assistant_confidence_threshold is not None
generation_config.assistant_confidence_threshold is not None
and generation_config.assistant_confidence_threshold > 0
):
criteria.append(
@ -1602,11 +1600,10 @@ class GenerationMixin:
# Use DynamicCache() instance by default. This will avoid back and forth from legacy format that
# keeps copying the cache thus using much more memory
else:
num_hidden_layers = self.config.get_text_config().num_hidden_layers
model_kwargs[cache_name] = (
DynamicCache(num_hidden_layers)
DynamicCache()
if not requires_cross_attention_cache
else EncoderDecoderCache(DynamicCache(num_hidden_layers), DynamicCache(num_hidden_layers))
else EncoderDecoderCache(DynamicCache(), DynamicCache())
)
def _supports_num_logits_to_keep(self) -> bool:
@ -2386,7 +2383,11 @@ class GenerationMixin:
this_peer_finished = False
# prepare layers for DoLa decoding
final_layer = self.config.get_text_config().num_hidden_layers
final_layer = (
self.config.text_config.num_hidden_layers
if hasattr(self.config, "text_config")
else self.config.num_hidden_layers
)
# if the model has tied word embeddings, we skip the word embeddings (0-th) layer and start from the 2nd layer,
# as the early exit from word embeddings will become identity function
# if the model is really shallow (<=2 layers), we use the 1st layer if it's not the final layer and the 0-th
@ -2734,7 +2735,7 @@ class GenerationMixin:
model_kwargs["past_key_values"].crop(-1)
all_outputs.append(outputs)
outputs = stack_model_outputs(all_outputs, self.config.get_text_config())
outputs = stack_model_outputs(all_outputs)
else:
# compute the candidate tokens by the language model and collect their hidden_states
@ -3012,7 +3013,8 @@ class GenerationMixin:
# Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration
# (the clone itself is always small)
next_token_logits = outputs.logits.clone()[:, -1, :].float()
# .float() is needed to retain precision for later logits manipulations
next_token_logits = outputs.logits[:, -1, :].clone().float()
# pre-process distribution
next_token_scores = logits_processor(input_ids, next_token_logits)
@ -3239,16 +3241,13 @@ class GenerationMixin:
)
inputs_per_sub_batches = _split_model_inputs(
model_inputs,
split_size=batch_size,
full_batch_size=batch_beam_size,
config=self.config.get_text_config(),
model_inputs, split_size=batch_size, full_batch_size=batch_beam_size
)
outputs_per_sub_batch = [
self(**inputs_per_sub_batch, return_dict=True) for inputs_per_sub_batch in inputs_per_sub_batches
]
outputs = stack_model_outputs(outputs_per_sub_batch, self.config.get_text_config())
outputs = stack_model_outputs(outputs_per_sub_batch)
else: # Unchanged original behavior
outputs = self(**model_inputs, return_dict=True)
@ -4004,7 +4003,7 @@ class GenerationMixin:
isinstance(past_key_values, EncoderDecoderCache)
and isinstance(past_key_values.self_attention_cache, DynamicCache)
):
if past_key_values.get_seq_length() == 0:
if len(past_key_values) == 0:
start_from_empty_dynamic_cache = True
this_peer_finished = False
@ -4313,7 +4312,7 @@ def _ranking_fast(
return selected_idx
def _split(data, full_batch_size: int, num_hidden_layers: int, split_size: int = None):
def _split(data, full_batch_size: int, split_size: int = None):
"""
Takes care of three cases:
1. data is a tensor: e.g. last_hidden_state, pooler_output etc. split them on the batch_size dim
@ -4331,7 +4330,7 @@ def _split(data, full_batch_size: int, num_hidden_layers: int, split_size: int =
elif isinstance(data, DynamicCache) or (
isinstance(data, EncoderDecoderCache) and isinstance(data.self_attention_cache, DynamicCache)
):
return data.batch_split(full_batch_size, split_size, num_hidden_layers)
return data.batch_split(full_batch_size, split_size)
elif isinstance(data, tuple):
# If the elements of the tuple are also tuples (e.g., past_key_values in our earlier example)
if isinstance(data[0], tuple):
@ -4350,7 +4349,7 @@ def _split(data, full_batch_size: int, num_hidden_layers: int, split_size: int =
def _split_model_inputs(
model_input: Union[ModelOutput, Dict], split_size: int, full_batch_size: int, config: PretrainedConfig
model_input: Union[ModelOutput, Dict], split_size: int, full_batch_size: int
) -> List[Union[ModelOutput, Dict]]:
"""
Split a ModelOutput object (or its subclasses) or Dict into a list of same-class objects based on a specified split
@ -4384,20 +4383,16 @@ def _split_model_inputs(
keys_to_ignore = ["cache_position", "encoder_outputs", "num_logits_to_keep"]
non_bool_keys = [k for k in keys if not isinstance(model_input[k], bool) and k not in keys_to_ignore]
num_hidden_layers = config.get_text_config().num_hidden_layers
# we split the tensors and tuples of tensors
data_split_list = [
{k: _split(model_input[k], full_batch_size, num_hidden_layers, split_size)[i] for k in non_bool_keys}
{k: _split(model_input[k], full_batch_size, split_size)[i] for k in non_bool_keys}
for i in range(full_batch_size // split_size)
]
# bool values are the same and replicated for each split
bool_data = {k: model_input[k] for k in bool_keys}
# encoder_outputs is a ModelOutput object and should be split by its own
if "encoder_outputs" in model_input:
encoder_outputs_split = _split_model_inputs(
model_input["encoder_outputs"], split_size, full_batch_size, config.get_text_config()
)
encoder_outputs_split = _split_model_inputs(model_input["encoder_outputs"], split_size, full_batch_size)
data_split_list = [
{**data_split, "encoder_outputs": encoder_outputs_split[i]} for i, data_split in enumerate(data_split_list)
]
@ -4415,7 +4410,7 @@ def _split_model_inputs(
return split_model_inputs
def stack_model_outputs(model_outputs: List[ModelOutput], config: PretrainedConfig) -> ModelOutput:
def stack_model_outputs(model_outputs: List[ModelOutput]) -> ModelOutput:
"""
Stack a list of ModelOutput objects (or its subclasses) along the batch_size dimension. The function infers the
specific ModelOutput subclass from the list provided.
@ -4425,7 +4420,6 @@ def stack_model_outputs(model_outputs: List[ModelOutput], config: PretrainedConf
# Infer the class from the first object in the list
model_output_cls = type(model_outputs[0])
num_hidden_layers = config.get_text_config().num_hidden_layers
# Ensure all objects are of the same type
if not all(isinstance(obj, model_output_cls) for obj in model_outputs):
@ -4442,9 +4436,9 @@ def stack_model_outputs(model_outputs: List[ModelOutput], config: PretrainedConf
return torch.cat(data, dim=0)
# New cache format
elif isinstance(data[0], DynamicCache):
return DynamicCache.from_batch_splits(data, num_hidden_layers=num_hidden_layers)
return DynamicCache.from_batch_splits(data)
elif isinstance(data[0], EncoderDecoderCache):
return EncoderDecoderCache.from_batch_splits(data, num_hidden_layers=num_hidden_layers)
return EncoderDecoderCache.from_batch_splits(data)
elif isinstance(data[0], tuple):
# If the elements of the tuple are also tuples (e.g., past_key_values in our earlier example)
if isinstance(data[0][0], tuple):

View File

@ -162,7 +162,6 @@ def _rescale_for_pil_conversion(image):
def to_pil_image(
image: Union[np.ndarray, "PIL.Image.Image", "torch.Tensor", "tf.Tensor", "jnp.ndarray"],
do_rescale: Optional[bool] = None,
image_mode: Optional[str] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> "PIL.Image.Image":
"""
@ -176,8 +175,6 @@ def to_pil_image(
Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default
to `True` if the image type is a floating type and casting to `int` would result in a loss of precision,
and `False` otherwise.
image_mode (`str`, *optional*):
The mode to use for the PIL image. If unset, will use the default mode for the input image type.
input_data_format (`ChannelDimension`, *optional*):
The channel dimension format of the input image. If unset, will use the inferred format from the input.
@ -210,7 +207,7 @@ def to_pil_image(
image = rescale(image, 255)
image = image.astype(np.uint8)
return PIL.Image.fromarray(image, mode=image_mode)
return PIL.Image.fromarray(image)
# Logic adapted from torchvision resizing logic: https://github.com/pytorch/vision/blob/511924c1ced4ce0461197e5caa64ce5b9e558aab/torchvision/transforms/functional.py#L366

View File

@ -163,7 +163,7 @@ def convert_pytorch_state_dict_to_flax(pt_state_dict, flax_model):
# numpy currently does not support bfloat16, need to go over float32 in this case to not lose precision
if v.dtype == bfloat16:
v = v.float()
pt_state_dict[k] = v.cpu().numpy()
pt_state_dict[k] = v.numpy()
model_prefix = flax_model.base_model_prefix

View File

@ -1698,10 +1698,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
raise ImportError(
f"{preface} you need flash_attn package version to be greater or equal than 2.1.0. Detected version {flash_attention_version}. {install_message}"
)
elif not torch.cuda.is_available():
raise ValueError(
f"{preface} Flash Attention 2 is not available on CPU. Please make sure torch can access a CUDA device."
)
else:
raise ImportError(f"{preface} Flash Attention 2 is not available. {install_message}")
elif torch.version.hip:

View File

@ -115,7 +115,6 @@ from . import (
ibert,
idefics,
idefics2,
idefics3,
imagegpt,
informer,
instructblip,
@ -154,7 +153,6 @@ from . import (
mimi,
mistral,
mixtral,
mllama,
mluke,
mobilebert,
mobilenet_v1,
@ -175,7 +173,6 @@ from . import (
nystromformer,
olmo,
olmoe,
omdet_turbo,
oneformer,
openai,
opt,

View File

@ -1575,7 +1575,7 @@ class AlignModel(AlignPreTrainedModel):
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(
... images=image, text=["a photo of a cat", "a photo of a dog"], return_tensors="pt", padding=True
... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
... )
>>> outputs = model(**inputs)

View File

@ -19,7 +19,11 @@ Image/Text processor class for ALIGN
from typing import List, Union
from ...image_utils import ImageInput
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, _validate_images_text_input_order
from ...processing_utils import (
ProcessingKwargs,
ProcessorMixin,
Unpack,
)
from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
@ -72,8 +76,8 @@ class AlignProcessor(ProcessorMixin):
def __call__(
self,
images: ImageInput = None,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
images: ImageInput = None,
audio=None,
videos=None,
**kwargs: Unpack[AlignProcessorKwargs],
@ -86,13 +90,13 @@ class AlignProcessor(ProcessorMixin):
to the doctsring of the above two methods for more information.
Args:
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.
text (`str`, `List[str]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
@ -110,9 +114,6 @@ class AlignProcessor(ProcessorMixin):
"""
if text is None and images is None:
raise ValueError("You must specify either text or images.")
# check if images and text inputs are reversed for BC
images, text = _validate_images_text_input_order(images, text)
output_kwargs = self._merge_kwargs(
AlignProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,

View File

@ -32,7 +32,7 @@ from ...modeling_outputs import (
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import ModelOutput, add_start_docstrings_to_model_forward, logging, replace_return_docstrings, torch_int
from ...utils import ModelOutput, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from .configuration_altclip import AltCLIPConfig, AltCLIPTextConfig, AltCLIPVisionConfig
@ -100,8 +100,6 @@ ALTCLIP_VISION_INPUTS_DOCSTRING = r"""
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
Whether to interpolate the pre-trained position encodings.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@ -139,8 +137,6 @@ ALTCLIP_INPUTS_DOCSTRING = r"""
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
Whether to interpolate the pre-trained position encodings.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@ -1013,63 +1009,15 @@ class AltCLIPVisionEmbeddings(nn.Module):
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
"""
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.
Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
"""
num_patches = embeddings.shape[1] - 1
self.position_embeddings = self.position_embedding.weight.unsqueeze(0)
num_positions = self.position_embeddings.shape[1] - 1
# always interpolate when tracing to ensure the exported model works for dynamic input shapes
if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
return self.position_embeddings
class_pos_embed = self.position_embeddings[:, :1]
patch_pos_embed = self.position_embeddings[:, 1:]
dim = embeddings.shape[-1]
new_height = height // self.patch_size
new_width = width // self.patch_size
sqrt_num_positions = torch_int(num_positions**0.5)
patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
patch_pos_embed = nn.functional.interpolate(
patch_pos_embed,
size=(new_height, new_width),
mode="bicubic",
align_corners=False,
)
patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor:
batch_size, _, height, width = pixel_values.shape
if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
raise ValueError(
f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
)
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
batch_size = pixel_values.shape[0]
target_dtype = self.patch_embedding.weight.dtype
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
class_embeds = self.class_embedding.expand(batch_size, 1, -1)
embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
if interpolate_pos_encoding:
embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
else:
embeddings = embeddings + self.position_embedding(self.position_ids)
embeddings = embeddings + self.position_embedding(self.position_ids)
return embeddings
@ -1149,7 +1097,6 @@ class AltCLIPVisionTransformer(nn.Module):
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
interpolate_pos_encoding: Optional[bool] = False,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
Returns:
@ -1164,7 +1111,7 @@ class AltCLIPVisionTransformer(nn.Module):
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
hidden_states = self.embeddings(pixel_values)
hidden_states = self.pre_layrnorm(hidden_states)
encoder_outputs = self.encoder(
@ -1209,7 +1156,6 @@ class AltCLIPVisionModel(AltCLIPPreTrainedModel):
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
interpolate_pos_encoding: bool = False,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
@ -1240,7 +1186,6 @@ class AltCLIPVisionModel(AltCLIPPreTrainedModel):
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
interpolate_pos_encoding=interpolate_pos_encoding,
return_dict=return_dict,
)
@ -1601,7 +1546,6 @@ class AltCLIPModel(AltCLIPPreTrainedModel):
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
interpolate_pos_encoding: bool = False,
return_dict: Optional[bool] = None,
) -> torch.FloatTensor:
r"""
@ -1634,7 +1578,6 @@ class AltCLIPModel(AltCLIPPreTrainedModel):
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
interpolate_pos_encoding=interpolate_pos_encoding,
return_dict=return_dict,
)
@ -1655,7 +1598,6 @@ class AltCLIPModel(AltCLIPPreTrainedModel):
return_loss: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
interpolate_pos_encoding: bool = False,
return_dict: Optional[bool] = None,
) -> Union[Tuple, AltCLIPOutput]:
r"""
@ -1700,7 +1642,6 @@ class AltCLIPModel(AltCLIPPreTrainedModel):
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
interpolate_pos_encoding=interpolate_pos_encoding,
return_dict=return_dict,
)

View File

@ -60,7 +60,6 @@ CONFIG_MAPPING_NAMES = OrderedDict(
("chinese_clip_vision_model", "ChineseCLIPVisionConfig"),
("clap", "ClapConfig"),
("clip", "CLIPConfig"),
("clip_text_model", "CLIPTextConfig"),
("clip_vision_model", "CLIPVisionConfig"),
("clipseg", "CLIPSegConfig"),
("clvp", "ClvpConfig"),
@ -133,7 +132,6 @@ CONFIG_MAPPING_NAMES = OrderedDict(
("ibert", "IBertConfig"),
("idefics", "IdeficsConfig"),
("idefics2", "Idefics2Config"),
("idefics3", "Idefics3Config"),
("imagegpt", "ImageGPTConfig"),
("informer", "InformerConfig"),
("instructblip", "InstructBlipConfig"),
@ -173,7 +171,6 @@ CONFIG_MAPPING_NAMES = OrderedDict(
("mimi", "MimiConfig"),
("mistral", "MistralConfig"),
("mixtral", "MixtralConfig"),
("mllama", "MllamaConfig"),
("mobilebert", "MobileBertConfig"),
("mobilenet_v1", "MobileNetV1Config"),
("mobilenet_v2", "MobileNetV2Config"),
@ -194,7 +191,6 @@ CONFIG_MAPPING_NAMES = OrderedDict(
("nystromformer", "NystromformerConfig"),
("olmo", "OlmoConfig"),
("olmoe", "OlmoeConfig"),
("omdet-turbo", "OmDetTurboConfig"),
("oneformer", "OneFormerConfig"),
("open-llama", "OpenLlamaConfig"),
("openai-gpt", "OpenAIGPTConfig"),
@ -352,7 +348,6 @@ MODEL_NAMES_MAPPING = OrderedDict(
("chinese_clip_vision_model", "ChineseCLIPVisionModel"),
("clap", "CLAP"),
("clip", "CLIP"),
("clip_text_model", "CLIPTextModel"),
("clip_vision_model", "CLIPVisionModel"),
("clipseg", "CLIPSeg"),
("clvp", "CLVP"),
@ -433,7 +428,6 @@ MODEL_NAMES_MAPPING = OrderedDict(
("ibert", "I-BERT"),
("idefics", "IDEFICS"),
("idefics2", "Idefics2"),
("idefics3", "Idefics3"),
("imagegpt", "ImageGPT"),
("informer", "Informer"),
("instructblip", "InstructBLIP"),
@ -480,7 +474,6 @@ MODEL_NAMES_MAPPING = OrderedDict(
("mimi", "Mimi"),
("mistral", "Mistral"),
("mixtral", "Mixtral"),
("mllama", "Mllama"),
("mluke", "mLUKE"),
("mms", "MMS"),
("mobilebert", "MobileBERT"),
@ -504,7 +497,6 @@ MODEL_NAMES_MAPPING = OrderedDict(
("nystromformer", "Nyströmformer"),
("olmo", "OLMo"),
("olmoe", "OLMoE"),
("omdet-turbo", "OmDet-Turbo"),
("oneformer", "OneFormer"),
("open-llama", "OpenLlama"),
("openai-gpt", "OpenAI GPT"),
@ -673,7 +665,6 @@ SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict(
("xclip", "x_clip"),
("clip_vision_model", "clip"),
("qwen2_audio_encoder", "qwen2_audio"),
("clip_text_model", "clip"),
("siglip_vision_model", "siglip"),
("chinese_clip_vision_model", "chinese_clip"),
("rt_detr_resnet", "rt_detr"),

View File

@ -89,7 +89,6 @@ else:
("hiera", ("BitImageProcessor",)),
("idefics", ("IdeficsImageProcessor",)),
("idefics2", ("Idefics2ImageProcessor",)),
("idefics3", ("Idefics3ImageProcessor",)),
("imagegpt", ("ImageGPTImageProcessor",)),
("instructblip", ("BlipImageProcessor",)),
("instructblipvideo", ("InstructBlipVideoImageProcessor",)),
@ -104,7 +103,6 @@ else:
("mask2former", ("Mask2FormerImageProcessor",)),
("maskformer", ("MaskFormerImageProcessor",)),
("mgp-str", ("ViTImageProcessor", "ViTImageProcessorFast")),
("mllama", ("MllamaImageProcessor",)),
("mobilenet_v1", ("MobileNetV1ImageProcessor",)),
("mobilenet_v2", ("MobileNetV2ImageProcessor",)),
("mobilevit", ("MobileViTImageProcessor",)),

View File

@ -60,7 +60,6 @@ MODEL_MAPPING_NAMES = OrderedDict(
("chinese_clip_vision_model", "ChineseCLIPVisionModel"),
("clap", "ClapModel"),
("clip", "CLIPModel"),
("clip_text_model", "CLIPTextModel"),
("clip_vision_model", "CLIPVisionModel"),
("clipseg", "CLIPSegModel"),
("clvp", "ClvpModelForConditionalGeneration"),
@ -130,7 +129,6 @@ MODEL_MAPPING_NAMES = OrderedDict(
("ibert", "IBertModel"),
("idefics", "IdeficsModel"),
("idefics2", "Idefics2Model"),
("idefics3", "Idefics3Model"),
("imagegpt", "ImageGPTModel"),
("informer", "InformerModel"),
("jamba", "JambaModel"),
@ -183,7 +181,6 @@ MODEL_MAPPING_NAMES = OrderedDict(
("nystromformer", "NystromformerModel"),
("olmo", "OlmoModel"),
("olmoe", "OlmoeModel"),
("omdet-turbo", "OmDetTurboForObjectDetection"),
("oneformer", "OneFormerModel"),
("open-llama", "OpenLlamaModel"),
("openai-gpt", "OpenAIGPTModel"),
@ -198,7 +195,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("persimmon", "PersimmonModel"),
("phi", "PhiModel"),
("phi3", "Phi3Model"),
("pixtral", "PixtralVisionModel"),
("pixtral", "PixtralModel"),
("plbart", "PLBartModel"),
("poolformer", "PoolFormerModel"),
("prophetnet", "ProphetNetModel"),
@ -316,7 +313,6 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
("ibert", "IBertForMaskedLM"),
("idefics", "IdeficsForVisionText2Text"),
("idefics2", "Idefics2ForConditionalGeneration"),
("idefics3", "Idefics3ForConditionalGeneration"),
("layoutlm", "LayoutLMForMaskedLM"),
("llava", "LlavaForConditionalGeneration"),
("llava_next", "LlavaNextForConditionalGeneration"),
@ -329,7 +325,6 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
("mamba2", "Mamba2ForCausalLM"),
("mega", "MegaForMaskedLM"),
("megatron-bert", "MegatronBertForPreTraining"),
("mllama", "MllamaForConditionalGeneration"),
("mobilebert", "MobileBertForPreTraining"),
("mpnet", "MPNetForMaskedLM"),
("mpt", "MptForCausalLM"),
@ -503,7 +498,6 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
("megatron-bert", "MegatronBertForCausalLM"),
("mistral", "MistralForCausalLM"),
("mixtral", "MixtralForCausalLM"),
("mllama", "MllamaForCausalLM"),
("mpt", "MptForCausalLM"),
("musicgen", "MusicgenForCausalLM"),
("musicgen_melody", "MusicgenMelodyForCausalLM"),
@ -570,7 +564,6 @@ MODEL_FOR_IMAGE_MAPPING_NAMES = OrderedDict(
("hiera", "HieraModel"),
("imagegpt", "ImageGPTModel"),
("levit", "LevitModel"),
("mllama", "MllamaVisionModel"),
("mobilenet_v1", "MobileNetV1Model"),
("mobilenet_v2", "MobileNetV2Model"),
("mobilevit", "MobileViTModel"),
@ -735,7 +728,6 @@ MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict(
("chameleon", "ChameleonForConditionalGeneration"),
("git", "GitForCausalLM"),
("idefics2", "Idefics2ForConditionalGeneration"),
("idefics3", "Idefics3ForConditionalGeneration"),
("instructblip", "InstructBlipForConditionalGeneration"),
("instructblipvideo", "InstructBlipVideoForConditionalGeneration"),
("kosmos-2", "Kosmos2ForConditionalGeneration"),
@ -743,7 +735,6 @@ MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict(
("llava_next", "LlavaNextForConditionalGeneration"),
("llava_next_video", "LlavaNextVideoForConditionalGeneration"),
("llava_onevision", "LlavaOnevisionForConditionalGeneration"),
("mllama", "MllamaForConditionalGeneration"),
("paligemma", "PaliGemmaForConditionalGeneration"),
("pix2struct", "Pix2StructForConditionalGeneration"),
("qwen2_vl", "Qwen2VLForConditionalGeneration"),
@ -821,7 +812,6 @@ MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES = OrderedDict(
[
# Model for Zero Shot Object Detection mapping
("grounding-dino", "GroundingDinoForObjectDetection"),
("omdet-turbo", "OmDetTurboForObjectDetection"),
("owlv2", "Owlv2ForObjectDetection"),
("owlvit", "OwlViTForObjectDetection"),
]
@ -1336,7 +1326,6 @@ MODEL_FOR_TEXT_ENCODING_MAPPING_NAMES = OrderedDict(
("albert", "AlbertModel"),
("bert", "BertModel"),
("big_bird", "BigBirdModel"),
("clip_text_model", "CLIPTextModel"),
("data2vec-text", "Data2VecTextModel"),
("deberta", "DebertaModel"),
("deberta-v2", "DebertaV2Model"),
@ -1345,7 +1334,6 @@ MODEL_FOR_TEXT_ENCODING_MAPPING_NAMES = OrderedDict(
("flaubert", "FlaubertModel"),
("ibert", "IBertModel"),
("longformer", "LongformerModel"),
("mllama", "MllamaTextModel"),
("mobilebert", "MobileBertModel"),
("mt5", "MT5EncoderModel"),
("nystromformer", "NystromformerModel"),

View File

@ -65,7 +65,6 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
("hubert", "Wav2Vec2Processor"),
("idefics", "IdeficsProcessor"),
("idefics2", "Idefics2Processor"),
("idefics3", "Idefics3Processor"),
("instructblip", "InstructBlipProcessor"),
("instructblipvideo", "InstructBlipVideoProcessor"),
("kosmos-2", "Kosmos2Processor"),
@ -78,7 +77,6 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
("markuplm", "MarkupLMProcessor"),
("mctct", "MCTCTProcessor"),
("mgp-str", "MgpstrProcessor"),
("mllama", "MllamaProcessor"),
("oneformer", "OneFormerProcessor"),
("owlv2", "Owlv2Processor"),
("owlvit", "OwlViTProcessor"),

View File

@ -219,7 +219,6 @@ else:
("ibert", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
("idefics", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
("idefics2", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
("idefics3", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
("instructblip", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
("instructblipvideo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
(
@ -306,7 +305,6 @@ else:
"LlamaTokenizerFast" if is_tokenizers_available() else None,
),
),
("mllama", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
("mluke", ("MLukeTokenizer" if is_sentencepiece_available() else None, None)),
("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)),
("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)),
@ -346,10 +344,6 @@ else:
),
("olmo", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
("olmoe", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
(
"omdet-turbo",
("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None),
),
("oneformer", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
(
"openai-gpt",

View File

@ -34,13 +34,7 @@ from ...modeling_outputs import (
)
from ...modeling_utils import PreTrainedModel, apply_chunking_to_forward
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
torch_int,
)
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from .configuration_bridgetower import BridgeTowerConfig, BridgeTowerTextConfig, BridgeTowerVisionConfig
@ -117,8 +111,6 @@ BRIDGETOWER_INPUTS_DOCSTRING = r"""
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
interpolate_pos_encoding (`bool`, defaults to `False`):
Whether to interpolate the pre-trained position encodings.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@ -284,63 +276,15 @@ class BridgeTowerVisionEmbeddings(nn.Module):
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
"""
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.
Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
"""
num_patches = embeddings.shape[1] - 1
self.position_embeddings = self.position_embedding.weight.unsqueeze(0)
num_positions = self.position_embeddings.shape[1] - 1
# always interpolate when tracing to ensure the exported model works for dynamic input shapes
if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
return self.position_embeddings
class_pos_embed = self.position_embeddings[:, :1]
patch_pos_embed = self.position_embeddings[:, 1:]
dim = embeddings.shape[-1]
new_height = height // self.patch_size
new_width = width // self.patch_size
sqrt_num_positions = torch_int(num_positions**0.5)
patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
patch_pos_embed = nn.functional.interpolate(
patch_pos_embed,
size=(new_height, new_width),
mode="bicubic",
align_corners=False,
)
patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor:
batch_size, _, height, width = pixel_values.shape
if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
raise ValueError(
f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
)
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
batch_size = pixel_values.shape[0]
target_dtype = self.patch_embedding.weight.dtype
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
class_embeds = self.class_embedding.expand(batch_size, 1, -1)
embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
if interpolate_pos_encoding:
embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
else:
embeddings = embeddings + self.position_embedding(self.position_ids)
embeddings = embeddings + self.position_embedding(self.position_ids)
return embeddings
@ -358,13 +302,8 @@ class BridgeTowerVisionTransformer(nn.Module):
[nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) for _ in range(config.num_hidden_layers)]
)
def forward(
self,
pixel_values: torch.Tensor,
attention_mask,
interpolate_pos_encoding: bool = False,
):
hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding)
def forward(self, pixel_values: torch.Tensor, attention_mask):
hidden_states = self.embeddings(pixel_values)
hidden_states = self.ln_pre(hidden_states)
# NLD -> LND
hidden_states = hidden_states.permute(1, 0, 2)
@ -385,12 +324,8 @@ class BridgeTowerVisionTransformer(nn.Module):
hidden_states = torch.stack(hidden_states_stack, dim=0)
return hidden_states
def forward_pre(
self,
pixel_values: torch.Tensor,
interpolate_pos_encoding: bool = False,
):
hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
def forward_pre(self, pixel_values: torch.Tensor):
hidden_states = self.embeddings(pixel_values)
hidden_states = self.ln_pre(hidden_states)
# NLD -> LND
hidden_states = hidden_states.permute(1, 0, 2)
@ -1080,8 +1015,8 @@ class BridgeTowerVisionModel(BridgeTowerPreTrainedModel):
def dtype(self):
return self.visual.embeddings.patch_embedding.weight.dtype
def forward(self, image, image_mask=None, interpolate_pos_encoding=False):
return self.visual(image.type(self.dtype), image_mask, interpolate_pos_encoding)
def forward(self, image, image_mask=None):
return self.visual(image.type(self.dtype), image_mask)
class BridgeTowerTextModel(BridgeTowerPreTrainedModel):
@ -1345,7 +1280,6 @@ class BridgeTowerModel(BridgeTowerPreTrainedModel):
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: Optional[torch.LongTensor] = None,
interpolate_pos_encoding: bool = False,
) -> Union[Tuple[torch.Tensor], BridgeTowerModelOutput]:
r"""
output_hidden_states (`bool`, *optional*):
@ -1418,9 +1352,7 @@ class BridgeTowerModel(BridgeTowerPreTrainedModel):
all_hidden_states_text += (text_embeds,)
if image_embeds is None:
image_embeds = self.vision_model.visual.forward_pre(
pixel_values.type(self.vision_model.dtype), interpolate_pos_encoding=interpolate_pos_encoding
)
image_embeds = self.vision_model.visual.forward_pre(pixel_values.type(self.vision_model.dtype))
else:
# Permute as BridgeTowerResidualAttention has batch_first=True
image_embeds = image_embeds.permute(1, 0, 2)

View File

@ -38,7 +38,6 @@ from ...utils import (
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
torch_int,
)
from .configuration_chinese_clip import ChineseCLIPConfig, ChineseCLIPTextConfig, ChineseCLIPVisionConfig
@ -189,63 +188,15 @@ class ChineseCLIPVisionEmbeddings(nn.Module):
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
"""
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.
Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
"""
num_patches = embeddings.shape[1] - 1
self.position_embeddings = self.position_embedding.weight.unsqueeze(0)
num_positions = self.position_embeddings.shape[1] - 1
# always interpolate when tracing to ensure the exported model works for dynamic input shapes
if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
return self.position_embeddings
class_pos_embed = self.position_embeddings[:, :1]
patch_pos_embed = self.position_embeddings[:, 1:]
dim = embeddings.shape[-1]
new_height = height // self.patch_size
new_width = width // self.patch_size
sqrt_num_positions = torch_int(num_positions**0.5)
patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
patch_pos_embed = nn.functional.interpolate(
patch_pos_embed,
size=(new_height, new_width),
mode="bicubic",
align_corners=False,
)
patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor:
batch_size, _, height, width = pixel_values.shape
if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
raise ValueError(
f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
)
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
batch_size = pixel_values.shape[0]
target_dtype = self.patch_embedding.weight.dtype
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
class_embeds = self.class_embedding.expand(batch_size, 1, -1)
embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
if interpolate_pos_encoding:
embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
else:
embeddings = embeddings + self.position_embedding(self.position_ids)
embeddings = embeddings + self.position_embedding(self.position_ids)
return embeddings
@ -847,8 +798,6 @@ CHINESE_CLIP_TEXT_INPUTS_DOCSTRING = r"""
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
Whether to interpolate the pre-trained position encodings.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@ -864,8 +813,6 @@ CHINESE_CLIP_VISION_INPUTS_DOCSTRING = r"""
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
Whether to interpolate the pre-trained position encodings.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@ -1105,7 +1052,6 @@ class ChineseCLIPVisionTransformer(nn.Module):
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
interpolate_pos_encoding: bool = False,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
@ -1120,7 +1066,7 @@ class ChineseCLIPVisionTransformer(nn.Module):
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
hidden_states = self.embeddings(pixel_values)
hidden_states = self.pre_layrnorm(hidden_states)
encoder_outputs = self.encoder(
@ -1353,7 +1299,6 @@ class ChineseCLIPVisionModel(ChineseCLIPPreTrainedModel):
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
interpolate_pos_encoding: bool = False,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
@ -1384,7 +1329,6 @@ class ChineseCLIPVisionModel(ChineseCLIPPreTrainedModel):
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
interpolate_pos_encoding=interpolate_pos_encoding,
return_dict=return_dict,
)
@ -1481,7 +1425,6 @@ class ChineseCLIPModel(ChineseCLIPPreTrainedModel):
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
interpolate_pos_encoding: bool = False,
return_dict: Optional[bool] = None,
) -> torch.FloatTensor:
r"""
@ -1518,7 +1461,6 @@ class ChineseCLIPModel(ChineseCLIPPreTrainedModel):
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
interpolate_pos_encoding=interpolate_pos_encoding,
return_dict=return_dict,
)
@ -1539,7 +1481,6 @@ class ChineseCLIPModel(ChineseCLIPPreTrainedModel):
return_loss: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
interpolate_pos_encoding: bool = False,
return_dict: Optional[bool] = None,
) -> Union[Tuple, ChineseCLIPOutput]:
r"""
@ -1575,7 +1516,6 @@ class ChineseCLIPModel(ChineseCLIPPreTrainedModel):
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
interpolate_pos_encoding=interpolate_pos_encoding,
return_dict=return_dict,
)

View File

@ -36,7 +36,6 @@ from ...utils import (
is_flash_attn_greater_or_equal_2_10,
logging,
replace_return_docstrings,
torch_int,
)
from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
@ -197,63 +196,15 @@ class CLIPVisionEmbeddings(nn.Module):
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
"""
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.
Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
"""
num_patches = embeddings.shape[1] - 1
self.position_embeddings = self.position_embedding.weight.unsqueeze(0)
num_positions = self.position_embeddings.shape[1] - 1
# always interpolate when tracing to ensure the exported model works for dynamic input shapes
if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
return self.position_embeddings
class_pos_embed = self.position_embeddings[:, :1]
patch_pos_embed = self.position_embeddings[:, 1:]
dim = embeddings.shape[-1]
new_height = height // self.patch_size
new_width = width // self.patch_size
sqrt_num_positions = torch_int(num_positions**0.5)
patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
patch_pos_embed = nn.functional.interpolate(
patch_pos_embed,
size=(new_height, new_width),
mode="bicubic",
align_corners=False,
)
patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor:
batch_size, _, height, width = pixel_values.shape
if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
raise ValueError(
f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
)
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
batch_size = pixel_values.shape[0]
target_dtype = self.patch_embedding.weight.dtype
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
class_embeds = self.class_embedding.expand(batch_size, 1, -1)
embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
if interpolate_pos_encoding:
embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
else:
embeddings = embeddings + self.position_embedding(self.position_ids)
embeddings = embeddings + self.position_embedding(self.position_ids)
return embeddings
@ -753,8 +704,6 @@ CLIP_VISION_INPUTS_DOCSTRING = r"""
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
Whether to interpolate the pre-trained position encodings.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@ -792,8 +741,6 @@ CLIP_INPUTS_DOCSTRING = r"""
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
Whether to interpolate the pre-trained position encodings.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@ -1076,7 +1023,6 @@ class CLIPVisionTransformer(nn.Module):
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
interpolate_pos_encoding: Optional[bool] = False,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
Returns:
@ -1091,7 +1037,7 @@ class CLIPVisionTransformer(nn.Module):
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
hidden_states = self.embeddings(pixel_values)
hidden_states = self.pre_layrnorm(hidden_states)
encoder_outputs = self.encoder(
@ -1141,7 +1087,6 @@ class CLIPVisionModel(CLIPPreTrainedModel):
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
interpolate_pos_encoding: bool = False,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
@ -1173,7 +1118,6 @@ class CLIPVisionModel(CLIPPreTrainedModel):
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
interpolate_pos_encoding=interpolate_pos_encoding,
)
@ -1270,7 +1214,6 @@ class CLIPModel(CLIPPreTrainedModel):
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
interpolate_pos_encoding: bool = False,
return_dict: Optional[bool] = None,
) -> torch.FloatTensor:
r"""
@ -1306,7 +1249,6 @@ class CLIPModel(CLIPPreTrainedModel):
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
interpolate_pos_encoding=interpolate_pos_encoding,
return_dict=return_dict,
)
@ -1326,7 +1268,6 @@ class CLIPModel(CLIPPreTrainedModel):
return_loss: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
interpolate_pos_encoding: bool = False,
return_dict: Optional[bool] = None,
) -> Union[Tuple, CLIPOutput]:
r"""
@ -1364,7 +1305,6 @@ class CLIPModel(CLIPPreTrainedModel):
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
interpolate_pos_encoding=interpolate_pos_encoding,
return_dict=return_dict,
)
@ -1526,7 +1466,6 @@ class CLIPVisionModelWithProjection(CLIPPreTrainedModel):
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
interpolate_pos_encoding: bool = False,
return_dict: Optional[bool] = None,
) -> Union[Tuple, CLIPVisionModelOutput]:
r"""
@ -1556,7 +1495,6 @@ class CLIPVisionModelWithProjection(CLIPPreTrainedModel):
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
interpolate_pos_encoding=interpolate_pos_encoding,
return_dict=return_dict,
)

View File

@ -33,7 +33,6 @@ from ...utils import (
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
torch_int,
)
from .configuration_clipseg import CLIPSegConfig, CLIPSegTextConfig, CLIPSegVisionConfig
@ -164,62 +163,40 @@ class CLIPSegVisionEmbeddings(nn.Module):
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
"""
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.
def interpolate_position_embeddings(self, new_size):
if len(new_size) != 2:
raise ValueError("new_size should consist of 2 values")
Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
"""
num_patches = embeddings.shape[1] - 1
self.position_embeddings = self.position_embedding.weight.unsqueeze(0)
num_positions = self.position_embeddings.shape[1] - 1
# always interpolate when tracing to ensure the exported model works for dynamic input shapes
if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
return self.position_embeddings
class_pos_embed = self.position_embeddings[:, :1]
patch_pos_embed = self.position_embeddings[:, 1:]
dim = embeddings.shape[-1]
new_height = height // self.patch_size
new_width = width // self.patch_size
sqrt_num_positions = torch_int(num_positions**0.5)
patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
patch_pos_embed = nn.functional.interpolate(
patch_pos_embed,
size=(new_height, new_width),
mode="bicubic",
align_corners=False,
num_patches_one_direction = int(self.num_patches**0.5)
# we interpolate the position embeddings in 2D
a = self.position_embedding.weight[1:].T.view(
1, self.config.hidden_size, num_patches_one_direction, num_patches_one_direction
)
b = (
nn.functional.interpolate(a, new_size, mode="bicubic", align_corners=False)
.squeeze(0)
.view(self.config.hidden_size, new_size[0] * new_size[1])
.T
)
result = torch.cat([self.position_embedding.weight[:1], b])
patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
return result
return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor:
batch_size, _, height, width = pixel_values.shape
if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
raise ValueError(
f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
)
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
batch_size = pixel_values.shape[0]
patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid]
patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
class_embeds = self.class_embedding.expand(batch_size, 1, -1)
embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
if interpolate_pos_encoding:
embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
if embeddings.shape[1] != self.num_positions:
new_shape = int(math.sqrt(embeddings.shape[1] - 1))
embeddings = embeddings + self.interpolate_position_embeddings((new_shape, new_shape))
embeddings = embeddings.to(embeddings.dtype)
else:
embeddings = embeddings + self.position_embedding(self.position_ids)
return embeddings
@ -535,8 +512,6 @@ CLIPSEG_VISION_INPUTS_DOCSTRING = r"""
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
Whether to interpolate the pre-trained position encodings.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@ -574,8 +549,6 @@ CLIPSEG_INPUTS_DOCSTRING = r"""
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
Whether to interpolate the pre-trained position encodings.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@ -852,7 +825,6 @@ class CLIPSegVisionTransformer(nn.Module):
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
interpolate_pos_encoding: Optional[bool] = False,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
Returns:
@ -867,7 +839,7 @@ class CLIPSegVisionTransformer(nn.Module):
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
hidden_states = self.embeddings(pixel_values)
hidden_states = self.pre_layrnorm(hidden_states)
encoder_outputs = self.encoder(
@ -912,7 +884,6 @@ class CLIPSegVisionModel(CLIPSegPreTrainedModel):
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
interpolate_pos_encoding: Optional[bool] = False,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
@ -941,7 +912,6 @@ class CLIPSegVisionModel(CLIPSegPreTrainedModel):
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
interpolate_pos_encoding=interpolate_pos_encoding,
return_dict=return_dict,
)
@ -1035,7 +1005,6 @@ class CLIPSegModel(CLIPSegPreTrainedModel):
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
interpolate_pos_encoding: bool = False,
return_dict: Optional[bool] = None,
) -> torch.FloatTensor:
r"""
@ -1071,7 +1040,6 @@ class CLIPSegModel(CLIPSegPreTrainedModel):
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
interpolate_pos_encoding=interpolate_pos_encoding,
return_dict=return_dict,
)
@ -1091,7 +1059,6 @@ class CLIPSegModel(CLIPSegPreTrainedModel):
return_loss: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
interpolate_pos_encoding: bool = False,
return_dict: Optional[bool] = None,
) -> Union[Tuple, CLIPSegOutput]:
r"""
@ -1129,7 +1096,6 @@ class CLIPSegModel(CLIPSegPreTrainedModel):
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
interpolate_pos_encoding=interpolate_pos_encoding,
return_dict=return_dict,
)
@ -1397,7 +1363,6 @@ class CLIPSegForImageSegmentation(CLIPSegPreTrainedModel):
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
interpolate_pos_encoding: bool = False,
return_dict: Optional[bool] = None,
) -> Union[Tuple, CLIPSegOutput]:
r"""
@ -1437,7 +1402,6 @@ class CLIPSegForImageSegmentation(CLIPSegPreTrainedModel):
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=True, # we need the intermediate hidden states
interpolate_pos_encoding=interpolate_pos_encoding,
return_dict=return_dict,
)
pooled_output = self.clip.visual_projection(vision_outputs[1])

View File

@ -1,10 +1,11 @@
# coding=utf-8
# Copyright 2024 Cohere team. All rights reserved.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# This file was automatically generated from <path_to_modular_file.py>.
# Do NOT edit this file manually as any edits will be overwritten by the generation of
# the file from the modular. If any change should be done, please apply the change to the
# modular_xxx.py file directly. One of our CI enforces this
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -17,41 +18,32 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Cohere model configuration"""
from ...configuration_utils import PretrainedConfig
from ...modeling_rope_utils import rope_config_validation
from ...utils import logging
logger = logging.get_logger(__name__)
class CohereConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere
model according to the specified arguments, defining the model architecture.
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the Cohere-7B.
e.g. [google/cohere-7b](https://huggingface.co/google/cohere-7b)
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information. Instantiating a configuration
with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model.
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 256000):
Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`CohereModel`]
hidden_size (`int`, *optional*, defaults to 8192):
hidden_size (`int`, *optional*, defaults to 3072):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 22528):
intermediate_size (`int`, *optional*, defaults to 24576):
Dimension of the MLP representations.
logit_scale (`float`, *optional*, defaults to 0.0625):
The scaling factor for the output logits.
num_hidden_layers (`int`, *optional*, defaults to 40):
num_hidden_layers (`int`, *optional*, defaults to 28):
Number of hidden layers in the Transformer decoder.
num_attention_heads (`int`, *optional*, defaults to 64):
num_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer decoder.
num_key_value_heads (`int`, *optional*):
num_key_value_heads (`int`, *optional*, defaults to 16):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
@ -59,83 +51,48 @@ class CohereConfig(PretrainedConfig):
by meanpooling all the original heads within that group. For more details checkout [this
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
`num_attention_heads`.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the decoder.
head_dim (`int`, *optional*, defaults to 256):
The attention head dimension.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
The legacy activation function. It is overwritten by the `hidden_activation`.
hidden_activation (`str` or `function`, *optional*):
The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
max_position_embeddings (`int`, *optional*, defaults to 8192):
The maximum sequence length that this model might ever be used with.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the layer normalization.
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
pad_token_id (`int`, *optional*, defaults to 0):
Padding token id.
bos_token_id (`int`, *optional*, defaults to 5):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 255001):
eos_token_id (`int`, *optional*, defaults to 1):
End of stream token id.
bos_token_id (`int`, *optional*, defaults to 2):
Beginning of stream token id.
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
accordingly.
Expected contents:
`rope_type` (`str`):
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
'llama3'], with 'default' being the original RoPE implementation.
`factor` (`float`, *optional*):
Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
most scaling types, a `factor` of x will enable the model to handle sequences of length x *
original maximum pre-trained length.
`original_max_position_embeddings` (`int`, *optional*):
Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
pretraining.
`attention_factor` (`float`, *optional*):
Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
computation. If unspecified, it defaults to value recommended by the implementation, using the
`factor` field to infer the suggested value.
`beta_fast` (`float`, *optional*):
Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
ramp function. If unspecified, it defaults to 32.
`beta_slow` (`float`, *optional*):
Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
ramp function. If unspecified, it defaults to 1.
`short_factor` (`List[float]`, *optional*):
Only used with 'longrope'. The scaling factor to be applied to short contexts (<
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
size divided by the number of attention heads divided by 2
`long_factor` (`List[float]`, *optional*):
Only used with 'longrope'. The scaling factor to be applied to long contexts (<
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
size divided by the number of attention heads divided by 2
`low_freq_factor` (`float`, *optional*):
Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
`high_freq_factor` (`float`, *optional*):
Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value and output projection layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
use_qk_norm (`bool`, *optional*, defaults to `False`):
Whether to use query-key normalization in the attention
```python
>>> from transformers import CohereModel, CohereConfig
>>> # Initializing a Cohere model configuration
>>> # Initializing a Cohere cohere-7b style configuration
>>> configuration = CohereConfig()
>>> # Initializing a model from the Cohere configuration
>>> model = CohereModel(configuration) # doctest: +SKIP
>>> # Initializing a model from the cohere-7b style configuration
>>> model = CohereModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config # doctest: +SKIP
```"""
>>> configuration = model.config
```
use_qk_norm (`bool`, *optional*, defaults to `False`):
Whether to use query-key normalization in the attention
"""
model_type = "cohere"
keys_to_ignore_at_inference = ["past_key_values"]
@ -145,51 +102,57 @@ class CohereConfig(PretrainedConfig):
vocab_size=256000,
hidden_size=8192,
intermediate_size=22528,
logit_scale=0.0625,
num_hidden_layers=40,
num_attention_heads=64,
num_key_value_heads=None,
head_dim=256,
hidden_act="silu",
hidden_activation=None,
max_position_embeddings=8192,
initializer_range=0.02,
layer_norm_eps=1e-5,
rms_norm_eps=1e-05,
use_cache=True,
pad_token_id=0,
bos_token_id=5,
eos_token_id=255001,
bos_token_id=5,
tie_word_embeddings=True,
rope_theta=10000.0,
rope_scaling=None,
attention_bias=False,
attention_dropout=0.0,
use_qk_norm=False,
layer_norm_eps=1e-05,
logit_scale=0.0625,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.logit_scale = logit_scale
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.head_dim = head_dim
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.hidden_activation = hidden_activation
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.rms_norm_eps = layer_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.use_qk_norm = use_qk_norm
self.layer_norm_eps = layer_norm_eps
self.itermediate_size = intermediate_size
# Validate the correctness of rotary position embeddings parameters
rope_config_validation(self)
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.logit_scale = logit_scale
if num_key_value_heads is None:
self.num_key_value_heads = num_attention_heads
super().__init__(
pad_token_id=pad_token_id,

View File

@ -1,10 +1,11 @@
# coding=utf-8
# Copyright 2024 Cohere team. All rights reserved.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# This file was automatically generated from <path_to_modular_file.py>.
# Do NOT edit this file manually as any edits will be overwritten by the generation of
# the file from the modular. If any change should be done, please apply the change to the
# modular_xxx.py file directly. One of our CI enforces this
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -17,11 +18,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is based on the LLama model definition file in transformers
"""PyTorch Cohere model."""
import math
from typing import List, Optional, Tuple, Union
@ -38,31 +34,21 @@ from ...modeling_outputs import (
BaseModelOutputWithPast,
CausalLMOutputWithPast,
)
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
from ...utils import (
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_flash_attn_2_available,
is_flash_attn_greater_or_equal_2_10,
is_torchdynamo_compiling,
logging,
replace_return_docstrings,
)
from .configuration_cohere import CohereConfig
if is_flash_attn_2_available():
from ...modeling_flash_attention_utils import _flash_attention_forward
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "CohereConfig"
# Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
def _prepare_4d_causal_attention_mask_with_cache_position(
attention_mask: torch.Tensor,
sequence_length: int,
@ -116,7 +102,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
return causal_mask
class CohereLayerNorm(nn.Module):
class CohereRMSNorm(nn.Module):
def __init__(self, hidden_size=None, eps=1e-5, bias=False):
"""The hidden size can be a tuple or an int. The tuple is used for QKNorm to normalize across head_dim"""
super().__init__()
@ -133,105 +119,45 @@ class CohereLayerNorm(nn.Module):
return hidden_states.to(input_dtype)
ALL_LAYERNORM_LAYERS.append(CohereLayerNorm)
class CohereLayerNorm(CohereRMSNorm):
def __init__(self, *args, **kwargs):
logger.warning_once("CohereLayerNorm is deprecated by CohereRMSNorm and will be removed in v4.45.")
super().__init__(*args, **kwargs)
class CohereRotaryEmbedding(nn.Module):
# Note: the forward pass of this RoPE is slightly different from Llama's, resulting in different `sin`/`cos` for
# the same parameterization. The differences are highlighted with a comment.
def __init__(
self,
dim=None,
max_position_embeddings=2048,
base=10000,
device=None,
scaling_factor=1.0,
rope_type="default",
config: Optional[CohereConfig] = None,
):
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
super().__init__()
# TODO (joao): remove the `if` below, only used for BC
self.rope_kwargs = {}
if config is None:
logger.warning_once(
"`CohereRotaryEmbedding` can now be fully parameterized by passing the model config through the "
"`config` argument. All other arguments will be removed in v4.46"
)
self.rope_kwargs = {
"rope_type": rope_type,
"factor": scaling_factor,
"dim": dim,
"base": base,
"max_position_embeddings": max_position_embeddings,
}
self.rope_type = rope_type
self.max_seq_len_cached = max_position_embeddings
self.original_max_seq_len = max_position_embeddings
else:
# BC: "rope_type" was originally "type"
if config.rope_scaling is not None:
self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
else:
self.rope_type = "default"
self.max_seq_len_cached = config.max_position_embeddings
self.original_max_seq_len = config.max_position_embeddings
self.config = config
self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
self.scaling_factor = scaling_factor
self.dim = dim
self.max_position_embeddings = max_position_embeddings
self.base = base
inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
self.register_buffer("inv_freq", inv_freq, persistent=False)
self.original_inv_freq = self.inv_freq
def _dynamic_frequency_update(self, position_ids, device):
"""
dynamic RoPE layers should recompute `inv_freq` in the following situations:
1 - growing beyond the cached sequence length (allow scaling)
2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
"""
seq_len = torch.max(position_ids) + 1
if seq_len > self.max_seq_len_cached: # growth
inv_freq, self.attention_scaling = self.rope_init_fn(
self.config, device, seq_len=seq_len, **self.rope_kwargs
)
self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation
self.max_seq_len_cached = seq_len
if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset
self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
self.max_seq_len_cached = self.original_max_seq_len
@torch.no_grad()
def forward(self, x, position_ids):
if "dynamic" in self.rope_type:
self._dynamic_frequency_update(position_ids, device=x.device)
# Core RoPE block
# x: [bs, num_attention_heads, seq_len, head_size]
inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
position_ids_expanded = position_ids[:, None, :].float()
# Force float32 (see https://github.com/huggingface/transformers/pull/29285)
# Force float32 since bfloat16 loses precision on long contexts
# See https://github.com/huggingface/transformers/pull/29285
device_type = x.device.type
device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
with torch.autocast(device_type=device_type, enabled=False):
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
emb = torch.repeat_interleave(freqs, 2, dim=-1) # This line differs from Llama's implementation
emb = torch.repeat_interleave(freqs, 2, dim=-1)
cos = emb.cos()
sin = emb.sin()
# Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
cos = cos * self.attention_scaling
sin = sin * self.attention_scaling
return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
return cos, sin
def rotate_half(x):
# Split and rotate. Note that this function is different from e.g. Llama.
x1 = x[..., ::2]
x2 = x[..., 1::2]
rot_x = torch.stack([-x2, x1], dim=-1).flatten(-2)
return rot_x
"""Rotates half the hidden dims of the input."""
x1 = x[..., : x.shape[-1] // 2]
x2 = x[..., x.shape[-1] // 2 :]
return torch.cat((-x2, x1), dim=-1)
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
@ -254,14 +180,11 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
Returns:
`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
"""
dtype = q.dtype
q = q.float()
k = k.float()
cos = cos.unsqueeze(unsqueeze_dim)
sin = sin.unsqueeze(unsqueeze_dim)
q_embed = (q * cos) + (rotate_half(q) * sin)
k_embed = (k * cos) + (rotate_half(k) * sin)
return q_embed.to(dtype=dtype), k_embed.to(dtype=dtype)
return q_embed, k_embed
class CohereMLP(nn.Module):
@ -275,13 +198,11 @@ class CohereMLP(nn.Module):
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
self.act_fn = ACT2FN[config.hidden_act]
# Ignore copy
def forward(self, x):
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
return down_proj
# Copied from transformers.models.llama.modeling_llama.repeat_kv
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
"""
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
@ -336,9 +257,14 @@ class CohereAttention(nn.Module):
self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
self._init_rope()
# TODO (joao): remove in v4.46 (RoPE is computed in the model, not in the decoder layers)
self.rotary_emb = CohereRotaryEmbedding(config=self.config)
def _init_rope(self):
self.rotary_emb = CohereRotaryEmbedding(
self.head_dim,
max_position_embeddings=self.max_position_embeddings,
base=self.rope_theta,
)
def forward(
self,
@ -349,7 +275,6 @@ class CohereAttention(nn.Module):
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
**kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
bsz, q_len, _ = hidden_states.size()
@ -368,16 +293,7 @@ class CohereAttention(nn.Module):
key_states = key_states.transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
if position_embeddings is None:
logger.warning_once(
"The attention layers in this model are transitioning from computing the RoPE embeddings internally "
"through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
"`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
"removed and `position_embeddings` will be mandatory."
)
cos, sin = self.rotary_emb(value_states, position_ids)
else:
cos, sin = position_embeddings
cos, sin = self.rotary_emb(value_states, position_ids)
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
if past_key_value is not None:
@ -417,136 +333,7 @@ class CohereAttention(nn.Module):
return attn_output, attn_weights, past_key_value
# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Cohere
class CohereFlashAttention2(CohereAttention):
"""
Cohere flash attention module. This module inherits from `CohereAttention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
# Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
# Ignore copy
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
**kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
if isinstance(past_key_value, StaticCache):
raise ValueError(
"`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
"make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
)
output_attentions = False
bsz, q_len, _ = hidden_states.size()
query_states = self.q_proj(hidden_states)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
if self.use_qk_norm:
query_states = self.q_norm(query_states)
key_states = self.k_norm(key_states)
query_states = query_states.transpose(1, 2)
key_states = key_states.transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
if position_embeddings is None:
logger.warning_once(
"The attention layers in this model are transitioning from computing the RoPE embeddings internally "
"through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
"`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
"removed and `position_embeddings` will be mandatory."
)
cos, sin = self.rotary_emb(value_states, position_ids)
else:
cos, sin = position_embeddings
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
if past_key_value is not None:
# sin and cos are specific to RoPE models; position_ids needed for the static cache
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
# TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
# to be able to avoid many of these transpose/reshape/view.
query_states = query_states.transpose(1, 2)
key_states = key_states.transpose(1, 2)
value_states = value_states.transpose(1, 2)
dropout_rate = self.attention_dropout if self.training else 0.0
# In PEFT, usually we cast the layer norms in float32 for training stability reasons
# therefore the input hidden states gets silently casted in float32. Hence, we need
# cast them back in the correct dtype just to be sure everything works as expected.
# This might slowdown training & inference so it is recommended to not cast the LayerNorms
# in fp32. (CohereLayerNorm handles it correctly)
input_dtype = query_states.dtype
if input_dtype == torch.float32:
if torch.is_autocast_enabled():
target_dtype = torch.get_autocast_gpu_dtype()
# Handle the case where the model is quantized
elif hasattr(self.config, "_pre_quantization_dtype"):
target_dtype = self.config._pre_quantization_dtype
else:
target_dtype = self.q_proj.weight.dtype
logger.warning_once(
f"The input hidden states seems to be silently casted in float32, this might be related to"
f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
f" {target_dtype}."
)
query_states = query_states.to(target_dtype)
key_states = key_states.to(target_dtype)
value_states = value_states.to(target_dtype)
attn_output = _flash_attention_forward(
query_states,
key_states,
value_states,
attention_mask,
q_len,
dropout=dropout_rate,
use_top_left_mask=self._flash_attn_uses_top_left_mask,
is_causal=self.is_causal,
)
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
attn_output = self.o_proj(attn_output)
if not output_attentions:
attn_weights = None
return attn_output, attn_weights, past_key_value
class CohereSdpaAttention(CohereAttention):
"""
Cohere attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
`CohereAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
SDPA API.
"""
def forward(
self,
hidden_states: torch.Tensor,
@ -556,7 +343,6 @@ class CohereSdpaAttention(CohereAttention):
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
if output_attentions:
# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
@ -590,16 +376,7 @@ class CohereSdpaAttention(CohereAttention):
key_states = key_states.transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
if position_embeddings is None:
logger.warning_once(
"The attention layers in this model are transitioning from computing the RoPE embeddings internally "
"through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
"`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
"removed and `position_embeddings` will be mandatory."
)
cos, sin = self.rotary_emb(value_states, position_ids)
else:
cos, sin = position_embeddings
cos, sin = self.rotary_emb(value_states, position_ids)
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
if past_key_value is not None:
@ -643,6 +420,97 @@ class CohereSdpaAttention(CohereAttention):
return attn_output, None, past_key_value
class CohereFlashAttention2(CohereAttention):
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
**kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
if isinstance(past_key_value, StaticCache):
raise ValueError(
"`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
"make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
)
output_attentions = False
bsz, q_len, _ = hidden_states.size()
query_states = self.q_proj(hidden_states)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
if self.use_qk_norm:
query_states = self.q_norm(query_states)
key_states = self.k_norm(key_states)
query_states = query_states.transpose(1, 2)
key_states = key_states.transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
cos, sin = self.rotary_emb(value_states, position_ids)
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
if past_key_value is not None:
# sin and cos are specific to RoPE models; position_ids needed for the static cache
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
# TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
# to be able to avoid many of these transpose/reshape/view.
query_states = query_states.transpose(1, 2)
key_states = key_states.transpose(1, 2)
value_states = value_states.transpose(1, 2)
dropout_rate = self.attention_dropout if self.training else 0.0
# Ignore copy
# In PEFT, usually we cast the layer norms in float32 for training stability reasons
# therefore the input hidden states gets silently casted in float32. Hence, we need
# cast them back in the correct dtype just to be sure everything works as expected.
# This might slowdown training & inference so it is recommended to not cast the LayerNorms
# in fp32. (CohereLayerNorm handles it correctly)
input_dtype = query_states.dtype
if input_dtype == torch.float32:
if torch.is_autocast_enabled():
target_dtype = torch.get_autocast_gpu_dtype()
# Handle the case where the model is quantized
elif hasattr(self.config, "_pre_quantization_dtype"):
target_dtype = self.config._pre_quantization_dtype
else:
target_dtype = self.q_proj.weight.dtype
logger.warning_once(
f"The input hidden states seems to be silently casted in float32, this might be related to"
f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
f" {target_dtype}."
)
query_states = query_states.to(target_dtype)
key_states = key_states.to(target_dtype)
value_states = value_states.to(target_dtype)
attn_output = self._flash_attention_forward(
query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
)
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
attn_output = self.o_proj(attn_output)
if not output_attentions:
attn_weights = None
return attn_output, attn_weights, past_key_value
COHERE_ATTENTION_CLASSES = {
"eager": CohereAttention,
"flash_attention_2": CohereFlashAttention2,
@ -669,7 +537,6 @@ class CohereDecoderLayer(nn.Module):
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
cache_position: Optional[torch.LongTensor] = None,
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
"""
Args:
@ -684,11 +551,6 @@ class CohereDecoderLayer(nn.Module):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
(see `past_key_values`).
past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
Indices depicting the position of the input sequence tokens in the sequence
position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
with `head_dim` being the embedding dimension of each attention head.
"""
residual = hidden_states
@ -703,7 +565,6 @@ class CohereDecoderLayer(nn.Module):
output_attentions=output_attentions,
use_cache=use_cache,
cache_position=cache_position,
position_embeddings=position_embeddings,
)
# Fully Connected
@ -725,7 +586,8 @@ class CohereDecoderLayer(nn.Module):
COHERE_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings etc.).
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
@ -743,7 +605,6 @@ COHERE_START_DOCSTRING = r"""
"The bare Cohere Model outputting raw hidden-states without any specific head on top.",
COHERE_START_DOCSTRING,
)
# Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->Cohere
class CoherePreTrainedModel(PreTrainedModel):
config_class = CohereConfig
base_model_prefix = "model"
@ -836,6 +697,10 @@ COHERE_INPUTS_DOCSTRING = r"""
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
the complete sequence length.
"""
@ -843,7 +708,6 @@ COHERE_INPUTS_DOCSTRING = r"""
"The bare Cohere Model outputting raw hidden-states without any specific head on top.",
COHERE_START_DOCSTRING,
)
# Copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->Cohere, LLAMA->COHERE
class CohereModel(CoherePreTrainedModel):
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`CohereDecoderLayer`]
@ -852,7 +716,6 @@ class CohereModel(CoherePreTrainedModel):
config: CohereConfig
"""
# Ignore copy
def __init__(self, config: CohereConfig):
super().__init__(config)
self.padding_idx = config.pad_token_id
@ -862,7 +725,7 @@ class CohereModel(CoherePreTrainedModel):
self.layers = nn.ModuleList(
[CohereDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
)
self.norm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps)
self.norm = CohereRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.rotary_emb = CohereRotaryEmbedding(config=config)
self.gradient_checkpointing = False
@ -1067,18 +930,15 @@ class CohereModel(CoherePreTrainedModel):
return causal_mask
# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with Llama->Cohere
class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
_tied_weights_keys = ["lm_head.weight"]
# Ignore copy
def __init__(self, config):
super().__init__(config)
self.model = CohereModel(config)
self.vocab_size = config.vocab_size
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.logit_scale = config.logit_scale
self.tie_word_embeddings = config.tie_word_embeddings
# Initialize weights and apply final processing
self.post_init()
@ -1100,7 +960,6 @@ class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
def get_decoder(self):
return self.model
# Ignore copy
@add_start_docstrings_to_model_forward(COHERE_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
def forward(
@ -1116,7 +975,6 @@ class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
num_logits_to_keep: int = 0,
) -> Union[Tuple, CausalLMOutputWithPast]:
r"""
Args:
@ -1125,11 +983,6 @@ class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
num_logits_to_keep (`int`, *optional*):
Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
`input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
Returns:
Example:
@ -1169,19 +1022,12 @@ class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
)
hidden_states = outputs[0]
if labels is None and not is_torchdynamo_compiling():
logger.warning_once(
"Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)"
)
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
# TODO: remove the float() operation in v4.46
logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
logits = logits * self.logit_scale
logits = self.lm_head(hidden_states)
logits = logits * self.config.logit_scale
logits = logits.float()
loss = None
if labels is not None:
# Upcast to float if we need to compute the loss to avoid potential precision issues
logits = logits.float()
# Shift so that tokens < n predict n
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
@ -1278,3 +1124,17 @@ class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
}
)
return model_inputs
@property
def logit_scale(self):
logger.warning(
"`logit_scale` attribute is going to be deprecated in future versions, please use `model.config.logit_scale` instead."
)
return self.config.logit_scale
@property
def tie_word_embeddings(self):
logger.warning(
"`tie_word_embeddings` attribute is going to be deprecated in future versions, please use `model.config.tie_word_embeddings` instead."
)
return self.config.tie_word_embeddings

View File

@ -0,0 +1,663 @@
# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
from transformers.models.gemma.configuration_gemma import GemmaConfig
from transformers.models.llama.modeling_llama import (
LlamaFlashAttention2,
LlamaForCausalLM,
LlamaSdpaAttention,
repeat_kv,
)
from ...activations import ACT2FN
from ...cache_utils import Cache, StaticCache
from ...modeling_outputs import CausalLMOutputWithPast
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
from ...utils import logging
logger = logging.get_logger(__name__)
class CohereConfig(GemmaConfig):
r"""
use_qk_norm (`bool`, *optional*, defaults to `False`):
Whether to use query-key normalization in the attention
"""
def __init__(
self,
hidden_size=8192,
intermediate_size=22528,
num_hidden_layers=40,
hidden_act="silu",
use_qk_norm=False,
layer_norm_eps=1e-05,
num_attention_heads=64,
num_key_value_heads=None,
logit_scale=0.0625,
rms_norm_eps=1e-05,
bos_token_id=5,
eos_token_id=255001,
**super_kwargs,
):
super().__init__(self, **super_kwargs)
self.use_qk_norm = use_qk_norm
self.layer_norm_eps = layer_norm_eps
self.rms_norm_eps = layer_norm_eps
self.hidden_size = hidden_size
self.itermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.hidden_act = hidden_act
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.logit_scale = logit_scale
if num_key_value_heads is None:
self.num_key_value_heads = num_attention_heads
class CohereRMSNorm(nn.Module):
def __init__(self, hidden_size=None, eps=1e-5, bias=False):
"""The hidden size can be a tuple or an int. The tuple is used for QKNorm to normalize across head_dim"""
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.variance_epsilon = eps
def forward(self, hidden_states):
input_dtype = hidden_states.dtype
hidden_states = hidden_states.to(torch.float32)
mean = hidden_states.mean(-1, keepdim=True)
variance = (hidden_states - mean).pow(2).mean(-1, keepdim=True)
hidden_states = (hidden_states - mean) * torch.rsqrt(variance + self.variance_epsilon)
hidden_states = self.weight.to(torch.float32) * hidden_states
return hidden_states.to(input_dtype)
ALL_LAYERNORM_LAYERS.append(CohereRMSNorm)
class CohereLayerNorm(CohereRMSNorm):
def __init__(self, *args, **kwargs):
logger.warning_once("CohereLayerNorm is deprecated by CohereRMSNorm and will be removed in v4.45.")
super().__init__(*args, **kwargs)
class CohereRotaryEmbedding(nn.Module):
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
super().__init__()
self.scaling_factor = scaling_factor
self.dim = dim
self.max_position_embeddings = max_position_embeddings
self.base = base
inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
self.register_buffer("inv_freq", inv_freq, persistent=False)
@torch.no_grad()
def forward(self, x, position_ids):
# x: [bs, num_attention_heads, seq_len, head_size]
inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
position_ids_expanded = position_ids[:, None, :].float()
# Force float32 since bfloat16 loses precision on long contexts
# See https://github.com/huggingface/transformers/pull/29285
device_type = x.device.type
device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
with torch.autocast(device_type=device_type, enabled=False):
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
emb = torch.repeat_interleave(freqs, 2, dim=-1)
cos = emb.cos()
sin = emb.sin()
return cos, sin
def rotate_half(x):
# Split and rotate
x1 = x[..., ::2]
x2 = x[..., 1::2]
rot_x = torch.stack([-x2, x1], dim=-1).flatten(-2)
return rot_x
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
"""Applies Rotary Position Embedding to the query and key tensors.
Args:
q (`torch.Tensor`): The query tensor.
k (`torch.Tensor`): The key tensor.
cos (`torch.Tensor`): The cosine part of the rotary embedding.
sin (`torch.Tensor`): The sine part of the rotary embedding.
position_ids (`torch.Tensor`, *optional*):
Deprecated and unused.
unsqueeze_dim (`int`, *optional*, defaults to 1):
The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
"""
dtype = q.dtype
q = q.float()
k = k.float()
cos = cos.unsqueeze(unsqueeze_dim)
sin = sin.unsqueeze(unsqueeze_dim)
q_embed = (q * cos) + (rotate_half(q) * sin)
k_embed = (k * cos) + (rotate_half(k) * sin)
return q_embed.to(dtype=dtype), k_embed.to(dtype=dtype)
class CohereMLP(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.hidden_size = config.hidden_size
self.intermediate_size = config.intermediate_size
self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
self.act_fn = ACT2FN[config.hidden_act]
def forward(self, x):
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
return down_proj
class CohereAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(self, config: CohereConfig, layer_idx: Optional[int] = None):
super().__init__()
self.config = config
self.layer_idx = layer_idx
if layer_idx is None:
logger.warning_once(
f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
"lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
"when creating this class."
)
self.attention_dropout = config.attention_dropout
self.hidden_size = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.hidden_size // self.num_heads
self.num_key_value_heads = config.num_key_value_heads
self.num_key_value_groups = self.num_heads // self.num_key_value_heads
self.max_position_embeddings = config.max_position_embeddings
self.rope_theta = config.rope_theta
self.is_causal = True
self.use_qk_norm = config.use_qk_norm
if (self.head_dim * self.num_heads) != self.hidden_size:
raise ValueError(
f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
f" and `num_heads`: {self.num_heads})."
)
if self.use_qk_norm:
# When sharding the model using Tensor Parallelism, need to be careful to use n_local_heads
self.q_norm = CohereLayerNorm(hidden_size=(self.num_heads, self.head_dim), eps=config.layer_norm_eps)
self.k_norm = CohereLayerNorm(
hidden_size=(self.num_key_value_heads, self.head_dim), eps=config.layer_norm_eps
)
self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
self._init_rope()
def _init_rope(self):
self.rotary_emb = CohereRotaryEmbedding(
self.head_dim,
max_position_embeddings=self.max_position_embeddings,
base=self.rope_theta,
)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
**kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
bsz, q_len, _ = hidden_states.size()
query_states = self.q_proj(hidden_states)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
if self.use_qk_norm:
query_states = self.q_norm(query_states)
key_states = self.k_norm(key_states)
query_states = query_states.transpose(1, 2)
key_states = key_states.transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
cos, sin = self.rotary_emb(value_states, position_ids)
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
if past_key_value is not None:
# sin and cos are specific to RoPE models; position_ids needed for the static cache
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
key_states = repeat_kv(key_states, self.num_key_value_groups)
value_states = repeat_kv(value_states, self.num_key_value_groups)
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
if attention_mask is not None: # no matter the length, we just slice it
causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
attn_weights = attn_weights + causal_mask
# upcast attention to fp32
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
attn_output = torch.matmul(attn_weights, value_states)
if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
raise ValueError(
f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
f" {attn_output.size()}"
)
attn_output = attn_output.transpose(1, 2).contiguous()
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
attn_output = self.o_proj(attn_output)
if not output_attentions:
attn_weights = None
return attn_output, attn_weights, past_key_value
class CohereSdpaAttention(CohereAttention):
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
if output_attentions:
# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
logger.warning_once(
"CohereModel is using CohereSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
return super().forward(
hidden_states=hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
cache_position=cache_position,
)
bsz, q_len, _ = hidden_states.size()
query_states = self.q_proj(hidden_states)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
if self.use_qk_norm:
query_states = self.q_norm(query_states)
key_states = self.k_norm(key_states)
query_states = query_states.transpose(1, 2)
key_states = key_states.transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
cos, sin = self.rotary_emb(value_states, position_ids)
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
if past_key_value is not None:
# sin and cos are specific to RoPE models; cache_position needed for the static cache
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
key_states = repeat_kv(key_states, self.num_key_value_groups)
value_states = repeat_kv(value_states, self.num_key_value_groups)
causal_mask = attention_mask
# if attention_mask is not None and cache_position is not None:
if attention_mask is not None:
causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
# SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
# Reference: https://github.com/pytorch/pytorch/issues/112577.
if query_states.device.type == "cuda" and causal_mask is not None:
query_states = query_states.contiguous()
key_states = key_states.contiguous()
value_states = value_states.contiguous()
# We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
# in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
is_causal = True if causal_mask is None and q_len > 1 else False
attn_output = torch.nn.functional.scaled_dot_product_attention(
query_states,
key_states,
value_states,
attn_mask=causal_mask,
dropout_p=self.attention_dropout if self.training else 0.0,
is_causal=is_causal,
)
attn_output = attn_output.transpose(1, 2).contiguous()
attn_output = attn_output.view(bsz, q_len, self.hidden_size)
attn_output = self.o_proj(attn_output)
return attn_output, None, past_key_value
class CohereFlashAttention2(CohereAttention):
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
**kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
if isinstance(past_key_value, StaticCache):
raise ValueError(
"`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
"make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
)
output_attentions = False
bsz, q_len, _ = hidden_states.size()
query_states = self.q_proj(hidden_states)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
if self.use_qk_norm:
query_states = self.q_norm(query_states)
key_states = self.k_norm(key_states)
query_states = query_states.transpose(1, 2)
key_states = key_states.transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
cos, sin = self.rotary_emb(value_states, position_ids)
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
if past_key_value is not None:
# sin and cos are specific to RoPE models; position_ids needed for the static cache
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
# TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
# to be able to avoid many of these transpose/reshape/view.
query_states = query_states.transpose(1, 2)
key_states = key_states.transpose(1, 2)
value_states = value_states.transpose(1, 2)
dropout_rate = self.attention_dropout if self.training else 0.0
# Ignore copy
# In PEFT, usually we cast the layer norms in float32 for training stability reasons
# therefore the input hidden states gets silently casted in float32. Hence, we need
# cast them back in the correct dtype just to be sure everything works as expected.
# This might slowdown training & inference so it is recommended to not cast the LayerNorms
# in fp32. (CohereLayerNorm handles it correctly)
input_dtype = query_states.dtype
if input_dtype == torch.float32:
if torch.is_autocast_enabled():
target_dtype = torch.get_autocast_gpu_dtype()
# Handle the case where the model is quantized
elif hasattr(self.config, "_pre_quantization_dtype"):
target_dtype = self.config._pre_quantization_dtype
else:
target_dtype = self.q_proj.weight.dtype
logger.warning_once(
f"The input hidden states seems to be silently casted in float32, this might be related to"
f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
f" {target_dtype}."
)
query_states = query_states.to(target_dtype)
key_states = key_states.to(target_dtype)
value_states = value_states.to(target_dtype)
attn_output = self._flash_attention_forward(
query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
)
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
attn_output = self.o_proj(attn_output)
if not output_attentions:
attn_weights = None
return attn_output, attn_weights, past_key_value
COHERE_ATTENTION_CLASSES = {
"eager": CohereAttention,
"flash_attention_2": CohereFlashAttention2,
"sdpa": CohereSdpaAttention,
}
class CohereDecoderLayer(nn.Module):
def __init__(self, config: CohereConfig, layer_idx: int):
super().__init__()
self.hidden_size = config.hidden_size
self.self_attn = COHERE_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
self.mlp = CohereMLP(config)
self.input_layernorm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
cache_position: Optional[torch.LongTensor] = None,
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`, *optional*):
attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
query_sequence_length, key_sequence_length)` if default attention is used.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
(see `past_key_values`).
past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
"""
residual = hidden_states
hidden_states = self.input_layernorm(hidden_states)
# Self Attention
hidden_states_attention, self_attn_weights, present_key_value = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
cache_position=cache_position,
)
# Fully Connected
hidden_states_mlp = self.mlp(hidden_states)
# Add everything together
hidden_states = residual + hidden_states_attention + hidden_states_mlp
outputs = (hidden_states,)
if output_attentions:
outputs += (self_attn_weights,)
if use_cache:
outputs += (present_key_value,)
return outputs
class CohereForCausalLM(LlamaForCausalLM):
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
) -> Union[Tuple, CausalLMOutputWithPast]:
r"""
Args:
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
Example:
```python
>> from transformers import AutoTokenizer, CohereForCausalLM
>> model = CohereForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01")
>> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")
>> prompt = "Hey, are you conscious? Can you talk to me?"
>> inputs = tokenizer(prompt, return_tensors="pt")
>> # Generate
>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
outputs = self.model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
cache_position=cache_position,
)
hidden_states = outputs[0]
logits = self.lm_head(hidden_states)
logits = logits * self.config.logit_scale
logits = logits.float()
loss = None
if labels is not None:
# Shift so that tokens < n predict n
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
# Flatten the tokens
loss_fct = CrossEntropyLoss()
shift_logits = shift_logits.view(-1, self.config.vocab_size)
shift_labels = shift_labels.view(-1)
# Enable model parallelism
shift_labels = shift_labels.to(shift_logits.device)
loss = loss_fct(shift_logits, shift_labels)
if not return_dict:
output = (logits,) + outputs[1:]
return (loss,) + output if loss is not None else output
return CausalLMOutputWithPast(
loss=loss,
logits=logits,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@property
def logit_scale(self):
logger.warning(
"`logit_scale` attribute is going to be deprecated in future versions, please use `model.config.logit_scale` instead."
)
return self.config.logit_scale
@property
def tie_word_embeddings(self):
logger.warning(
"`tie_word_embeddings` attribute is going to be deprecated in future versions, please use `model.config.tie_word_embeddings` instead."
)
return self.config.tie_word_embeddings

View File

@ -265,7 +265,7 @@ class FuyuForCausalLM(FuyuPreTrainedModel, GenerationMixin):
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> prompt = "Generate a coco-style caption.\n"
>>> inputs = processor(images=image, text=prompt, return_tensors="pt")
>>> inputs = processor(text=prompt, images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> generated_ids = model.generate(**inputs, max_new_tokens=7)

View File

@ -21,10 +21,9 @@ from typing import Dict, List, Optional, Tuple, Union
import numpy as np
from ...image_utils import ImageInput
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, _validate_images_text_input_order
from ...tokenization_utils_base import PreTokenizedInput, TextInput
from ...utils import is_torch_available, logging, requires_backends
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import PaddingStrategy, TruncationStrategy
from ...utils import TensorType, is_torch_available, logging, requires_backends
if is_torch_available():
@ -50,24 +49,6 @@ TOKEN_POINT_CLOSE_STRING = "<0x03>" # </point>
BEGINNING_OF_ANSWER_STRING = "<0x04>" # <boa>
class FuyuProcessorKwargs(ProcessingKwargs, total=False):
_defaults = {
"text_kwargs": {
"add_special_tokens": True,
"padding": False,
"stride": 0,
"return_attention_mask": True,
"return_overflowing_tokens": False,
"return_special_tokens_mask": False,
"return_offsets_mapping": False,
"return_token_type_ids": False,
"return_length": False,
"verbose": True,
},
"images_kwargs": {},
}
def full_unpacked_stream_to_tensor(
all_bi_tokens_to_place: List[int],
full_unpacked_stream: List["torch.Tensor"],
@ -471,11 +452,23 @@ class FuyuProcessor(ProcessorMixin):
def __call__(
self,
images: ImageInput = None,
text: Optional[Union[str, List[str], TextInput, PreTokenizedInput]] = None,
audio=None,
videos=None,
**kwargs: Unpack[FuyuProcessorKwargs],
text=None,
images=None,
add_special_tokens: bool = True,
return_attention_mask: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_token_type_ids: bool = False,
return_length: bool = False,
verbose: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
) -> "FuyuBatchFeature":
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
@ -485,13 +478,13 @@ class FuyuProcessor(ProcessorMixin):
of the above two methods for more information.
Args:
images (`PIL.Image.Image`, `List[PIL.Image.Image]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.
text (`str`, `List[str]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
images (`PIL.Image.Image`, `List[PIL.Image.Image]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.
Returns:
[`FuyuBatchEncoding`]: A [`FuyuBatchEncoding`] with the following fields:
@ -505,24 +498,31 @@ class FuyuProcessor(ProcessorMixin):
requires_backends(self, ["torch"])
# --- Check input validity ---
if not return_attention_mask:
raise ValueError("`return_attention_mask=False` is not supported for this model.")
if text is None and images is None:
raise ValueError("You have to specify either text or images. Both cannot be None.")
# check if images and text inputs are reversed for BC
images, text = _validate_images_text_input_order(images, text)
output_kwargs = self._merge_kwargs(
FuyuProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
if not output_kwargs["text_kwargs"].setdefault("return_attention_mask", True):
raise ValueError("`return_attention_mask=False` is not supported for this model.")
if text is not None and images is None:
logger.warning("You are processing a text with no associated image. Make sure it is intended.")
self.current_processor = self.tokenizer
text_encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
text_encoding = self.tokenizer(
text=text,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_token_type_ids=return_token_type_ids,
return_length=return_length,
verbose=verbose,
return_tensors=return_tensors,
**kwargs,
)
return text_encoding
if text is None and images is not None:
@ -537,8 +537,7 @@ class FuyuProcessor(ProcessorMixin):
# --- Preprocess images using self.image_processor ---
# FIXME - We hard code "pt" here because the rest of the processing assumes torch tensors
output_kwargs["images_kwargs"]["return_tensors"] = "pt"
image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"])
image_encoding = self.image_processor.preprocess(images, return_tensors="pt")
batch_images = image_encoding["images"]
image_unpadded_heights = image_encoding["image_unpadded_heights"]
image_unpadded_widths = image_encoding["image_unpadded_widths"]
@ -569,7 +568,7 @@ class FuyuProcessor(ProcessorMixin):
)
all_encodings.append(sample_encoding)
batch_encoding = self._left_pad_inputs_with_attention_mask(
model_inputs=all_encodings, return_attention_mask=True
model_inputs=all_encodings, return_attention_mask=return_attention_mask
)
return FuyuBatchFeature(data=batch_encoding)

View File

@ -85,7 +85,6 @@ class Gemma2Config(PretrainedConfig):
size of the sliding window.
final_logit_softcapping (`float`, *optional*, defaults to 30.0): scaling factor when applying tanh softcapping on the logits.
attn_logit_softcapping (`float`, *optional*, defaults to 50.0): scaling factor when applying tanh softcapping on the attention scores.
cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
```python
>>> from transformers import Gemma2Model, Gemma2Config
@ -99,6 +98,7 @@ class Gemma2Config(PretrainedConfig):
model_type = "gemma2"
keys_to_ignore_at_inference = ["past_key_values"]
cache_implementation = "hybrid"
def __init__(
self,
@ -125,7 +125,6 @@ class Gemma2Config(PretrainedConfig):
sliding_window=4096,
final_logit_softcapping=30.0,
attn_logit_softcapping=50.0,
cache_implementation="hybrid",
**kwargs,
):
super().__init__(
@ -154,4 +153,3 @@ class Gemma2Config(PretrainedConfig):
self.sliding_window = sliding_window
self.final_logit_softcapping = final_logit_softcapping
self.attn_logit_softcapping = attn_logit_softcapping
self.cache_implementation = cache_implementation

View File

@ -117,7 +117,6 @@ class Gemma2Config(PretrainedConfig):
size of the sliding window.
final_logit_softcapping (`float`, *optional*, defaults to 30.0): scaling factor when applying tanh softcapping on the logits.
attn_logit_softcapping (`float`, *optional*, defaults to 50.0): scaling factor when applying tanh softcapping on the attention scores.
cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
```python
>>> from transformers import Gemma2Model, Gemma2Config
@ -131,6 +130,7 @@ class Gemma2Config(PretrainedConfig):
model_type = "gemma2"
keys_to_ignore_at_inference = ["past_key_values"]
cache_implementation = "hybrid"
def __init__(
self,
@ -157,7 +157,6 @@ class Gemma2Config(PretrainedConfig):
sliding_window=4096,
final_logit_softcapping=30.0,
attn_logit_softcapping=50.0,
cache_implementation="hybrid",
**kwargs,
):
super().__init__(
@ -186,7 +185,6 @@ class Gemma2Config(PretrainedConfig):
self.sliding_window = sliding_window
self.final_logit_softcapping = final_logit_softcapping
self.attn_logit_softcapping = attn_logit_softcapping
self.cache_implementation = cache_implementation
class Gemma2RMSNorm(GemmaRMSNorm):

Some files were not shown because too many files have changed in this diff Show More