Compare commits

..

1 Commits

Author SHA1 Message Date
6c9f50deec fix audio pipeline with torchcodec input 2025-07-09 15:55:27 +02:00
404 changed files with 4467 additions and 16164 deletions

View File

@ -303,7 +303,7 @@ non_model_job = CircleCIJob(
docker_image=[{"image": "huggingface/transformers-torch-light"}],
# networkx==3.3 (after #36957) cause some issues
# TODO: remove this once it works directly
install_steps=["uv venv && uv pip install .[serving]"],
install_steps=["uv venv && uv pip install ."],
marker="not generate",
parallelism=6,
)

View File

@ -3,7 +3,7 @@ name: Build docker images (scheduled)
on:
push:
branches:
- build_with_2404
- build_ci_docker_image*
repository_dispatch:
workflow_call:
inputs:
@ -43,4 +43,313 @@ jobs:
build-args: |
REF=main
push: true
tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }}-test
tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }}
# Push CI images still need to be re-built daily
-
name: Build and push (for Push CI) in a daily basis
# This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
# The later case is useful for manual image building for debugging purpose. Use another tag in this case!
if: inputs.image_postfix != '-push-ci'
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-all-latest-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-all-latest-gpu-push-ci
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the transformers-all-latest-gpu-push-ci docker build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
latest-torch-deepspeed-docker:
name: "Latest PyTorch + DeepSpeed"
runs-on:
group: aws-g4dn-2xlarge-cache
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Check out code
uses: actions/checkout@v4
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
name: Build and push
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-pytorch-deepspeed-latest-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }}
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER}}
title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu docker build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
# Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`)
latest-torch-deepspeed-docker-for-push-ci-daily-build:
name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)"
runs-on:
group: aws-general-8-plus
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Check out code
uses: actions/checkout@v4
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
# Push CI images still need to be re-built daily
-
name: Build and push (for Push CI) in a daily basis
# This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
# The later case is useful for manual image building for debugging purpose. Use another tag in this case!
if: inputs.image_postfix != '-push-ci'
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-pytorch-deepspeed-latest-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu-push-ci docker build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
doc-builder:
name: "Doc builder"
# Push CI doesn't need this image
if: inputs.image_postfix != '-push-ci'
runs-on:
group: aws-general-8-plus
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Check out code
uses: actions/checkout@v4
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
name: Build and push
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-doc-builder
push: true
tags: huggingface/transformers-doc-builder
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the huggingface/transformers-doc-builder docker build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
latest-pytorch:
name: "Latest PyTorch [dev]"
# Push CI doesn't need this image
if: inputs.image_postfix != '-push-ci'
runs-on:
group: aws-general-8-plus
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Check out code
uses: actions/checkout@v4
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
name: Build and push
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-pytorch-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-pytorch-gpu
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the huggingface/transformers-pytorch-gpudocker build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
latest-pytorch-amd:
name: "Latest PyTorch (AMD) [dev]"
runs-on:
group: aws-general-8-plus
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Check out code
uses: actions/checkout@v4
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
name: Build and push
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-pytorch-amd-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-pytorch-amd-gpu${{ inputs.image_postfix }}
# Push CI images still need to be re-built daily
-
name: Build and push (for Push CI) in a daily basis
# This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
# The later case is useful for manual image building for debugging purpose. Use another tag in this case!
if: inputs.image_postfix != '-push-ci'
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-pytorch-amd-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-pytorch-amd-gpu-push-ci
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu-push-ci build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
latest-pytorch-deepspeed-amd:
name: "PyTorch + DeepSpeed (AMD) [dev]"
runs-on:
group: aws-general-8-plus
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Check out code
uses: actions/checkout@v4
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
name: Build and push
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-pytorch-deepspeed-amd-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}
# Push CI images still need to be re-built daily
-
name: Build and push (for Push CI) in a daily basis
# This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
# The later case is useful for manual image building for debugging purpose. Use another tag in this case!
if: inputs.image_postfix != '-push-ci'
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-pytorch-deepspeed-amd-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the transformers-pytorch-deepspeed-amd-gpu build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
latest-quantization-torch-docker:
name: "Latest Pytorch + Quantization [dev]"
# Push CI doesn't need this image
if: inputs.image_postfix != '-push-ci'
runs-on:
group: aws-general-8-plus
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Check out code
uses: actions/checkout@v4
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
name: Build and push
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-quantization-latest-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-quantization-latest-gpu${{ inputs.image_postfix }}
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the transformers-quantization-latest-gpu build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

View File

@ -29,7 +29,7 @@ jobs:
runs-on: ubuntu-22.04
name: Get PR number
# For security: only allow team members to run
if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu", "ivarflakstad", "stevhliu"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu", "ivarflakstad"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
outputs:
PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
steps:

3
.gitignore vendored
View File

@ -167,6 +167,3 @@ tags
# ruff
.ruff_cache
# modular conversion
*.modular_backup

View File

@ -28,7 +28,6 @@ from transformers.testing_utils import HfDoctestModule, HfDocTestParser
NOT_DEVICE_TESTS = {
"test_tokenization",
"test_tokenization_mistral_common",
"test_processor",
"test_processing",
"test_beam_constraints",

View File

@ -1,4 +1,4 @@
FROM nvidia/cuda:12.6.0-cudnn-devel-ubuntu24.04
FROM nvidia/cuda:12.6.0-cudnn-devel-ubuntu22.04
LABEL maintainer="Hugging Face"
ARG DEBIAN_FRONTEND=noninteractive
@ -30,8 +30,6 @@ RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] &&
RUN python3 -m pip uninstall -y flax jax
RUN python3 -m pip install --no-cache-dir -U timm
RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
RUN python3 -m pip install -U "itsdangerous<2.1.0"

View File

@ -517,8 +517,6 @@
title: Jukebox
- local: model_doc/led
title: LED
- local: model_doc/lfm2
title: LFM2
- local: model_doc/llama
title: LLaMA
- local: model_doc/llama2
@ -563,8 +561,6 @@
title: MobileBERT
- local: model_doc/modernbert
title: ModernBert
- local: model_doc/modernbert-decoder
title: ModernBERTDecoder
- local: model_doc/mpnet
title: MPNet
- local: model_doc/mpt
@ -713,8 +709,6 @@
title: D-FINE
- local: model_doc/dab-detr
title: DAB-DETR
- local: model_doc/deepseek_v2
title: DeepSeek-V2
- local: model_doc/deformable_detr
title: Deformable DETR
- local: model_doc/deit
@ -1041,8 +1035,6 @@
title: PaliGemma
- local: model_doc/perceiver
title: Perceiver
- local: model_doc/perception_lm
title: PerceptionLM
- local: model_doc/phi4_multimodal
title: Phi4 Multimodal
- local: model_doc/pix2struct

View File

@ -99,6 +99,8 @@ self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_stat
2. The cache grows dynamically as more tokens are processed. The sequence length dimension (`seq_len`) increases with each new token.
3. The cache maintains a count of seen tokens through `self._seen_tokens`. This is updated when the first layer processes a new token.
The example below demonstrates how to create a generation loop with [`DynamicCache`]. As discussed, the attention mask is a concatenation of past and current token values and `1` is added to the cache position for the next token.
```py

View File

@ -14,105 +14,49 @@ rendered properly in your Markdown viewer.
-->
<div style="float: right;">
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
</div>
# CamemBERT
[CamemBERT](https://huggingface.co/papers/1911.03894) is a language model based on [RoBERTa](./roberta), but trained specifically on French text from the OSCAR dataset, making it more effective for French language tasks.
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
What sets CamemBERT apart is that it learned from a huge, high quality collection of French data, as opposed to mixing lots of languages. This helps it really understand French better than many multilingual models.
## Overview
Common applications of CamemBERT include masked language modeling (Fill-mask prediction), text classification (sentiment analysis), token classification (entity recognition) and sentence pair classification (entailment tasks).
The CamemBERT model was proposed in [CamemBERT: a Tasty French Language Model](https://huggingface.co/papers/1911.03894) by
[Louis Martin](https://huggingface.co/louismartin), [Benjamin Muller](https://huggingface.co/benjamin-mlr), [Pedro Javier Ortiz Suárez](https://huggingface.co/pjox), Yoann Dupont, Laurent Romary, Éric Villemonte de la
Clergerie, [Djamé Seddah](https://huggingface.co/Djame), and [Benoît Sagot](https://huggingface.co/sagot). It is based on Facebook's RoBERTa model released in 2019. It is a model
trained on 138GB of French text.
You can find all the original CamemBERT checkpoints under the [ALMAnaCH](https://huggingface.co/almanach/models?search=camembert) organization.
The abstract from the paper is the following:
> [!TIP]
> This model was contributed by the [ALMAnaCH (Inria)](https://huggingface.co/almanach) team.
>
> Click on the CamemBERT models in the right sidebar for more examples of how to apply CamemBERT to different NLP tasks.
*Pretrained language models are now ubiquitous in Natural Language Processing. Despite their success, most available
models have either been trained on English data or on the concatenation of data in multiple languages. This makes
practical use of such models --in all languages except English-- very limited. Aiming to address this issue for French,
we release CamemBERT, a French version of the Bi-directional Encoders for Transformers (BERT). We measure the
performance of CamemBERT compared to multilingual models in multiple downstream tasks, namely part-of-speech tagging,
dependency parsing, named-entity recognition, and natural language inference. CamemBERT improves the state of the art
for most of the tasks considered. We release the pretrained model for CamemBERT hoping to foster research and
downstream applications for French NLP.*
The examples below demonstrate how to predict the `<mask>` token with [`Pipeline`], [`AutoModel`], and from the command line.
This model was contributed by [the ALMAnaCH team (Inria)](https://huggingface.co/almanach). The original code can be found [here](https://camembert-model.fr/).
<hfoptions id="usage">
<Tip>
<hfoption id="Pipeline">
This implementation is the same as RoBERTa. Refer to the [documentation of RoBERTa](roberta) for usage examples as well
as the information relative to the inputs and outputs.
```python
import torch
from transformers import pipeline
</Tip>
pipeline = pipeline("fill-mask", model="camembert-base", torch_dtype=torch.float16, device=0)
pipeline("Le camembert est un délicieux fromage <mask>.")
```
</hfoption>
## Resources
<hfoption id="AutoModel">
```python
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
tokenizer = AutoTokenizer.from_pretrained("camembert-base")
model = AutoModelForMaskedLM.from_pretrained("camembert-base", torch_dtype="auto", device_map="auto", attn_implementation="sdpa")
inputs = tokenizer("Le camembert est un délicieux fromage <mask>.", return_tensors="pt").to("cuda")
with torch.no_grad():
outputs = model(**inputs)
predictions = outputs.logits
masked_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]
predicted_token_id = predictions[0, masked_index].argmax(dim=-1)
predicted_token = tokenizer.decode(predicted_token_id)
print(f"The predicted token is: {predicted_token}")
```
</hfoption>
<hfoption id="transformers CLI">
```bash
echo -e "Le camembert est un délicieux fromage <mask>." | transformers run --task fill-mask --model camembert-base --device 0
```
</hfoption>
</hfoptions>
Quantization reduces the memory burden of large models by representing weights in lower precision. Refer to the [Quantization](../quantization/overview) overview for available options.
The example below uses [bitsandbytes](../quantization/bitsandbytes) quantization to quantize the weights to 8-bits.
```python
from transformers import AutoTokenizer, AutoModelForMaskedLM, BitsAndBytesConfig
import torch
quant_config = BitsAndBytesConfig(load_in_8bit=True)
model = AutoModelForMaskedLM.from_pretrained(
"almanach/camembert-large",
quantization_config=quant_config,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("almanach/camembert-large")
inputs = tokenizer("Le camembert est un délicieux fromage <mask>.", return_tensors="pt").to("cuda")
with torch.no_grad():
outputs = model(**inputs)
predictions = outputs.logits
masked_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
predicted_token_id = predictions[0, masked_index].argmax(dim=-1)
predicted_token = tokenizer.decode(predicted_token_id)
print(f"The predicted token is: {predicted_token}")
```
- [Text classification task guide](../tasks/sequence_classification)
- [Token classification task guide](../tasks/token_classification)
- [Question answering task guide](../tasks/question_answering)
- [Causal language modeling task guide](../tasks/language_modeling)
- [Masked language modeling task guide](../tasks/masked_language_modeling)
- [Multiple choice task guide](../tasks/multiple_choice)
## CamembertConfig
@ -193,4 +137,5 @@ print(f"The predicted token is: {predicted_token}")
[[autodoc]] TFCamembertForQuestionAnswering
</tf>
</frameworkcontent>
</frameworkcontent>

View File

@ -1,49 +0,0 @@
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# DeepSeek-V2
## Overview
The DeepSeek-V2 model was proposed in [DeepSeek-V2: A Strong, Economical, and Efficient Mixture-of-Experts Language Model](https://arxiv.org/abs/2405.04434) by DeepSeek-AI Team.
The abstract from the paper is the following:
We present DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference. It comprises 236B total parameters, of which 21B are activated for each token, and supports a context length of 128K tokens. DeepSeek-V2 adopts innovative architectures including Multi-head Latent Attention (MLA) and DeepSeekMoE. MLA guarantees efficient inference through significantly compressing the Key-Value (KV) cache into a latent vector, while DeepSeekMoE enables training strong models at an economical cost through sparse computation. Compared with DeepSeek 67B, DeepSeek-V2 achieves significantly stronger performance, and meanwhile saves 42.5% of training costs, reduces the KV cache by 93.3%, and boosts the maximum generation throughput to 5.76 times. We pretrain DeepSeek-V2 on a high-quality and multi-source corpus consisting of 8.1T tokens, and further perform Supervised Fine-Tuning (SFT) and Reinforcement Learning (RL) to fully unlock its potential. Evaluation results show that, even with only 21B activated parameters, DeepSeek-V2 and its chat versions still achieve top-tier performance among open-source models.
This model was contributed by [VladOS95-cyber](https://github.com/VladOS95-cyber).
The original code can be found [here](https://huggingface.co/deepseek-ai/DeepSeek-V2).
### Usage tips
The model uses Multi-head Latent Attention (MLA) and DeepSeekMoE architectures for efficient inference and cost-effective training. It employs an auxiliary-loss-free strategy for load balancing and multi-token prediction training objective. The model can be used for various language tasks after being pre-trained on 14.8 trillion tokens and going through Supervised Fine-Tuning and Reinforcement Learning stages.
## DeepseekV2Config
[[autodoc]] DeepseekV2Config
## DeepseekV2Model
[[autodoc]] DeepseekV2Model
- forward
## DeepseekV2ForCausalLM
[[autodoc]] DeepseekV2ForCausalLM
- forward
## DeepseekV2ForSequenceClassification
[[autodoc]] DeepseekV2ForSequenceClassification
- forward

View File

@ -14,88 +14,115 @@ rendered properly in your Markdown viewer.
-->
<div style="float: right;">
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=
">
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
</div>
# Encoder Decoder Models
[`EncoderDecoderModel`](https://huggingface.co/papers/1706.03762) initializes a sequence-to-sequence model with any pretrained autoencoder and pretrained autoregressive model. It is effective for sequence generation tasks as demonstrated in [Text Summarization with Pretrained Encoders](https://huggingface.co/papers/1908.08345) which uses [`BertModel`] as the encoder and decoder.
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=
">
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
> [!TIP]
> This model was contributed by [thomwolf](https://huggingface.co/thomwolf) and the TensorFlow/Flax version by [ydshieh](https://huggingface.co/ydshieh).
>
> Click on the Encoder Decoder models in the right sidebar for more examples of how to apply Encoder Decoder to different language tasks.
## Overview
The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and from the command line.
The [`EncoderDecoderModel`] can be used to initialize a sequence-to-sequence model with any
pretrained autoencoding model as the encoder and any pretrained autoregressive model as the decoder.
<hfoptions id="usage">
<hfoption id="Pipeline">
The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation tasks
was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://huggingface.co/papers/1907.12461) by
Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
After such an [`EncoderDecoderModel`] has been trained/fine-tuned, it can be saved/loaded just like
any other models (see the examples for more information).
An application of this architecture could be to leverage two pretrained [`BertModel`] as the encoder
and decoder for a summarization model as was shown in: [Text Summarization with Pretrained Encoders](https://huggingface.co/papers/1908.08345) by Yang Liu and Mirella Lapata.
## Randomly initializing `EncoderDecoderModel` from model configurations.
[`EncoderDecoderModel`] can be randomly initialized from an encoder and a decoder config. In the following example, we show how to do this using the default [`BertModel`] configuration for the encoder and the default [`BertForCausalLM`] configuration for the decoder.
```python
from transformers import pipeline
>>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
summarizer = pipeline(
"summarization",
model="patrickvonplaten/bert2bert-cnn_dailymail-fp16",
device=0
)
>>> config_encoder = BertConfig()
>>> config_decoder = BertConfig()
text = "Plants create energy through a process known as photosynthesis. This involves capturing sunlight and converting carbon dioxide and water into glucose and oxygen."
print(summarizer(text))
>>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
>>> model = EncoderDecoderModel(config=config)
```
</hfoption>
<hfoption id="AutoModel">
## Initialising `EncoderDecoderModel` from a pretrained encoder and a pretrained decoder.
[`EncoderDecoderModel`] can be initialized from a pretrained encoder checkpoint and a pretrained decoder checkpoint. Note that any pretrained auto-encoding model, *e.g.* BERT, can serve as the encoder and both pretrained auto-encoding models, *e.g.* BERT, pretrained causal language models, *e.g.* GPT2, as well as the pretrained decoder part of sequence-to-sequence models, *e.g.* decoder of BART, can be used as the decoder.
Depending on which architecture you choose as the decoder, the cross-attention layers might be randomly initialized.
Initializing [`EncoderDecoderModel`] from a pretrained encoder and decoder checkpoint requires the model to be fine-tuned on a downstream task, as has been shown in [the *Warm-starting-encoder-decoder blog post*](https://huggingface.co/blog/warm-starting-encoder-decoder).
To do so, the `EncoderDecoderModel` class provides a [`EncoderDecoderModel.from_encoder_decoder_pretrained`] method.
```python
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
>>> from transformers import EncoderDecoderModel, BertTokenizer
tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
model = AutoModelForCausalLM.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16", torch_dtype=torch.bfloat16, device_map="auto",attn_implementation="sdpa")
text = "Plants create energy through a process known as photosynthesis. This involves capturing sunlight and converting carbon dioxide and water into glucose and oxygen."
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(model.device)
summary = model.generate(**inputs, max_length=60, num_beams=4, early_stopping=True)
print(tokenizer.decode(summary[0], skip_special_tokens=True))
>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-uncased", "google-bert/bert-base-uncased")
```
</hfoption>
<hfoption id="transformers CLI">
## Loading an existing `EncoderDecoderModel` checkpoint and perform inference.
```bash
echo -e "Plants create energy through a process known as photosynthesis. This involves capturing sunlight and converting carbon dioxide and water into glucose and oxygen." | transformers-cli run --task summarization --model "patrickvonplaten/bert2bert-cnn_dailymail-fp16" --device 0
```
To load fine-tuned checkpoints of the `EncoderDecoderModel` class, [`EncoderDecoderModel`] provides the `from_pretrained(...)` method just like any other model architecture in Transformers.
</hfoption>
</hfoptions>
## Notes
- [`EncoderDecoderModel`] can be initialized using any pretrained encoder and decoder. But depending on the decoder architecture, the cross-attention layers may be randomly initialized.
These models require downstream fine-tuning, as discussed in this [blog post](https://huggingface.co/blog/warm-starting-encoder-decoder). Use [`~EncoderDecoderModel.from_encoder_decoder_pretrained`] to combine encoder and decoder checkpoints.
To perform inference, one uses the [`generate`] method, which allows to autoregressively generate text. This method supports various forms of decoding, such as greedy, beam search and multinomial sampling.
```python
from transformers import EncoderDecoderModel, BertTokenizer
>>> from transformers import AutoTokenizer, EncoderDecoderModel
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
"google-bert/bert-base-uncased",
"google-bert/bert-base-uncased"
)
>>> # load a fine-tuned seq2seq model and corresponding tokenizer
>>> model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")
>>> tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")
>>> # let's perform inference on a long piece of text
>>> ARTICLE_TO_SUMMARIZE = (
... "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
... "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
... "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
... )
>>> input_ids = tokenizer(ARTICLE_TO_SUMMARIZE, return_tensors="pt").input_ids
>>> # autoregressively generate summary (uses greedy decoding by default)
>>> generated_ids = model.generate(input_ids)
>>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
>>> print(generated_text)
nearly 800 thousand customers were affected by the shutoffs. the aim is to reduce the risk of wildfires. nearly 800, 000 customers were expected to be affected by high winds amid dry conditions. pg & e said it scheduled the blackouts to last through at least midday tomorrow.
```
- Encoder Decoder models can be fine-tuned like BART, T5 or any other encoder-decoder model. Only 2 inputs are required to compute a loss, `input_ids` and `labels`. Refer to this [notebook](https://colab.research.google.com/drive/1WIk2bxglElfZewOHboPFNj8H44_VAyKE?usp=sharing#scrollTo=ZwQIEhKOrJpl) for a more detailed training example.
## Loading a PyTorch checkpoint into `TFEncoderDecoderModel`.
[`TFEncoderDecoderModel.from_pretrained`] currently doesn't support initializing the model from a
pytorch checkpoint. Passing `from_pt=True` to this method will throw an exception. If there are only pytorch
checkpoints for a particular encoder-decoder model, a workaround is:
```python
>>> # a workaround to load from pytorch checkpoint
>>> from transformers import EncoderDecoderModel, TFEncoderDecoderModel
>>> _model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
>>> _model.encoder.save_pretrained("./encoder")
>>> _model.decoder.save_pretrained("./decoder")
>>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained(
... "./encoder", "./decoder", encoder_from_pt=True, decoder_from_pt=True
... )
>>> # This is only for copying some specific attributes of this particular model.
>>> model.config = _model.config
```
## Training
Once the model is created, it can be fine-tuned similar to BART, T5 or any other encoder-decoder model.
As you can see, only 2 inputs are required for the model in order to compute a loss: `input_ids` (which are the
`input_ids` of the encoded input sequence) and `labels` (which are the `input_ids` of the encoded
target sequence).
```python
>>> from transformers import BertTokenizer, EncoderDecoderModel
@ -120,42 +147,11 @@ model = EncoderDecoderModel.from_encoder_decoder_pretrained(
>>> loss = model(input_ids=input_ids, labels=labels).loss
```
- [`EncoderDecoderModel`] can be randomly initialized from an encoder and a decoder config as shown below.
Detailed [colab](https://colab.research.google.com/drive/1WIk2bxglElfZewOHboPFNj8H44_VAyKE?usp=sharing#scrollTo=ZwQIEhKOrJpl) for training.
```python
>>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
This model was contributed by [thomwolf](https://github.com/thomwolf). This model's TensorFlow and Flax versions
were contributed by [ydshieh](https://github.com/ydshieh).
>>> config_encoder = BertConfig()
>>> config_decoder = BertConfig()
>>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
>>> model = EncoderDecoderModel(config=config)
```
- The Encoder Decoder Model can also be used for translation as shown below.
```python
from transformers import AutoTokenizer, EncoderDecoderModel
# Load a pre-trained translation model
model_name = "google/bert2bert_L-24_wmt_en_de"
tokenizer = AutoTokenizer.from_pretrained(model_name, pad_token="<pad>", eos_token="</s>", bos_token="<s>")
model = EncoderDecoderModel.from_pretrained(model_name)
# Input sentence to translate
input_text = "Plants create energy through a process known as"
# Encode the input text
inputs = tokenizer(input_text, return_tensors="pt", add_special_tokens=False).input_ids
# Generate the translated output
outputs = model.generate(inputs)[0]
# Decode the output tokens to get the translated sentence
translated_text = tokenizer.decode(outputs, skip_special_tokens=True)
print("Translated text:", translated_text)
```
## EncoderDecoderConfig

View File

@ -57,7 +57,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", torch_dtype=torch.float16, device_map="auto", attn_implementation="sdpa")
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
input_ids = tokenizer("Hello, I'm a language model", return_tensors="pt").to("cuda")
input_ids = tokenzier("Hello, I'm a language model". return_tensors="pt").to("cuda")
output = model.generate(**input_ids, cache_implementation="static")
print(tokenizer.decode(output[0], skip_special_tokens=True))

View File

@ -1,84 +0,0 @@
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
# LFM2
## Overview
[LFM2](https://www.liquid.ai/blog/liquid-foundation-models-v2-our-second-series-of-generative-ai-models) represents a new generation of Liquid Foundation Models developed by [Liquid AI](https://liquid.ai/), specifically designed for edge AI and on-device deployment.
The models are available in three sizes (350M, 700M, and 1.2B parameters) and are engineered to run efficiently on CPU, GPU, and NPU hardware, making them particularly well-suited for applications requiring low latency, offline operation, and privacy.
## Architecture
The architecture consists of 16 blocks total: 10 double-gated short-range convolution blocks and 6 blocks of grouped query attention. This design stems from the concept of dynamical systems, where linear operations are modulated by input-dependent gates, allowing for "liquid" dynamics that can adapt in real-time. The short convolutions are particularly optimized for embedded SoC CPUs, making them ideal for devices that require fast, local inference without relying on cloud connectivity.
The key architectural innovation of LFM2 lies in its systematic approach to balancing quality, latency, and memory efficiency through our STAR neural architecture search engine. Using STAR, Liquid AI optimized the models for real-world performance on embedded hardware, measuring actual peak memory usage and inference speed on Qualcomm Snapdragon processors. This results in models that achieve 2x faster decode and prefill performance compared to similar-sized models, while maintaining superior benchmark performance across knowledge, mathematics, instruction following, and multilingual tasks.
## Example
The following example shows how to generate an answer using the `AutoModelForCausalLM` class.
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
# Load model and tokenizer
model_id = "LiquidAI/LFM2-1.2B"
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype="bfloat16",
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Generate answer
prompt = "What is C. elegans?"
input_ids = tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
add_generation_prompt=True,
return_tensors="pt",
tokenize=True,
)
output = model.generate(
input_ids,
do_sample=True,
temperature=0.3,
min_p=0.15,
repetition_penalty=1.05,
max_new_tokens=512,
)
print(tokenizer.decode(output[0], skip_special_tokens=False))
```
## Lfm2Config
[[autodoc]] Lfm2Config
## Lfm2Model
[[autodoc]] Lfm2Model
- forward
## Lfm2ForCausalLM
[[autodoc]] Lfm2ForCausalLM
- forward

View File

@ -14,178 +14,287 @@ rendered properly in your Markdown viewer.
-->
<div style="float: right;">
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
</div>
# LLaVA-NeXT
[LLaVANeXT](https://llava-vl.github.io/blog/2024-05-10-llava-next-stronger-llms/) improves on [Llava](./llava) by increasing the input image resolution by 4x more pixels and supporting 3 aspect ratios (up to 672x672, 336x1344, 1344x336) to better grasp visual details. It is also trained on an improved visual instruction tuning dataset covering more scenarios and applications to improve OCR and common sense reasoning.
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
You can find all the original LLaVANeXT checkpoints under the [LLaVA-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf) collection.
## Overview
> [!TIP]
> This model was contributed by [nielsr](https://huggingface.co/nielsr).
>
> Click on the LLaVANeXT models in the right sidebar for more examples of how to apply Llava-NeXT to different multimodal tasks.
The LLaVA-NeXT model was proposed in [LLaVA-NeXT: Improved reasoning, OCR, and world knowledge](https://llava-vl.github.io/blog/2024-01-30-llava-next/) by Haotian Liu, Chunyuan Li, Yuheng Li, Bo Li, Yuanhan Zhang, Sheng Shen, Yong Jae Lee. LLaVa-NeXT (also called LLaVa-1.6) improves upon [LLaVa](llava) by increasing the input image resolution and training on an improved visual instruction tuning dataset to improve OCR and common sense reasoning.
The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.
The introduction from the blog is the following:
<hfoptions id="usage">
*In October 2023, we released LLaVA-1.5 with a simple and efficient design along with great performance on a benchmark suite of 12 datasets. It has since served as the foundation of many comprehensive studies of data, model, and capabilities of large multimodal models (LMM), and has enabled various new applications.
<hfoption id="Pipeline">
Today, we are thrilled to present LLaVA-NeXT, with improved reasoning, OCR, and world knowledge. LLaVA-NeXT even exceeds Gemini Pro on several benchmarks.
Compared with LLaVA-1.5, LLaVA-NeXT has several improvements:
Increasing the input image resolution to 4x more pixels. This allows it to grasp more visual details. It supports three aspect ratios, up to 672x672, 336x1344, 1344x336 resolution.
Better visual reasoning and OCR capability with an improved visual instruction tuning data mixture.
Better visual conversation for more scenarios, covering different applications. Better world knowledge and logical reasoning.
Efficient deployment and inference with SGLang.
Along with performance improvements, LLaVA-NeXT maintains the minimalist design and data efficiency of LLaVA-1.5. It re-uses the pretrained connector of LLaVA-1.5, and still uses less than 1M visual instruction tuning samples. The largest 34B variant finishes training in ~1 day with 32 A100s.*
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/llava_next_overview.png"
alt="drawing" width="600"/>
<small> LLaVa-NeXT incorporates a higher input resolution by encoding various patches of the input image. Taken from the <a href="https://huggingface.co/papers/2310.03744">original paper.</a> </small>
This model was contributed by [nielsr](https://huggingface.co/nielsr).
The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/main).
## Usage tips
- We advise users to use `padding_side="left"` when computing batched generation as it leads to more accurate results. Simply make sure to call `processor.tokenizer.padding_side = "left"` before generating.
<Tip warning={true}>
- Llava-Next uses different number of patches for images and thus has to pad the inputs inside modeling code, aside from the padding done when processing the inputs. The default setting is "left-padding" if model is in `eval()` mode, otherwise "right-padding".
</Tip>
> [!NOTE]
> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you.
Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `<image>` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings.
The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches.
### Formatting Prompts with Chat Templates
Each **checkpoint** is trained with a specific prompt format, depending on the underlying large language model backbone. To ensure correct formatting, use the processors `apply_chat_template` method.
**Important:**
- You must construct a conversation history — passing a plain string won't work.
- Each message should be a dictionary with `"role"` and `"content"` keys.
- The `"content"` should be a list of dictionaries for different modalities like `"text"` and `"image"`.
Heres an example of how to structure your input. We will use [llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) and a conversation history of text and image.
```python
import torch
from transformers import pipeline
from transformers import LlavaNextProcessor
pipeline = pipeline(
task="image-text-to-text",
model="llava-hf/llava-v1.6-mistral-7b-hf",
device=0,
torch_dtype=torch.bfloat16
)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
},
{ "type": "text", "text": "Describe this image."},
]
}
]
pipeline(text=messages, max_new_tokens=20, return_full_text=False)
processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
conversation = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "Whats shown in this image?"},
],
},
{
"role": "assistant",
"content": [{"type": "text", "text": "This image shows a red stop sign."},]
},
{
"role": "user",
"content": [
{"type": "text", "text": "Describe the image in more details."},
],
},
]
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
# Note that the template simply formats your prompt, you still have to tokenize it and obtain pixel values for your images
print(text_prompt)
>>> "[INST] <image>\nWhat's shown in this image? [/INST] This image shows a red stop sign. [INST] Describe the image in more details. [/INST]"
```
</hfoption>
<hfoption id="AutoModel">
```python
import torch
import requests
from PIL import Image
from transformers import AutoProcessor, LlavaNextForConditionalGeneration
processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16).to("cuda")
url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
image = Image.open(requests.get(url, stream=True).raw)
conversation = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What is shown in this image?"},
],
},
]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(image, prompt, return_tensors="pt").to("cuda")
output = model.generate(**inputs, max_new_tokens=100)
print(processor.decode(output[0], skip_special_tokens=True))
- If you want to construct a chat prompt yourself, below is a list of possible formats
.
[llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) requires the following format:
```bash
"[INST] <image>\nWhat is shown in this image? [/INST]"
```
</hfoption>
</hfoptions>
Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to int4.
```python
import torch
import requests
from PIL import Image
from transformers import AutoModelForImageTextToText, AutoProcessor, BitsAndBytesConfig
quant_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_quant_type="nf4"
)
processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", quantization_config=quant_config, device_map="auto")
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/llava_next_ocr.png"
image = Image.open(requests.get(url, stream=True).raw)
conversation = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What does this chart show?"},
],
},
]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(image, prompt, return_tensors="pt").to("cuda")
with torch.inference_mode():
output = model.generate(**inputs, max_new_tokens=100)
print(processor.decode(output[0], skip_special_tokens=True))
[llava-v1.6-vicuna-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-7b-hf) and [llava-v1.6-vicuna-13b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) require the following format:
```bash
"A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nWhat is shown in this image? ASSISTANT:"
```
## Notes
* Different checkpoints (Mistral, Vicuna, etc.) require a specific prompt format depending on the underlying LLM. Always use [`~ProcessorMixin.apply_chat_template`] to ensure correct formatting. Refer to the [Templates](../chat_templating) guide for more details.
* Set `padding_side="left"` during batched generation for more accurate results.
```py
processor.tokenizer.padding_side = "left"
[llava-v1.6-34b-hf](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) requires the following format:
```bash
"<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n"
```
* LLaVA-NeXT uses different numbers of patches for images and pads the inputs inside the modeling code except when padding is done during processing. The default setting is *left-padding* if the model is in `eval()` mode, otherwise it is *right-padding*.
[llama3-llava-next-8b-hf](https://huggingface.co/llava-hf/llava-next-8b-hf) requires the following format:
* LLaVA models after v4.46 raises warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}`, and `processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add these attributes to the processor if you own the model checkpoint or open a PR if it isn't.
```bash
"<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.<|eot_id|><|start_header_id|><|start_header_id|>user<|end_header_id|>\n\n<image>\nWhat is shown in this image?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
```
Adding these attributes means LLaVA will try to infer the number of image tokens required per image and expand the text with the same number of `<image>` token placeholders. There are usually ~500 tokens per image, so make sure the text is not truncated because it will cause a failure when merging the embeddings. The attributes can be found in `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`.
[llava-next-72b-hf](https://huggingface.co/llava-hf/llava-next-72b-hf) and [llava-next-110b-hf](https://huggingface.co/llava-hf/llava-next-110b-hf) require the following format:
The `num_additional_image_tokens` should be `1` if the vision backbone adds a `CLS` token or `0` if nothing extra is added.
```bash
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|>\n<|im_start|>assistant\n"
```
* The example below demonstrates inference with multiple input images.
🚀 **Bonus:** If you're using `transformers>=4.49.0`, you can also get a vectorized output from `apply_chat_template`. See the **Usage Examples** below for more details on how to use it.
## Usage example
### Single image inference
Here's how to load the model and perform inference in half-precision (`torch.float16`):
```python
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
import torch
from PIL import Image
import requests, torch
import requests
processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
model = LlavaNextForConditionalGeneration.from_pretrained(
"llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16
).to("cuda")
# Load multiple images
url1 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/llava_next_ocr.png"
url2 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/llava_next_comparison.png"
model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16)
model.to("cuda:0")
image1 = Image.open(requests.get(url1, stream=True).raw)
image2 = Image.open(requests.get(url2, stream=True).raw)
# prepare image and text prompt, using the appropriate prompt template
url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
image = Image.open(requests.get(url, stream=True).raw)
conversation = [
{"role": "user", "content": [{"type": "image"}, {"type": "image"}, {"type": "text", "text": "Compare these two images and describe the differences."}]}
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What is shown in this image?"},
],
},
]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor([image1, image2], prompt, return_tensors="pt").to("cuda")
inputs = processor(image, prompt, return_tensors="pt").to("cuda:0")
# autoregressively complete prompt
output = model.generate(**inputs, max_new_tokens=100)
print(processor.decode(output[0], skip_special_tokens=True))
```
### Multi image inference
LLaVa-Next can perform inference with multiple images as input, where images either belong to the same prompt or different prompts (in batched inference). Here is how you can do it:
```python
import requests
from PIL import Image
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText
# Load the model in half-precision
model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, device_map="auto")
processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
# Get three different images
url = "https://www.ilankelman.org/stopsigns/australia.jpg"
image_stop = Image.open(requests.get(url, stream=True).raw)
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image_cats = Image.open(requests.get(url, stream=True).raw)
url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
image_snowman = Image.open(requests.get(url, stream=True).raw)
# Prepare a batch of two prompts, where the first one is a multi-turn conversation and the second is not
conversation_1 = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What is shown in this image?"},
],
},
{
"role": "assistant",
"content": [
{"type": "text", "text": "There is a red stop sign in the image."},
],
},
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What about this image? How many cats do you see?"},
],
},
]
conversation_2 = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What is shown in this image?"},
],
},
]
prompt_1 = processor.apply_chat_template(conversation_1, add_generation_prompt=True)
prompt_2 = processor.apply_chat_template(conversation_2, add_generation_prompt=True)
prompts = [prompt_1, prompt_2]
# We can simply feed images in the order they have to be used in the text prompt
# Each "<image>" token uses one image leaving the next for the subsequent "<image>" tokens
inputs = processor(images=[image_stop, image_cats, image_snowman], text=prompts, padding=True, return_tensors="pt").to(model.device)
# Generate
generate_ids = model.generate(**inputs, max_new_tokens=30)
processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
```
## Model optimization
### Quantization using Bitsandbytes
The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes`, and to have access to a GPU/accelerator that is supported by the library.
<Tip>
bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend).
We value your feedback to help identify bugs before the full release! Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links.
</Tip>
Simply change the snippet above with:
```python
from transformers import AutoModelForImageTextToText, BitsAndBytesConfig
# specify how to quantize the model
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
)
model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", quantization_config=quantization_config, device_map="auto")
```
### Use Flash-Attention 2 to further speed-up generation
First make sure to install flash-attn. Refer to the [original repository of Flash Attention](https://github.com/Dao-AILab/flash-attention) regarding that package installation. Simply change the snippet above with:
```python
from transformers import AutoModelForImageTextToText
model = AutoModelForImageTextToText.from_pretrained(
model_id,
torch_dtype=torch.float16,
use_flash_attention_2=True
).to(0)
```
## LlavaNextConfig

View File

@ -28,7 +28,6 @@ You can find all the original Mamba checkpoints under the [State Space Models](h
> [!TIP]
> This model was contributed by [Molbap](https://huggingface.co/Molbap) and [AntonV](https://huggingface.co/AntonV).
> Click on the Mamba models in the right sidebar for more examples of how to apply Mamba to different language tasks.
The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and from the command line.

View File

@ -26,7 +26,6 @@ rendered properly in your Markdown viewer.
You can find all the original Mamba 2 checkpoints under the [State Space Models](https://huggingface.co/state-spaces) organization, but the examples shown below use [mistralai/Mamba-Codestral-7B-v0.1](https://huggingface.co/mistralai/Mamba-Codestral-7B-v0.1) because a Hugging Face implementation isn't supported yet for the original checkpoints.
> [!TIP]
> This model was contributed by [ArthurZ](https://huggingface.co/ArthurZ).
> Click on the Mamba models in the right sidebar for more examples of how to apply Mamba to different language tasks.
The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and from the command line.

View File

@ -14,139 +14,160 @@ rendered properly in your Markdown viewer.
-->
<div style="float: right;">
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=
">
<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
</div>
# MarianMT
[MarianMT](https://huggingface.co/papers/1804.00344) is a machine translation model trained with the Marian framework which is written in pure C++. The framework includes its own custom auto-differentiation engine and efficient meta-algorithms to train encoder-decoder models like BART.
All MarianMT models are transformer encoder-decoders with 6 layers in each component, use static sinusoidal positional embeddings, don't have a layernorm embedding, and the model starts generating with the prefix `pad_token_id` instead of `<s/>`.
You can find all the original MarianMT checkpoints under the [Language Technology Research Group at the University of Helsinki](https://huggingface.co/Helsinki-NLP/models?search=opus-mt) organization.
> [!TIP]
> This model was contributed by [sshleifer](https://huggingface.co/sshleifer).
>
> Click on the MarianMT models in the right sidebar for more examples of how to apply MarianMT to translation tasks.
The example below demonstrates how to translate text using [`Pipeline`] or the [`AutoModel`] class.
<hfoptions id="usage">
<hfoption id="Pipeline">
```python
import torch
from transformers import pipeline
pipeline = pipeline("translation_en_to_de", model="Helsinki-NLP/opus-mt-en-de", torch_dtype=torch.float16, device=0)
pipeline("Hello, how are you?")
```
</hfoption>
<hfoption id="AutoModel">
```python
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-de", torch_dtype=torch.float16, attn_implementation="sdpa", device_map="auto")
inputs = tokenizer("Hello, how are you?", return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, cache_implementation="static")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
```
</hfoption>
</hfoptions>
Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/blob/beb9b5b02246b9b7ee81ddf938f93f44cfeaad19/src/transformers/utils/attention_visualizer.py#L139) to better understand what tokens the model can and cannot attend to.
```python
from transformers.utils.attention_visualizer import AttentionMaskVisualizer
visualizer = AttentionMaskVisualizer("Helsinki-NLP/opus-mt-en-de")
visualizer("Hello, how are you?")
```
<div class="flex justify-center">
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/marianmt-attn-mask.png"/>
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=
">
<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
## Notes
## Overview
- MarianMT models are ~298MB on disk and there are more than 1000 models. Check this [list](https://huggingface.co/Helsinki-NLP) for supported language pairs. The language codes may be inconsistent. Two digit codes can be found [here](https://developers.google.com/admin-sdk/directory/v1/languages) while three digit codes may require further searching.
- Models that require BPE preprocessing are not supported.
- All model names use the following format: `Helsinki-NLP/opus-mt-{src}-{tgt}`. Language codes formatted like `es_AR` usually refer to the `code_{region}`. For example, `es_AR` refers to Spanish from Argentina.
- If a model can output multiple languages, prepend the desired output language to `src_txt` as shown below. New multilingual models from the [Tatoeba-Challenge](https://github.com/Helsinki-NLP/Tatoeba-Challenge) require 3 character language codes.
A framework for translation models, using the same models as BART. Translations should be similar, but not identical to output in the test set linked to in each model card.
This model was contributed by [sshleifer](https://huggingface.co/sshleifer).
## Implementation Notes
- Each model is about 298 MB on disk, there are more than 1,000 models.
- The list of supported language pairs can be found [here](https://huggingface.co/Helsinki-NLP).
- Models were originally trained by [Jörg Tiedemann](https://researchportal.helsinki.fi/en/persons/j%C3%B6rg-tiedemann) using the [Marian](https://marian-nmt.github.io/) C++ library, which supports fast training and translation.
- All models are transformer encoder-decoders with 6 layers in each component. Each model's performance is documented
in a model card.
- The 80 opus models that require BPE preprocessing are not supported.
- The modeling code is the same as [`BartForConditionalGeneration`] with a few minor modifications:
- static (sinusoid) positional embeddings (`MarianConfig.static_position_embeddings=True`)
- no layernorm_embedding (`MarianConfig.normalize_embedding=False`)
- the model starts generating with `pad_token_id` (which has 0 as a token_embedding) as the prefix (Bart uses
`<s/>`),
- Code to bulk convert models can be found in `convert_marian_to_pytorch.py`.
## Naming
- All model names use the following format: `Helsinki-NLP/opus-mt-{src}-{tgt}`
- The language codes used to name models are inconsistent. Two digit codes can usually be found [here](https://developers.google.com/admin-sdk/directory/v1/languages), three digit codes require googling "language
code {code}".
- Codes formatted like `es_AR` are usually `code_{region}`. That one is Spanish from Argentina.
- The models were converted in two stages. The first 1000 models use ISO-639-2 codes to identify languages, the second
group use a combination of ISO-639-5 codes and ISO-639-2 codes.
## Examples
- Since Marian models are smaller than many other translation models available in the library, they can be useful for
fine-tuning experiments and integration tests.
- [Fine-tune on GPU](https://github.com/huggingface/transformers/blob/master/examples/legacy/seq2seq/train_distil_marian_enro.sh)
## Multilingual Models
- All model names use the following format: `Helsinki-NLP/opus-mt-{src}-{tgt}`:
- If a model can output multiple languages, and you should specify a language code by prepending the desired output
language to the `src_text`.
- You can see a models's supported language codes in its model card, under target constituents, like in [opus-mt-en-roa](https://huggingface.co/Helsinki-NLP/opus-mt-en-roa).
- Note that if a model is only multilingual on the source side, like `Helsinki-NLP/opus-mt-roa-en`, no language
codes are required.
New multi-lingual models from the [Tatoeba-Challenge repo](https://github.com/Helsinki-NLP/Tatoeba-Challenge)
require 3 character language codes:
```python
>>> from transformers import MarianMTModel, MarianTokenizer
from transformers import MarianMTModel, MarianTokenizer
>>> src_text = [
... ">>fra<< this is a sentence in english that we want to translate to french",
... ">>por<< This should go to portuguese",
... ">>esp<< And this to Spanish",
... ]
# Model trained on multiple source languages → multiple target languages
# Example: multilingual to Arabic (arb)
model_name = "Helsinki-NLP/opus-mt-mul-mul" # Tatoeba Challenge model
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
# Prepend the desired output language code (3-letter ISO 639-3)
src_texts = ["arb>> Hello, how are you today?"]
# Tokenize and translate
inputs = tokenizer(src_texts, return_tensors="pt", padding=True, truncation=True)
translated = model.generate(**inputs)
# Decode and print result
translated_texts = tokenizer.batch_decode(translated, skip_special_tokens=True)
print(translated_texts[0])
>>> model_name = "Helsinki-NLP/opus-mt-en-roa"
>>> tokenizer = MarianTokenizer.from_pretrained(model_name)
>>> print(tokenizer.supported_language_codes)
['>>zlm_Latn<<', '>>mfe<<', '>>hat<<', '>>pap<<', '>>ast<<', '>>cat<<', '>>ind<<', '>>glg<<', '>>wln<<', '>>spa<<', '>>fra<<', '>>ron<<', '>>por<<', '>>ita<<', '>>oci<<', '>>arg<<', '>>min<<']
>>> model = MarianMTModel.from_pretrained(model_name)
>>> translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
>>> [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
["c'est une phrase en anglais que nous voulons traduire en français",
'Isto deve ir para o português.',
'Y esto al español']
```
- Older multilingual models use 2 character language codes.
Here is the code to see all available pretrained models on the hub:
```python
from huggingface_hub import list_models
from transformers import MarianMTModel, MarianTokenizer
# Example: older multilingual model (like en → many)
model_name = "Helsinki-NLP/opus-mt-en-ROMANCE" # English → French, Spanish, Italian, etc.
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
# Prepend the 2-letter ISO 639-1 target language code (older format)
src_texts = [">>fr<< Hello, how are you today?"]
# Tokenize and translate
inputs = tokenizer(src_texts, return_tensors="pt", padding=True, truncation=True)
translated = model.generate(**inputs)
# Decode and print result
translated_texts = tokenizer.batch_decode(translated, skip_special_tokens=True)
print(translated_texts[0])
model_list = list_models()
org = "Helsinki-NLP"
model_ids = [x.id for x in model_list if x.id.startswith(org)]
suffix = [x.split("/")[1] for x in model_ids]
old_style_multi_models = [f"{org}/{s}" for s in suffix if s != s.lower()]
```
## Old Style Multi-Lingual Models
These are the old style multi-lingual models ported from the OPUS-MT-Train repo: and the members of each language
group:
```python no-style
['Helsinki-NLP/opus-mt-NORTH_EU-NORTH_EU',
'Helsinki-NLP/opus-mt-ROMANCE-en',
'Helsinki-NLP/opus-mt-SCANDINAVIA-SCANDINAVIA',
'Helsinki-NLP/opus-mt-de-ZH',
'Helsinki-NLP/opus-mt-en-CELTIC',
'Helsinki-NLP/opus-mt-en-ROMANCE',
'Helsinki-NLP/opus-mt-es-NORWAY',
'Helsinki-NLP/opus-mt-fi-NORWAY',
'Helsinki-NLP/opus-mt-fi-ZH',
'Helsinki-NLP/opus-mt-fi_nb_no_nn_ru_sv_en-SAMI',
'Helsinki-NLP/opus-mt-sv-NORWAY',
'Helsinki-NLP/opus-mt-sv-ZH']
GROUP_MEMBERS = {
'ZH': ['cmn', 'cn', 'yue', 'ze_zh', 'zh_cn', 'zh_CN', 'zh_HK', 'zh_tw', 'zh_TW', 'zh_yue', 'zhs', 'zht', 'zh'],
'ROMANCE': ['fr', 'fr_BE', 'fr_CA', 'fr_FR', 'wa', 'frp', 'oc', 'ca', 'rm', 'lld', 'fur', 'lij', 'lmo', 'es', 'es_AR', 'es_CL', 'es_CO', 'es_CR', 'es_DO', 'es_EC', 'es_ES', 'es_GT', 'es_HN', 'es_MX', 'es_NI', 'es_PA', 'es_PE', 'es_PR', 'es_SV', 'es_UY', 'es_VE', 'pt', 'pt_br', 'pt_BR', 'pt_PT', 'gl', 'lad', 'an', 'mwl', 'it', 'it_IT', 'co', 'nap', 'scn', 'vec', 'sc', 'ro', 'la'],
'NORTH_EU': ['de', 'nl', 'fy', 'af', 'da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'],
'SCANDINAVIA': ['da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'],
'SAMI': ['se', 'sma', 'smj', 'smn', 'sms'],
'NORWAY': ['nb_NO', 'nb', 'nn_NO', 'nn', 'nog', 'no_nb', 'no'],
'CELTIC': ['ga', 'cy', 'br', 'gd', 'kw', 'gv']
}
```
Example of translating english to many romance languages, using old-style 2 character language codes
```python
>>> from transformers import MarianMTModel, MarianTokenizer
>>> src_text = [
... ">>fr<< this is a sentence in english that we want to translate to french",
... ">>pt<< This should go to portuguese",
... ">>es<< And this to Spanish",
... ]
>>> model_name = "Helsinki-NLP/opus-mt-en-ROMANCE"
>>> tokenizer = MarianTokenizer.from_pretrained(model_name)
>>> model = MarianMTModel.from_pretrained(model_name)
>>> translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
>>> tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
["c'est une phrase en anglais que nous voulons traduire en français",
'Isto deve ir para o português.',
'Y esto al español']
```
## Resources
- [Translation task guide](../tasks/translation)
- [Summarization task guide](../tasks/summarization)
- [Causal language modeling task guide](../tasks/language_modeling)
## MarianConfig
[[autodoc]] MarianConfig

View File

@ -139,10 +139,6 @@ Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/bl
[[autodoc]] MistralConfig
## MistralCommonTokenizer
[[autodoc]] MistralCommonTokenizer
## MistralModel
[[autodoc]] MistralModel

View File

@ -227,10 +227,6 @@ This example also how to use `BitsAndBytes` to load the model in 4bit quantizati
[[autodoc]] Mistral3Config
## MistralCommonTokenizer
[[autodoc]] MistralCommonTokenizer
## Mistral3Model
[[autodoc]] Mistral3Model

View File

@ -197,10 +197,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
[[autodoc]] MixtralConfig
## MistralCommonTokenizer
[[autodoc]] MistralCommonTokenizer
## MixtralModel
[[autodoc]] MixtralModel

View File

@ -1,155 +0,0 @@
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
<div style="float: right;">
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
</div>
# ModernBERT Decoder
ModernBERT Decoder is the same architecture as [ModernBERT](https://huggingface.co/papers/2412.13663) but trained from scratch with a causal language modeling (CLM) objective. This allows for using the same architecture for comparing encoders and decoders. This is the decoder architecture implementation of ModernBERT, designed for autoregressive text generation tasks.
Like the encoder version, ModernBERT Decoder incorporates modern architectural improvements such as rotary positional embeddings to support sequences of up to 8192 tokens, unpadding to avoid wasting compute on padding tokens, GeGLU layers, and alternating attention patterns. However, it uses causal (unidirectional) attention to enable autoregressive generation.
> [!TIP]
> Click on the ModernBERT Decoder models in the right sidebar for more examples of how to apply ModernBERT Decoder to different text generation tasks.
The example below demonstrates how to use ModernBERT Decoder for text generation with [`Pipeline`], [`AutoModel`], and from the command line.
<hfoptions id="usage">
<hfoption id="Pipeline">
```py
import torch
from transformers import pipeline
generator = pipeline(
task="text-generation",
model="blab-jhu/test-32m-dec",
torch_dtype=torch.float16,
device=0
)
generator("The future of artificial intelligence is", max_length=50, num_return_sequences=1)
# For sequence classification
classifier = pipeline(
task="text-classification",
model="blab-jhu/test-32m-dec",
torch_dtype=torch.float16,
device=0
)
classifier("This movie is really great!")
```
</hfoption>
<hfoption id="AutoModel">
```py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("blab-jhu/test-32m-dec")
model = AutoModelForCausalLM.from_pretrained(
"blab-jhu/test-32m-dec",
torch_dtype=torch.float16,
device_map="auto",
)
prompt = "The future of artificial intelligence is"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
with torch.no_grad():
outputs = model.generate(
**inputs,
max_length=50,
num_return_sequences=1,
temperature=0.7,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Generated text: {generated_text}")
# For sequence classification
from transformers import AutoModelForSequenceClassification
classifier_model = AutoModelForSequenceClassification.from_pretrained(
"blab-jhu/test-32m-dec",
torch_dtype=torch.float16,
device_map="auto",
num_labels=2
)
text = "This movie is really great!"
inputs = tokenizer(text, return_tensors="pt").to("cuda")
with torch.no_grad():
outputs = classifier_model(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predicted_class = torch.argmax(predictions, dim=-1)
print(f"Predicted class: {predicted_class.item()}")
print(f"Prediction probabilities: {predictions}")
```
</hfoption>
<hfoption id="transformers CLI">
```bash
echo "The future of artificial intelligence is" | transformers run --task text-generation --model your-username/modernbert-decoder-base --device 0
```
</hfoption>
</hfoptions>
## ModernBertDecoderConfig
[[autodoc]] ModernBertDecoderConfig
<frameworkcontent>
<pt>
## ModernBertDecoderModel
[[autodoc]] ModernBertDecoderModel
- forward
## ModernBertDecoderForCausalLM
[[autodoc]] ModernBertDecoderForCausalLM
- forward
## ModernBertDecoderForSequenceClassification
[[autodoc]] ModernBertDecoderForSequenceClassification
- forward
### Usage tips
The ModernBertDecoder model can be fine-tuned for various text generation tasks using the HuggingFace Transformers library. It supports efficient inference with features like:
- **Causal attention**: Ensures autoregressive generation by masking future tokens
- **Sliding window attention**: Alternates between local and global attention patterns for efficiency
- **Rotary positional embeddings**: Enables handling of longer sequences up to 8000 tokens
- **FlashAttention support**: Optimized attention computation for faster training and inference
</pt>
</frameworkcontent>

View File

@ -1,68 +0,0 @@
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# PerceptionLM
## Overview
The PerceptionLM model was proposed in [PerceptionLM: Open-Access Data and Models for Detailed Visual Understanding](https://ai.meta.com/research/publications/perceptionlm-open-access-data-and-models-for-detailed-visual-understanding/) by Jang Hyun Cho et al. It's a fully open, reproducible model for transparent research in image and video understanding. PLM consists of
a vision encoder with a small scale (<8B parameters) LLM decoder.
The abstract from the paper is the following:
*Vision-language models are integral to computer vision research, yet many high-performing models
remain closed-source, obscuring their data, design and training recipe. The research community
has responded by using distillation from black-box models to label training data, achieving strong
benchmark results, at the cost of measurable scientific progress. However, without knowing the details
of the teacher model and its data sources, scientific progress remains difficult to measure. In this
paper, we study building a Perception Language Model (PLM) in a fully open and reproducible
framework for transparent research in image and video understanding. We analyze standard training
pipelines without distillation from proprietary models and explore large-scale synthetic data to identify
critical data gaps, particularly in detailed video understanding. To bridge these gaps, we release 2.8M
human-labeled instances of fine-grained video question-answer pairs and spatio-temporally grounded
video captions. Additionally, we introduce PLMVideoBench, a suite for evaluating challenging video
understanding tasks focusing on the ability to reason about what”, where”, when”, and how of a
video. We make our work fully reproducible by providing data, training recipes, code & models.*
This model was contributed by [shumingh](https://huggingface.co/shumingh).
The original code can be found [here](https://github.com/facebookresearch/perception_models).
## PerceptionLMConfig
[[autodoc]] PerceptionLMConfig
## PerceptionLMProcessor
[[autodoc]] PerceptionLMProcessor
## PerceptionLMImageProcessorFast
[[autodoc]] PerceptionLMImageProcessorFast
## PerceptionLMVideoProcessor
[[autodoc]] PerceptionLMVideoProcessor
## PerceptionLMModel
[[autodoc]] PerceptionLMModel
## PerceptionLMForConditionalGeneration
[[autodoc]] PerceptionLMForConditionalGeneration
- forward

View File

@ -9,53 +9,44 @@ specific language governing permissions and limitations under the License.
rendered properly in your Markdown viewer.
-->
<div style="float: right;">
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-EE4C2C?logo=pytorch&logoColor=white&style=flat">
</div>
</div>
# Phi4 Multimodal
## Phi4 Multimodal
## Overview
[Phi4 Multimodal](https://huggingface.co/papers/2503.01743) is a multimodal model capable of text, image, and speech and audio inputs or any combination of these. It features a mixture of LoRA adapters for handling different inputs, and each input is routed to the appropriate encoder.
Phi4 Multimodal is a lightweight open multimodal foundation model that leverages the language, vision, and speech research and datasets used for Phi-3.5 and 4.0 models. The model processes text, image, and audio inputs, generating text outputs, and comes with 128K token context length. The model underwent an enhancement process, incorporating both supervised fine-tuning, direct preference optimization and RLHF (Reinforcement Learning from Human Feedback) to support precise instruction adherence and safety measures. The languages that each modal supports are the following:
You can find all the original Phi4 Multimodal checkpoints under the [Phi4](https://huggingface.co/collections/microsoft/phi-4-677e9380e514feb5577a40e4) collection.
- Text: Arabic, Chinese, Czech, Danish, Dutch, English, Finnish, French, German, Hebrew, Hungarian, Italian, Japanese, Korean, Norwegian, Polish, Portuguese, Russian, Spanish, Swedish, Thai, Turkish, Ukrainian
- Vision: English
- Audio: English, Chinese, German, French, Italian, Japanese, Spanish, Portuguese
> [!TIP]
> This model was contributed by [cyrilvallez](https://huggingface.co/cyrilvallez).
>
> Click on the Phi-4 Multimodal in the right sidebar for more examples of how to apply Phi-4 Multimodal to different tasks.
This model was contributed by [Cyril Vallez](https://huggingface.co/cyrilvallez). The most recent code can be
found [here](https://github.com/huggingface/transformers/blob/main/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py).
The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.
<hfoptions id="usage">
<hfoption id="Pipeline">
## Usage tips
```python
from transformers import pipeline
generator = pipeline("text-generation", model="microsoft/Phi-4-multimodal-instruct", torch_dtype="auto", device=0)
`Phi4-multimodal-instruct` can be found on the [Huggingface Hub](https://huggingface.co/microsoft/Phi-4-multimodal-instruct)
prompt = "Explain the concept of multimodal AI in simple terms."
result = generator(prompt, max_length=50)
print(result[0]['generated_text'])
```
</hfoption>
<hfoption id="AutoModel">
In the following, we demonstrate how to use it for inference depending on the input modalities (text, image, audio).
```python
import torch
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
# Define model path
model_path = "microsoft/Phi-4-multimodal-instruct"
device = "cuda:0"
# Load model and processor
processor = AutoProcessor.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device, torch_dtype=torch.float16)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device, torch_dtype=torch.float16)
# Optional: load the adapters (note that without them, the base model will very likely not work well)
model.load_adapter(model_path, adapter_name="speech", device_map=device, adapter_kwargs={"subfolder": 'speech-lora'})
model.load_adapter(model_path, adapter_name="vision", device_map=device, adapter_kwargs={"subfolder": 'vision-lora'})
# Part : Image Processing
messages = [
{
"role": "user",
@ -66,7 +57,7 @@ messages = [
},
]
model.set_adapter("vision")
model.set_adapter("vision") # if loaded, activate the vision adapter
inputs = processor.apply_chat_template(
messages,
add_generation_prompt=True,
@ -75,6 +66,7 @@ inputs = processor.apply_chat_template(
return_tensors="pt",
).to(device)
# Generate response
generate_ids = model.generate(
**inputs,
max_new_tokens=1000,
@ -85,27 +77,10 @@ response = processor.batch_decode(
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(f'>>> Response\n{response}')
```
</hfoption>
</hfoptions>
## Notes
The example below demonstrates inference with an audio and text input.
```py
import torch
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
model_path = "microsoft/Phi-4-multimodal-instruct"
device = "cuda:0"
processor = AutoProcessor.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device, torch_dtype=torch.float16)
model.load_adapter(model_path, adapter_name="speech", device_map=device, adapter_kwargs={"subfolder": 'speech-lora'})
model.set_adapter("speech")
# Part 2: Audio Processing
model.set_adapter("speech") # if loaded, activate the speech adapter
audio_url = "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac"
messages = [
{
@ -135,7 +110,6 @@ response = processor.batch_decode(
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(f'>>> Response\n{response}')
```
## Phi4MultimodalFeatureExtractor

View File

@ -86,10 +86,6 @@ output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up
[[autodoc]] PixtralVisionConfig
## MistralCommonTokenizer
[[autodoc]] MistralCommonTokenizer
## PixtralVisionModel
[[autodoc]] PixtralVisionModel

View File

@ -14,90 +14,35 @@ rendered properly in your Markdown viewer.
-->
<div style="float: right;">
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
# SwitchTransformers
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
# Switch Transformers
## Overview
[Switch Transformers](https://huggingface.co/papers/2101.03961) is a sparse T5 model where the MLP layer is replaced by a Mixture-of-Experts (MoE). A routing mechanism associates each token with an expert and each expert is a dense MLP. Sparsity enables better scaling and the routing mechanism allows the model to select relevant weights on the fly which increases model capacity.
The SwitchTransformers model was proposed in [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://huggingface.co/papers/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
You can find all the original Switch Transformers checkpoints under the [Switch Transformer](https://huggingface.co/collections/google/switch-transformers-release-6548c35c6507968374b56d1f) collection.
The Switch Transformer model uses a sparse T5 encoder-decoder architecture, where the MLP are replaced by a Mixture of Experts (MoE). A routing mechanism (top 1 in this case) associates each token to one of the expert, where each expert is a dense MLP. While switch transformers have a lot more weights than their equivalent dense models, the sparsity allows better scaling and better finetuning performance at scale.
During a forward pass, only a fraction of the weights are used. The routing mechanism allows the model to select relevant weights on the fly which increases the model capacity without increasing the number of operations.
The abstract from the paper is the following:
> [!TIP]
> This model was contributed by [ybelkada](https://huggingface.co/ybelkada) and [ArthurZ](https://huggingface.co/ArthurZ).
>
> Click on the Switch Transformers models in the right sidebar for more examples of how to apply Switch Transformers to different natural language tasks.
*In deep learning, models typically reuse the same parameters for all inputs. Mixture of Experts (MoE) defies this and instead selects different parameters for each incoming example. The result is a sparsely-activated model -- with outrageous numbers of parameters -- but a constant computational cost. However, despite several notable successes of MoE, widespread adoption has been hindered by complexity, communication costs and training instability -- we address these with the Switch Transformer. We simplify the MoE routing algorithm and design intuitive improved models with reduced communication and computational costs. Our proposed training techniques help wrangle the instabilities and we show large sparse models may be trained, for the first time, with lower precision (bfloat16) formats. We design models based off T5-Base and T5-Large to obtain up to 7x increases in pre-training speed with the same computational resources. These improvements extend into multilingual settings where we measure gains over the mT5-Base version across all 101 languages. Finally, we advance the current scale of language models by pre-training up to trillion parameter models on the "Colossal Clean Crawled Corpus" and achieve a 4x speedup over the T5-XXL model.*
The example below demonstrates how to predict the masked token with [`Pipeline`], [`AutoModel`], and from the command line.
This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada) and [Arthur Zucker](https://huggingface.co/ArthurZ).
The original code can be found [here](https://github.com/google/flaxformer/tree/main/flaxformer/architectures/moe).
<hfoptions id="usage">
<hfoption id="Pipeline">
## Usage tips
```python
import torch
from transformers import pipeline
- SwitchTransformers uses the [`T5Tokenizer`], which can be loaded directly from each model's repository.
- The released weights are pretrained on English [Masked Language Modeling](https://moon-ci-docs.huggingface.co/docs/transformers/pr_19323/en/glossary#general-terms) task, and should be finetuned.
pipeline = pipeline(
task="text2text-generation",
model="google/switch-base-8",
torch_dtype=torch.float16,
device=0
)
print(pipeline("The capital of France is <extra_id_0>."))
```
</hfoption>
<hfoption id="AutoModel">
```python
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google/switch-base-8")
model = AutoModelForSeq2SeqLM.from_pretrained("google/switch-base-8", device_map="auto", torch_dtype=torch.float16)
input_text = "The capital of France is <extra_id_0>."
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(0)
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]))
```
</hfoption>
<hfoption id="transformers CLI">
```bash
echo -e "The capital of France is <extra_id_0>." | transformers run --task text2text-generation --model google/switch-base-8 --device 0
# [{'generated_text': 'Paris.'}]
```
</hfoption>
</hfoptions>
Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
The example below uses [bitsandbytes](../quantization/bitsandbytes/) to only quantize the weights to 8-bits.
```py
# pip install bitsandbytes
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig
tokenizer = AutoTokenizer.from_pretrained("google/switch-base-8")
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
model = AutoModelForSeq2SeqLM.from_pretrained("google/switch-base-8", device_map="auto", quantization_config=quantization_config)
input_text = "The capital of France is <extra_id_0>."
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(0)
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]))
```
## Resources
- [Translation task guide](../tasks/translation)
- [Summarization task guide](../tasks/summarization)
## SwitchTransformersConfig

View File

@ -24,7 +24,7 @@ rendered properly in your Markdown viewer.
# T5Gemma
T5Gemma (aka encoder-decoder Gemma) was proposed in a [research paper](https://arxiv.org/abs/2504.06225) by Google. It is a family of encoder-decoder large language models, developed by adapting pretrained decoder-only models into encoder-decoder. T5Gemma includes pretrained and instruction-tuned variants. The architecture is based on transformer encoder-decoder design following T5, with improvements from Gemma 2: GQA, RoPE, GeGLU activation, RMSNorm, and interleaved local/global attention.
T5Gemma (aka encoder-decoder Gemma) was proposed in a [research paper](https://arxiv.org/abs/2504.06225) by Google. It is a family of encoder-decoder large langauge models, developed by adapting pretrained decoder-only models into encoder-decoder. T5Gemma includes pretrained and instruction-tuned variants. The architecture is based on transformer encoder-decoder design following T5, with improvements from Gemma 2: GQA, RoPE, GeGLU activation, RMSNorm, and interleaved local/global attention.
T5Gemma has two groups of model sizes: 1) [Gemma 2](https://ai.google.dev/gemma/docs/core/model_card_2) sizes (2B-2B, 9B-2B, and 9B-9B), which are based on the offical Gemma 2 models (2B and 9B); and 2) [T5](https://arxiv.org/abs/1910.10683) sizes (Small, Base, Large, and XL), where are pretrained under the Gemma 2 framework following T5 configuration. In addition, we also provide a model at ML size (medium large, ~2B in total), which is in-between T5 Large and T5 XL.

View File

@ -187,13 +187,13 @@ from torch import nn
from transformers import Trainer
class CustomTrainer(Trainer):
def compute_loss(self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], return_outputs: bool = False num_items_in_batch: Optional[torch.Tensor] = None):
def compute_losss(self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], return_outputs: bool = False num_items_in_batch: Optional[torch.Tensor] = None):
labels = inputs.pop("labels")
# forward pass
outputs = model(**inputs)
logits = outputs.get("logits")
# compute custom loss for 3 labels with different weights
reduction = "sum" if num_items_in_batch is not None else "mean"
reduction = "mean" if num_items_in_batch is not None else "sum"
loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0], device=model.device, reduction=reduction))
loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
if num_items_in_batch is not None:

View File

@ -157,8 +157,6 @@
title: (번역중) VPTQ
- local: quantization/quanto
title: Quanto
- local: quantization/quark
title: Quark
- local: quantization/eetq
title: EETQ
- local: in_translation

View File

@ -1,85 +0,0 @@
<!--Copyright 2025 Advanced Micro Devices, Inc. and The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# Quark[[quark]]
[Quark](https://quark.docs.amd.com/latest/)는 특정 데이터 타입, 알고리즘, 하드웨어에 구애받지 않도록 설계된 딥러닝 양자화 툴킷입니다. Quark에서는 다양한 전처리 전략, 알고리즘, 데이터 타입을 조합하여 사용할 수 있습니다.
🤗 Transformers를 통해 통합된 PyTorch 지원은 주로 AMD CPU 및 GPU를 대상으로 하며, 주로 평가 목적으로 사용됩니다. 예를 들어, [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness)를 🤗 Transformers 백엔드와 함께 사용하여 Quark로 양자화된 다양한 모델을 원활하게 평가할 수 있습니다.
Quark에 관심이 있는 사용자는 [문서](https://quark.docs.amd.com/latest/)를 참고하여 모델 양자화를 시작하고 지원되는 오픈 소스 라이브러리에서 사용할 수 있습니다!
Quark는 자체 체크포인트/[설정 포맷](https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test/blob/main/config.json#L26)를 가지고 있지만, 다른 양자화/런타임 구현체 ([AutoAWQ](https://huggingface.co/docs/transformers/quantization/awq), [네이티브 fp8](https://huggingface.co/docs/transformers/quantization/finegrained_fp8))와 호환되는 직렬화 레이아웃으로 모델을 생성하는 것도 지원합니다.
Transformer에서 Quark 양자화 모델을 로드하려면 먼저 라이브러리를 설치해야 합니다:
```bash
pip install amd-quark
```
## 지원 매트릭스[[Support matrix]]
Quark를 통해 양자화된 모델은 함께 조합할 수 있는 광범위한 기능을 지원합니다. 구성에 관계없이 모든 양자화된 모델은 `PretrainedModel.from_pretrained`를 통해 원활하게 다시 로드할 수 있습니다.
아래 표는 Quark에서 지원하는 몇 가지 기능을 보여줍니다:
| **기능** | **Quark에서 지원하는 항목** | |
|---------------------------------|-----------------------------------------------------------------------------------------------------------|---|
| 데이터 타입 | int8, int4, int2, bfloat16, float16, fp8_e5m2, fp8_e4m3, fp6_e3m2, fp6_e2m3, fp4, OCP MX, MX6, MX9, bfp16 | |
| 양자화 전 모델 변환 | SmoothQuant, QuaRot, SpinQuant, AWQ | |
| 양자화 알고리즘 | GPTQ | |
| 지원 연산자 | ``nn.Linear``, ``nn.Conv2d``, ``nn.ConvTranspose2d``, ``nn.Embedding``, ``nn.EmbeddingBag`` | |
| 세분성(Granularity) | per-tensor, per-channel, per-block, per-layer, per-layer type | |
| KV 캐시 | fp8 | |
| 활성화 캘리브레이션 | MinMax / Percentile / MSE | |
| 양자화 전략 | weight-only, static, dynamic, with or without output quantization | |
## Hugging Face Hub의 모델[[Models on Hugging Face Hub]]
Quark 네이티브 직렬화를 사용하는 공개 모델은 https://huggingface.co/models?other=quark 에서 찾을 수 있습니다.
Quark는 [`quant_method="fp8"`을 이용하는 모델](https://huggingface.co/models?other=fp8)과 [`quant_method="awq"`을 사용하는 모델](https://huggingface.co/models?other=awq)도 지원하지만, Transformers는 이러한 모델을 [AutoAWQ](https://huggingface.co/docs/transformers/quantization/awq)를 통해 불러오거나
[🤗 Transformers의 네이티브 fp8 지원](https://huggingface.co/docs/transformers/quantization/finegrained_fp8)을 사용합니다.
## Transformers에서 Quark모델 사용하기[[Using Quark models in Transformers]]
다음은 Transformers에서 Quark 모델을 불러오는 방법의 예시입니다:
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
model_id = "EmbeddedLLM/Llama-3.1-8B-Instruct-w_fp8_per_channel_sym"
model = AutoModelForCausalLM.from_pretrained(model_id)
model = model.to("cuda")
print(model.model.layers[0].self_attn.q_proj)
# QParamsLinear(
# (weight_quantizer): ScaledRealQuantizer()
# (input_quantizer): ScaledRealQuantizer()
# (output_quantizer): ScaledRealQuantizer()
# )
tokenizer = AutoTokenizer.from_pretrained(model_id)
inp = tokenizer("Where is a good place to cycle around Tokyo?", return_tensors="pt")
inp = inp.to("cuda")
res = model.generate(**inp, min_new_tokens=50, max_new_tokens=100)
print(tokenizer.batch_decode(res)[0])
# <|begin_of_text|>Where is a good place to cycle around Tokyo? There are several places in Tokyo that are suitable for cycling, depending on your skill level and interests. Here are a few suggestions:
# 1. Yoyogi Park: This park is a popular spot for cycling and has a wide, flat path that's perfect for beginners. You can also visit the Meiji Shrine, a famous Shinto shrine located in the park.
# 2. Imperial Palace East Garden: This beautiful garden has a large, flat path that's perfect for cycling. You can also visit the
```

View File

@ -1,216 +0,0 @@
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# This file was automatically generated from examples/modular-transformers/modular_duplicated_method.py.
# Do NOT edit this file manually as any edits will be overwritten by the generation of
# the file from the modular. If any change should be done, please apply the change to the
# modular_duplicated_method.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
from ...configuration_utils import PretrainedConfig
from ...modeling_rope_utils import rope_config_validation
class DuplicatedMethodConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`DuplicatedMethodModel`]. It is used to instantiate an DuplicatedMethod
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the DuplicatedMethod-7B.
e.g. [meta-duplicated_method/DuplicatedMethod-2-7b-hf](https://huggingface.co/meta-duplicated_method/DuplicatedMethod-2-7b-hf)
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 32000):
Vocabulary size of the DuplicatedMethod model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`DuplicatedMethodModel`]
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 11008):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer decoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the Transformer decoder.
num_key_value_heads (`int`, *optional*):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
by meanpooling all the original heads within that group. For more details, check out [this
paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
`num_attention_heads`.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with. DuplicatedMethod 1 supports up to 2048 tokens,
DuplicatedMethod 2 up to 4096, CodeLlama up to 16384.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
pad_token_id (`int`, *optional*):
Padding token id.
bos_token_id (`int`, *optional*, defaults to 1):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 2):
End of stream token id.
pretraining_tp (`int`, *optional*, defaults to 1):
Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to
understand more about it. This value is necessary to ensure exact reproducibility of the pretraining
results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232).
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
accordingly.
Expected contents:
`rope_type` (`str`):
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
'duplicated_method3'], with 'default' being the original RoPE implementation.
`factor` (`float`, *optional*):
Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
most scaling types, a `factor` of x will enable the model to handle sequences of length x *
original maximum pre-trained length.
`original_max_position_embeddings` (`int`, *optional*):
Used with 'dynamic', 'longrope' and 'duplicated_method3'. The original max position embeddings used during
pretraining.
`attention_factor` (`float`, *optional*):
Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
computation. If unspecified, it defaults to value recommended by the implementation, using the
`factor` field to infer the suggested value.
`beta_fast` (`float`, *optional*):
Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
ramp function. If unspecified, it defaults to 32.
`beta_slow` (`float`, *optional*):
Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
ramp function. If unspecified, it defaults to 1.
`short_factor` (`list[float]`, *optional*):
Only used with 'longrope'. The scaling factor to be applied to short contexts (<
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
size divided by the number of attention heads divided by 2
`long_factor` (`list[float]`, *optional*):
Only used with 'longrope'. The scaling factor to be applied to long contexts (<
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
size divided by the number of attention heads divided by 2
`low_freq_factor` (`float`, *optional*):
Only used with 'duplicated_method3'. Scaling factor applied to low frequency components of the RoPE
`high_freq_factor` (`float`, *optional*):
Only used with 'duplicated_method3'. Scaling factor applied to high frequency components of the RoPE
attention_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value and output projection layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
mlp_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
head_dim (`int`, *optional*):
The attention head dimension. If None, it will default to hidden_size // num_attention_heads
```python
>>> from transformers import DuplicatedMethodModel, DuplicatedMethodConfig
>>> # Initializing a DuplicatedMethod duplicated_method-7b style configuration
>>> configuration = DuplicatedMethodConfig()
>>> # Initializing a model from the duplicated_method-7b style configuration
>>> model = DuplicatedMethodModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "duplicated_method"
keys_to_ignore_at_inference = ["past_key_values"]
# Default tensor parallel plan for base model `DuplicatedMethodModel`
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise",
"layers.*.self_attn.k_proj": "colwise",
"layers.*.self_attn.v_proj": "colwise",
"layers.*.self_attn.o_proj": "rowwise",
"layers.*.mlp.gate_proj": "colwise",
"layers.*.mlp.up_proj": "colwise",
"layers.*.mlp.down_proj": "rowwise",
}
base_model_pp_plan = {
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
"norm": (["hidden_states"], ["hidden_states"]),
}
def __init__(
self,
vocab_size=32000,
hidden_size=4096,
intermediate_size=11008,
num_hidden_layers=32,
num_attention_heads=32,
num_key_value_heads=None,
hidden_act="silu",
max_position_embeddings=2048,
initializer_range=0.02,
rms_norm_eps=1e-6,
use_cache=True,
pad_token_id=None,
bos_token_id=1,
eos_token_id=2,
pretraining_tp=1,
tie_word_embeddings=False,
rope_theta=10000.0,
rope_scaling=None,
attention_bias=False,
attention_dropout=0.0,
mlp_bias=False,
head_dim=None,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.pretraining_tp = pretraining_tp
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.mlp_bias = mlp_bias
self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
# Validate the correctness of rotary position embeddings parameters
# BC: if there is a 'type' field, copy it it to 'rope_type'.
if self.rope_scaling is not None and "type" in self.rope_scaling:
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
rope_config_validation(self)
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
@property
def vocab_size(self):
return 45
@vocab_size.setter
def vocab_size(self, value):
self.vocab_size = value

View File

@ -498,7 +498,6 @@ class Multimodal2VisionPreTrainedModel(PreTrainedModel):
supports_gradient_checkpointing = True
_supports_sdpa = True
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
_supports_flex_attn = True
_supports_attention_backend = True

View File

@ -65,7 +65,7 @@ class MyNewModel2RotaryEmbedding(nn.Module):
def __init__(self, config: MyNewModel2Config, device=None):
super().__init__()
# BC: "rope_type" was originally "type"
if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
else:
self.rope_type = "default"
@ -290,7 +290,6 @@ class MyNewModel2PreTrainedModel(PreTrainedModel):
_no_split_modules = ["MyNewModel2DecoderLayer"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
_supports_sdpa = True
_supports_flex_attn = True
_supports_cache_class = True

View File

@ -96,7 +96,6 @@ class NewTaskModelPreTrainedModel(PreTrainedModel):
_supports_quantized_cache = True
_supports_static_cache = True
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
_supports_sdpa = True
_supports_flex_attn = True
_supports_attention_backend = True

View File

@ -48,7 +48,7 @@ class SuperRotaryEmbedding(nn.Module):
def __init__(self, config: SuperConfig, device=None):
super().__init__()
# BC: "rope_type" was originally "type"
if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
else:
self.rope_type = "default"
@ -289,7 +289,6 @@ class SuperPreTrainedModel(PreTrainedModel):
_no_split_modules = ["SuperDecoderLayer"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
_supports_sdpa = True
_supports_flex_attn = True
_supports_cache_class = True

View File

@ -1,11 +0,0 @@
from transformers.models.llama.configuration_llama import LlamaConfig
class DuplicatedMethodConfig(LlamaConfig):
@property
def vocab_size(self):
return 45
@vocab_size.setter
def vocab_size(self, value):
self.vocab_size = value

View File

@ -65,7 +65,7 @@ examples/pytorch/token-classification/run_ner.py \
Most example scripts should have the first two command line arguments and some have the third one. You can quickly check if a given example supports any of these by passing a `-h` option, e.g.:
```bash
token-classification/run_ner.py -h
examples/pytorch/token-classification/run_ner.py -h
```
## Resuming training
@ -110,7 +110,7 @@ classification MNLI task using the `run_glue` script, with 8 GPUs:
```bash
torchrun \
--nproc_per_node 8 text-classification/run_glue.py \
--nproc_per_node 8 pytorch/text-classification/run_glue.py \
--model_name_or_path google-bert/bert-large-uncased-whole-word-masking \
--task_name mnli \
--do_train \

View File

@ -84,7 +84,7 @@ loaded using the pre-trained weights.
Finally, we can run the example script to train the model:
```bash
python run_clip.py \
python examples/pytorch/contrastive-image-text/run_clip.py \
--output_dir ./clip-roberta-finetuned \
--model_name_or_path ./clip-roberta \
--data_dir $PWD/data \

View File

@ -21,7 +21,7 @@ limitations under the License.
`run_swag` allows you to fine-tune any model from our [hub](https://huggingface.co/models) (as long as its architecture as a `ForMultipleChoice` version in the library) on the SWAG dataset or your own csv/jsonlines files as long as they are structured the same way. To make it works on another dataset, you will need to tweak the `preprocess_function` inside the script.
```bash
python run_swag.py \
python examples/pytorch/multiple-choice/run_swag.py \
--model_name_or_path FacebookAI/roberta-base \
--do_train \
--do_eval \

View File

@ -324,12 +324,13 @@ def main():
args.model_name_or_path, id2label=id2label, label2id=label2id, trust_remote_code=args.trust_remote_code
)
image_processor = AutoImageProcessor.from_pretrained(
args.model_name_or_path, trust_remote_code=args.trust_remote_code, do_reduce_labels=args.do_reduce_labels
args.model_name_or_path, trust_remote_code=args.trust_remote_code
)
model = AutoModelForSemanticSegmentation.from_pretrained(
args.model_name_or_path,
config=config,
trust_remote_code=args.trust_remote_code,
do_reduce_labels=args.do_reduce_labels,
)
# Define transforms to be applied to each image and target.

View File

@ -40,7 +40,7 @@ and you also will find examples of these below.
Here is an example on a summarization task:
```bash
python run_summarization.py \
python examples/pytorch/summarization/run_summarization.py \
--model_name_or_path google-t5/t5-small \
--do_train \
--do_eval \
@ -64,7 +64,7 @@ And here is how you would use it on your own files, after adjusting the values f
`--train_file`, `--validation_file`, `--text_column` and `--summary_column` to match your setup:
```bash
python run_summarization.py \
python examples/pytorch/summarization/run_summarization.py \
--model_name_or_path google-t5/t5-small \
--do_train \
--do_eval \

View File

@ -42,7 +42,7 @@ and you also will find examples of these below.
Here is an example of a translation fine-tuning with a MarianMT model:
```bash
python run_translation.py \
python examples/pytorch/translation/run_translation.py \
--model_name_or_path Helsinki-NLP/opus-mt-en-ro \
--do_train \
--do_eval \
@ -62,7 +62,7 @@ MBart and some T5 models require special handling.
T5 models `google-t5/t5-small`, `google-t5/t5-base`, `google-t5/t5-large`, `google-t5/t5-3b` and `google-t5/t5-11b` must use an additional argument: `--source_prefix "translate {source_lang} to {target_lang}"`. For example:
```bash
python run_translation.py \
python examples/pytorch/translation/run_translation.py \
--model_name_or_path google-t5/t5-small \
--do_train \
--do_eval \
@ -85,7 +85,7 @@ For the aforementioned group of T5 models it's important to remember that if you
MBart models require a different format for `--source_lang` and `--target_lang` values, e.g. instead of `en` it expects `en_XX`, for `ro` it expects `ro_RO`. The full MBart specification for language codes can be found [here](https://huggingface.co/facebook/mbart-large-cc25). For example:
```bash
python run_translation.py \
python examples/pytorch/translation/run_translation.py \
--model_name_or_path facebook/mbart-large-en-ro \
--do_train \
--do_eval \
@ -104,7 +104,7 @@ And here is how you would use the translation finetuning on your own files, afte
values for the arguments `--train_file`, `--validation_file` to match your setup:
```bash
python run_translation.py \
python examples/pytorch/translation/run_translation.py \
--model_name_or_path google-t5/t5-small \
--do_train \
--do_eval \
@ -133,7 +133,7 @@ Here the languages are Romanian (`ro`) and English (`en`).
If you want to use a pre-processed dataset that leads to high BLEU scores, but for the `en-de` language pair, you can use `--dataset_name stas/wmt14-en-de-pre-processed`, as following:
```bash
python run_translation.py \
python examples/pytorch/translation/run_translation.py \
--model_name_or_path google-t5/t5-small \
--do_train \
--do_eval \

View File

@ -52,7 +52,7 @@ To create the package for pypi.
twine upload dist/* -r testpypi --repository-url=https://test.pypi.org/legacy/
Check that you can install it in a virtualenv by running:
pip install -i https://test.pypi.org/simple/ transformers
pip install -i https://testpypi.python.org/pypi transformers
Check you can run the following commands:
python -c "from transformers import pipeline; classifier = pipeline('text-classification'); print(classifier('What a nice release'))"
@ -204,7 +204,6 @@ _deps = [
"opentelemetry-api",
"opentelemetry-exporter-otlp",
"opentelemetry-sdk",
"mistral-common[opencv]>=1.6.3",
]
@ -314,7 +313,7 @@ extras["hub-kernels"] = deps_list("kernels")
extras["integrations"] = extras["hub-kernels"] + extras["optuna"] + extras["ray"] + extras["sigopt"]
extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette") + extras["torch"]
extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette")
extras["audio"] = deps_list(
"librosa",
"pyctcdecode",
@ -335,7 +334,6 @@ extras["video"] = deps_list("av")
extras["num2words"] = deps_list("num2words")
extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
extras["tiktoken"] = deps_list("tiktoken", "blobfile")
extras["mistral-common"] = deps_list("mistral-common[opencv]")
extras["testing"] = (
deps_list(
"pytest",
@ -365,7 +363,6 @@ extras["testing"] = (
)
+ extras["retrieval"]
+ extras["modelcreation"]
+ extras["mistral-common"]
)
extras["deepspeed-testing"] = extras["deepspeed"] + extras["testing"] + extras["optuna"] + extras["sentencepiece"]
@ -387,7 +384,6 @@ extras["all"] = (
+ extras["accelerate"]
+ extras["video"]
+ extras["num2words"]
+ extras["mistral-common"]
)

View File

@ -34,7 +34,6 @@ from .utils import (
is_g2p_en_available,
is_keras_nlp_available,
is_librosa_available,
is_mistral_common_available,
is_pretty_midi_available,
is_scipy_available,
is_sentencepiece_available,
@ -311,18 +310,6 @@ else:
"convert_slow_tokenizer",
]
try:
if not (is_mistral_common_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from .utils import dummy_mistral_common_objects
_import_structure["utils.dummy_mistral_common_objects"] = [
name for name in dir(dummy_mistral_common_objects) if not name.startswith("_")
]
else:
_import_structure["tokenization_mistral_common"] = ["MistralCommonTokenizer"]
# Vision-specific objects
try:
if not is_vision_available():

View File

@ -1157,7 +1157,9 @@ def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size: int =
frame = waveform[i : i + fft_window_size]
frame_width = frame.shape[0]
if frame_width < waveform.shape[0]:
frame = np.pad(frame, pad_width=(0, fft_window_size - frame_width), mode="constant", constant_values=0)
frame = np.lib.pad(
frame, pad_width=(0, fft_window_size - frame_width), mode="constant", constant_values=0
)
frames.append(frame)
frames = np.stack(frames, 0)

View File

@ -185,6 +185,17 @@ class Cache:
device = self.value_cache[layer_idx].device
self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
@property
def seen_tokens(self):
logger.warning_once(
"The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` "
"model input instead."
)
if hasattr(self, "_seen_tokens"):
return self._seen_tokens
else:
return None
def get_mask_sizes(self, cache_position: torch.Tensor, layer_idx: int) -> tuple[int, int]:
"""
Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
@ -461,6 +472,7 @@ class DynamicCache(Cache):
def __init__(self, _distributed_cache_data: Optional[Iterable] = None) -> None:
super().__init__()
self._seen_tokens = 0 # Used in `generate` to keep tally of how many tokens the cache has seen
self.key_cache: list[torch.Tensor] = []
self.value_cache: list[torch.Tensor] = []
@ -523,6 +535,10 @@ class DynamicCache(Cache):
Return:
A tuple containing the updated key and value states.
"""
# Update the number of seen tokens
if layer_idx == 0:
self._seen_tokens += key_states.shape[-2]
# Update the cache
if key_states is not None:
if len(self.key_cache) <= layer_idx:
@ -589,6 +605,7 @@ class DynamicCache(Cache):
if self.get_seq_length() <= max_length:
return
self._seen_tokens = max_length
for idx in range(len(self.key_cache)):
if self.key_cache[idx].numel():
self.key_cache[idx] = self.key_cache[idx][..., :max_length, :]
@ -600,6 +617,7 @@ class DynamicCache(Cache):
out = []
for i in range(0, full_batch_size, split_size):
current_split = DynamicCache()
current_split._seen_tokens = self._seen_tokens
current_split.key_cache = [tensor[i : i + split_size] for tensor in self.key_cache]
current_split.value_cache = [tensor[i : i + split_size] for tensor in self.value_cache]
out.append(current_split)
@ -797,6 +815,10 @@ class OffloadedCache(DynamicCache):
Return:
A tuple containing the updated key and value states.
"""
# Update the number of seen tokens
if layer_idx == 0:
self._seen_tokens += key_states.shape[-2]
# Update the cache
if len(self.key_cache) < layer_idx:
raise ValueError("OffloadedCache does not support model usage where layers are skipped. Use DynamicCache.")
@ -835,9 +857,6 @@ class QuantizedCache(DynamicCache):
def __init__(self, cache_config: QuantizedCacheConfig) -> None:
super().__init__()
# Used only for QuantCache where the seq-length can't be inferred easily from cache contents
self._seen_tokens = 0
self._quantized_key_cache: list[torch.Tensor] = []
self._quantized_value_cache: list[torch.Tensor] = []
@ -1079,10 +1098,6 @@ class StaticCache(Cache):
Mapping between the layers and its device. This is required when you are manually initializing the cache
and the model is split between different gpus. You can know which layers mapped to which device by
checking the associated device_map: `model.hf_device_map`.
tp_size (`Optional[int]`, *optional*):
The tensor parallel size of the model. This is used to adjust the number of key/value heads in the cache
if the model is using tensor parallelism. If not provided, it defaults to `None`, which means that the
number of key/value heads will not be adjusted.
Example:
@ -1115,7 +1130,6 @@ class StaticCache(Cache):
device: Union[torch.device, str, None] = None,
dtype: torch.dtype = torch.float32,
layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
tp_size: Optional[int] = None,
) -> None:
super().__init__()
self.max_batch_size = max_batch_size
@ -1130,13 +1144,6 @@ class StaticCache(Cache):
if getattr(config, "num_key_value_heads", None) is None
else config.num_key_value_heads
)
if tp_size is not None and tp_size > 1:
if self.num_key_value_heads % tp_size != 0:
raise ValueError(
f"Number of key value heads {self.num_key_value_heads} must be divisible by tensor parallel size {tp_size}."
)
# If the model is using tensor parallelism, we need to adjust the number of heads accordingly.
self.num_key_value_heads //= tp_size
self.key_cache: list[torch.Tensor] = []
self.value_cache: list[torch.Tensor] = []
@ -1393,19 +1400,6 @@ class EncoderDecoderCache(Cache):
for layer_idx in range(len(cross_attention_cache.key_cache)):
self.is_updated[layer_idx] = bool(cross_attention_cache.get_seq_length(layer_idx) > 0)
def __iter__(self):
"""
Support for backwards-compatible `past_key_value` iteration, e.g. `for x in past_key_value:` to iterate over
keys and values
"""
for layer_idx in range(len(self)):
yield (
self.self_attention_cache.key_cache[layer_idx],
self.self_attention_cache.value_cache[layer_idx],
self.cross_attention_cache.key_cache[layer_idx],
self.cross_attention_cache.value_cache[layer_idx],
)
def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
@ -1579,10 +1573,6 @@ class HybridCache(Cache):
Mapping between the layers and its device. This is required when you are manually initializing the cache
and the model is split between different gpus. You can know which layers mapped to which device by
checking the associated device_map: `model.hf_device_map`.
tp_size (`Optional[int]`, *optional*):
The tensor parallel size of the model. This is used to adjust the number of key/value heads in the cache
if the model is using tensor parallelism. If not provided, it defaults to `None`, which means that the
number of key/value heads will not be adjusted.
Example:
@ -1614,7 +1604,6 @@ class HybridCache(Cache):
device: Union[torch.device, str, None] = None,
dtype: torch.dtype = torch.float32,
layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
tp_size: Optional[int] = None,
) -> None:
super().__init__()
if not hasattr(config, "sliding_window") or config.sliding_window is None:
@ -1638,13 +1627,6 @@ class HybridCache(Cache):
if getattr(config, "num_key_value_heads", None) is None
else config.num_key_value_heads
)
if tp_size is not None and tp_size > 1:
if self.num_key_value_heads % tp_size != 0:
raise ValueError(
f"Number of key value heads {self.num_key_value_heads} must be divisible by tensor parallel size {tp_size}."
)
# If the model is using tensor parallelism, we need to adjust the number of heads accordingly.
self.num_key_value_heads //= tp_size
# If the attribute does not exist in the config, fallback to a simple StaticCache
if hasattr(config, "layer_types"):
@ -2215,10 +2197,6 @@ class OffloadedStaticCache(StaticCache):
Mapping between the layers and its device. This is required when you are manually initializing the cache
and the model is split between different gpus. You can know which layers mapped to which device by
checking the associated device_map: `model.hf_device_map`.
tp_size (`Optional[int]`, *optional*):
The tensor parallel size of the model. This is used to adjust the number of key/value heads in the cache
if the model is using tensor parallelism. If not provided, it defaults to `None`, which means that the
number of key/value heads will not be adjusted.
Example:
@ -2250,7 +2228,6 @@ class OffloadedStaticCache(StaticCache):
dtype: Optional[torch.dtype] = None,
offload_device: Union[str, torch.device] = torch.device("cpu"),
layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
tp_size: Optional[int] = None,
) -> None:
super(Cache, self).__init__()
@ -2274,13 +2251,6 @@ class OffloadedStaticCache(StaticCache):
if getattr(config, "num_key_value_heads", None) is None
else config.num_key_value_heads
)
if tp_size is not None and tp_size > 1:
if num_key_value_heads % tp_size != 0:
raise ValueError(
f"Number of key value heads {num_key_value_heads} must be divisible by tensor parallel size {tp_size}."
)
# If the model is using tensor parallelism, we need to adjust the number of heads accordingly.
num_key_value_heads //= tp_size
cache_shape = (max_batch_size, num_key_value_heads, self.max_cache_len, head_dim)
@ -2307,6 +2277,10 @@ class OffloadedStaticCache(StaticCache):
self._device_key_cache.append(key_cache)
self._device_value_cache.append(value_cache)
# For backwards compatibility.
# TODO(gante): Remove this.
self._seen_tokens = 0
# Create new CUDA stream for parallel prefetching.
self._prefetch_stream = torch.cuda.Stream() if self.device.type == "cuda" else None
@ -2340,6 +2314,10 @@ class OffloadedStaticCache(StaticCache):
value_states = value_states.to(self.value_cache[layer_idx].dtype)
if layer_idx == 0:
# Update seen tokens.
# TODO(gante): Remove this.
self._seen_tokens += key_states.shape[-2]
# Always there.
k_out = self.key_cache[0]
v_out = self.value_cache[0]
@ -2393,14 +2371,10 @@ class OffloadedStaticCache(StaticCache):
return k_out, v_out
def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
"""Returns the sequence length of the cached states. A layer index can be optionally passed."""
is_empty_layer = (
len(self.key_cache) == 0 # no cache in any layer
or len(self.key_cache) <= layer_idx # hasn't run a layer with cache after it
or not self.key_cache[layer_idx].numel() # the layer has no cache
)
layer_seq_length = self.key_cache[layer_idx].shape[-2] if not is_empty_layer else 0
return layer_seq_length
"""Returns the sequence length of the cached states that were seen by the model."""
# TODO(gante): Remove this.
return self._seen_tokens
def get_max_cache_shape(self) -> Optional[int]:
"""Returns the maximum sequence length of the cached states."""
@ -2410,12 +2384,22 @@ class OffloadedStaticCache(StaticCache):
def reset(self) -> None:
"""Resets the cache values while preserving the objects."""
# For backwards compatibility.
# TODO(gante): Remove this.
self._seen_tokens = 0
# Zero out cache.
for layer_idx in range(len(self.key_cache)):
# In-place ops prevent breaking the static address.
self.key_cache[layer_idx].zero_()
self.value_cache[layer_idx].zero_()
@property
def seen_tokens(self) -> int:
# For backwards compatibility.
# TODO(gante): Remove this.
return self._seen_tokens
def _create_key_value_cache_tensors(
self, shape: tuple[int, ...], device: torch.device
) -> tuple[torch.Tensor, torch.Tensor]:

View File

@ -14,7 +14,6 @@
import asyncio
import copy
import json
import os
import platform
@ -452,13 +451,11 @@ class ChatCommand(BaseTransformersCLICommand):
)
return processed_generate_flags
def get_generation_parameterization(
self, args: ChatArguments, model_generation_config: GenerationConfig
) -> tuple[GenerationConfig, dict]:
def get_generation_parameterization(self, args: ChatArguments) -> tuple[GenerationConfig, dict]:
"""
Returns a GenerationConfig object holding the generation parameters for the CLI command.
"""
# No generation config arg provided -> use model's default generation config, then apply CLI defaults
# No generation config arg provided -> use base generation config, apply CLI defaults
if args.generation_config is not None:
if ".json" in args.generation_config: # is a local file
dirname = os.path.dirname(args.generation_config)
@ -470,8 +467,7 @@ class ChatCommand(BaseTransformersCLICommand):
# !!!!!!!!!
# This is a chat session, so we have a few non-standard defaults
# !!!!!!!!!
generation_config = copy.deepcopy(model_generation_config)
generation_config.update({"do_sample": True, "max_new_tokens": 256})
generation_config = GenerationConfig(do_sample=True, max_new_tokens=256)
# Finally: parse and apply `generate_flags`
parsed_generate_flags = self.parse_generate_flags(args.generate_flags)
@ -679,8 +675,7 @@ class ChatCommand(BaseTransformersCLICommand):
else:
user = args.user
model_generation_config = GenerationConfig.from_pretrained(args.model_name_or_path)
generation_config, model_kwargs = self.get_generation_parameterization(args, model_generation_config)
generation_config, model_kwargs = self.get_generation_parameterization(args)
interface = RichInterface(model_name=args.model_name_or_path, user_name=user)
interface.clear()
@ -720,7 +715,7 @@ class ChatCommand(BaseTransformersCLICommand):
stream=True,
extra_body={
"request_id": request_id,
"generation_config": generation_config.to_json_string(),
"generation_config": {**generation_config.to_dict()},
"model": model,
},
)

View File

@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import functools
import json
import re
@ -21,7 +20,12 @@ from dataclasses import dataclass, field
from threading import Thread
from typing import Any, Optional
from huggingface_hub import ModelInfo, model_info
from huggingface_hub import (
ChatCompletionStreamOutputDeltaToolCall,
ChatCompletionStreamOutputFunction,
ModelInfo,
model_info,
)
from transformers.utils.import_utils import is_fastapi_available, is_pydantic_available, is_uvicorn_available
@ -82,9 +86,6 @@ if is_pydantic_available() and is_fastapi_available() and is_uvicorn_available()
# tool_prompt: Optional[str] = None
# top_logprobs: Optional[int] = None
# transformers-specific request fields
generation_config: Optional[str] = None
logger = logging.get_logger(__name__)
@ -109,35 +110,26 @@ def serve_command_factory(args: Namespace):
return ServeCommand(args)
def create_generation_config_from_req(
req: "ChatCompletionInput", model_generation_config: "GenerationConfig", **kwargs
) -> "GenerationConfig":
def create_generation_config_from_req(req: "ChatCompletionInput", **kwargs) -> "GenerationConfig":
"""
Creates a generation config from the parameters of the request. If a generation config is passed in the request,
it will be used as a baseline for parameterization. Otherwise, we will use the model's default generation config.
Other parameters in the request will be applied on top of the baseline.
Creates a generation config from the parameters of the request. Note that we can pass a `GenerationConfig`
(serialized into a `dict`) in `extra_body`, for full `generate` parameterization.
Args:
req (`ChatCompletionInput`):
The request which may optionally contain generation parameters.
model_generation_config (`GenerationConfig`):
The model's default generation config.
req (`ChatCompletionInput`): The request which may optionally contain generation parameters.
Returns:
The prepared `GenerationConfig` object.
"""
# If there is a generation config in the request, it is a json string serialization from a `GenerationConfig`
# object. For simplicity, flags set here take precedence over all other flags.
if req.generation_config is not None:
generation_config = GenerationConfig(**json.loads(req.generation_config))
else:
generation_config = copy.deepcopy(model_generation_config)
if req.extra_body is not None and "generation_config" in req.extra_body:
for key in req.extra_body["generation_config"].keys():
if key in ChatCompletionInput.base_field_names.keys():
raise ValueError("error: Duplicated key in the root request and in the passed generation config.")
non_standard_kwargs = generation_config.update(**kwargs)
# Set extra kwargs that are not in the `GenerationConfig` class (e.g. continuous batching flags)
for k, v in non_standard_kwargs.items():
if v is not None:
setattr(generation_config, k, v)
if req.extra_body is not None and "generation_config" in req.extra_body:
generation_config = GenerationConfig(**(req.extra_body["generation_config"]), **kwargs)
else:
generation_config = GenerationConfig(**kwargs)
if req.frequency_penalty is not None:
generation_config.repetition_penalty = float(req.frequency_penalty)
@ -275,7 +267,7 @@ class ServeCommand(BaseTransformersCLICommand):
content: Optional[str] = None,
role: Optional[str] = None,
finish_reason: Optional[str] = None,
tool_calls: Optional[list[dict]] = None,
tool_calls: Optional[list[ChatCompletionStreamOutputDeltaToolCall]] = None,
) -> str:
"""
Builds a chunk of a streaming response.
@ -292,7 +284,7 @@ class ServeCommand(BaseTransformersCLICommand):
The role of the next content, until a new role is defined.
finish_reason (`str`, *optional*):
The reason the generation by the model has finished.
tool_calls (`list[dict]`, *optional*):
tool_calls (`list[ChatCompletionStreamOutputDeltaToolCall]`, *optional*):
Data about the tool calls, when they are triggered.
Returns:
@ -366,7 +358,7 @@ class ServeCommand(BaseTransformersCLICommand):
{
"id": model.id,
"object": "model",
"created": model.created_at.timestamp(),
"crated": model.created_at.timestamp(),
"owned_by": model.author,
}
for model in get_text_gen_models()
@ -388,7 +380,6 @@ class ServeCommand(BaseTransformersCLICommand):
generation_config = create_generation_config_from_req(
req,
model_generation_config=self.model.generation_config,
eos_token_id=self.tokenizer.eos_token_id,
pad_token_id=self.tokenizer.pad_token_id,
use_cache=False,
@ -422,10 +413,6 @@ class ServeCommand(BaseTransformersCLICommand):
)
queue_is_flushed = False
# Emit the assistant role to start the stream. Other chunks won't have a role, as it is implicit
# they come from the assistant.
yield self.build_chunk(request_id, role="assistant")
for result in self.running_continuous_batching_manager:
if result.request_id != request_id:
continue
@ -437,12 +424,14 @@ class ServeCommand(BaseTransformersCLICommand):
queue_is_flushed = True
finish_reason = "stop" if result.status == RequestStatus.FINISHED else None
if result.status == RequestStatus.FINISHED:
yield self.build_chunk(request_id, finish_reason=finish_reason)
break
else:
yield self.build_chunk(request_id=request_id, content=result.next_token)
yield self.build_chunk(
request_id=request_id, content=result.next_token, finish_reason=finish_reason
)
if result.status == RequestStatus.FINISHED:
break
yield "data: [DONE]\n\n"
except Exception as e:
logger.error(str(e))
yield f'data: {{"error": "{str(e)}"}}'
@ -518,10 +507,7 @@ class ServeCommand(BaseTransformersCLICommand):
generation_streamer = TextIteratorStreamer(self.tokenizer, skip_special_tokens=True, skip_prompt=True)
generation_config = create_generation_config_from_req(
req,
model_generation_config=self.model.generation_config,
)
generation_config = create_generation_config_from_req(req)
max_new_tokens = req.max_tokens or generation_config.max_new_tokens or 1024
generation_config.max_new_tokens = max_new_tokens
@ -584,12 +570,14 @@ class ServeCommand(BaseTransformersCLICommand):
else:
tool_name = tool_name.group(1)
tool_state.has_tool_name_defined = True
tool = {
"function": {"name": tool_name},
"index": 0,
"type": "function",
"id": _request_id + "_tool_call", # Only the first tool call delta has an id
}
tool = ChatCompletionStreamOutputDeltaToolCall(
function=ChatCompletionStreamOutputFunction(
name=tool_name,
),
index=0,
type="function",
id=_request_id + "_tool_call", # Only the first tool call delta has an id
)
# Second step: extract tool arguments. The tool arguments can be seen as a json string
# within the tool json string. We emit a delta for the arguments.
@ -609,11 +597,13 @@ class ServeCommand(BaseTransformersCLICommand):
if tool_state.arg_nesting_level < 0:
result = "".join(result.split("}")[:-2]) + "}" # e.g. "4}}\n" -> "4}"
tool = {
"function": {"arguments": result},
"index": 0,
"type": "function",
}
tool = ChatCompletionStreamOutputDeltaToolCall(
function=ChatCompletionStreamOutputFunction(
arguments=result,
),
index=0,
type="function",
)
yield self.build_chunk(_request_id, tool_calls=[tool])
continue

View File

@ -106,5 +106,4 @@ deps = {
"opentelemetry-api": "opentelemetry-api",
"opentelemetry-exporter-otlp": "opentelemetry-exporter-otlp",
"opentelemetry-sdk": "opentelemetry-sdk",
"mistral-common[opencv]": "mistral-common[opencv]>=1.6.3",
}

View File

@ -162,7 +162,6 @@ class PagedAttentionCache(Cache):
dtype: torch.dtype = torch.float16,
layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
initial_prompt_shapes: Optional[list[list[int]]] = None,
tp_size: Optional[int] = None,
) -> None:
"""Initialize a paged attention cache for efficient memory usage.
@ -197,16 +196,7 @@ class PagedAttentionCache(Cache):
self.block_size = block_size
self.num_blocks = num_blocks
num_key_value_heads = self.num_key_value_heads
if tp_size is not None and tp_size > 1:
if num_key_value_heads % tp_size != 0:
raise ValueError(
f"Number of key value heads {num_key_value_heads} must be divisible by tensor parallel size {tp_size}."
)
# If the model is using tensor parallelism, we need to adjust the number of heads accordingly.
num_key_value_heads //= tp_size
self.cache_shape = (num_key_value_heads, num_blocks, self.block_size, self.head_dim)
self.cache_shape = (self.num_key_value_heads, num_blocks, self.block_size, self.head_dim)
self.dtype = dtype
self.device = device
@ -650,7 +640,7 @@ def compute_optimal_blocks(
memory_per_token = 2 * num_kv_heads * head_dim * dtype_size * num_hidden_layers # For K and V caches
# Estimate sequence length requirements
tokens_to_generate = getattr(generation_config, "max_new_tokens") or 20
tokens_to_generate = getattr(generation_config, "max_new_tokens", 20)
if median_prefill_length is None and inputs:
non_empty_inputs = [len(seq) for seq in inputs if seq]
@ -1291,7 +1281,6 @@ class ContinuousBatchingManager:
self.generation_config,
self.model.device,
self.model.dtype,
tp_size=getattr(self.model, "tp_size"),
)
scheduler = None

View File

@ -1963,9 +1963,6 @@ class GenerationMixin(ContinuousMixin):
"device": device,
"layer_device_map": layer_device_map,
}
if cache_implementation in ["static", "hybrid", "offloaded_static"]:
cache_kwargs.update({"tp_size": self.tp_size})
self._cache = cache_cls(**cache_kwargs)
if requires_cross_attention_cache:
encoder_kwargs = cache_kwargs.copy()
@ -1989,7 +1986,6 @@ class GenerationMixin(ContinuousMixin):
and "zamba" not in self.__class__.__name__.lower()
and "bamba" not in self.__class__.__name__.lower()
and "minimax" not in self.__class__.__name__.lower()
and "lfm2" not in self.__class__.__name__.lower()
)
def _prepare_cache_for_generation(

View File

@ -13,7 +13,6 @@
# limitations under the License.
from collections.abc import Iterable
from copy import deepcopy
from functools import lru_cache, partial
from typing import Any, Optional, TypedDict, Union
@ -230,7 +229,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
if kwarg is not None:
setattr(self, key, kwarg)
else:
setattr(self, key, deepcopy(getattr(self, key, None)))
setattr(self, key, getattr(self, key, None))
# get valid kwargs names
self._valid_kwargs_names = list(self.valid_kwargs.__annotations__.keys())

View File

@ -15,7 +15,7 @@ from typing import Callable, Optional
import torch
from ..cache_utils import DynamicCache, EncoderDecoderCache, HybridCache, StaticCache
from ..cache_utils import DynamicCache, HybridCache, StaticCache
from ..generation.configuration_utils import GenerationConfig
from ..masking_utils import (
ALL_MASK_ATTENTION_FUNCTIONS,
@ -548,7 +548,7 @@ class Seq2SeqLMDecoderExportableModuleWithStaticCache(torch.nn.Module):
self.lm_head = model.lm_head
self.config = model.config
# Initialize static cache for decoder and DynamicCache for encoder
# Initialize static cache
self.static_cache = StaticCache(
config=self.config,
max_batch_size=batch_size,
@ -556,7 +556,6 @@ class Seq2SeqLMDecoderExportableModuleWithStaticCache(torch.nn.Module):
device="cpu",
dtype=torch.float32,
)
self.cache = EncoderDecoderCache(self.static_cache, DynamicCache())
# Register cache buffers to make them exportable
for i in range(len(self.static_cache.key_cache)):
@ -568,7 +567,7 @@ class Seq2SeqLMDecoderExportableModuleWithStaticCache(torch.nn.Module):
outputs = self.decoder(
input_ids=decoder_input_ids,
encoder_hidden_states=encoder_hidden_states,
past_key_values=self.cache,
past_key_values=self.static_cache,
use_cache=True,
cache_position=cache_position,
)

View File

@ -607,7 +607,7 @@ class AttentionMaskInterface(GeneralInterface):
ALL_MASK_ATTENTION_FUNCTIONS: AttentionMaskInterface = AttentionMaskInterface()
def find_packed_sequence_indices(position_ids: torch.Tensor) -> torch.Tensor:
def find_packed_sequence_indices(position_ids: torch.Tensor) -> Optional[torch.Tensor]:
"""
Find the indices of the sequence to which each new query token in the sequence belongs when using packed
tensor format (i.e. several sequences packed in the same batch dimension).
@ -721,7 +721,7 @@ def create_causal_mask(
attention_mask: Optional[torch.Tensor],
cache_position: torch.Tensor,
past_key_values: Optional[Cache],
position_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor],
or_mask_function: Optional[Callable] = None,
and_mask_function: Optional[Callable] = None,
) -> Optional[Union[torch.Tensor, BlockMask]]:
@ -810,7 +810,7 @@ def create_sliding_window_causal_mask(
attention_mask: Optional[torch.Tensor],
cache_position: torch.Tensor,
past_key_values: Optional[Cache],
position_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor],
or_mask_function: Optional[Callable] = None,
and_mask_function: Optional[Callable] = None,
) -> Optional[Union[torch.Tensor, BlockMask]]:
@ -905,7 +905,7 @@ def create_chunked_causal_mask(
attention_mask: Optional[torch.Tensor],
cache_position: torch.Tensor,
past_key_values: Optional[Cache],
position_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor],
or_mask_function: Optional[Callable] = None,
and_mask_function: Optional[Callable] = None,
) -> Optional[Union[torch.Tensor, BlockMask]]:
@ -1014,7 +1014,7 @@ def create_masks_for_generate(
attention_mask: Optional[torch.Tensor],
cache_position: torch.Tensor,
past_key_values: Optional[Cache],
position_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor],
or_mask_function: Optional[Callable] = None,
and_mask_function: Optional[Callable] = None,
**kwargs,

View File

@ -208,8 +208,6 @@ def _get_unpad_data(attention_mask: torch.Tensor) -> tuple[torch.Tensor, torch.T
"""
seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
# NOTE: Similar to the `.item()` in prepare_fa2_from_position_ids, with torch compile,
# this might cause a graph break
max_seqlen_in_batch = seqlens_in_batch.max().item()
cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
return (
@ -344,13 +342,7 @@ def _prepare_flash_attention_from_position_ids(query, key, value, position_ids):
)
)
# NOTE: With torch compile, this will cause a graph break if you don't set
# `TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1` in the environment or call
# `torch._dynamo.config.capture_scalar_outputs = True` before doing the forward pass.
# This is a limitation of flash attention API, as the function `flash_attn_varlen_func`
# requires `max_length_q`, `max_length_k` to be passed as `int` and not `torch.Tensor`.
# https://github.com/Dao-AILab/flash-attention/blob/2dd8078adc1d9b74e315ee99718c0dea0de8eeb6/flash_attn/flash_attn_interface.py#L1423-L1424
max_length = position_ids.max().item() + 1
max_length = position_ids.max() + 1
return (query, key, value, indices_q, (cu_seq_lens, cu_seq_lens), (max_length, max_length))

View File

@ -93,6 +93,7 @@ def _compute_default_rope_parameters(
config: Optional[PretrainedConfig] = None,
device: Optional["torch.device"] = None,
seq_len: Optional[int] = None,
**rope_kwargs,
) -> tuple["torch.Tensor", float]:
"""
Computes the inverse frequencies according to the original RoPE implementation
@ -103,14 +104,25 @@ def _compute_default_rope_parameters(
The device to use for initialization of the inverse frequencies.
seq_len (`int`, *optional*):
The current sequence length. Unused for this type of RoPE.
rope_kwargs (`Dict`, *optional*):
BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
Returns:
Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
"""
base = config.rope_theta
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
dim = int(head_dim * partial_rotary_factor)
if config is not None and len(rope_kwargs) > 0:
raise ValueError(
"Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
f"`_compute_default_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
)
if len(rope_kwargs) > 0:
base = rope_kwargs["base"]
dim = rope_kwargs["dim"]
elif config is not None:
base = config.rope_theta
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
dim = int(head_dim * partial_rotary_factor)
attention_factor = 1.0 # Unused in this type of RoPE
@ -123,6 +135,7 @@ def _compute_linear_scaling_rope_parameters(
config: Optional[PretrainedConfig] = None,
device: Optional["torch.device"] = None,
seq_len: Optional[int] = None,
**rope_kwargs,
) -> tuple["torch.Tensor", float]:
"""
Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
@ -133,14 +146,24 @@ def _compute_linear_scaling_rope_parameters(
The device to use for initialization of the inverse frequencies.
seq_len (`int`, *optional*):
The current sequence length. Unused for this type of RoPE.
rope_kwargs (`Dict`, *optional*):
BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
Returns:
Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
"""
factor = config.rope_scaling["factor"]
if config is not None and len(rope_kwargs) > 0:
raise ValueError(
"Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
f"`_compute_linear_scaling_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
)
if len(rope_kwargs) > 0:
factor = rope_kwargs["factor"]
elif config is not None:
factor = config.rope_scaling["factor"]
# Gets the default RoPE parameters
inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len)
inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs)
# Then applies linear scaling to the frequencies.
# NOTE: originally, scaling was applied to the position_ids. However, we get `embs = inv_freq @ position_ids`, so
@ -153,6 +176,7 @@ def _compute_dynamic_ntk_parameters(
config: Optional[PretrainedConfig] = None,
device: Optional["torch.device"] = None,
seq_len: Optional[int] = None,
**rope_kwargs,
) -> tuple["torch.Tensor", float]:
"""
Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla
@ -163,17 +187,30 @@ def _compute_dynamic_ntk_parameters(
The device to use for initialization of the inverse frequencies.
seq_len (`int`, *optional*):
The current sequence length, used to update the dynamic RoPE at inference time.
rope_kwargs (`Dict`, *optional*):
BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
Returns:
Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
"""
# TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
base = config.rope_theta
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
dim = int(head_dim * partial_rotary_factor)
max_position_embeddings = config.max_position_embeddings
factor = config.rope_scaling["factor"]
if config is not None and len(rope_kwargs) > 0:
raise ValueError(
"Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
f"`_compute_dynamic_ntk_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
)
if len(rope_kwargs) > 0:
base = rope_kwargs["base"]
dim = rope_kwargs["dim"]
max_position_embeddings = rope_kwargs["max_position_embeddings"]
factor = rope_kwargs["factor"]
elif config is not None:
base = config.rope_theta
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
dim = int(head_dim * partial_rotary_factor)
max_position_embeddings = config.max_position_embeddings
factor = config.rope_scaling["factor"]
attention_factor = 1.0 # Unused in this type of RoPE
@ -195,7 +232,7 @@ def _compute_dynamic_ntk_parameters(
def _compute_yarn_parameters(
config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None
config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
) -> tuple["torch.Tensor", float]:
"""
Computes the inverse frequencies with NTK scaling. Please refer to the
@ -207,10 +244,17 @@ def _compute_yarn_parameters(
The device to use for initialization of the inverse frequencies.
seq_len (`int`, *optional*):
The current sequence length. Unused for this type of RoPE.
rope_kwargs (`Dict`, *optional*):
BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
Returns:
Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
post-processing scaling factor applied to the computed cos/sin.
"""
# No need to keep BC with yarn, unreleased when this new pattern was created.
if len(rope_kwargs) > 0:
raise ValueError(
f"Unexpected arguments: `**rope_kwargs` should be unset in `_compute_yarn_parameters`, got {rope_kwargs}"
)
base = config.rope_theta
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
@ -284,7 +328,7 @@ def _compute_yarn_parameters(
def _compute_longrope_parameters(
config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None
config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
) -> tuple["torch.Tensor", float]:
"""
Computes the inverse frequencies with LongRoPE scaling. Please refer to the
@ -296,11 +340,20 @@ def _compute_longrope_parameters(
The device to use for initialization of the inverse frequencies.
seq_len (`int`, *optional*):
The current sequence length.
rope_kwargs (`Dict`, *optional*):
BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
Returns:
Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
post-processing scaling factor applied to the computed cos/sin.
"""
# TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
# No need to keep BC with longrope, unreleased when this new pattern was created.
if len(rope_kwargs) > 0:
raise ValueError(
"Unexpected arguments: `**rope_kwargs` should be unset in `_compute_longrope_parameters`, got "
f"{rope_kwargs}"
)
base = config.rope_theta
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
@ -338,7 +391,7 @@ def _compute_longrope_parameters(
def _compute_llama3_parameters(
config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None
config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
) -> tuple["torch.Tensor", float]:
"""
Computes the inverse frequencies for llama 3.1.
@ -350,12 +403,14 @@ def _compute_llama3_parameters(
The device to use for initialization of the inverse frequencies.
seq_len (`int`, *optional*):
The current sequence length. Unused for this type of RoPE.
rope_kwargs (`Dict`, *optional*):
BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
Returns:
Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
post-processing scaling factor applied to the computed cos/sin.
"""
# Gets the default RoPE parameters
inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len)
inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs)
factor = config.rope_scaling["factor"] # `8` in the original implementation
low_freq_factor = config.rope_scaling["low_freq_factor"] # `1` in the original implementation

View File

@ -240,7 +240,6 @@ VLMS = [
"mistral3",
"mllama",
"paligemma",
"shieldgemma2",
"qwen2vl",
"qwen2_5_vl",
"videollava",
@ -1961,8 +1960,11 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
supports_gradient_checkpointing = False
_is_stateful = False
# Flash Attention support
_supports_flash_attn = False
# Flash Attention 2 support
_supports_flash_attn_2 = False
# Flash Attention 3 support
_supports_flash_attn_3 = False
# SDPA support
_supports_sdpa = False
@ -2071,15 +2073,12 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
"`PretrainedConfig`. To create a model from a pretrained model use "
f"`model = {self.__class__.__name__}.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
if not getattr(config, "_attn_implementation_autoset", False):
# config usually has a `torch_dtype` but we need the next line for the `no_super_init` tests
dtype = config.torch_dtype if hasattr(config, "torch_dtype") else torch.get_default_dtype()
config = self._autoset_attn_implementation(config, torch_dtype=dtype, check_device_map=False)
self.config = config
# The `hasattr` here is used as some Transformers tests for some reason do not call
# PretrainedConfig __init__ (e.g. test_no_super_init_config_and_model)
if hasattr(config, "_attn_implementation_internal") and not getattr(
config, "_attn_implementation_autoset", False
):
self.set_attention_implementation(self.config._attn_implementation_internal)
# for initialization of the loss
loss_type = self.__class__.__name__
if loss_type not in LOSS_MAPPING:
@ -2226,11 +2225,19 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
config = copy.deepcopy(config) # We do not want to modify the config inplace in _from_config.
if config._attn_implementation_internal is not None:
# In this case, the config has been created with the attn_implementation set by the user, which we should respect.
# In this case, the config has been created with the attn_implementation set by the user, which we
# should respect.
attn_implementation = config._attn_implementation_internal
else:
attn_implementation = None
config._attn_implementation = kwargs.pop("attn_implementation", attn_implementation)
if not getattr(config, "_attn_implementation_autoset", False):
config = cls._autoset_attn_implementation(
config,
check_device_map=False,
torch_dtype=torch_dtype,
)
if is_deepspeed_zero3_enabled() and not _is_quantized and not _is_ds_init_called:
logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
@ -2252,65 +2259,81 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
return model
@classmethod
def _check_attn_implementation(cls, attn_implementation: Union[str, dict]) -> Union[str, dict]:
def _autoset_attn_implementation(
cls,
config,
torch_dtype: Optional[torch.dtype] = None,
device_map: Optional[Union[str, dict[str, int]]] = None,
check_device_map: bool = True,
):
"""
Checks that the requested attention implementation exists and tries to get the kernel from hub
if `attn_implementation` matches hf kernels pattern.
Automatically checks and dispatches to a default attention implementation. In order of priority:
1. An implementation specified in `config._attn_implementation` (due for example to the argument attn_implementation="sdpa" in from_pretrained).
2. SDPA implementation, if available and supported by the model type. (`LlamaSdpaAttention` for example)
3. The default model's implementation otherwise (`LlamaAttention` for example) .
"""
if isinstance(attn_implementation, str) and re.match(r"^[^/:]+/[^/:]+:[^/:]+$", attn_implementation):
if not is_kernels_available():
raise ValueError("kernels is not installed. Please install it with `pip install kernels`.")
# Here we use config._attn_implementation_internal to check whether the attention implementation was explicitly set by the user.
# The property `PretrainedConfig._attn_implementation` is never `None`, for backward compatibility (always fall back on "eager").
# The `hasattr` here is used as some Transformers tests for some reason do not call PretrainedConfig __init__ (e.g. test_no_super_init_config_and_model)
requested_attn_implementation = None
if hasattr(config, "_attn_implementation_internal") and config._attn_implementation_internal is not None:
if isinstance(config._attn_implementation, str) and re.match(
r"^[^/:]+/[^/:]+:[^/:]+$", config._attn_implementation
):
if not is_kernels_available():
raise ValueError("kernels is not installed. Please install it with `pip install kernels`.")
# Extract repo_id and kernel_name from the string
repo_id, kernel_name = attn_implementation.split(":")
kernel_name = kernel_name.strip()
repo_id = repo_id.strip()
# Extract repo_id and kernel_name from the string
repo_id, kernel_name = config._attn_implementation.split(":")
kernel_name = kernel_name.strip()
repo_id = repo_id.strip()
try:
kernel = get_kernel(repo_id)
ALL_ATTENTION_FUNCTIONS.register(f"kernel_{repo_id.replace('/', '_')}", getattr(kernel, kernel_name))
attn_implementation = f"kernel_{repo_id.replace('/', '_')}"
except FileNotFoundError as e:
logger.warning(
f"Could not find a kernel repository '{repo_id}' compatible with your devicein the hub: {e}. Using eager attention implementation instead."
)
attn_implementation = None # try to dispatch SDPA and fallback eager if not available
except AttributeError:
raise ValueError(
"the kernel function name or class specified in the attn_implementation argument is not valid. \
Please check the documentation for the correct format, \
and check that the kernel exports the class and the function correctly."
)
if (
not isinstance(attn_implementation, dict)
and attn_implementation not in ["eager", None] + ALL_ATTENTION_FUNCTIONS.valid_keys()
):
message = f'Specified `attn_implementation="{attn_implementation}"` is not supported. The only possible arguments are `attn_implementation="eager"` (manual attention implementation)'
# check `supports_flash_attn_2` for BC with custom code. TODO: remove after a few releases
if cls._supports_flash_attn or getattr(cls, "_supports_flash_attn_2", False):
message += (
', `"attn_implementation=flash_attention_3"` (implementation using flash attention 3)'
', `"attn_implementation=flash_attention_2"` (implementation using flash attention 2)'
)
if cls._supports_sdpa:
message += ', `"attn_implementation=sdpa"` (implementation using torch.nn.functional.scaled_dot_product_attention)'
if cls._supports_flex_attn:
message += ', `"attn_implementation=flex_attention"` (implementation using torch\'s flex_attention)'
raise ValueError(message + ".")
try:
kernel = get_kernel(repo_id)
ALL_ATTENTION_FUNCTIONS.register(
f"kernel_{repo_id.replace('/', '_')}", getattr(kernel, kernel_name)
)
config._attn_implementation = f"kernel_{repo_id.replace('/', '_')}"
except FileNotFoundError as e:
logger.warning(
f"Could not find a kernel repository '{repo_id}' compatible with your devicein the hub: {e}. Using eager attention implementation instead."
)
config._attn_implementation = "eager"
except AttributeError:
raise ValueError(
"the kernel function name or class specified in the attn_implementation argument is not valid. \
Please check the documentation for the correct format, \
and check that the kernel exports the class and the function correctly."
)
return attn_implementation
if (
not isinstance(config._attn_implementation, dict)
and config._attn_implementation not in ["eager"] + ALL_ATTENTION_FUNCTIONS.valid_keys()
):
message = f'Specified `attn_implementation="{config._attn_implementation}"` is not supported. The only possible arguments are `attn_implementation="eager"` (manual attention implementation)'
if cls._supports_flash_attn_3:
message += ', `"attn_implementation=flash_attention_3"` (implementation using flash attention 3)'
if cls._supports_flash_attn_2:
message += ', `"attn_implementation=flash_attention_2"` (implementation using flash attention 2)'
if cls._supports_sdpa:
message += ', `"attn_implementation=sdpa"` (implementation using torch.nn.functional.scaled_dot_product_attention)'
if cls._supports_flex_attn:
message += (
', `"attn_implementation=flex_attention"` (implementation using torch\'s flex_attention)'
)
raise ValueError(message + ".")
def set_attention_implementation(self, attn_implementation: Union[str, dict]):
"""
Checks and dispatches to the requested attention implementation.
"""
requested_attn_implementation = self._check_attn_implementation(attn_implementation)
# If a config is passed with a preset attn_implementation, we skip the automatic dispatch and use the user-provided config, with hard checks that the requested attention implementation is available.
requested_attn_implementation = config._attn_implementation_internal
# Composite models consisting of several PretrainedModels can specify attention implementation as a dict where
# keys are sub-config names. But most people will specify one `str` which means that should dispatch it for all sub-models.
# See https://github.com/huggingface/transformers/pull/32238
for key in self.config.sub_configs.keys():
sub_config = getattr(self.config, key)
# Composite models consisting of several PretrainedModels have to specify attention impl as a dict
# where keys are sub-config names. But most people will specify one `str` which means that should dispatch it
# for all sub-models.
# Below we check if a config is composite and manually prepare a dict of attn impl if not already passed as a dict.
# Later each sub-module will dispatch with its own attn impl, by calling `XXXModel._from_config(config.text_config)`
# If any of sub-modules doesn't support requested attn, an error will be raised. See https://github.com/huggingface/transformers/pull/32238
for key in config.sub_configs.keys():
sub_config = getattr(config, key)
curr_attn_implementation = (
requested_attn_implementation
if not isinstance(requested_attn_implementation, dict)
@ -2325,26 +2348,50 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
):
sub_config._attn_implementation_internal = curr_attn_implementation
if requested_attn_implementation == "flash_attention_3" and self._flash_attn_3_can_dispatch():
self.config._attn_implementation = "flash_attention_3"
if requested_attn_implementation == "flash_attention_2" and self._flash_attn_2_can_dispatch():
self.config._attn_implementation = "flash_attention_2"
elif requested_attn_implementation == "flex_attention" and self._flex_attn_can_dispatch():
self.config._attn_implementation = "flex_attention"
elif (
requested_attn_implementation in [None, "sdpa"]
and not is_torch_xla_available()
and self._sdpa_can_dispatch(hard_check_only=requested_attn_implementation is not None)
):
self.config._attn_implementation = "sdpa"
elif requested_attn_implementation in ALL_ATTENTION_FUNCTIONS.valid_keys():
self.config._attn_implementation = requested_attn_implementation
elif isinstance(requested_attn_implementation, dict):
self.config._attn_implementation = requested_attn_implementation.get("", None)
else:
self.config._attn_implementation = "eager"
if config._attn_implementation == "flash_attention_3":
cls._check_and_enable_flash_attn_3(
config,
torch_dtype=torch_dtype,
device_map=device_map,
hard_check_only=False,
check_device_map=check_device_map,
)
elif config._attn_implementation == "flash_attention_2":
cls._check_and_enable_flash_attn_2(
config,
torch_dtype=torch_dtype,
device_map=device_map,
hard_check_only=False,
check_device_map=check_device_map,
)
elif requested_attn_implementation == "flex_attention":
config = cls._check_and_enable_flex_attn(config, hard_check_only=True)
elif requested_attn_implementation in [None, "sdpa"] and not is_torch_xla_available():
# flash_attention_2 takes priority over SDPA, hence SDPA treated in this elif.
config = cls._check_and_enable_sdpa(
config,
hard_check_only=requested_attn_implementation is not None,
)
self.config._attn_implementation_autoset = True
if (
torch.version.hip is not None
and config._attn_implementation == "sdpa"
and torch.cuda.device_count() > 1
and version.parse(torch.__version__) < version.parse("2.4.1")
):
logger.warning_once(
"Using the `SDPA` attention implementation on multi-gpu setup with ROCM may lead to performance issues due to the FA backend. Disabling it to use alternative backends."
)
torch.backends.cuda.enable_flash_sdp(False)
elif requested_attn_implementation in ALL_ATTENTION_FUNCTIONS.valid_keys():
config._attn_implementation = requested_attn_implementation
elif isinstance(requested_attn_implementation, dict):
config._attn_implementation = None
else:
config._attn_implementation = "eager"
config._attn_implementation_autoset = True
return config
@classmethod
def _set_default_torch_dtype(cls, dtype: torch.dtype) -> torch.dtype:
@ -2418,21 +2465,24 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
# Otherwise, can't generate
return False
def _flash_attn_2_can_dispatch(self) -> bool:
@classmethod
def _check_and_enable_flash_attn_2(
cls,
config,
torch_dtype: Optional[torch.dtype] = None,
device_map: Optional[Union[str, dict[str, int]]] = None,
check_device_map: bool = True,
hard_check_only: bool = False,
) -> PretrainedConfig:
"""
Checks the availability of Flash Attention 2 and compatibility with the current model.
If all checks pass and `hard_check_only` is False, the method will set the config attribute `attn_implementation` to "flash_attention_2" so that the model can initialize the correct attention module.
"""
# Config always has `torch_dtype` but we need the next line for `no_super_init()` tests
torch_dtype = self.config.torch_dtype if hasattr(self.config, "torch_dtype") else torch.get_default_dtype()
device_map = self.hf_device_map if hasattr(self, "hf_device_map") else None
# check `supports_flash_attn_2` for BC with custom code. TODO: remove after a few releases
if not (self._supports_flash_attn or getattr(self, "_supports_flash_attn_2", False)):
if not cls._supports_flash_attn_2:
raise ValueError(
f"{self.__class__.__name__} does not support Flash Attention 2.0 yet. Please request to add support where"
f" the model is hosted, on its model hub page: https://huggingface.co/{self.config._name_or_path}/discussions/new"
f"{cls.__name__} does not support Flash Attention 2.0 yet. Please request to add support where"
f" the model is hosted, on its model hub page: https://huggingface.co/{config._name_or_path}/discussions/new"
" or in the Transformers GitHub repo: https://github.com/huggingface/transformers/issues/new"
)
@ -2440,32 +2490,39 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
preface = "FlashAttention2 has been toggled on, but it cannot be used due to the following error:"
install_message = "Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2."
# package `flash-attn` can not be installed on Ascend NPU, ignore related validation logi
if importlib.util.find_spec("flash_attn") is None and not is_torch_npu_available():
raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
else:
# Check FA2 installed version compatibility
flash_attention_version = version.parse(importlib.metadata.version("flash_attn"))
if torch.version.cuda:
if flash_attention_version < version.parse("2.1.0"):
raise ImportError(
f"{preface} you need flash_attn package version to be greater or equal than 2.1.0. Detected version {flash_attention_version}. {install_message}"
)
elif not torch.cuda.is_available():
raise ValueError(
f"{preface} Flash Attention 2 is not available on CPU. Please make sure torch can access a CUDA device."
)
else:
raise ImportError(f"{preface} Flash Attention 2 is not available. {install_message}")
elif torch.version.hip:
if flash_attention_version < version.parse("2.0.4"):
raise ImportError(
f"{preface} you need flash_attn package version to be greater or equal than 2.0.4. Detected version {flash_attention_version}. {install_message}"
)
else:
raise ImportError(f"{preface} Flash Attention 2 is not available. {install_message}")
if importlib.util.find_spec("flash_attn") is None:
# package `flash-attn` can not be installed on Ascend NPU, ignore related validation logic and early exit.
if is_torch_npu_available():
if not hard_check_only:
config._attn_implementation = "flash_attention_2"
logger.info("Detect using FlashAttention2 on Ascend NPU.")
return config
else:
raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
flash_attention_version = version.parse(importlib.metadata.version("flash_attn"))
if torch.version.cuda:
if flash_attention_version < version.parse("2.1.0"):
raise ImportError(
f"{preface} you need flash_attn package version to be greater or equal than 2.1.0. Detected version {flash_attention_version}. {install_message}"
)
elif not torch.cuda.is_available():
raise ValueError(
f"{preface} Flash Attention 2 is not available on CPU. Please make sure torch can access a CUDA device."
)
else:
raise ImportError(f"{preface} Flash Attention 2 is not available. {install_message}")
elif torch.version.hip:
if flash_attention_version < version.parse("2.0.4"):
raise ImportError(
f"{preface} you need flash_attn package version to be greater or equal than 2.0.4. Make sure to have that version installed - detected version {flash_attention_version}. {install_message}"
)
else:
raise ImportError(f"{preface} Flash Attention 2 is not available. {install_message}")
_is_bettertransformer = getattr(cls, "use_bettertransformer", False)
_is_bettertransformer = getattr(self, "use_bettertransformer", False)
if _is_bettertransformer:
raise ValueError(
"Flash Attention 2 and BetterTransformer API are not compatible. Please make sure to disable BetterTransformers by doing model.reverse_bettertransformer()"
@ -2478,13 +2535,13 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
elif torch_dtype is not None and torch_dtype not in [torch.float16, torch.bfloat16]:
logger.warning_once(
"Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but"
f" the current dype in {self.__class__.__name__} is {torch_dtype}. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator,"
f" the current dype in {cls.__name__} is {torch_dtype}. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator,"
' or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`'
)
# The check `torch.empty(0).device.type != "cuda"` is needed as the model may be initialized after `torch.set_default_device` has been called,
# or the model may be initialized under the context manager `with torch.device("cuda"):`.
if device_map is None and torch.empty(0).device.type not in ["cuda", "mlu"]:
if check_device_map and device_map is None and torch.empty(0).device.type not in ["cuda", "mlu"]:
if torch.cuda.is_available():
logger.warning_once(
"You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU"
@ -2502,7 +2559,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
"or initialising the model on CPU and then moving it to GPU."
)
elif (
device_map is not None
check_device_map
and device_map is not None
and isinstance(device_map, dict)
and ("cpu" in device_map.values() or "disk" in device_map.values())
):
@ -2510,24 +2568,28 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
"You are attempting to use Flash Attention 2.0 with a model dispatched on CPU or disk. This is not supported. Please make sure to "
"initialise the model on a GPU by passing a device_map that contains only GPU devices as keys."
)
if not hard_check_only:
config._attn_implementation = "flash_attention_2"
return config
# If no error raise by this point, we can return `True`
return True
def _flash_attn_3_can_dispatch(self) -> bool:
@classmethod
def _check_and_enable_flash_attn_3(
cls,
config,
torch_dtype: Optional[torch.dtype] = None,
device_map: Optional[Union[str, dict[str, int]]] = None,
check_device_map: bool = True,
hard_check_only: bool = False,
) -> PretrainedConfig:
"""
Checks the availability of Flash Attention 3 and compatibility with the current model.
If all checks pass and `hard_check_only` is False, the method will set the config attribute `attn_implementation` to "flash_attention_3" so that the model can initialize the correct attention module.
"""
# Config always has `torch_dtype` but we need the next line for `no_super_init()` tests
torch_dtype = self.config.torch_dtype if hasattr(self.config, "torch_dtype") else torch.get_default_dtype()
device_map = self.hf_device_map if hasattr(self, "hf_device_map") else None
if not self._supports_flash_attn:
if not cls._supports_flash_attn_3:
raise ValueError(
f"{self.__class__.__name__} does not support Flash Attention 3.0 yet. Please request to add support where"
f" the model is hosted, on its model hub page: https://huggingface.co/{self.config._name_or_path}/discussions/new"
f"{cls.__name__} does not support Flash Attention 3.0 yet. Please request to add support where"
f" the model is hosted, on its model hub page: https://huggingface.co/{config._name_or_path}/discussions/new"
" or in the Transformers GitHub repo: https://github.com/huggingface/transformers/issues/new"
)
@ -2557,22 +2619,22 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
elif torch_dtype is not None and torch_dtype not in [torch.float16, torch.bfloat16]:
logger.warning_once(
"Flash Attention 3 only supports torch.float16 and torch.bfloat16 dtypes, but"
f" the current dype in {self.__class__.__name__} is {torch_dtype}. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator,"
f" the current dype in {cls.__name__} is {torch_dtype}. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator,"
' or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("meta-llama/Llama-3.2-1B", attn_implementation="flash_attention_3", torch_dtype=torch.float16)`'
)
if getattr(self.config, "alibi", False) or getattr(self.config, "use_alibi", False):
if getattr(config, "alibi", False) or getattr(config, "use_alibi", False):
raise ValueError("Model is configured to use ALiBi, which is not supported by Flash Attention 3.")
# Check for attention dropout, which is incompatible with FA3
if hasattr(self.config, "attention_dropout") and self.config.attention_dropout > 0:
if hasattr(config, "attention_dropout") and config.attention_dropout > 0:
raise ValueError(
f"Model has attention_dropout={self.config.attention_dropout}, which is not supported by Flash Attention 3."
f"Model has attention_dropout={config.attention_dropout}, which is not supported by Flash Attention 3."
)
# The check `torch.empty(0).device.type != "cuda"` is needed as the model may be initialized after `torch.set_default_device` has been called,
# or the model may be initialized under the context manager `with torch.device("cuda"):`.
if device_map is None and torch.empty(0).device.type not in ["cuda", "mlu"]:
if check_device_map and device_map is None and torch.empty(0).device.type not in ["cuda", "mlu"]:
if torch.cuda.is_available():
logger.warning_once(
"You are attempting to use Flash Attention 3 with a model not initialized on GPU. Make sure to move the model to GPU"
@ -2585,7 +2647,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
"or initialising the model on CPU and then moving it to GPU."
)
elif (
device_map is not None
check_device_map
and device_map is not None
and isinstance(device_map, dict)
and ("cpu" in device_map.values() or "disk" in device_map.values())
):
@ -2593,18 +2656,21 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
"You are attempting to use Flash Attention 3 with a model dispatched on CPU or disk. This is not supported. Please make sure to "
"initialise the model on a GPU by passing a device_map that contains only GPU devices as keys."
)
return True
if not hard_check_only:
config._attn_implementation = "flash_attention_3"
return config
def _sdpa_can_dispatch(self, hard_check_only: bool = False) -> bool:
@classmethod
def _check_and_enable_sdpa(cls, config, hard_check_only: bool = False):
"""
Checks the availability of SDPA for a given model.
If all checks pass and `hard_check_only` is False, the method will set the config attribute `_attn_implementation` to "sdpa" so that the model can initialize the correct attention module.
"""
if hard_check_only:
if not self._supports_sdpa:
if not cls._supports_sdpa:
raise ValueError(
f"{self.__class__.__name__} does not support an attention implementation through torch.nn.functional.scaled_dot_product_attention yet."
f"{cls.__name__} does not support an attention implementation through torch.nn.functional.scaled_dot_product_attention yet."
" Please request the support for this architecture: https://github.com/huggingface/transformers/issues/28005. If you believe"
' this error is a bug, please open an issue in Transformers GitHub repository and load your model with the argument `attn_implementation="eager"` meanwhile. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="eager")`'
)
@ -2613,44 +2679,45 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
"PyTorch SDPA requirements in Transformers are not met. Please install torch>=2.1.1."
)
if (
torch.version.hip is not None
and torch.cuda.device_count() > 1
and version.parse(torch.__version__) < version.parse("2.4.1")
):
logger.warning_once(
"Using the `SDPA` attention implementation on multi-gpu setup with ROCM may lead to performance issues due to the FA backend. Disabling it to use alternative backends."
)
torch.backends.cuda.enable_flash_sdp(False)
if not is_torch_sdpa_available() or not cls._supports_sdpa:
return config
# This means we have `hard_check_only=False` and fallback to eager if SDPA isn't supported
_is_bettertransformer = getattr(self, "use_bettertransformer", False)
if not is_torch_sdpa_available() or not self._supports_sdpa or _is_bettertransformer:
return False
_is_bettertransformer = getattr(cls, "use_bettertransformer", False)
if _is_bettertransformer:
return config
return True
if not hard_check_only:
config._attn_implementation = "sdpa"
return config
def _flex_attn_can_dispatch(self) -> bool:
@classmethod
def _check_and_enable_flex_attn(cls, config, hard_check_only: bool = False) -> PretrainedConfig:
"""
Checks the availability of Flex Attention for a given model.
If all checks pass and `hard_check_only` is False, the method will set the config attribute `_attn_implementation` to "flex_attention" so that the model can initialize the correct attention module.
"""
if not self._supports_flex_attn:
raise ValueError(
f"{self.__class__.__name__} does not support an attention implementation through torch's flex_attention."
" Please request the support for this architecture: https://github.com/huggingface/transformers/issues/34809."
" If you believe this error is a bug, please open an issue in Transformers GitHub repository"
' and load your model with the argument `attn_implementation="eager"` meanwhile.'
' Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="eager")`'
)
if not is_torch_flex_attn_available():
raise ImportError(
"PyTorch Flex Attention requirements in Transformers are not met. Please install torch>=2.5.0."
)
if hard_check_only:
if not cls._supports_flex_attn:
raise ValueError(
f"{cls.__name__} does not support an attention implementation through torch's flex_attention."
" Please request the support for this architecture: https://github.com/huggingface/transformers/issues/34809."
" If you believe this error is a bug, please open an issue in Transformers GitHub repository"
' and load your model with the argument `attn_implementation="eager"` meanwhile.'
' Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="eager")`'
)
if not is_torch_flex_attn_available():
raise ImportError(
"PyTorch Flex Attention requirements in Transformers are not met. Please install torch>=2.5.0."
)
# If no error raise by this point, we can return `True`
return True
if not is_torch_flex_attn_available() or not cls._supports_flex_attn:
return config
if not hard_check_only:
config._attn_implementation = "flex_attention"
return config
def enable_input_require_grads(self):
"""
@ -3768,23 +3835,27 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
# We're going to remove aliases before saving
ptrs = collections.defaultdict(list)
for name, tensor in state_dict.items():
if not isinstance(tensor, torch.Tensor):
# Sometimes in the state_dict we have non-tensor objects.
# e.g. in bitsandbytes we have some `str` objects in the state_dict
# Sometimes in the state_dict we have non-tensor objects.
# e.g. in bitsandbytes we have some `str` objects in the state_dict
if isinstance(tensor, torch.Tensor):
ptrs[id_tensor_storage(tensor)].append(name)
else:
# In the non-tensor case, fall back to the pointer of the object itself
ptrs[id(tensor)].append(name)
elif tensor.device.type == "meta":
# In offloaded cases, there may be meta tensors in the state_dict.
# For these cases, key by the pointer of the original tensor object
# (state_dict tensors are detached and therefore no longer shared)
tensor = self.get_parameter(name)
ptrs[id(tensor)].append(name)
# These are all the pointers of shared tensors
if hasattr(self, "hf_device_map"):
# if the model has offloaded parameters, we must check using find_tied_parameters()
tied_params = find_tied_parameters(self)
if tied_params:
tied_names = tied_params[0]
shared_ptrs = {
ptr: names for ptr, names in ptrs.items() if any(name in tied_names for name in names)
}
else:
ptrs[id_tensor_storage(tensor)].append(name)
shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1}
shared_ptrs = {}
else:
shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1}
# Recursively descend to find tied weight keys
_tied_weights_keys = _get_tied_weight_keys(self)
@ -3828,9 +3899,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
if len(error_names) > 0:
raise RuntimeError(
f"The weights trying to be saved contained shared tensors {error_names} that are mismatching "
"the transformers base configuration. Try saving using `safe_serialization=False`, setting the "
"`_dynamic_tied_weights_keys` attribute for affected modules, or remove this tensor sharing.",
f"The weights trying to be saved contained shared tensors {error_names} that are mismatching the transformers base configuration. Try saving using `safe_serialization=False` or remove this tensor sharing.",
)
# Shard the model if it is too big.
@ -4425,9 +4494,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
raise ValueError("device_mesh must be 1 dimensional and will be used for TP")
device_map = torch.device(device_mesh.device_type, int(os.environ["LOCAL_RANK"]))
if tp_size is None:
tp_size = torch.distributed.get_world_size()
if use_auth_token is not None:
warnings.warn(
"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
@ -4735,6 +4801,13 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
model_init_context = cls.get_init_context(is_quantized, _is_ds_init_called)
config = copy.deepcopy(config) # We do not want to modify the config inplace in from_pretrained.
if not getattr(config, "_attn_implementation_autoset", False):
config = cls._autoset_attn_implementation(
config,
torch_dtype=torch_dtype,
device_map=device_map,
)
with ContextManagers(model_init_context):
# Let's make sure we don't run the init function of buffer modules
model = cls(config, *model_args, **model_kwargs)

View File

@ -82,7 +82,6 @@ if TYPE_CHECKING:
from .deberta import *
from .deberta_v2 import *
from .decision_transformer import *
from .deepseek_v2 import *
from .deepseek_v3 import *
from .deformable_detr import *
from .deit import *
@ -168,7 +167,6 @@ if TYPE_CHECKING:
from .layoutxlm import *
from .led import *
from .levit import *
from .lfm2 import *
from .lightglue import *
from .lilt import *
from .llama import *
@ -207,7 +205,6 @@ if TYPE_CHECKING:
from .mobilevit import *
from .mobilevitv2 import *
from .modernbert import *
from .modernbert_decoder import *
from .moonshine import *
from .moshi import *
from .mpnet import *
@ -238,7 +235,6 @@ if TYPE_CHECKING:
from .pegasus import *
from .pegasus_x import *
from .perceiver import *
from .perception_lm import *
from .persimmon import *
from .phi import *
from .phi3 import *

View File

@ -444,7 +444,8 @@ class Aimv2PreTrainedModel(PreTrainedModel):
"Aimv2TextEmbeddings",
]
_supports_sdpa = True
_supports_flash_attn = True
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
_supports_flex_attn = True
def _init_weights(self, module):

View File

@ -441,7 +441,8 @@ class Aimv2PreTrainedModel(PreTrainedModel):
"Aimv2TextEmbeddings",
]
_supports_sdpa = True
_supports_flash_attn = True
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
_supports_flex_attn = True
def _init_weights(self, module):

View File

@ -19,7 +19,7 @@ Image/Text processor class for ALIGN
from typing import Union
from ...image_utils import ImageInput
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, _validate_images_text_input_order
from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
@ -110,6 +110,8 @@ class AlignProcessor(ProcessorMixin):
"""
if text is None and images is None:
raise ValueError("You must specify either text or images.")
# check if images and text inputs are reversed for BC
images, text = _validate_images_text_input_order(images, text)
output_kwargs = self._merge_kwargs(
AlignProcessorKwargs,

View File

@ -313,7 +313,8 @@ class ArceePreTrainedModel(PreTrainedModel):
supports_gradient_checkpointing = True
_no_split_modules = ["ArceeDecoderLayer"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
_supports_sdpa = True
_supports_flex_attn = True
_supports_cache_class = True

View File

@ -629,7 +629,7 @@ class AriaTextPreTrainedModel(PreTrainedModel):
_no_split_modules = ["AriaTextDecoderLayer", "AriaGroupedExpertsGemm"]
supports_gradient_checkpointing = True
_skip_keys_device_placement = "past_key_values"
_supports_flash_attn = False
_supports_flash_attn_2 = False
_supports_sdpa = True
_supports_cache_class = True
_supports_attention_backend = True
@ -661,7 +661,8 @@ class AriaPreTrainedModel(PreTrainedModel):
supports_gradient_checkpointing = True
_no_split_modules = ["AriaDecoderLayer"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
_supports_sdpa = True
_supports_flex_attn = True
_supports_cache_class = True
@ -919,7 +920,7 @@ class AriaCausalLMOutputWithPast(ModelOutput):
Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
@ -946,7 +947,7 @@ class AriaCausalLMOutputWithPast(ModelOutput):
)
class AriaModelOutputWithPast(BaseModelOutputWithPast):
r"""
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
@ -1033,7 +1034,7 @@ class AriaModel(AriaPreTrainedModel):
pixel_mask: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Cache] = None,
past_key_values: Optional[list[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
@ -1191,7 +1192,7 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
pixel_mask: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Cache] = None,
past_key_values: Optional[list[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,

View File

@ -18,7 +18,6 @@ from typing import Optional, Union
import numpy as np
from ...activations import ACT2FN
from ...cache_utils import Cache
from ...configuration_utils import PretrainedConfig
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_patch_output_size, select_best_resolution
from ...image_transforms import PaddingMode, convert_to_rgb, pad, resize, to_channel_dimension_format
@ -1284,7 +1283,7 @@ class AriaTextPreTrainedModel(PreTrainedModel):
_no_split_modules = ["AriaTextDecoderLayer", "AriaGroupedExpertsGemm"]
supports_gradient_checkpointing = True
_skip_keys_device_placement = "past_key_values"
_supports_flash_attn = False
_supports_flash_attn_2 = False
_supports_sdpa = True
_supports_cache_class = True
_supports_attention_backend = True
@ -1432,7 +1431,7 @@ class AriaModel(LlavaModel):
pixel_mask: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Cache] = None,
past_key_values: Optional[list[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
@ -1529,7 +1528,7 @@ class AriaForConditionalGeneration(LlavaForConditionalGeneration):
pixel_mask: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Cache] = None,
past_key_values: Optional[list[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,

View File

@ -375,7 +375,8 @@ class ASTPreTrainedModel(PreTrainedModel):
main_input_name = "input_values"
supports_gradient_checkpointing = True
_supports_sdpa = True
_supports_flash_attn = True
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
_supports_flex_attn = True
_supports_attention_backend = True

View File

@ -101,7 +101,6 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
("deberta", "DebertaConfig"),
("deberta-v2", "DebertaV2Config"),
("decision_transformer", "DecisionTransformerConfig"),
("deepseek_v2", "DeepseekV2Config"),
("deepseek_v3", "DeepseekV3Config"),
("deformable_detr", "DeformableDetrConfig"),
("deit", "DeiTConfig"),
@ -201,7 +200,6 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
("layoutlmv3", "LayoutLMv3Config"),
("led", "LEDConfig"),
("levit", "LevitConfig"),
("lfm2", "Lfm2Config"),
("lightglue", "LightGlueConfig"),
("lilt", "LiltConfig"),
("llama", "LlamaConfig"),
@ -241,7 +239,6 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
("mobilevit", "MobileViTConfig"),
("mobilevitv2", "MobileViTV2Config"),
("modernbert", "ModernBertConfig"),
("modernbert-decoder", "ModernBertDecoderConfig"),
("moonshine", "MoonshineConfig"),
("moshi", "MoshiConfig"),
("mpnet", "MPNetConfig"),
@ -273,8 +270,6 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
("pegasus", "PegasusConfig"),
("pegasus_x", "PegasusXConfig"),
("perceiver", "PerceiverConfig"),
("perception_encoder", "TimmWrapperConfig"),
("perception_lm", "PerceptionLMConfig"),
("persimmon", "PersimmonConfig"),
("phi", "PhiConfig"),
("phi3", "Phi3Config"),
@ -486,7 +481,6 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
("deberta", "DeBERTa"),
("deberta-v2", "DeBERTa-v2"),
("decision_transformer", "Decision Transformer"),
("deepseek_v2", "DeepSeek-V2"),
("deepseek_v3", "DeepSeek-V3"),
("deformable_detr", "Deformable DETR"),
("deit", "DeiT"),
@ -595,7 +589,6 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
("layoutxlm", "LayoutXLM"),
("led", "LED"),
("levit", "LeViT"),
("lfm2", "Lfm2"),
("lightglue", "LightGlue"),
("lilt", "LiLT"),
("llama", "LLaMA"),
@ -643,7 +636,6 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
("mobilevit", "MobileViT"),
("mobilevitv2", "MobileViTV2"),
("modernbert", "ModernBERT"),
("modernbert-decoder", "ModernBertDecoder"),
("moonshine", "Moonshine"),
("moshi", "Moshi"),
("mpnet", "MPNet"),
@ -677,8 +669,6 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
("pegasus", "Pegasus"),
("pegasus_x", "PEGASUS-X"),
("perceiver", "Perceiver"),
("perception_encoder", "PerceptionEncoder"),
("perception_lm", "PerceptionLM"),
("persimmon", "Persimmon"),
("phi", "Phi"),
("phi3", "Phi3"),
@ -886,7 +876,6 @@ SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict[str, str](
("llama4_text", "llama4"),
("blip_2_qformer", "blip_2"),
("fastspeech2_conformer_with_hifigan", "fastspeech2_conformer"),
("perception_encoder", "perception_lm"),
]
)

View File

@ -134,7 +134,6 @@ else:
("owlvit", ("OwlViTImageProcessor", "OwlViTImageProcessorFast")),
("paligemma", ("SiglipImageProcessor", "SiglipImageProcessorFast")),
("perceiver", ("PerceiverImageProcessor", "PerceiverImageProcessorFast")),
("perception_lm", ("PerceptionLMImageProcessorFast",)),
("phi4_multimodal", ("Phi4MultimodalImageProcessorFast",)),
("pix2struct", ("Pix2StructImageProcessor",)),
("pixtral", ("PixtralImageProcessor", "PixtralImageProcessorFast")),
@ -600,6 +599,7 @@ class AutoImageProcessor:
raise ValueError(
"This image processor cannot be instantiated. Please make sure you have `Pillow` installed."
)
raise ValueError(
f"Unrecognized image processor in {pretrained_model_name_or_path}. Should have a "
f"`image_processor_type` key in its {IMAGE_PROCESSOR_NAME} of {CONFIG_NAME}, or one of the following "

View File

@ -95,7 +95,6 @@ MODEL_MAPPING_NAMES = OrderedDict(
("deberta", "DebertaModel"),
("deberta-v2", "DebertaV2Model"),
("decision_transformer", "DecisionTransformerModel"),
("deepseek_v2", "DeepseekV2Model"),
("deepseek_v3", "DeepseekV3Model"),
("deformable_detr", "DeformableDetrModel"),
("deit", "DeiTModel"),
@ -190,7 +189,6 @@ MODEL_MAPPING_NAMES = OrderedDict(
("layoutlmv3", "LayoutLMv3Model"),
("led", "LEDModel"),
("levit", "LevitModel"),
("lfm2", "Lfm2Model"),
("lightglue", "LightGlueForKeypointMatching"),
("lilt", "LiltModel"),
("llama", "LlamaModel"),
@ -230,7 +228,6 @@ MODEL_MAPPING_NAMES = OrderedDict(
("mobilevit", "MobileViTModel"),
("mobilevitv2", "MobileViTV2Model"),
("modernbert", "ModernBertModel"),
("modernbert-decoder", "ModernBertDecoderModel"),
("moonshine", "MoonshineModel"),
("moshi", "MoshiModel"),
("mpnet", "MPNetModel"),
@ -261,8 +258,6 @@ MODEL_MAPPING_NAMES = OrderedDict(
("pegasus", "PegasusModel"),
("pegasus_x", "PegasusXModel"),
("perceiver", "PerceiverModel"),
("perception_encoder", "PerceptionEncoder"),
("perception_lm", "PerceptionLMModel"),
("persimmon", "PersimmonModel"),
("phi", "PhiModel"),
("phi3", "Phi3Model"),
@ -582,7 +577,6 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
("ctrl", "CTRLLMHeadModel"),
("data2vec-text", "Data2VecTextForCausalLM"),
("dbrx", "DbrxForCausalLM"),
("deepseek_v2", "DeepseekV2ForCausalLM"),
("deepseek_v3", "DeepseekV3ForCausalLM"),
("diffllama", "DiffLlamaForCausalLM"),
("doge", "DogeForCausalLM"),
@ -618,7 +612,6 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
("helium", "HeliumForCausalLM"),
("jamba", "JambaForCausalLM"),
("jetmoe", "JetMoeForCausalLM"),
("lfm2", "Lfm2ForCausalLM"),
("llama", "LlamaForCausalLM"),
("llama4", "Llama4ForCausalLM"),
("llama4_text", "Llama4ForCausalLM"),
@ -632,7 +625,6 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
("mistral", "MistralForCausalLM"),
("mixtral", "MixtralForCausalLM"),
("mllama", "MllamaForCausalLM"),
("modernbert-decoder", "ModernBertDecoderForCausalLM"),
("moshi", "MoshiForCausalLM"),
("mpt", "MptForCausalLM"),
("musicgen", "MusicgenForCausalLM"),
@ -946,7 +938,6 @@ MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict(
("mistral3", "Mistral3ForConditionalGeneration"),
("mllama", "MllamaForConditionalGeneration"),
("paligemma", "PaliGemmaForConditionalGeneration"),
("perception_lm", "PerceptionLMForConditionalGeneration"),
("pix2struct", "Pix2StructForConditionalGeneration"),
("pixtral", "LlavaForConditionalGeneration"),
("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"),
@ -1117,7 +1108,6 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
("data2vec-text", "Data2VecTextForSequenceClassification"),
("deberta", "DebertaForSequenceClassification"),
("deberta-v2", "DebertaV2ForSequenceClassification"),
("deepseek_v2", "DeepseekV2ForSequenceClassification"),
("diffllama", "DiffLlamaForSequenceClassification"),
("distilbert", "DistilBertForSequenceClassification"),
("doge", "DogeForSequenceClassification"),
@ -1160,7 +1150,6 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
("mixtral", "MixtralForSequenceClassification"),
("mobilebert", "MobileBertForSequenceClassification"),
("modernbert", "ModernBertForSequenceClassification"),
("modernbert-decoder", "ModernBertDecoderForSequenceClassification"),
("mpnet", "MPNetForSequenceClassification"),
("mpt", "MptForSequenceClassification"),
("mra", "MraForSequenceClassification"),
@ -2000,12 +1989,11 @@ class AutoModelForVideoClassification(_BaseAutoModelClass):
AutoModelForVideoClassification = auto_class_update(AutoModelForVideoClassification, head_doc="video classification")
# Private on purpose, the public class will add the deprecation warnings.
class _AutoModelForVision2Seq(_BaseAutoModelClass):
class AutoModelForVision2Seq(_BaseAutoModelClass):
_model_mapping = MODEL_FOR_VISION_2_SEQ_MAPPING
_AutoModelForVision2Seq = auto_class_update(_AutoModelForVision2Seq, head_doc="vision-to-text modeling")
AutoModelForVision2Seq = auto_class_update(AutoModelForVision2Seq, head_doc="vision-to-text modeling")
class AutoModelForImageTextToText(_BaseAutoModelClass):
@ -2104,26 +2092,6 @@ class AutoModelWithLMHead(_AutoModelWithLMHead):
return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
class AutoModelForVision2Seq(_AutoModelForVision2Seq):
@classmethod
def from_config(cls, config):
warnings.warn(
"The class `AutoModelForVision2Seq` is deprecated and will be removed in v5.0. Please use "
"`AutoModelForImageTextToText` instead.",
FutureWarning,
)
return super().from_config(config)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
warnings.warn(
"The class `AutoModelForVision2Seq` is deprecated and will be removed in v5.0. Please use "
"`AutoModelForImageTextToText` instead.",
FutureWarning,
)
return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
__all__ = [
"MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
"MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING",

View File

@ -101,7 +101,6 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
("owlv2", "Owlv2Processor"),
("owlvit", "OwlViTProcessor"),
("paligemma", "PaliGemmaProcessor"),
("perception_lm", "PerceptionLMProcessor"),
("phi4_multimodal", "Phi4MultimodalProcessor"),
("pix2struct", "Pix2StructProcessor"),
("pixtral", "PixtralProcessor"),

View File

@ -21,8 +21,6 @@ import warnings
from collections import OrderedDict
from typing import Any, Optional, Union
from transformers.utils.import_utils import is_mistral_common_available
from ...configuration_utils import PretrainedConfig
from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
from ...modeling_gguf_pytorch_utils import load_gguf_checkpoint
@ -179,13 +177,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
"DebertaV2TokenizerFast" if is_tokenizers_available() else None,
),
),
(
"deepseek_v2",
(
"LlamaTokenizer" if is_sentencepiece_available() else None,
"LlamaTokenizerFast" if is_tokenizers_available() else None,
),
),
(
"deepseek_v3",
(
@ -389,19 +380,15 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
(
"mistral",
(
"MistralCommonTokenizer"
if is_mistral_common_available()
else ("LlamaTokenizer" if is_sentencepiece_available() else None),
"LlamaTokenizerFast" if is_tokenizers_available() and not is_mistral_common_available() else None,
"LlamaTokenizer" if is_sentencepiece_available() else None,
"LlamaTokenizerFast" if is_tokenizers_available() else None,
),
),
(
"mixtral",
(
"MistralCommonTokenizer"
if is_mistral_common_available()
else ("LlamaTokenizer" if is_sentencepiece_available() else None),
"LlamaTokenizerFast" if is_tokenizers_available() and not is_mistral_common_available() else None,
"LlamaTokenizer" if is_sentencepiece_available() else None,
"LlamaTokenizerFast" if is_tokenizers_available() else None,
),
),
("mllama", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
@ -496,15 +483,7 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
("phimoe", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
("phobert", ("PhobertTokenizer", None)),
("pix2struct", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
(
"pixtral",
(
None,
"MistralCommonTokenizer"
if is_mistral_common_available()
else ("PreTrainedTokenizerFast" if is_tokenizers_available() else None),
),
),
("pixtral", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
("plbart", ("PLBartTokenizer" if is_sentencepiece_available() else None, None)),
("prophetnet", ("ProphetNetTokenizer", None)),
("qdqbert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
@ -735,10 +714,8 @@ def tokenizer_class_from_name(class_name: str) -> Union[type[Any], None]:
for module_name, tokenizers in TOKENIZER_MAPPING_NAMES.items():
if class_name in tokenizers:
module_name = model_type_to_module_name(module_name)
if module_name in ["mistral", "mixtral"] and class_name == "MistralCommonTokenizer":
module = importlib.import_module(".tokenization_mistral_common", "transformers")
else:
module = importlib.import_module(f".{module_name}", "transformers.models")
module = importlib.import_module(f".{module_name}", "transformers.models")
try:
return getattr(module, class_name)
except AttributeError:

View File

@ -26,7 +26,6 @@ import torch
from torch import nn
from ...activations import ACT2FN
from ...cache_utils import Cache
from ...generation import GenerationMixin
from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
@ -94,7 +93,8 @@ class AyaVisionPreTrainedModel(PreTrainedModel):
supports_gradient_checkpointing = True
_skip_keys_device_placement = "past_key_values"
_supports_cache_class = True
_supports_flash_attn = True
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
_supports_sdpa = True
_supports_quantized_cache = False
_supports_static_cache = False
@ -129,7 +129,7 @@ class AyaVisionCausalLMOutputWithPast(ModelOutput):
Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
@ -156,7 +156,7 @@ class AyaVisionCausalLMOutputWithPast(ModelOutput):
)
class AyaVisionModelOutputWithPast(BaseModelOutputWithPast):
r"""
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
@ -261,7 +261,7 @@ class AyaVisionModel(AyaVisionPreTrainedModel):
pixel_values: torch.FloatTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Cache] = None,
past_key_values: Optional[list[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
vision_feature_layer: Optional[Union[int, list[int]]] = None,
vision_feature_select_strategy: Optional[str] = None,
@ -413,7 +413,7 @@ class AyaVisionForConditionalGeneration(AyaVisionPreTrainedModel, GenerationMixi
pixel_values: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Cache] = None,
past_key_values: Optional[list[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
vision_feature_layer: Optional[Union[int, list[int]]] = None,
vision_feature_select_strategy: Optional[str] = None,

View File

@ -29,7 +29,6 @@ from transformers.models.llava.modeling_llava import (
)
from ...activations import ACT2FN
from ...cache_utils import Cache
from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...processing_utils import Unpack
from ...utils import auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging
@ -182,7 +181,7 @@ class AyaVisionModel(LlavaModel):
pixel_values: torch.FloatTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Cache] = None,
past_key_values: Optional[list[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
vision_feature_layer: Optional[Union[int, list[int]]] = None,
vision_feature_select_strategy: Optional[str] = None,
@ -268,7 +267,7 @@ class AyaVisionForConditionalGeneration(LlavaForConditionalGeneration):
pixel_values: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Cache] = None,
past_key_values: Optional[list[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
vision_feature_layer: Optional[Union[int, list[int]]] = None,
vision_feature_select_strategy: Optional[str] = None,

View File

@ -32,7 +32,7 @@ from torch import nn
import transformers.models.jamba.modeling_jamba as modeling_jamba
from transformers.activations import ACT2FN
from ...cache_utils import Cache
from ...cache_utils import Cache # we need __iter__ and __len__ of pkv
from ...generation import GenerationMixin
from ...integrations import use_kernel_forward_from_hub
from ...modeling_attn_mask_utils import AttentionMaskConverter
@ -1039,7 +1039,8 @@ class BambaPreTrainedModel(PreTrainedModel):
supports_gradient_checkpointing = True
_no_split_modules = ["BambaDecoderLayer"]
_skip_keys_device_placement = "past_key_values"
_supports_flash_attn = True
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
_supports_sdpa = True
_supports_cache_class = True # Note: only supports HybridMambaAttentionDynamicCache
_is_stateful = True

View File

@ -810,7 +810,8 @@ class BambaPreTrainedModel(PreTrainedModel):
supports_gradient_checkpointing = True
_no_split_modules = ["BambaDecoderLayer"]
_skip_keys_device_placement = "past_key_values"
_supports_flash_attn = True
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
_supports_sdpa = True
_supports_cache_class = True # Note: only supports HybridMambaAttentionDynamicCache
_is_stateful = True

View File

@ -355,7 +355,8 @@ class BarkBlock(GradientCheckpointingLayer):
class BarkPreTrainedModel(PreTrainedModel):
config_class = BarkConfig
supports_gradient_checkpointing = False
_supports_flash_attn = True
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
def _init_weights(self, module):
"""Initialize the weights."""
@ -1683,6 +1684,42 @@ class BarkModel(BarkPreTrainedModel):
return audio
@classmethod
def _check_and_enable_flash_attn_2(
cls,
config,
torch_dtype: Optional[torch.dtype] = None,
device_map: Optional[Union[str, dict[str, int]]] = None,
hard_check_only: bool = False,
check_device_map: bool = False,
):
"""
`_check_and_enable_flash_attn_2` originally don't expand flash attention enabling to the model
sub-configurations. We override the original method to make sure that Bark sub-models are using Flash Attention
if necessary.
If you don't know about Flash Attention, check out the official repository of flash attention:
https://github.com/Dao-AILab/flash-attention
For using Flash Attention 1.0 you can do it directly via the `BetterTransformer` API, have a look at this
specific section of the documentation to learn more about it:
https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#decoder-models
The method checks if the current setup is compatible with Flash Attention as it requires the model to be in
half precision and not ran on CPU.
If all checks pass and `hard_check_only` is False, the method will set the config attribute `_attn_implementation` to "flash_attention_2" so that the model
can initialize the correct attention module
"""
config = super()._check_and_enable_flash_attn_2(
config, torch_dtype, device_map, hard_check_only=hard_check_only, check_device_map=check_device_map
)
config.semantic_config._attn_implementation = config._attn_implementation
config.coarse_acoustics_config._attn_implementation = config._attn_implementation
config.fine_acoustics_config._attn_implementation = config._attn_implementation
return config
__all__ = [
"BarkFineModel",

View File

@ -493,7 +493,8 @@ class BartPreTrainedModel(PreTrainedModel):
_keys_to_ignore_on_load_unexpected = ["encoder.version", "decoder.version"]
_no_split_modules = [r"BartEncoderLayer", r"BartDecoderLayer"]
_skip_keys_device_placement = "past_key_values"
_supports_flash_attn = True
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
_supports_sdpa = True
_supports_flex_attn = True
_supports_cache_class = True
@ -948,7 +949,7 @@ class BartDecoder(BartPreTrainedModel):
encoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[Cache] = None,
past_key_values: Optional[list[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
@ -997,7 +998,7 @@ class BartDecoder(BartPreTrainedModel):
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
@ -1229,7 +1230,7 @@ class BartModel(BartPreTrainedModel):
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[list[torch.FloatTensor]] = None,
past_key_values: Optional[Cache] = None,
past_key_values: Optional[list[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
@ -1401,7 +1402,7 @@ class BartForConditionalGeneration(BartPreTrainedModel, GenerationMixin):
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[list[torch.FloatTensor]] = None,
past_key_values: Optional[Cache] = None,
past_key_values: Optional[list[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
@ -1900,7 +1901,7 @@ class BartForCausalLM(BartPreTrainedModel, GenerationMixin):
encoder_attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[Cache] = None,
past_key_values: Optional[list[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,

View File

@ -2107,7 +2107,7 @@ class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel):
encoder_attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[Cache] = None,
past_key_values: Optional[list[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
@ -2156,7 +2156,7 @@ class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel):
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
@ -2381,7 +2381,7 @@ class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel):
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[list[torch.FloatTensor]] = None,
past_key_values: Optional[Cache] = None,
past_key_values: Optional[list[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
@ -2543,7 +2543,7 @@ class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel, Gene
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[list[torch.FloatTensor]] = None,
past_key_values: Optional[Cache] = None,
past_key_values: Optional[list[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,

View File

@ -346,7 +346,8 @@ class BioGptPreTrainedModel(PreTrainedModel):
config_class = BioGptConfig
base_model_prefix = "biogpt"
supports_gradient_checkpointing = True
_supports_flash_attn = True
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
_supports_sdpa = True
_supports_flex_attn = True
_supports_cache_class = True

View File

@ -171,7 +171,8 @@ class BioGptPreTrainedModel(PreTrainedModel):
config_class = BioGptConfig
base_model_prefix = "biogpt"
supports_gradient_checkpointing = True
_supports_flash_attn = True
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
_supports_sdpa = True
_supports_flex_attn = True
_supports_cache_class = True

View File

@ -308,7 +308,8 @@ class BitNetPreTrainedModel(PreTrainedModel):
supports_gradient_checkpointing = True
_no_split_modules = ["BitNetDecoderLayer"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
_supports_sdpa = True
_supports_flex_attn = True
_supports_cache_class = True

View File

@ -463,7 +463,8 @@ class BlenderbotPreTrainedModel(PreTrainedModel):
config_class = BlenderbotConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_supports_flash_attn = True
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
_supports_sdpa = True
_supports_flex_attn = True
_supports_cache_class = True
@ -952,7 +953,7 @@ class BlenderbotDecoder(BlenderbotPreTrainedModel):
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
@ -1185,7 +1186,7 @@ class BlenderbotModel(BlenderbotPreTrainedModel):
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Union[tuple, BaseModelOutput]] = None,
past_key_values: Optional[Cache] = None,
past_key_values: Optional[list[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.Tensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
@ -1360,7 +1361,7 @@ class BlenderbotForConditionalGeneration(BlenderbotPreTrainedModel, GenerationMi
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Union[tuple, BaseModelOutput]] = None,
past_key_values: Optional[Cache] = None,
past_key_values: Optional[list[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.Tensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
@ -1550,7 +1551,7 @@ class BlenderbotForCausalLM(BlenderbotPreTrainedModel, GenerationMixin):
encoder_attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[Cache] = None,
past_key_values: Optional[list[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,

View File

@ -451,7 +451,8 @@ class BlenderbotSmallPreTrainedModel(PreTrainedModel):
config_class = BlenderbotSmallConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_supports_flash_attn = True
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
_supports_sdpa = True
_supports_flex_attn = True
_supports_cache_class = True
@ -933,7 +934,7 @@ class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel):
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
@ -1152,7 +1153,7 @@ class BlenderbotSmallModel(BlenderbotSmallPreTrainedModel):
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Union[tuple, BaseModelOutput]] = None,
past_key_values: Optional[Cache] = None,
past_key_values: Optional[list[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.Tensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
@ -1314,7 +1315,7 @@ class BlenderbotSmallForConditionalGeneration(BlenderbotSmallPreTrainedModel, Ge
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Union[tuple, BaseModelOutput]] = None,
past_key_values: Optional[Cache] = None,
past_key_values: Optional[list[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.Tensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
@ -1504,7 +1505,7 @@ class BlenderbotSmallForCausalLM(BlenderbotSmallPreTrainedModel, GenerationMixin
encoder_attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[Cache] = None,
past_key_values: Optional[list[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,

View File

@ -24,7 +24,6 @@ from torch import nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...cache_utils import Cache
from ...generation import GenerationMixin
from ...modeling_layers import GradientCheckpointingLayer
from ...modeling_outputs import (
@ -37,7 +36,6 @@ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from ...processing_utils import Unpack
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import ModelOutput, TransformersKwargs, auto_docstring, logging, torch_int
from ...utils.deprecation import deprecate_kwarg
from ..auto import AutoModelForCausalLM, AutoModelForSeq2SeqLM
from .configuration_blip_2 import Blip2Config, Blip2QFormerConfig, Blip2VisionConfig
@ -409,7 +407,8 @@ class Blip2PreTrainedModel(PreTrainedModel):
base_model_prefix = "blip"
supports_gradient_checkpointing = True
_supports_attention_backend = True
_supports_flash_attn = True
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
_supports_sdpa = True
_supports_flex_attn = True
@ -642,7 +641,6 @@ class Blip2QFormerMultiHeadAttention(nn.Module):
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
@deprecate_kwarg("past_key_value", version="4.55.0")
def forward(
self,
hidden_states,
@ -662,6 +660,11 @@ class Blip2QFormerMultiHeadAttention(nn.Module):
key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
attention_mask = encoder_attention_mask
elif past_key_value is not None:
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
else:
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
@ -670,6 +673,8 @@ class Blip2QFormerMultiHeadAttention(nn.Module):
query_layer = self.transpose_for_scores(mixed_query_layer)
past_key_value = (key_layer, value_layer)
# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
@ -716,14 +721,9 @@ class Blip2QFormerMultiHeadAttention(nn.Module):
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape)
outputs = (
(
context_layer,
attention_probs,
)
if output_attentions
else (context_layer,)
)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
outputs = outputs + (past_key_value,)
return outputs
@ -767,7 +767,6 @@ class Blip2QFormerAttention(nn.Module):
self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
@deprecate_kwarg("past_key_value", version="4.55.0")
def forward(
self,
hidden_states: torch.Tensor,
@ -775,16 +774,17 @@ class Blip2QFormerAttention(nn.Module):
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Cache] = None,
past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> tuple[torch.Tensor]:
self_outputs = self.attention(
hidden_states=hidden_states,
attention_mask=attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
output_attentions=output_attentions,
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
@ -844,7 +844,6 @@ class Blip2QFormerLayer(GradientCheckpointingLayer):
self.intermediate_query = Blip2QFormerIntermediate(config)
self.output_query = Blip2QFormerOutput(config)
@deprecate_kwarg("past_key_value", version="4.55.0")
def forward(
self,
hidden_states,
@ -856,14 +855,19 @@ class Blip2QFormerLayer(GradientCheckpointingLayer):
output_attentions=False,
query_length=0,
):
# decoder uni-directional self-attention cached key/values tuple is at positions 1,2
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
self_attention_outputs = self.attention(
hidden_states=hidden_states,
attention_mask=attention_mask,
head_mask=head_mask,
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
past_key_value=self_attn_past_key_value,
)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:]
outputs = self_attention_outputs[1:-1]
present_key_value = self_attention_outputs[-1]
if query_length > 0:
query_attention_output = attention_output[:, :query_length, :]
@ -872,16 +876,16 @@ class Blip2QFormerLayer(GradientCheckpointingLayer):
if encoder_hidden_states is None:
raise ValueError("encoder_hidden_states must be given for cross-attention layers")
cross_attention_outputs = self.crossattention(
hidden_states=query_attention_output,
attention_mask=attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
query_attention_output,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
output_attentions=output_attentions,
)
query_attention_output = cross_attention_outputs[0]
# add cross attentions if we output attention weights
outputs = outputs + cross_attention_outputs[1:]
outputs = outputs + cross_attention_outputs[1:-1]
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk_query,
@ -907,6 +911,8 @@ class Blip2QFormerLayer(GradientCheckpointingLayer):
)
outputs = (layer_output,) + outputs
outputs = outputs + (present_key_value,)
return outputs
def feed_forward_chunk(self, attention_output):
@ -929,8 +935,6 @@ class Blip2QFormerEncoder(nn.Module):
)
self.gradient_checkpointing = False
@deprecate_kwarg("past_key_value", version="4.55.0")
@deprecate_kwarg("use_cache", version="4.55.0")
def forward(
self,
hidden_states,
@ -949,12 +953,21 @@ class Blip2QFormerEncoder(nn.Module):
all_self_attentions = () if output_attentions else None
all_cross_attentions = () if output_attentions else None
next_decoder_cache = () if use_cache else None
for i in range(self.config.num_hidden_layers):
layer_module = self.layer[i]
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
past_key_value = past_key_values[i] if past_key_values is not None else None
if getattr(self.config, "gradient_checkpointing", False) and self.training and use_cache:
logger.warning(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache = False
layer_outputs = layer_module(
hidden_states,
@ -962,14 +975,17 @@ class Blip2QFormerEncoder(nn.Module):
layer_head_mask,
encoder_hidden_states, # as a positional argument for gradient checkpointing
encoder_attention_mask=encoder_attention_mask,
past_key_value=past_key_value,
output_attentions=output_attentions,
query_length=query_length,
)
hidden_states = layer_outputs[0]
if use_cache:
next_decoder_cache += (layer_outputs[-1],)
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if query_length > 0 and layer_module.has_cross_attention:
if layer_module.has_cross_attention:
all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
if output_hidden_states:
@ -980,6 +996,7 @@ class Blip2QFormerEncoder(nn.Module):
v
for v in [
hidden_states,
next_decoder_cache,
all_hidden_states,
all_self_attentions,
all_cross_attentions,
@ -988,6 +1005,7 @@ class Blip2QFormerEncoder(nn.Module):
)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_decoder_cache,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
cross_attentions=all_cross_attentions,
@ -1047,7 +1065,7 @@ class Blip2TextEmbeddings(nn.Module):
)
class Blip2QFormerModel(Blip2PreTrainedModel):
_supports_attention_backend = False # adds position on attn weights before last matmul
_supports_flash_attn = False
_supports_flash_attn_2 = False
_supports_sdpa = False
_supports_flex_attn = False
@ -1119,8 +1137,6 @@ class Blip2QFormerModel(Blip2PreTrainedModel):
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
return extended_attention_mask
@deprecate_kwarg("past_key_value", version="4.55.0")
@deprecate_kwarg("use_cache", version="4.55.0")
@auto_docstring
def forward(
self,
@ -1130,7 +1146,7 @@ class Blip2QFormerModel(Blip2PreTrainedModel):
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Cache] = None,
past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
@ -1150,6 +1166,11 @@ class Blip2QFormerModel(Blip2PreTrainedModel):
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# past_key_values_length
past_key_values_length = (
past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0
)
query_length = (
query_length if query_length is not None else query_embeds.shape[1] if query_embeds is not None else 0
)
@ -1164,7 +1185,7 @@ class Blip2QFormerModel(Blip2PreTrainedModel):
device = embedding_output.device
if attention_mask is None:
attention_mask = torch.ones(((batch_size, seq_length)), device=device)
attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
@ -1206,6 +1227,8 @@ class Blip2QFormerModel(Blip2PreTrainedModel):
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_extended_attention_mask,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
@ -1237,7 +1260,7 @@ class Blip2Model(Blip2PreTrainedModel):
config_class = Blip2Config
main_input_name = "pixel_values"
_keep_in_fp32_modules = ["query_tokens", "qformer"]
_supports_flash_attn = False # because self.qformer does not support FA2
_supports_flash_attn_2 = False # because self.qformer does not support FA2
def __init__(self, config: Blip2Config):
super().__init__(config)
@ -1622,7 +1645,7 @@ class Blip2Model(Blip2PreTrainedModel):
class Blip2TextModelWithProjection(Blip2PreTrainedModel):
supports_gradient_checkpointing = False
_keep_in_fp32_modules = ["query_tokens", "qformer"]
_supports_flash_attn = False # because self.qformer does not support FA2
_supports_flash_attn_2 = False # because self.qformer does not support FA2
def __init__(self, config: Blip2Config):
super().__init__(config)
@ -1715,7 +1738,7 @@ class Blip2TextModelWithProjection(Blip2PreTrainedModel):
class Blip2VisionModelWithProjection(Blip2PreTrainedModel):
main_input_name = "pixel_values"
_keep_in_fp32_modules = ["query_tokens", "qformer"]
_supports_flash_attn = False # because self.qformer does not support FA2
_supports_flash_attn_2 = False # because self.qformer does not support FA2
def __init__(self, config: Blip2Config):
super().__init__(config)
@ -1835,7 +1858,7 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin):
_supports_quantized_cache = False # not all LM bacbones support (e.g. T5)
_keep_in_fp32_modules = ["query_tokens", "qformer"]
_supports_flash_attn = False # because self.qformer does not support FA2
_supports_flash_attn_2 = False # because self.qformer does not support FA2
def __init__(self, config: Blip2Config):
super().__init__(config)
@ -2266,7 +2289,7 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin):
class Blip2ForImageTextRetrieval(Blip2PreTrainedModel):
main_input_name = "pixel_values"
_keep_in_fp32_modules = ["query_tokens", "qformer"]
_supports_flash_attn = False # because self.qformer does not support FA2
_supports_flash_attn_2 = False # because self.qformer does not support FA2
def __init__(self, config: Blip2Config):
super().__init__(config)

View File

@ -328,7 +328,12 @@ class BloomAttention(nn.Module):
output_tensor = self.dense(context_layer)
output_tensor = dropout_add(output_tensor, residual, self.hidden_dropout, self.training)
return output_tensor, attention_probs
outputs = (output_tensor, layer_past)
if output_attentions:
outputs += (attention_probs,)
return outputs
class BloomMLP(nn.Module):
@ -400,7 +405,7 @@ class BloomBlock(GradientCheckpointingLayer):
residual = hidden_states
# Self attention.
attention_output, attn_weights = self.self_attention(
attn_outputs = self.self_attention(
layernorm_output,
residual,
layer_past=layer_past,
@ -412,6 +417,10 @@ class BloomBlock(GradientCheckpointingLayer):
cache_position=cache_position,
)
attention_output = attn_outputs[0]
outputs = attn_outputs[1:]
layernorm_output = self.post_attention_layernorm(attention_output)
# Get residual
@ -423,7 +432,12 @@ class BloomBlock(GradientCheckpointingLayer):
# MLP.
output = self.mlp(layernorm_output, residual)
return output, attn_weights # hidden_states, attentions
if use_cache:
outputs = (output,) + outputs
else:
outputs = (output,) + outputs[1:]
return outputs # hidden_states, past_kv, attentions
@auto_docstring
@ -546,12 +560,19 @@ class BloomModel(BloomPreTrainedModel):
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
# TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
if not isinstance(past_key_values, (type(None), Cache)):
raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
if use_cache and past_key_values is None:
past_key_values = DynamicCache()
# kept for BC (non `Cache` `past_key_values` inputs)
return_legacy_cache = False
if use_cache and not isinstance(past_key_values, Cache):
return_legacy_cache = True
if past_key_values is None:
past_key_values = DynamicCache()
else:
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
"We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
"will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
"(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
)
batch_size, seq_length, _ = inputs_embeds.shape
past_length = past_key_values.get_seq_length() if past_key_values is not None else 0
@ -566,6 +587,7 @@ class BloomModel(BloomPreTrainedModel):
head_mask = self.get_head_mask(head_mask, self.config.n_layer)
hidden_states = self.word_embeddings_layernorm(inputs_embeds)
next_decoder_cache = None
all_self_attentions = () if output_attentions else None
all_hidden_states = () if output_hidden_states else None
@ -596,8 +618,11 @@ class BloomModel(BloomPreTrainedModel):
)
hidden_states = outputs[0]
if use_cache:
next_decoder_cache = outputs[1]
if output_attentions:
all_self_attentions = all_self_attentions + (outputs[1],)
all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
# Add last hidden state
hidden_states = self.ln_f(hidden_states)
@ -605,14 +630,18 @@ class BloomModel(BloomPreTrainedModel):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
next_cache = next_decoder_cache if use_cache else None
if return_legacy_cache:
next_cache = next_cache.to_legacy_cache()
if not return_dict:
return tuple(
v for v in [hidden_states, past_key_values, all_hidden_states, all_self_attentions] if v is not None
v for v in [hidden_states, next_cache, all_hidden_states, all_self_attentions] if v is not None
)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=past_key_values,
past_key_values=next_cache,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)

View File

@ -821,7 +821,8 @@ class ChameleonPreTrainedModel(PreTrainedModel):
supports_gradient_checkpointing = True
_no_split_modules = ["ChameleonDecoderLayer", "ChameleonSwinDecoderLayer"]
_skip_keys_device_placement = ["past_key_values", "causal_mask"]
_supports_flash_attn = True
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
_supports_sdpa = True
_supports_quantized_cache = True
_supports_cache_class = True

View File

@ -28,6 +28,7 @@ from ...processing_utils import (
ProcessorMixin,
TextKwargs,
Unpack,
_validate_images_text_input_order,
)
from ...tokenization_utils_base import PreTokenizedInput, TextInput
@ -128,7 +129,8 @@ class ChameleonProcessor(ProcessorMixin):
`None`).
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
"""
# check if images and text inputs are reversed for BC
images, text = _validate_images_text_input_order(images, text)
if isinstance(text, str):
text = [text]
elif not isinstance(text, list) and not isinstance(text[0], str):

View File

@ -428,7 +428,8 @@ class CLIPPreTrainedModel(PreTrainedModel):
base_model_prefix = "clip"
supports_gradient_checkpointing = True
_supports_sdpa = True
_supports_flash_attn = True
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
_supports_flex_attn = True
_supports_attention_backend = True

View File

@ -217,7 +217,12 @@ class CodeGenAttention(nn.Module):
attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_dim)
attn_output = self.out_proj(attn_output)
attn_output = self.resid_dropout(attn_output)
return attn_output, attn_weights
outputs = (attn_output, layer_past)
if output_attentions:
outputs += (attn_weights,)
return outputs # a, present, (attentions)
# Copied from transformers.models.gptj.modeling_gptj.GPTJMLP with GPTJ->CodeGen
@ -263,7 +268,7 @@ class CodeGenBlock(GradientCheckpointingLayer):
) -> Union[tuple[torch.Tensor], Optional[tuple[torch.Tensor, tuple[torch.FloatTensor, ...]]]]:
residual = hidden_states
hidden_states = self.ln_1(hidden_states)
attn_outputs, attn_weights = self.attn(
attn_outputs = self.attn(
hidden_states=hidden_states,
layer_past=layer_past,
attention_mask=attention_mask,
@ -273,10 +278,18 @@ class CodeGenBlock(GradientCheckpointingLayer):
output_attentions=output_attentions,
cache_position=cache_position,
)
feed_forward_hidden_states = self.mlp(hidden_states)
hidden_states = attn_outputs + feed_forward_hidden_states + residual
attn_output = attn_outputs[0] # output_attn: a, present, (attentions)
outputs = attn_outputs[1:]
return hidden_states, attn_weights
feed_forward_hidden_states = self.mlp(hidden_states)
hidden_states = attn_output + feed_forward_hidden_states + residual
if use_cache:
outputs = (hidden_states,) + outputs
else:
outputs = (hidden_states,) + outputs[1:]
return outputs # hidden_states, present, (attentions)
@auto_docstring
@ -377,12 +390,19 @@ class CodeGenModel(CodeGenPreTrainedModel):
if inputs_embeds is None:
inputs_embeds = self.wte(input_ids)
# TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
if not isinstance(past_key_values, (type(None), Cache)):
raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
if use_cache and past_key_values is None:
past_key_values = DynamicCache()
# kept for BC (non `Cache` `past_key_values` inputs)
return_legacy_cache = False
if use_cache and not isinstance(past_key_values, Cache):
return_legacy_cache = True
if past_key_values is None:
past_key_values = DynamicCache()
else:
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
"We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
"will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
"(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
)
seq_length = inputs_embeds.shape[1]
if cache_position is None:
@ -411,6 +431,7 @@ class CodeGenModel(CodeGenPreTrainedModel):
hidden_states = self.drop(hidden_states)
output_shape = (-1, seq_length, hidden_states.size(-1))
next_decoder_cache = None
all_self_attentions = () if output_attentions else None
all_hidden_states = () if output_hidden_states else None
for i, block in enumerate(self.h):
@ -429,8 +450,11 @@ class CodeGenModel(CodeGenPreTrainedModel):
)
hidden_states = outputs[0]
if use_cache is True:
next_decoder_cache = outputs[1]
if output_attentions:
all_self_attentions = all_self_attentions + (outputs[1],)
all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
hidden_states = self.ln_f(hidden_states)
@ -439,14 +463,18 @@ class CodeGenModel(CodeGenPreTrainedModel):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
next_cache = next_decoder_cache if use_cache else None
if return_legacy_cache:
next_cache = next_cache.to_legacy_cache()
if not return_dict:
return tuple(
v for v in [hidden_states, past_key_values, all_hidden_states, all_self_attentions] if v is not None
v for v in [hidden_states, next_cache, all_hidden_states, all_self_attentions] if v is not None
)
return BaseModelOutputWithPast(
last_hidden_state=hidden_states,
past_key_values=past_key_values,
past_key_values=next_cache,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)

View File

@ -341,7 +341,8 @@ class CoherePreTrainedModel(PreTrainedModel):
supports_gradient_checkpointing = True
_no_split_modules = ["CohereDecoderLayer"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
_supports_sdpa = True
_supports_flex_attn = True
_supports_cache_class = True

View File

@ -19,8 +19,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
from ...configuration_utils import PretrainedConfig, layer_type_validation
from ...modeling_rope_utils import rope_config_validation
@ -218,29 +216,14 @@ class Cohere2Config(PretrainedConfig):
**kwargs,
)
# BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 4)
if self.layer_types is None:
# BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
self._sliding_window_pattern = getattr(self, "sliding_window_pattern", 4)
sliding_window_pattern = getattr(self, "sliding_window_pattern", 4)
self.layer_types = [
"sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention"
"sliding_attention" if bool((i + 1) % sliding_window_pattern) else "full_attention"
for i in range(self.num_hidden_layers)
]
layer_type_validation(self.layer_types)
@property
def sliding_window_pattern(self):
warnings.warn(
"The `sliding_window_pattern` attribute is deprecated and will be removed in v4.55.0.",
FutureWarning,
)
return self._sliding_window_pattern
@sliding_window_pattern.setter
def sliding_window_pattern(self, value):
self._sliding_window_pattern = value
__all__ = ["Cohere2Config"]

View File

@ -318,7 +318,8 @@ class Cohere2PreTrainedModel(PreTrainedModel):
supports_gradient_checkpointing = True
_no_split_modules = ["Cohere2DecoderLayer"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
_supports_sdpa = True
_supports_flex_attn = True
_supports_cache_class = True

View File

@ -13,7 +13,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
from typing import Callable, Optional
import torch
@ -238,30 +237,15 @@ class Cohere2Config(PretrainedConfig):
**kwargs,
)
# BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 4)
if self.layer_types is None:
# BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
self._sliding_window_pattern = getattr(self, "sliding_window_pattern", 4)
sliding_window_pattern = getattr(self, "sliding_window_pattern", 4)
self.layer_types = [
"sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention"
"sliding_attention" if bool((i + 1) % sliding_window_pattern) else "full_attention"
for i in range(self.num_hidden_layers)
]
layer_type_validation(self.layer_types)
@property
def sliding_window_pattern(self):
warnings.warn(
"The `sliding_window_pattern` attribute is deprecated and will be removed in v4.55.0.",
FutureWarning,
)
return self._sliding_window_pattern
@sliding_window_pattern.setter
def sliding_window_pattern(self, value):
self._sliding_window_pattern = value
class Cohere2RotaryEmbedding(CohereRotaryEmbedding):
pass

View File

@ -63,7 +63,7 @@ class ColPaliForRetrievalOutput(ModelOutput):
Language modeling loss (for next-token prediction).
embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
The embeddings of the model.
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)

View File

@ -41,7 +41,8 @@ class ColQwen2PreTrainedModel(PreTrainedModel):
config_class = ColQwen2Config
base_model_prefix = "model"
_no_split_modules = []
_supports_flash_attn = True
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
_supports_sdpa = True
_supports_cache_class = True
@ -74,7 +75,7 @@ class ColQwen2ForRetrievalOutput(ModelOutput):
Language modeling loss (for next-token prediction).
embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
The embeddings of the model.
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
@ -129,7 +130,7 @@ class ColQwen2ForRetrieval(ColQwen2PreTrainedModel):
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Cache] = None,
past_key_values: Optional[list[torch.FloatTensor]] = None,
labels: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,

View File

@ -225,7 +225,8 @@ class ColQwen2Processor(ColPaliProcessor):
class ColQwen2PreTrainedModel(ColPaliPreTrainedModel):
_supports_flash_attn = True
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
_supports_sdpa = True
_supports_cache_class = True
@ -242,7 +243,7 @@ class ColQwen2ForRetrievalOutput(ModelOutput):
Language modeling loss (for next-token prediction).
embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
The embeddings of the model.
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
@ -279,7 +280,7 @@ class ColQwen2ForRetrieval(ColPaliForRetrieval):
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Cache] = None,
past_key_values: Optional[list[torch.FloatTensor]] = None,
labels: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,

View File

@ -58,7 +58,7 @@ class CsmOutputWithPast(ModelOutput):
Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
@ -366,7 +366,8 @@ class CsmPreTrainedModel(PreTrainedModel):
supports_gradient_checkpointing = True
_no_split_modules = ["CsmDecoderLayer"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
_supports_sdpa = True
# does not because of Mimi codec model
# _supports_flex_attn = True

View File

@ -58,7 +58,7 @@ class CsmOutputWithPast(ModelOutput):
Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
@ -129,7 +129,8 @@ class CsmPreTrainedModel(PreTrainedModel):
supports_gradient_checkpointing = True
_no_split_modules = ["CsmDecoderLayer"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
_supports_sdpa = True
# does not because of Mimi codec model
# _supports_flex_attn = True

View File

@ -1674,9 +1674,7 @@ class DFineForObjectDetection(DFinePreTrainedModel):
return_dict: Optional[bool] = None,
**kwargs,
) -> Union[tuple[torch.FloatTensor], DFineObjectDetectionOutput]:
r"""
Example:
"""
```python
>>> import torch
>>> from transformers.image_utils import load_image

View File

@ -917,9 +917,7 @@ class DFineForObjectDetection(RTDetrForObjectDetection, DFinePreTrainedModel):
self.post_init()
def forward(**super_kwargs):
r"""
Example:
"""
```python
>>> import torch
>>> from transformers.image_utils import load_image

View File

@ -17,7 +17,6 @@ import fnmatch
import re
import torch
import torch.nn as nn
from transformers import (
DacConfig,
@ -186,38 +185,6 @@ def recursively_load_weights(orig_dict, hf_model, model_name):
logger.warning(f"Unused weights: {unused_weights}")
def apply_weight_norm(model):
weight_norm = nn.utils.weight_norm
for layer in model.quantizer.quantizers:
weight_norm(layer.in_proj)
weight_norm(layer.out_proj)
weight_norm(model.encoder.conv1)
weight_norm(model.encoder.conv2)
for layer in model.encoder.block:
weight_norm(layer.conv1)
weight_norm(layer.res_unit1.conv1)
weight_norm(layer.res_unit1.conv2)
weight_norm(layer.res_unit2.conv1)
weight_norm(layer.res_unit2.conv2)
weight_norm(layer.res_unit3.conv1)
weight_norm(layer.res_unit3.conv2)
weight_norm(model.decoder.conv1)
weight_norm(model.decoder.conv2)
for layer in model.decoder.block:
weight_norm(layer.conv_t1)
weight_norm(layer.res_unit1.conv1)
weight_norm(layer.res_unit1.conv2)
weight_norm(layer.res_unit2.conv1)
weight_norm(layer.res_unit2.conv2)
weight_norm(layer.res_unit3.conv1)
weight_norm(layer.res_unit3.conv2)
@torch.no_grad()
def convert_checkpoint(
model_name,
@ -247,7 +214,7 @@ def convert_checkpoint(
original_checkpoint = model_dict["state_dict"]
apply_weight_norm(model)
model.apply_weight_norm()
recursively_load_weights(original_checkpoint, model, model_name)
model.remove_weight_norm()

View File

@ -505,7 +505,8 @@ class Data2VecAudioPreTrainedModel(PreTrainedModel):
base_model_prefix = "data2vec_audio"
main_input_name = "input_values"
supports_gradient_checkpointing = True
_supports_flash_attn = True
_supports_flash_attn_2 = True
_supports_flash_attn_3 = True
_supports_sdpa = True
_supports_flex_attn = True

Some files were not shown because too many files have changed in this diff Show More