Compare commits

..

6 Commits

Author SHA1 Message Date
24575c4f4c work for mixtral 2025-11-17 15:46:42 +00:00
081d800f65 this time for sure 2025-11-17 15:07:52 +00:00
bf0f2cd151 fix convert 2025-11-17 15:05:22 +00:00
b4e6c4dae5 fix the key 2025-11-17 15:02:43 +00:00
a1ccade834 nit 2025-11-17 14:56:35 +00:00
87142d62eb initial commit 2025-11-17 14:53:21 +00:00
263 changed files with 46504 additions and 1423 deletions

View File

@ -420,6 +420,8 @@
title: BLOOM
- local: model_doc/blt
title: BLT
- local: model_doc/bort
title: BORT
- local: model_doc/byt5
title: ByT5
- local: model_doc/camembert
@ -474,6 +476,8 @@
title: Ernie4_5
- local: model_doc/ernie4_5_moe
title: Ernie4_5_MoE
- local: model_doc/ernie_m
title: ErnieM
- local: model_doc/esm
title: ESM
- local: model_doc/exaone4
@ -528,6 +532,8 @@
title: GPTBigCode
- local: model_doc/gpt_oss
title: GptOss
- local: model_doc/gptsan-japanese
title: GPTSAN Japanese
- local: model_doc/gpt-sw3
title: GPTSw3
- local: model_doc/granite
@ -552,6 +558,8 @@
title: Jamba
- local: model_doc/jetmoe
title: JetMoe
- local: model_doc/jukebox
title: Jukebox
- local: model_doc/led
title: LED
- local: model_doc/lfm2
@ -586,6 +594,8 @@
title: MarkupLM
- local: model_doc/mbart
title: MBart and MBart-50
- local: model_doc/mega
title: MEGA
- local: model_doc/megatron-bert
title: MegatronBERT
- local: model_doc/megatron_gpt2
@ -620,6 +630,8 @@
title: myt5
- local: model_doc/nemotron
title: Nemotron
- local: model_doc/nezha
title: NEZHA
- local: model_doc/nllb
title: NLLB
- local: model_doc/nllb-moe
@ -634,6 +646,8 @@
title: Olmo3
- local: model_doc/olmoe
title: OLMoE
- local: model_doc/open-llama
title: Open-Llama
- local: model_doc/opt
title: OPT
- local: model_doc/pegasus
@ -654,6 +668,8 @@
title: PLBart
- local: model_doc/prophetnet
title: ProphetNet
- local: model_doc/qdqbert
title: QDQBert
- local: model_doc/qwen2
title: Qwen2
- local: model_doc/qwen2_moe
@ -666,12 +682,16 @@
title: Qwen3Next
- local: model_doc/rag
title: RAG
- local: model_doc/realm
title: REALM
- local: model_doc/recurrent_gemma
title: RecurrentGemma
- local: model_doc/reformer
title: Reformer
- local: model_doc/rembert
title: RemBERT
- local: model_doc/retribert
title: RetriBERT
- local: model_doc/roberta
title: RoBERTa
- local: model_doc/roberta-prelayernorm
@ -700,6 +720,10 @@
title: T5Gemma
- local: model_doc/t5v1.1
title: T5v1.1
- local: model_doc/tapex
title: TAPEX
- local: model_doc/transfo-xl
title: Transformer XL
- local: model_doc/ul2
title: UL2
- local: model_doc/umt5
@ -712,6 +736,8 @@
title: XGLM
- local: model_doc/xlm
title: XLM
- local: model_doc/xlm-prophetnet
title: XLM-ProphetNet
- local: model_doc/xlm-roberta
title: XLM-RoBERTa
- local: model_doc/xlm-roberta-xl
@ -758,6 +784,8 @@
title: Depth Anything V2
- local: model_doc/depth_pro
title: DepthPro
- local: model_doc/deta
title: DETA
- local: model_doc/detr
title: DETR
- local: model_doc/dinat
@ -772,6 +800,8 @@
title: DiT
- local: model_doc/dpt
title: DPT
- local: model_doc/efficientformer
title: EfficientFormer
- local: model_doc/efficientloftr
title: EfficientLoFTR
- local: model_doc/efficientnet
@ -808,6 +838,8 @@
title: MobileViT
- local: model_doc/mobilevitv2
title: MobileViTV2
- local: model_doc/nat
title: NAT
- local: model_doc/poolformer
title: PoolFormer
- local: model_doc/prompt_depth_anything
@ -854,8 +886,12 @@
title: Timm Wrapper
- local: model_doc/upernet
title: UperNet
- local: model_doc/van
title: VAN
- local: model_doc/vit
title: Vision Transformer (ViT)
- local: model_doc/vit_hybrid
title: ViT Hybrid
- local: model_doc/vitdet
title: ViTDet
- local: model_doc/vit_mae
@ -894,6 +930,8 @@
title: Hubert
- local: model_doc/kyutai_speech_to_text
title: Kyutai Speech-To-Text
- local: model_doc/mctct
title: MCTCT
- local: model_doc/mimi
title: Mimi
- local: model_doc/mms
@ -920,6 +958,8 @@
title: SEW-D
- local: model_doc/speech_to_text
title: Speech2Text
- local: model_doc/speech_to_text_2
title: Speech2Text2
- local: model_doc/speecht5
title: SpeechT5
- local: model_doc/unispeech
@ -1148,6 +1188,8 @@
title: TAPAS
- local: model_doc/trocr
title: TrOCR
- local: model_doc/tvlt
title: TVLT
- local: model_doc/tvp
title: TVP
- local: model_doc/udop
@ -1174,6 +1216,8 @@
- sections:
- local: model_doc/decision_transformer
title: Decision Transformer
- local: model_doc/trajectory_transformer
title: Trajectory Transformer
title: Reinforcement learning models
- sections:
- local: model_doc/autoformer
@ -1189,6 +1233,10 @@
- local: model_doc/timesfm
title: TimesFM
title: Time series models
- sections:
- local: model_doc/graphormer
title: Graphormer
title: Graph models
title: Models
- sections:
- local: internal/modeling_utils

View File

@ -0,0 +1,60 @@
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
*This model was released on 2020-10-20 and added to Hugging Face Transformers on 2023-06-20.*
# BORT
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
<Tip warning={true}>
This model is in maintenance mode only, we do not accept any new PRs changing its code.
If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
You can do so by running the following command: `pip install -U transformers==4.30.0`.
</Tip>
## Overview
The BORT model was proposed in [Optimal Subarchitecture Extraction for BERT](https://huggingface.co/papers/2010.10499) by
Adrian de Wynter and Daniel J. Perry. It is an optimal subset of architectural parameters for the BERT, which the
authors refer to as "Bort".
The abstract from the paper is the following:
*We extract an optimal subset of architectural parameters for the BERT architecture from Devlin et al. (2018) by
applying recent breakthroughs in algorithms for neural architecture search. This optimal subset, which we refer to as
"Bort", is demonstrably smaller, having an effective (that is, not counting the embedding layer) size of 5.5% the
original BERT-large architecture, and 16% of the net size. Bort is also able to be pretrained in 288 GPU hours, which
is 1.2% of the time required to pretrain the highest-performing BERT parametric architectural variant, RoBERTa-large
(Liu et al., 2019), and about 33% of that of the world-record, in GPU hours, required to train BERT-large on the same
hardware. It is also 7.9x faster on a CPU, as well as being better performing than other compressed variants of the
architecture, and some of the non-compressed variants: it obtains performance improvements of between 0.3% and 31%,
absolute, with respect to BERT-large, on multiple public natural language understanding (NLU) benchmarks.*
This model was contributed by [stefan-it](https://huggingface.co/stefan-it). The original code can be found [here](https://github.com/alexa/bort/).
## Usage tips
- BORT's model architecture is based on BERT, refer to [BERT's documentation page](bert) for the
model's API reference as well as usage examples.
- BORT uses the RoBERTa tokenizer instead of the BERT tokenizer, refer to [RoBERTa's documentation page](roberta) for the tokenizer's API reference as well as usage examples.
- BORT requires a specific fine-tuning algorithm, called [Agora](https://adewynter.github.io/notes/bort_algorithms_and_applications.html#fine-tuning-with-algebraic-topology) ,
that is sadly not open-sourced yet. It would be very useful for the community, if someone tries to implement the
algorithm to make BORT fine-tuning work.

View File

@ -0,0 +1,78 @@
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
*This model was released on 2022-12-12 and added to Hugging Face Transformers on 2023-06-20.*
# DETA
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
<Tip warning={true}>
This model is in maintenance mode only, we don't accept any new PRs changing its code.
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
You can do so by running the following command: `pip install -U transformers==4.40.2`.
</Tip>
## Overview
The DETA model was proposed in [NMS Strikes Back](https://huggingface.co/papers/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl.
DETA (short for Detection Transformers with Assignment) improves [Deformable DETR](deformable_detr) by replacing the one-to-one bipartite Hungarian matching loss
with one-to-many label assignments used in traditional detectors with non-maximum suppression (NMS). This leads to significant gains of up to 2.5 mAP.
The abstract from the paper is the following:
*Detection Transformer (DETR) directly transforms queries to unique objects by using one-to-one bipartite matching during training and enables end-to-end object detection. Recently, these models have surpassed traditional detectors on COCO with undeniable elegance. However, they differ from traditional detectors in multiple designs, including model architecture and training schedules, and thus the effectiveness of one-to-one matching is not fully understood. In this work, we conduct a strict comparison between the one-to-one Hungarian matching in DETRs and the one-to-many label assignments in traditional detectors with non-maximum supervision (NMS). Surprisingly, we observe one-to-many assignments with NMS consistently outperform standard one-to-one matching under the same setting, with a significant gain of up to 2.5 mAP. Our detector that trains Deformable-DETR with traditional IoU-based label assignment achieved 50.2 COCO mAP within 12 epochs (1x schedule) with ResNet50 backbone, outperforming all existing traditional or transformer-based detectors in this setting. On multiple datasets, schedules, and architectures, we consistently show bipartite matching is unnecessary for performant detection transformers. Furthermore, we attribute the success of detection transformers to their expressive transformer architecture.*
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/deta_architecture.jpg"
alt="drawing" width="600"/>
<small> DETA overview. Taken from the <a href="https://huggingface.co/papers/2212.06137">original paper</a>. </small>
This model was contributed by [nielsr](https://huggingface.co/nielsr).
The original code can be found [here](https://github.com/jozhang97/DETA).
## Resources
A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DETA.
- Demo notebooks for DETA can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETA).
- Scripts for finetuning [`DetaForObjectDetection`] with [`Trainer`] or [Accelerate](https://huggingface.co/docs/accelerate/index) can be found [here](https://github.com/huggingface/transformers/tree/main/examples/pytorch/object-detection).
- See also: [Object detection task guide](../tasks/object_detection).
If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
## DetaConfig
[[autodoc]] DetaConfig
## DetaImageProcessor
[[autodoc]] DetaImageProcessor
- preprocess
- post_process_object_detection
## DetaModel
[[autodoc]] DetaModel
- forward
## DetaForObjectDetection
[[autodoc]] DetaForObjectDetection
- forward

View File

@ -0,0 +1,85 @@
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
*This model was released on 2022-06-02 and added to Hugging Face Transformers on 2023-06-20.*
# EfficientFormer
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
<Tip warning={true}>
This model is in maintenance mode only, we don't accept any new PRs changing its code.
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
You can do so by running the following command: `pip install -U transformers==4.40.2`.
</Tip>
## Overview
The EfficientFormer model was proposed in [EfficientFormer: Vision Transformers at MobileNet Speed](https://huggingface.co/papers/2206.01191)
by Yanyu Li, Geng Yuan, Yang Wen, Eric Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren. EfficientFormer proposes a
dimension-consistent pure transformer that can be run on mobile devices for dense prediction tasks like image classification, object
detection and semantic segmentation.
The abstract from the paper is the following:
*Vision Transformers (ViT) have shown rapid progress in computer vision tasks, achieving promising results on various benchmarks.
However, due to the massive number of parameters and model design, e.g., attention mechanism, ViT-based models are generally
times slower than lightweight convolutional networks. Therefore, the deployment of ViT for real-time applications is particularly
challenging, especially on resource-constrained hardware such as mobile devices. Recent efforts try to reduce the computation
complexity of ViT through network architecture search or hybrid design with MobileNet block, yet the inference speed is still
unsatisfactory. This leads to an important question: can transformers run as fast as MobileNet while obtaining high performance?
To answer this, we first revisit the network architecture and operators used in ViT-based models and identify inefficient designs.
Then we introduce a dimension-consistent pure transformer (without MobileNet blocks) as a design paradigm.
Finally, we perform latency-driven slimming to get a series of final models dubbed EfficientFormer.
Extensive experiments show the superiority of EfficientFormer in performance and speed on mobile devices.
Our fastest model, EfficientFormer-L1, achieves 79.2% top-1 accuracy on ImageNet-1K with only 1.6 ms inference latency on
iPhone 12 (compiled with CoreML), which { runs as fast as MobileNetV2×1.4 (1.6 ms, 74.7% top-1),} and our largest model,
EfficientFormer-L7, obtains 83.3% accuracy with only 7.0 ms latency. Our work proves that properly designed transformers can
reach extremely low latency on mobile devices while maintaining high performance.*
This model was contributed by [novice03](https://huggingface.co/novice03) and [Bearnardd](https://huggingface.co/Bearnardd).
The original code can be found [here](https://github.com/snap-research/EfficientFormer).
## Documentation resources
- [Image classification task guide](../tasks/image_classification)
## EfficientFormerConfig
[[autodoc]] EfficientFormerConfig
## EfficientFormerImageProcessor
[[autodoc]] EfficientFormerImageProcessor
- preprocess
## EfficientFormerModel
[[autodoc]] EfficientFormerModel
- forward
## EfficientFormerForImageClassification
[[autodoc]] EfficientFormerForImageClassification
- forward
## EfficientFormerForImageClassificationWithTeacher
[[autodoc]] EfficientFormerForImageClassificationWithTeacher
- forward

View File

@ -0,0 +1,97 @@
<!--Copyright 2023 The HuggingFace and Baidu Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
*This model was released on 2020-12-31 and added to Hugging Face Transformers on 2023-06-20.*
# ErnieM
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
<Tip warning={true}>
This model is in maintenance mode only, we don't accept any new PRs changing its code.
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
You can do so by running the following command: `pip install -U transformers==4.40.2`.
</Tip>
## Overview
The ErnieM model was proposed in [ERNIE-M: Enhanced Multilingual Representation by Aligning
Cross-lingual Semantics with Monolingual Corpora](https://huggingface.co/papers/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun,
Hao Tian, Hua Wu, Haifeng Wang.
The abstract from the paper is the following:
*Recent studies have demonstrated that pre-trained cross-lingual models achieve impressive performance in downstream cross-lingual tasks. This improvement benefits from learning a large amount of monolingual and parallel corpora. Although it is generally acknowledged that parallel corpora are critical for improving the model performance, existing methods are often constrained by the size of parallel corpora, especially for lowresource languages. In this paper, we propose ERNIE-M, a new training method that encourages the model to align the representation of multiple languages with monolingual corpora, to overcome the constraint that the parallel corpus size places on the model performance. Our key insight is to integrate back-translation into the pre-training process. We generate pseudo-parallel sentence pairs on a monolingual corpus to enable the learning of semantic alignments between different languages, thereby enhancing the semantic modeling of cross-lingual models. Experimental results show that ERNIE-M outperforms existing cross-lingual models and delivers new state-of-the-art results in various cross-lingual downstream tasks.*
This model was contributed by [Susnato Dhar](https://huggingface.co/susnato). The original code can be found [here](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/paddlenlp/transformers/ernie_m).
## Usage tips
- Ernie-M is a BERT-like model so it is a stacked Transformer Encoder.
- Instead of using MaskedLM for pretraining (like BERT) the authors used two novel techniques: `Cross-attention Masked Language Modeling` and `Back-translation Masked Language Modeling`. For now these two LMHead objectives are not implemented here.
- It is a multilingual language model.
- Next Sentence Prediction was not used in pretraining process.
## Resources
- [Text classification task guide](../tasks/sequence_classification)
- [Token classification task guide](../tasks/token_classification)
- [Question answering task guide](../tasks/question_answering)
- [Multiple choice task guide](../tasks/multiple_choice)
## ErnieMConfig
[[autodoc]] ErnieMConfig
## ErnieMTokenizer
[[autodoc]] ErnieMTokenizer
- build_inputs_with_special_tokens
- get_special_tokens_mask
- create_token_type_ids_from_sequences
- save_vocabulary
## ErnieMModel
[[autodoc]] ErnieMModel
- forward
## ErnieMForSequenceClassification
[[autodoc]] ErnieMForSequenceClassification
- forward
## ErnieMForMultipleChoice
[[autodoc]] ErnieMForMultipleChoice
- forward
## ErnieMForTokenClassification
[[autodoc]] ErnieMForTokenClassification
- forward
## ErnieMForQuestionAnswering
[[autodoc]] ErnieMForQuestionAnswering
- forward
## ErnieMForInformationExtraction
[[autodoc]] ErnieMForInformationExtraction
- forward

View File

@ -0,0 +1,145 @@
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
*This model was released on 2023-02-07 and added to Hugging Face Transformers on 2023-06-20.*
# GPTSAN-japanese
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
<Tip warning={true}>
This model is in maintenance mode only, we don't accept any new PRs changing its code.
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
You can do so by running the following command: `pip install -U transformers==4.40.2`.
</Tip>
## Overview
The [GPTSAN-japanese](https://huggingface.co/Tanrei/GPTSAN-japanese) model was released in the repository by Toshiyuki Sakamoto (tanreinama).
GPTSAN is a Japanese language model using Switch Transformer. It has the same structure as the model introduced as Prefix LM
in the T5 paper, and support both Text Generation and Masked Language Modeling tasks. These basic tasks similarly can
fine-tune for translation or summarization.
### Usage example
The `generate()` method can be used to generate text using GPTSAN-Japanese model.
```python
>>> from transformers import AutoModel, AutoTokenizer
from accelerate import Accelerator
>>> import torch
>>> device = Accelerator().device
>>> tokenizer = AutoTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
>>> model = AutoModel.from_pretrained("Tanrei/GPTSAN-japanese").to(device)
>>> x_tok = tokenizer("は、", prefix_text="織田信長", return_tensors="pt")
>>> torch.manual_seed(0)
>>> gen_tok = model.generate(x_tok.input_ids.to(model.device), token_type_ids=x_tok.token_type_ids.to(model.device), max_new_tokens=20)
>>> tokenizer.decode(gen_tok[0])
'織田信長は、2004年に『戦国BASARA』のために、豊臣秀吉'
```
## GPTSAN Features
GPTSAN has some unique features. It has a model structure of Prefix-LM. It works as a shifted Masked Language Model for Prefix Input tokens. Un-prefixed inputs behave like normal generative models.
The Spout vector is a GPTSAN specific input. Spout is pre-trained with random inputs, but you can specify a class of text or an arbitrary vector during fine-tuning. This allows you to indicate the tendency of the generated text.
GPTSAN has a sparse Feed Forward based on Switch-Transformer. You can also add other layers and train them partially. See the original GPTSAN repository for details.
### Prefix-LM Model
GPTSAN has the structure of the model named Prefix-LM in the `T5` paper. (The original GPTSAN repository calls it `hybrid`)
In GPTSAN, the `Prefix` part of Prefix-LM, that is, the input position that can be referenced by both tokens, can be specified with any length.
Arbitrary lengths can also be specified differently for each batch.
This length applies to the text entered in `prefix_text` for the tokenizer.
The tokenizer returns the mask of the `Prefix` part of Prefix-LM as `token_type_ids`.
The model treats the part where `token_type_ids` is 1 as a `Prefix` part, that is, the input can refer to both tokens before and after.
## Usage tips
Specifying the Prefix part is done with a mask passed to self-attention.
When token_type_ids=None or all zero, it is equivalent to regular causal mask
for example:
>>> x_token = tokenizer("アイウエ")
```text
input_ids: | SOT | SEG | ア | イ | ウ | エ |
token_type_ids: | 1 | 0 | 0 | 0 | 0 | 0 |
prefix_lm_mask:
SOT | 1 0 0 0 0 0 |
SEG | 1 1 0 0 0 0 |
ア | 1 1 1 0 0 0 |
イ | 1 1 1 1 0 0 |
ウ | 1 1 1 1 1 0 |
エ | 1 1 1 1 1 1 |
```
>>> x_token = tokenizer("", prefix_text="アイウエ")
```text
input_ids: | SOT | ア | イ | ウ | エ | SEG |
token_type_ids: | 1 | 1 | 1 | 1 | 1 | 0 |
prefix_lm_mask:
SOT | 1 1 1 1 1 0 |
ア | 1 1 1 1 1 0 |
イ | 1 1 1 1 1 0 |
ウ | 1 1 1 1 1 0 |
エ | 1 1 1 1 1 0 |
SEG | 1 1 1 1 1 1 |
```
>>> x_token = tokenizer("ウエ", prefix_text="アイ")
```text
input_ids: | SOT | ア | イ | SEG | ウ | エ |
token_type_ids: | 1 | 1 | 1 | 0 | 0 | 0 |
prefix_lm_mask:
SOT | 1 1 1 0 0 0 |
ア | 1 1 1 0 0 0 |
イ | 1 1 1 0 0 0 |
SEG | 1 1 1 1 0 0 |
ウ | 1 1 1 1 1 0 |
エ | 1 1 1 1 1 1 |
```
### Spout Vector
A Spout Vector is a special vector for controlling text generation.
This vector is treated as the first embedding in self-attention to bring extraneous attention to the generated tokens.
In the pre-trained model published from `Tanrei/GPTSAN-japanese`, the Spout Vector is a 128-dimensional vector that passes through 8 fully connected layers in the model and is projected into the space acting as external attention.
The Spout Vector projected by the fully connected layer is split to be passed to all self-attentions.
## GPTSanJapaneseConfig
[[autodoc]] GPTSanJapaneseConfig
## GPTSanJapaneseTokenizer
[[autodoc]] GPTSanJapaneseTokenizer
## GPTSanJapaneseModel
[[autodoc]] GPTSanJapaneseModel
## GPTSanJapaneseForConditionalGeneration
[[autodoc]] GPTSanJapaneseForConditionalGeneration
- forward

View File

@ -0,0 +1,60 @@
<!--Copyright 2022 The HuggingFace Team and Microsoft. All rights reserved.
Licensed under the MIT License; you may not use this file except in compliance with
the License.
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
*This model was released on 2021-06-09 and added to Hugging Face Transformers on 2023-06-20.*
# Graphormer
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
<Tip warning={true}>
This model is in maintenance mode only, we don't accept any new PRs changing its code.
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
You can do so by running the following command: `pip install -U transformers==4.40.2`.
</Tip>
## Overview
The Graphormer model was proposed in [Do Transformers Really Perform Bad for Graph Representation?](https://huggingface.co/papers/2106.05234) by
Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen and Tie-Yan Liu. It is a Graph Transformer model, modified to allow computations on graphs instead of text sequences by generating embeddings and features of interest during preprocessing and collation, then using a modified attention.
The abstract from the paper is the following:
*The Transformer architecture has become a dominant choice in many domains, such as natural language processing and computer vision. Yet, it has not achieved competitive performance on popular leaderboards of graph-level prediction compared to mainstream GNN variants. Therefore, it remains a mystery how Transformers could perform well for graph representation learning. In this paper, we solve this mystery by presenting Graphormer, which is built upon the standard Transformer architecture, and could attain excellent results on a broad range of graph representation learning tasks, especially on the recent OGB Large-Scale Challenge. Our key insight to utilizing Transformer in the graph is the necessity of effectively encoding the structural information of a graph into the model. To this end, we propose several simple yet effective structural encoding methods to help Graphormer better model graph-structured data. Besides, we mathematically characterize the expressive power of Graphormer and exhibit that with our ways of encoding the structural information of graphs, many popular GNN variants could be covered as the special cases of Graphormer.*
This model was contributed by [clefourrier](https://huggingface.co/clefourrier). The original code can be found [here](https://github.com/microsoft/Graphormer).
## Usage tips
This model will not work well on large graphs (more than 100 nodes/edges), as it will make the memory explode.
You can reduce the batch size, increase your RAM, or decrease the `UNREACHABLE_NODE_DISTANCE` parameter in algos_graphormer.pyx, but it will be hard to go above 700 nodes/edges.
This model does not use a tokenizer, but instead a special collator during training.
## GraphormerConfig
[[autodoc]] GraphormerConfig
## GraphormerModel
[[autodoc]] GraphormerModel
- forward
## GraphormerForGraphClassification
[[autodoc]] GraphormerForGraphClassification
- forward

View File

@ -0,0 +1,99 @@
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
*This model was released on 2020-04-30 and added to Hugging Face Transformers on 2023-06-20.*
# Jukebox
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
<Tip warning={true}>
This model is in maintenance mode only, we don't accept any new PRs changing its code.
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
You can do so by running the following command: `pip install -U transformers==4.40.2`.
</Tip>
## Overview
The Jukebox model was proposed in [Jukebox: A generative model for music](https://huggingface.co/papers/2005.00341)
by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford,
Ilya Sutskever. It introduces a generative music model which can produce minute long samples that can be conditioned on
an artist, genres and lyrics.
The abstract from the paper is the following:
*We introduce Jukebox, a model that generates music with singing in the raw audio domain. We tackle the long context of raw audio using a multiscale VQ-VAE to compress it to discrete codes, and modeling those using autoregressive Transformers. We show that the combined model at scale can generate high-fidelity and diverse songs with coherence up to multiple minutes. We can condition on artist and genre to steer the musical and vocal style, and on unaligned lyrics to make the singing more controllable. We are releasing thousands of non cherry-picked samples, along with model weights and code.*
As shown on the following figure, Jukebox is made of 3 `priors` which are decoder only models. They follow the architecture described in [Generating Long Sequences with Sparse Transformers](https://huggingface.co/papers/1904.10509), modified to support longer context length.
First, a autoencoder is used to encode the text lyrics. Next, the first (also called `top_prior`) prior attends to the last hidden states extracted from the lyrics encoder. The priors are linked to the previous priors respectively via an `AudioConditioner` module. The`AudioConditioner` upsamples the outputs of the previous prior to raw tokens at a certain audio frame per second resolution.
The metadata such as *artist, genre and timing* are passed to each prior, in the form of a start token and positional embedding for the timing data. The hidden states are mapped to the closest codebook vector from the VQVAE in order to convert them to raw audio.
![JukeboxModel](https://gist.githubusercontent.com/ArthurZucker/92c1acaae62ebf1b6a951710bdd8b6af/raw/c9c517bf4eff61393f6c7dec9366ef02bdd059a3/jukebox.svg)
This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ).
The original code can be found [here](https://github.com/openai/jukebox).
## Usage tips
- This model only supports inference. This is for a few reasons, mostly because it requires a crazy amount of memory to train. Feel free to open a PR and add what's missing to have a full integration with the hugging face trainer!
- This model is very slow, and takes 8h to generate a minute long audio using the 5b top prior on a V100 GPU. In order automaticallay handle the device on which the model should execute, use `accelerate`.
- Contrary to the paper, the order of the priors goes from `0` to `1` as it felt more intuitive : we sample starting from `0`.
- Primed sampling (conditioning the sampling on raw audio) requires more memory than ancestral sampling and should be used with `fp16` set to `True`.
This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ).
The original code can be found [here](https://github.com/openai/jukebox).
## JukeboxConfig
[[autodoc]] JukeboxConfig
## JukeboxPriorConfig
[[autodoc]] JukeboxPriorConfig
## JukeboxVQVAEConfig
[[autodoc]] JukeboxVQVAEConfig
## JukeboxTokenizer
[[autodoc]] JukeboxTokenizer
- save_vocabulary
## JukeboxModel
[[autodoc]] JukeboxModel
- ancestral_sample
- primed_sample
- continue_sample
- upsample
- _sample
## JukeboxPrior
[[autodoc]] JukeboxPrior
- sample
- forward
## JukeboxVQVAE
[[autodoc]] JukeboxVQVAE
- forward
- encode
- decode

View File

@ -0,0 +1,84 @@
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
*This model was released on 2021-10-30 and added to Hugging Face Transformers on 2023-06-20.*
# M-CTC-T
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
<Tip warning={true}>
This model is in maintenance mode only, so we won't accept any new PRs changing its code.
If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
You can do so by running the following command: `pip install -U transformers==4.30.0`.
</Tip>
## Overview
The M-CTC-T model was proposed in [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://huggingface.co/papers/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert. The model is a 1B-param transformer encoder, with a CTC head over 8065 character labels and a language identification head over 60 language ID labels. It is trained on Common Voice (version 6.1, December 2020 release) and VoxPopuli. After training on Common Voice and VoxPopuli, the model is trained on Common Voice only. The labels are unnormalized character-level transcripts (punctuation and capitalization are not removed). The model takes as input Mel filterbank features from a 16Khz audio signal.
The abstract from the paper is the following:
*Semi-supervised learning through pseudo-labeling has become a staple of state-of-the-art monolingual
speech recognition systems. In this work, we extend pseudo-labeling to massively multilingual speech
recognition with 60 languages. We propose a simple pseudo-labeling recipe that works well even
with low-resource languages: train a supervised multilingual model, fine-tune it with semi-supervised
learning on a target language, generate pseudo-labels for that language, and train a final model using
pseudo-labels for all languages, either from scratch or by fine-tuning. Experiments on the labeled
Common Voice and unlabeled VoxPopuli datasets show that our recipe can yield a model with better
performance for many languages that also transfers well to LibriSpeech.*
This model was contributed by [cwkeam](https://huggingface.co/cwkeam). The original code can be found [here](https://github.com/flashlight/wav2letter/tree/main/recipes/mling_pl).
## Usage tips
The PyTorch version of this model is only available in torch 1.9 and higher.
## Resources
- [Automatic speech recognition task guide](../tasks/asr)
## MCTCTConfig
[[autodoc]] MCTCTConfig
## MCTCTFeatureExtractor
[[autodoc]] MCTCTFeatureExtractor
- __call__
## MCTCTProcessor
[[autodoc]] MCTCTProcessor
- __call__
- from_pretrained
- save_pretrained
- batch_decode
- decode
## MCTCTModel
[[autodoc]] MCTCTModel
- forward
## MCTCTForCTC
[[autodoc]] MCTCTForCTC
- forward

View File

@ -0,0 +1,94 @@
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
*This model was released on 2022-09-21 and added to Hugging Face Transformers on 2023-06-20.*
# MEGA
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
<Tip warning={true}>
This model is in maintenance mode only, we don't accept any new PRs changing its code.
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
You can do so by running the following command: `pip install -U transformers==4.40.2`.
</Tip>
## Overview
The MEGA model was proposed in [Mega: Moving Average Equipped Gated Attention](https://huggingface.co/papers/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
MEGA proposes a new approach to self-attention with each encoder layer having a multi-headed exponential moving average in addition to a single head of standard dot-product attention, giving the attention mechanism
stronger positional biases. This allows MEGA to perform competitively to Transformers on standard benchmarks including LRA
while also having significantly fewer parameters. MEGA's compute efficiency allows it to scale to very long sequences, making it an
attractive option for long-document NLP tasks.
The abstract from the paper is the following:
*The design choices in the Transformer attention mechanism, including weak inductive bias and quadratic computational complexity, have limited its application for modeling long sequences. In this paper, we introduce Mega, a simple, theoretically grounded, single-head gated attention mechanism equipped with (exponential) moving average to incorporate inductive bias of position-aware local dependencies into the position-agnostic attention mechanism. We further propose a variant of Mega that offers linear time and space complexity yet yields only minimal quality loss, by efficiently splitting the whole sequence into multiple chunks with fixed length. Extensive experiments on a wide range of sequence modeling benchmarks, including the Long Range Arena, neural machine translation, auto-regressive language modeling, and image and speech classification, show that Mega achieves significant improvements over other sequence models, including variants of Transformers and recent state space models.*
This model was contributed by [mnaylor](https://huggingface.co/mnaylor).
The original code can be found [here](https://github.com/facebookresearch/mega).
## Usage tips
- MEGA can perform quite well with relatively few parameters. See Appendix D in the MEGA paper for examples of architectural specs which perform well in various settings. If using MEGA as a decoder, be sure to set `bidirectional=False` to avoid errors with default bidirectional.
- Mega-chunk is a variant of mega that reduces time and spaces complexity from quadratic to linear. Utilize chunking with MegaConfig.use_chunking and control chunk size with MegaConfig.chunk_size
## Implementation Notes
- The original implementation of MEGA had an inconsistent expectation of attention masks for padding and causal self-attention between the softmax attention and Laplace/squared ReLU method. This implementation addresses that inconsistency.
- The original implementation did not include token type embeddings; this implementation adds support for these, with the option controlled by MegaConfig.add_token_type_embeddings
## MegaConfig
[[autodoc]] MegaConfig
## MegaModel
[[autodoc]] MegaModel
- forward
## MegaForCausalLM
[[autodoc]] MegaForCausalLM
- forward
## MegaForMaskedLM
[[autodoc]] MegaForMaskedLM
- forward
## MegaForSequenceClassification
[[autodoc]] MegaForSequenceClassification
- forward
## MegaForMultipleChoice
[[autodoc]] MegaForMultipleChoice
- forward
## MegaForTokenClassification
[[autodoc]] MegaForTokenClassification
- forward
## MegaForQuestionAnswering
[[autodoc]] MegaForQuestionAnswering
- forward

View File

@ -0,0 +1,101 @@
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
*This model was released on 2022-04-14 and added to Hugging Face Transformers on 2023-06-20.*
# Neighborhood Attention Transformer
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
<Tip warning={true}>
This model is in maintenance mode only, we don't accept any new PRs changing its code.
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
You can do so by running the following command: `pip install -U transformers==4.40.2`.
</Tip>
## Overview
NAT was proposed in [Neighborhood Attention Transformer](https://huggingface.co/papers/2204.07143)
by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
It is a hierarchical vision transformer based on Neighborhood Attention, a sliding-window self attention pattern.
The abstract from the paper is the following:
*We present Neighborhood Attention (NA), the first efficient and scalable sliding-window attention mechanism for vision.
NA is a pixel-wise operation, localizing self attention (SA) to the nearest neighboring pixels, and therefore enjoys a
linear time and space complexity compared to the quadratic complexity of SA. The sliding-window pattern allows NA's
receptive field to grow without needing extra pixel shifts, and preserves translational equivariance, unlike
Swin Transformer's Window Self Attention (WSA). We develop NATTEN (Neighborhood Attention Extension), a Python package
with efficient C++ and CUDA kernels, which allows NA to run up to 40% faster than Swin's WSA while using up to 25% less
memory. We further present Neighborhood Attention Transformer (NAT), a new hierarchical transformer design based on NA
that boosts image classification and downstream vision performance. Experimental results on NAT are competitive;
NAT-Tiny reaches 83.2% top-1 accuracy on ImageNet, 51.4% mAP on MS-COCO and 48.4% mIoU on ADE20K, which is 1.9%
ImageNet accuracy, 1.0% COCO mAP, and 2.6% ADE20K mIoU improvement over a Swin model with similar size.*
<img
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/neighborhood-attention-pattern.jpg"
alt="drawing" width="600"/>
<small> Neighborhood Attention compared to other attention patterns.
Taken from the <a href="https://huggingface.co/papers/2204.07143">original paper</a>.</small>
This model was contributed by [Ali Hassani](https://huggingface.co/alihassanijr).
The original code can be found [here](https://github.com/SHI-Labs/Neighborhood-Attention-Transformer).
## Usage tips
- One can use the [`AutoImageProcessor`] API to prepare images for the model.
- NAT can be used as a *backbone*. When `output_hidden_states = True`,
it will output both `hidden_states` and `reshaped_hidden_states`.
The `reshaped_hidden_states` have a shape of `(batch, num_channels, height, width)` rather than
`(batch_size, height, width, num_channels)`.
Notes:
- NAT depends on [NATTEN](https://github.com/SHI-Labs/NATTEN/)'s implementation of Neighborhood Attention.
You can install it with pre-built wheels for Linux by referring to [shi-labs.com/natten](https://shi-labs.com/natten),
or build on your system by running `pip install natten`.
Note that the latter will likely take time to compile. NATTEN does not support Windows devices yet.
- Patch size of 4 is only supported at the moment.
## Resources
A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with NAT.
<PipelineTag pipeline="image-classification"/>
- [`NatForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
- See also: [Image classification task guide](../tasks/image_classification)
If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
## NatConfig
[[autodoc]] NatConfig
## NatModel
[[autodoc]] NatModel
- forward
## NatForImageClassification
[[autodoc]] NatForImageClassification
- forward

View File

@ -0,0 +1,101 @@
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
*This model was released on 2019-08-31 and added to Hugging Face Transformers on 2023-06-20.*
# Nezha
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
<Tip warning={true}>
This model is in maintenance mode only, we don't accept any new PRs changing its code.
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
You can do so by running the following command: `pip install -U transformers==4.40.2`.
</Tip>
## Overview
The Nezha model was proposed in [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://huggingface.co/papers/1909.00204) by Junqiu Wei et al.
The abstract from the paper is the following:
*The pre-trained language models have achieved great successes in various natural language understanding (NLU) tasks
due to its capacity to capture the deep contextualized information in text by pre-training on large-scale corpora.
In this technical report, we present our practice of pre-training language models named NEZHA (NEural contextualiZed
representation for CHinese lAnguage understanding) on Chinese corpora and finetuning for the Chinese NLU tasks.
The current version of NEZHA is based on BERT with a collection of proven improvements, which include Functional
Relative Positional Encoding as an effective positional encoding scheme, Whole Word Masking strategy,
Mixed Precision Training and the LAMB Optimizer in training the models. The experimental results show that NEZHA
achieves the state-of-the-art performances when finetuned on several representative Chinese tasks, including
named entity recognition (People's Daily NER), sentence matching (LCQMC), Chinese sentiment classification (ChnSenti)
and natural language inference (XNLI).*
This model was contributed by [sijunhe](https://huggingface.co/sijunhe). The original code can be found [here](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/NEZHA-PyTorch).
## Resources
- [Text classification task guide](../tasks/sequence_classification)
- [Token classification task guide](../tasks/token_classification)
- [Question answering task guide](../tasks/question_answering)
- [Masked language modeling task guide](../tasks/masked_language_modeling)
- [Multiple choice task guide](../tasks/multiple_choice)
## NezhaConfig
[[autodoc]] NezhaConfig
## NezhaModel
[[autodoc]] NezhaModel
- forward
## NezhaForPreTraining
[[autodoc]] NezhaForPreTraining
- forward
## NezhaForMaskedLM
[[autodoc]] NezhaForMaskedLM
- forward
## NezhaForNextSentencePrediction
[[autodoc]] NezhaForNextSentencePrediction
- forward
## NezhaForSequenceClassification
[[autodoc]] NezhaForSequenceClassification
- forward
## NezhaForMultipleChoice
[[autodoc]] NezhaForMultipleChoice
- forward
## NezhaForTokenClassification
[[autodoc]] NezhaForTokenClassification
- forward
## NezhaForQuestionAnswering
[[autodoc]] NezhaForQuestionAnswering
- forward

View File

@ -0,0 +1,66 @@
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
*This model was released on 2023-04-16 and added to Hugging Face Transformers on 2023-06-20.*
# Open-Llama
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
<Tip warning={true}>
This model is in maintenance mode only, we don't accept any new PRs changing its code.
If you run into any issues running this model, please reinstall the last version that supported this model: v4.31.0.
You can do so by running the following command: `pip install -U transformers==4.31.0`.
</Tip>
<Tip warning={true}>
This model differs from the [OpenLLaMA models](https://huggingface.co/models?search=openllama) on the Hugging Face Hub, which primarily use the [LLaMA](llama) architecture.
</Tip>
## Overview
The Open-Llama model was proposed in the open source Open-Llama project by community developer s-JoL.
The model is mainly based on LLaMA with some modifications, incorporating memory-efficient attention from Xformers, stable embedding from Bloom, and shared input-output embedding from PaLM.
And the model is pre-trained on both Chinese and English, which gives it better performance on Chinese language tasks.
This model was contributed by [s-JoL](https://huggingface.co/s-JoL).
The original code was released on GitHub by [s-JoL](https://github.com/s-JoL), but is now removed.
## OpenLlamaConfig
[[autodoc]] OpenLlamaConfig
## OpenLlamaModel
[[autodoc]] OpenLlamaModel
- forward
## OpenLlamaForCausalLM
[[autodoc]] OpenLlamaForCausalLM
- forward
## OpenLlamaForSequenceClassification
[[autodoc]] OpenLlamaForSequenceClassification
- forward

View File

@ -0,0 +1,183 @@
<!--Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
*This model was released on 2020-04-20 and added to Hugging Face Transformers on 2023-06-20.*
# QDQBERT
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
<Tip warning={true}>
This model is in maintenance mode only, we don't accept any new PRs changing its code.
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
You can do so by running the following command: `pip install -U transformers==4.40.2`.
</Tip>
## Overview
The QDQBERT model can be referenced in [Integer Quantization for Deep Learning Inference: Principles and Empirical
Evaluation](https://huggingface.co/papers/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius
Micikevicius.
The abstract from the paper is the following:
*Quantization techniques can reduce the size of Deep Neural Networks and improve inference latency and throughput by
taking advantage of high throughput integer instructions. In this paper we review the mathematical aspects of
quantization parameters and evaluate their choices on a wide range of neural network models for different application
domains, including vision, speech, and language. We focus on quantization techniques that are amenable to acceleration
by processors with high-throughput integer math pipelines. We also present a workflow for 8-bit quantization that is
able to maintain accuracy within 1% of the floating-point baseline on all networks studied, including models that are
more difficult to quantize, such as MobileNets and BERT-large.*
This model was contributed by [shangz](https://huggingface.co/shangz).
## Usage tips
- QDQBERT model adds fake quantization operations (pair of QuantizeLinear/DequantizeLinear ops) to (i) linear layer
inputs and weights, (ii) matmul inputs, (iii) residual add inputs, in BERT model.
- QDQBERT requires the dependency of [Pytorch Quantization Toolkit](https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization). To install `pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com`
- QDQBERT model can be loaded from any checkpoint of HuggingFace BERT model (for example *google-bert/bert-base-uncased*), and
perform Quantization Aware Training/Post Training Quantization.
- A complete example of using QDQBERT model to perform Quatization Aware Training and Post Training Quantization for
SQUAD task can be found at https://github.com/huggingface/transformers-research-projects/tree/main/quantization-qdqbert.
### Set default quantizers
QDQBERT model adds fake quantization operations (pair of QuantizeLinear/DequantizeLinear ops) to BERT by
`TensorQuantizer` in [Pytorch Quantization Toolkit](https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization). `TensorQuantizer` is the module
for quantizing tensors, with `QuantDescriptor` defining how the tensor should be quantized. Refer to [Pytorch
Quantization Toolkit userguide](https://docs.nvidia.com/deeplearning/tensorrt/pytorch-quantization-toolkit/docs/userguide.html) for more details.
Before creating QDQBERT model, one has to set the default `QuantDescriptor` defining default tensor quantizers.
Example:
```python
>>> import pytorch_quantization.nn as quant_nn
>>> from pytorch_quantization.tensor_quant import QuantDescriptor
>>> # The default tensor quantizer is set to use Max calibration method
>>> input_desc = QuantDescriptor(num_bits=8, calib_method="max")
>>> # The default tensor quantizer is set to be per-channel quantization for weights
>>> weight_desc = QuantDescriptor(num_bits=8, axis=((0,)))
>>> quant_nn.QuantLinear.set_default_quant_desc_input(input_desc)
>>> quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc)
```
### Calibration
Calibration is the terminology of passing data samples to the quantizer and deciding the best scaling factors for
tensors. After setting up the tensor quantizers, one can use the following example to calibrate the model:
```python
>>> # Find the TensorQuantizer and enable calibration
>>> for name, module in model.named_modules():
... if name.endswith("_input_quantizer"):
... module.enable_calib()
... module.disable_quant() # Use full precision data to calibrate
>>> # Feeding data samples
>>> model(x)
>>> # ...
>>> # Finalize calibration
>>> for name, module in model.named_modules():
... if name.endswith("_input_quantizer"):
... module.load_calib_amax()
... module.enable_quant()
>>> # If running on accelerator, it needs to call `.to(xx)` again because new tensors will be created by calibration process
>>> from accelerate import Accelerator
>>> device = Accelerator().device
>>> model.to(device)
>>> # Keep running the quantized model
>>> # ...
```
### Export to ONNX
The goal of exporting to ONNX is to deploy inference by [TensorRT](https://developer.nvidia.com/tensorrt). Fake
quantization will be broken into a pair of QuantizeLinear/DequantizeLinear ONNX ops. After setting static member of
TensorQuantizer to use Pytorch's own fake quantization functions, fake quantized model can be exported to ONNX, follow
the instructions in [torch.onnx](https://pytorch.org/docs/stable/onnx.html). Example:
```python
>>> from pytorch_quantization.nn import TensorQuantizer
>>> TensorQuantizer.use_fb_fake_quant = True
>>> # Load the calibrated model
>>> ...
>>> # ONNX export
>>> torch.onnx.export(...)
```
## Resources
- [Text classification task guide](../tasks/sequence_classification)
- [Token classification task guide](../tasks/token_classification)
- [Question answering task guide](../tasks/question_answering)
- [Causal language modeling task guide](../tasks/language_modeling)
- [Masked language modeling task guide](../tasks/masked_language_modeling)
- [Multiple choice task guide](../tasks/multiple_choice)
## QDQBertConfig
[[autodoc]] QDQBertConfig
## QDQBertModel
[[autodoc]] QDQBertModel
- forward
## QDQBertLMHeadModel
[[autodoc]] QDQBertLMHeadModel
- forward
## QDQBertForMaskedLM
[[autodoc]] QDQBertForMaskedLM
- forward
## QDQBertForSequenceClassification
[[autodoc]] QDQBertForSequenceClassification
- forward
## QDQBertForNextSentencePrediction
[[autodoc]] QDQBertForNextSentencePrediction
- forward
## QDQBertForMultipleChoice
[[autodoc]] QDQBertForMultipleChoice
- forward
## QDQBertForTokenClassification
[[autodoc]] QDQBertForTokenClassification
- forward
## QDQBertForQuestionAnswering
[[autodoc]] QDQBertForQuestionAnswering
- forward

View File

@ -0,0 +1,102 @@
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
*This model was released on 2020-02-10 and added to Hugging Face Transformers on 2023-06-20.*
# REALM
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
<Tip warning={true}>
This model is in maintenance mode only, we don't accept any new PRs changing its code.
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
You can do so by running the following command: `pip install -U transformers==4.40.2`.
</Tip>
## Overview
The REALM model was proposed in [REALM: Retrieval-Augmented Language Model Pre-Training](https://huggingface.co/papers/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. It's a
retrieval-augmented language model that firstly retrieves documents from a textual knowledge corpus and then
utilizes retrieved documents to process question answering tasks.
The abstract from the paper is the following:
*Language model pre-training has been shown to capture a surprising amount of world knowledge, crucial for NLP tasks
such as question answering. However, this knowledge is stored implicitly in the parameters of a neural network,
requiring ever-larger networks to cover more facts. To capture knowledge in a more modular and interpretable way, we
augment language model pre-training with a latent knowledge retriever, which allows the model to retrieve and attend
over documents from a large corpus such as Wikipedia, used during pre-training, fine-tuning and inference. For the
first time, we show how to pre-train such a knowledge retriever in an unsupervised manner, using masked language
modeling as the learning signal and backpropagating through a retrieval step that considers millions of documents. We
demonstrate the effectiveness of Retrieval-Augmented Language Model pre-training (REALM) by fine-tuning on the
challenging task of Open-domain Question Answering (Open-QA). We compare against state-of-the-art models for both
explicit and implicit knowledge storage on three popular Open-QA benchmarks, and find that we outperform all previous
methods by a significant margin (4-16% absolute accuracy), while also providing qualitative benefits such as
interpretability and modularity.*
This model was contributed by [qqaatw](https://huggingface.co/qqaatw). The original code can be found
[here](https://github.com/google-research/language/tree/master/language/realm).
## RealmConfig
[[autodoc]] RealmConfig
## RealmTokenizer
[[autodoc]] RealmTokenizer
- build_inputs_with_special_tokens
- get_special_tokens_mask
- create_token_type_ids_from_sequences
- save_vocabulary
- batch_encode_candidates
## RealmTokenizerFast
[[autodoc]] RealmTokenizerFast
- batch_encode_candidates
## RealmRetriever
[[autodoc]] RealmRetriever
## RealmEmbedder
[[autodoc]] RealmEmbedder
- forward
## RealmScorer
[[autodoc]] RealmScorer
- forward
## RealmKnowledgeAugEncoder
[[autodoc]] RealmKnowledgeAugEncoder
- forward
## RealmReader
[[autodoc]] RealmReader
- forward
## RealmForOpenQA
[[autodoc]] RealmForOpenQA
- block_embedding_to
- forward

View File

@ -0,0 +1,57 @@
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
*This model was released on 2020-06-12 and added to Hugging Face Transformers on 2023-06-20.*
# RetriBERT
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
<Tip warning={true}>
This model is in maintenance mode only, so we won't accept any new PRs changing its code.
If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
You can do so by running the following command: `pip install -U transformers==4.30.0`.
</Tip>
## Overview
The [RetriBERT](https://huggingface.co/yjernite/retribert-base-uncased/tree/main) model was proposed in the blog post [Explain Anything Like I'm Five: A Model for Open Domain Long Form
Question Answering](https://yjernite.github.io/lfqa.html). RetriBERT is a small model that uses either a single or
pair of BERT encoders with lower-dimension projection for dense semantic indexing of text.
This model was contributed by [yjernite](https://huggingface.co/yjernite). Code to train and use the model can be
found [here](https://github.com/huggingface/transformers/tree/main/examples/research-projects/distillation).
## RetriBertConfig
[[autodoc]] RetriBertConfig
## RetriBertTokenizer
[[autodoc]] RetriBertTokenizer
## RetriBertTokenizerFast
[[autodoc]] RetriBertTokenizerFast
## RetriBertModel
[[autodoc]] RetriBertModel
- forward

View File

@ -0,0 +1,133 @@
<!--Copyright 2021 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
*This model was released on 2021-04-14 and added to Hugging Face Transformers on 2023-06-20.*
# Speech2Text2
<Tip warning={true}>
This model is in maintenance mode only, we don't accept any new PRs changing its code.
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
You can do so by running the following command: `pip install -U transformers==4.40.2`.
</Tip>
## Overview
The Speech2Text2 model is used together with [Wav2Vec2](wav2vec2) for Speech Translation models proposed in
[Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://huggingface.co/papers/2104.06678) by
Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
Speech2Text2 is a *decoder-only* transformer model that can be used with any speech *encoder-only*, such as
[Wav2Vec2](wav2vec2) or [HuBERT](hubert) for Speech-to-Text tasks. Please refer to the
[SpeechEncoderDecoder](speech-encoder-decoder) class on how to combine Speech2Text2 with any speech *encoder-only*
model.
This model was contributed by [Patrick von Platen](https://huggingface.co/patrickvonplaten).
The original code can be found [here](https://github.com/pytorch/fairseq/blob/1f7ef9ed1e1061f8c7f88f8b94c7186834398690/fairseq/models/wav2vec/wav2vec2_asr.py#L266).
## Usage tips
- Speech2Text2 achieves state-of-the-art results on the CoVoST Speech Translation dataset. For more information, see
the [official models](https://huggingface.co/models?other=speech2text2) .
- Speech2Text2 is always used within the [SpeechEncoderDecoder](speech-encoder-decoder) framework.
- Speech2Text2's tokenizer is based on [fastBPE](https://github.com/glample/fastBPE).
## Inference
Speech2Text2's [`SpeechEncoderDecoderModel`] model accepts raw waveform input values from speech and
makes use of [`~generation.GenerationMixin.generate`] to translate the input speech
autoregressively to the target language.
The [`Wav2Vec2FeatureExtractor`] class is responsible for preprocessing the input speech and
[`Speech2Text2Tokenizer`] decodes the generated target tokens to the target string. The
[`Speech2Text2Processor`] wraps [`Wav2Vec2FeatureExtractor`] and
[`Speech2Text2Tokenizer`] into a single instance to both extract the input features and decode the
predicted token ids.
- Step-by-step Speech Translation
```python
>>> from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel
>>> from datasets import load_dataset
>>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
>>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
>>> def map_to_array(example):
... example["speech"] = example["audio"]["array"]
... return example
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.map(map_to_array)
>>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
>>> generated_ids = model.generate(inputs=inputs["input_values"], attention_mask=inputs["attention_mask"])
>>> transcription = processor.batch_decode(generated_ids)
```
- Speech Translation via Pipelines
The automatic speech recognition pipeline can also be used to translate speech in just a couple lines of code
```python
>>> from datasets import load_dataset
>>> from transformers import pipeline
>>> librispeech_en = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> asr = pipeline(
... "automatic-speech-recognition",
... model="facebook/s2t-wav2vec2-large-en-de",
... feature_extractor="facebook/s2t-wav2vec2-large-en-de",
... )
>>> translation_de = asr(librispeech_en[0]["file"])
```
See [model hub](https://huggingface.co/models?filter=speech2text2) to look for Speech2Text2 checkpoints.
## Resources
- [Causal language modeling task guide](../tasks/language_modeling)
## Speech2Text2Config
[[autodoc]] Speech2Text2Config
## Speech2TextTokenizer
[[autodoc]] Speech2Text2Tokenizer
- batch_decode
- decode
- save_vocabulary
## Speech2Text2Processor
[[autodoc]] Speech2Text2Processor
- __call__
- from_pretrained
- save_pretrained
- batch_decode
- decode
## Speech2Text2ForCausalLM
[[autodoc]] Speech2Text2ForCausalLM
- forward

View File

@ -0,0 +1,155 @@
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
*This model was released on 2021-07-16 and added to Hugging Face Transformers on 2023-06-20.*
# TAPEX
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
<Tip warning={true}>
This model is in maintenance mode only, we don't accept any new PRs changing its code.
If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
You can do so by running the following command: `pip install -U transformers==4.30.0`.
</Tip>
## Overview
The TAPEX model was proposed in [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://huggingface.co/papers/2107.07653) by Qian Liu,
Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. TAPEX pre-trains a BART model to solve synthetic SQL queries, after
which it can be fine-tuned to answer natural language questions related to tabular data, as well as performing table fact checking.
TAPEX has been fine-tuned on several datasets:
- [SQA](https://www.microsoft.com/en-us/download/details.aspx?id=54253) (Sequential Question Answering by Microsoft)
- [WTQ](https://github.com/ppasupat/WikiTableQuestions) (Wiki Table Questions by Stanford University)
- [WikiSQL](https://github.com/salesforce/WikiSQL) (by Salesforce)
- [TabFact](https://tabfact.github.io/) (by USCB NLP Lab).
The abstract from the paper is the following:
*Recent progress in language model pre-training has achieved a great success via leveraging large-scale unstructured textual data. However, it is
still a challenge to apply pre-training on structured tabular data due to the absence of large-scale high-quality tabular data. In this paper, we
propose TAPEX to show that table pre-training can be achieved by learning a neural SQL executor over a synthetic corpus, which is obtained by automatically
synthesizing executable SQL queries and their execution outputs. TAPEX addresses the data scarcity challenge via guiding the language model to mimic a SQL
executor on the diverse, large-scale and high-quality synthetic corpus. We evaluate TAPEX on four benchmark datasets. Experimental results demonstrate that
TAPEX outperforms previous table pre-training approaches by a large margin and achieves new state-of-the-art results on all of them. This includes improvements
on the weakly-supervised WikiSQL denotation accuracy to 89.5% (+2.3%), the WikiTableQuestions denotation accuracy to 57.5% (+4.8%), the SQA denotation accuracy
to 74.5% (+3.5%), and the TabFact accuracy to 84.2% (+3.2%). To our knowledge, this is the first work to exploit table pre-training via synthetic executable programs
and to achieve new state-of-the-art results on various downstream tasks.*
## Usage tips
- TAPEX is a generative (seq2seq) model. One can directly plug in the weights of TAPEX into a BART model.
- TAPEX has checkpoints on the hub that are either pre-trained only, or fine-tuned on WTQ, SQA, WikiSQL and TabFact.
- Sentences + tables are presented to the model as `sentence + " " + linearized table`. The linearized table has the following format:
`col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : ...`.
- TAPEX has its own tokenizer, that allows to prepare all data for the model easily. One can pass Pandas DataFrames and strings to the tokenizer,
and it will automatically create the `input_ids` and `attention_mask` (as shown in the usage examples below).
### Usage: inference
Below, we illustrate how to use TAPEX for table question answering. As one can see, one can directly plug in the weights of TAPEX into a BART model.
We use the [Auto API](auto), which will automatically instantiate the appropriate tokenizer ([`TapexTokenizer`]) and model ([`BartForConditionalGeneration`]) for us,
based on the configuration file of the checkpoint on the hub.
```python
>>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
>>> import pandas as pd
>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")
>>> model = AutoModelForSeq2SeqLM.from_pretrained("microsoft/tapex-large-finetuned-wtq")
>>> # prepare table + question
>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
>>> table = pd.DataFrame.from_dict(data)
>>> question = "how many movies does Leonardo Di Caprio have?"
>>> encoding = tokenizer(table, question, return_tensors="pt")
>>> # let the model generate an answer autoregressively
>>> outputs = model.generate(**encoding)
>>> # decode back to text
>>> predicted_answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
>>> print(predicted_answer)
53
```
Note that [`TapexTokenizer`] also supports batched inference. Hence, one can provide a batch of different tables/questions, or a batch of a single table
and multiple questions, or a batch of a single query and multiple tables. Let's illustrate this:
```python
>>> # prepare table + question
>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
>>> table = pd.DataFrame.from_dict(data)
>>> questions = [
... "how many movies does Leonardo Di Caprio have?",
... "which actor has 69 movies?",
... "what's the first name of the actor who has 87 movies?",
... ]
>>> encoding = tokenizer(table, questions, padding=True, return_tensors="pt")
>>> # let the model generate an answer autoregressively
>>> outputs = model.generate(**encoding)
>>> # decode back to text
>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
[' 53', ' george clooney', ' brad pitt']
```
In case one wants to do table verification (i.e. the task of determining whether a given sentence is supported or refuted by the contents
of a table), one can instantiate a [`BartForSequenceClassification`] model. TAPEX has checkpoints on the hub fine-tuned on TabFact, an important
benchmark for table fact checking (it achieves 84% accuracy). The code example below again leverages the [Auto API](auto).
```python
>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/tapex-large-finetuned-tabfact")
>>> model = AutoModelForSequenceClassification.from_pretrained("microsoft/tapex-large-finetuned-tabfact")
>>> # prepare table + sentence
>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
>>> table = pd.DataFrame.from_dict(data)
>>> sentence = "George Clooney has 30 movies"
>>> encoding = tokenizer(table, sentence, return_tensors="pt")
>>> # forward pass
>>> outputs = model(**encoding)
>>> # print prediction
>>> predicted_class_idx = outputs.logits[0].argmax(dim=0).item()
>>> print(model.config.id2label[predicted_class_idx])
Refused
```
<Tip>
TAPEX architecture is the same as BART, except for tokenization. Refer to [BART documentation](bart) for information on
configuration classes and their parameters. TAPEX-specific tokenizer is documented below.
</Tip>
## TapexTokenizer
[[autodoc]] TapexTokenizer
- __call__
- save_vocabulary

View File

@ -0,0 +1,66 @@
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
*This model was released on 2021-06-03 and added to Hugging Face Transformers on 2023-06-20.*
# Trajectory Transformer
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
<Tip warning={true}>
This model is in maintenance mode only, so we won't accept any new PRs changing its code.
If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
You can do so by running the following command: `pip install -U transformers==4.30.0`.
</Tip>
## Overview
The Trajectory Transformer model was proposed in [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://huggingface.co/papers/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine.
The abstract from the paper is the following:
*Reinforcement learning (RL) is typically concerned with estimating stationary policies or single-step models,
leveraging the Markov property to factorize problems in time. However, we can also view RL as a generic sequence
modeling problem, with the goal being to produce a sequence of actions that leads to a sequence of high rewards.
Viewed in this way, it is tempting to consider whether high-capacity sequence prediction models that work well
in other domains, such as natural-language processing, can also provide effective solutions to the RL problem.
To this end, we explore how RL can be tackled with the tools of sequence modeling, using a Transformer architecture
to model distributions over trajectories and repurposing beam search as a planning algorithm. Framing RL as sequence
modeling problem simplifies a range of design decisions, allowing us to dispense with many of the components common
in offline RL algorithms. We demonstrate the flexibility of this approach across long-horizon dynamics prediction,
imitation learning, goal-conditioned RL, and offline RL. Further, we show that this approach can be combined with
existing model-free algorithms to yield a state-of-the-art planner in sparse-reward, long-horizon tasks.*
This model was contributed by [CarlCochet](https://huggingface.co/CarlCochet). The original code can be found [here](https://github.com/jannerm/trajectory-transformer).
## Usage tips
This Transformer is used for deep reinforcement learning. To use it, you need to create sequences from
actions, states and rewards from all previous timesteps. This model will treat all these elements together
as one big sequence (a trajectory).
## TrajectoryTransformerConfig
[[autodoc]] TrajectoryTransformerConfig
## TrajectoryTransformerModel
[[autodoc]] TrajectoryTransformerModel
- forward

View File

@ -0,0 +1,136 @@
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
*This model was released on 2019-01-09 and added to Hugging Face Transformers on 2023-06-20.*
# Transformer XL
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
<Tip warning={true}>
This model is in maintenance mode only, so we won't accept any new PRs changing its code. This model was deprecated due to security issues linked to `pickle.load`.
We recommend switching to more recent models for improved security.
In case you would still like to use `TransfoXL` in your experiments, we recommend using the [Hub checkpoint](https://huggingface.co/transfo-xl/transfo-xl-wt103) with a specific revision to ensure you are downloading safe files from the Hub.
You will need to set the environment variable `TRUST_REMOTE_CODE` to `True` in order to allow the
usage of `pickle.load()`:
```python
import os
from transformers import TransfoXLTokenizer, TransfoXLLMHeadModel
os.environ["TRUST_REMOTE_CODE"] = "True"
checkpoint = 'transfo-xl/transfo-xl-wt103'
revision = '40a186da79458c9f9de846edfaea79c412137f97'
tokenizer = TransfoXLTokenizer.from_pretrained(checkpoint, revision=revision)
model = TransfoXLLMHeadModel.from_pretrained(checkpoint, revision=revision)
```
If you run into any issues running this model, please reinstall the last version that supported this model: v4.35.0.
You can do so by running the following command: `pip install -U transformers==4.35.0`.
</Tip>
<div class="flex flex-wrap space-x-1">
<a href="https://huggingface.co/models?filter=transfo-xl">
<img alt="Models" src="https://img.shields.io/badge/All_model_pages-transfo--xl-blueviolet">
</a>
<a href="https://huggingface.co/spaces/docs-demos/transfo-xl-wt103">
<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
</a>
</div>
## Overview
The Transformer-XL model was proposed in [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://huggingface.co/papers/1901.02860) by Zihang Dai, Zhilin Yang, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan
Salakhutdinov. It's a causal (uni-directional) transformer with relative positioning (sinusoïdal) embeddings which can
reuse previously computed hidden-states to attend to longer context (memory). This model also uses adaptive softmax
inputs and outputs (tied).
The abstract from the paper is the following:
*Transformers have a potential of learning longer-term dependency, but are limited by a fixed-length context in the
setting of language modeling. We propose a novel neural architecture Transformer-XL that enables learning dependency
beyond a fixed length without disrupting temporal coherence. It consists of a segment-level recurrence mechanism and a
novel positional encoding scheme. Our method not only enables capturing longer-term dependency, but also resolves the
context fragmentation problem. As a result, Transformer-XL learns dependency that is 80% longer than RNNs and 450%
longer than vanilla Transformers, achieves better performance on both short and long sequences, and is up to 1,800+
times faster than vanilla Transformers during evaluation. Notably, we improve the state-of-the-art results of
bpc/perplexity to 0.99 on enwiki8, 1.08 on text8, 18.3 on WikiText-103, 21.8 on One Billion Word, and 54.5 on Penn
Treebank (without finetuning). When trained only on WikiText-103, Transformer-XL manages to generate reasonably
coherent, novel text articles with thousands of tokens.*
This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/kimiyoung/transformer-xl).
## Usage tips
- Transformer-XL uses relative sinusoidal positional embeddings. Padding can be done on the left or on the right. The
original implementation trains on SQuAD with padding on the left, therefore the padding defaults are set to left.
- Transformer-XL is one of the few models that has no sequence length limit.
- Same as a regular GPT model, but introduces a recurrence mechanism for two consecutive segments (similar to a regular RNNs with two consecutive inputs). In this context, a segment is a number of consecutive tokens (for instance 512) that may span across multiple documents, and segments are fed in order to the model.
- Basically, the hidden states of the previous segment are concatenated to the current input to compute the attention scores. This allows the model to pay attention to information that was in the previous segment as well as the current one. By stacking multiple attention layers, the receptive field can be increased to multiple previous segments.
- This changes the positional embeddings to positional relative embeddings (as the regular positional embeddings would give the same results in the current input and the current hidden state at a given position) and needs to make some adjustments in the way attention scores are computed.
<Tip warning={true}>
TransformerXL does **not** work with *torch.nn.DataParallel* due to a bug in PyTorch, see [issue #36035](https://github.com/pytorch/pytorch/issues/36035)
</Tip>
## Resources
- [Text classification task guide](../tasks/sequence_classification)
- [Causal language modeling task guide](../tasks/language_modeling)
## TransfoXLConfig
[[autodoc]] TransfoXLConfig
## TransfoXLTokenizer
[[autodoc]] TransfoXLTokenizer
- save_vocabulary
## TransfoXL specific outputs
[[autodoc]] models.deprecated.transfo_xl.modeling_transfo_xl.TransfoXLModelOutput
[[autodoc]] models.deprecated.transfo_xl.modeling_transfo_xl.TransfoXLLMHeadModelOutput
## TransfoXLModel
[[autodoc]] TransfoXLModel
- forward
## TransfoXLLMHeadModel
[[autodoc]] TransfoXLLMHeadModel
- forward
## TransfoXLForSequenceClassification
[[autodoc]] TransfoXLForSequenceClassification
- forward
## Internal Layers
[[autodoc]] AdaptiveEmbedding

View File

@ -0,0 +1,90 @@
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
*This model was released on 2022-09-28 and added to Hugging Face Transformers on 2023-06-20.*
# TVLT
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
<Tip warning={true}>
This model is in maintenance mode only, we don't accept any new PRs changing its code.
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
You can do so by running the following command: `pip install -U transformers==4.40.2`.
</Tip>
## Overview
The TVLT model was proposed in [TVLT: Textless Vision-Language Transformer](https://huggingface.co/papers/2209.14156)
by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal (the first three authors contributed equally). The Textless Vision-Language Transformer (TVLT) is a model that uses raw visual and audio inputs for vision-and-language representation learning, without using text-specific modules such as tokenization or automatic speech recognition (ASR). It can perform various audiovisual and vision-language tasks like retrieval, question answering, etc.
The abstract from the paper is the following:
*In this work, we present the Textless Vision-Language Transformer (TVLT), where homogeneous transformer blocks take raw visual and audio inputs for vision-and-language representation learning with minimal modality-specific design, and do not use text-specific modules such as tokenization or automatic speech recognition (ASR). TVLT is trained by reconstructing masked patches of continuous video frames and audio spectrograms (masked autoencoding) and contrastive modeling to align video and audio. TVLT attains performance comparable to its text-based counterpart on various multimodal tasks, such as visual question answering, image retrieval, video retrieval, and multimodal sentiment analysis, with 28x faster inference speed and only 1/3 of the parameters. Our findings suggest the possibility of learning compact and efficient visual-linguistic representations from low-level visual and audio signals without assuming the prior existence of text.*
<p align="center">
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/tvlt_architecture.png"
alt="drawing" width="600"/>
</p>
<small> TVLT architecture. Taken from the <a href="[https://huggingface.co/papers/2102.03334](https://huggingface.co/papers/2209.14156)">original paper</a>. </small>
The original code can be found [here](https://github.com/zinengtang/TVLT). This model was contributed by [Zineng Tang](https://huggingface.co/ZinengTang).
## Usage tips
- TVLT is a model that takes both `pixel_values` and `audio_values` as input. One can use [`TvltProcessor`] to prepare data for the model.
This processor wraps an image processor (for the image/video modality) and an audio feature extractor (for the audio modality) into one.
- TVLT is trained with images/videos and audios of various sizes: the authors resize and crop the input images/videos to 224 and limit the length of audio spectrogram to 2048. To make batching of videos and audios possible, the authors use a `pixel_mask` that indicates which pixels are real/padding and `audio_mask` that indicates which audio values are real/padding.
- The design of TVLT is very similar to that of a standard Vision Transformer (ViT) and masked autoencoder (MAE) as in [ViTMAE](vitmae). The difference is that the model includes embedding layers for the audio modality.
- The PyTorch version of this model is only available in torch 1.10 and higher.
## TvltConfig
[[autodoc]] TvltConfig
## TvltProcessor
[[autodoc]] TvltProcessor
- __call__
## TvltFeatureExtractor
[[autodoc]] TvltFeatureExtractor
- __call__
## TvltImageProcessor
[[autodoc]] TvltImageProcessor
- preprocess
## TvltModel
[[autodoc]] TvltModel
- forward
## TvltForPreTraining
[[autodoc]] TvltForPreTraining
- forward
## TvltForAudioVisualClassification
[[autodoc]] TvltForAudioVisualClassification
- forward

View File

@ -0,0 +1,76 @@
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
*This model was released on 2022-02-20 and added to Hugging Face Transformers on 2023-06-20.*
# VAN
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
<Tip warning={true}>
This model is in maintenance mode only, we don't accept any new PRs changing its code.
If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
You can do so by running the following command: `pip install -U transformers==4.30.0`.
</Tip>
## Overview
The VAN model was proposed in [Visual Attention Network](https://huggingface.co/papers/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
This paper introduces a new attention layer based on convolution operations able to capture both local and distant relationships. This is done by combining normal and large kernel convolution layers. The latter uses a dilated convolution to capture distant correlations.
The abstract from the paper is the following:
*While originally designed for natural language processing tasks, the self-attention mechanism has recently taken various computer vision areas by storm. However, the 2D nature of images brings three challenges for applying self-attention in computer vision. (1) Treating images as 1D sequences neglects their 2D structures. (2) The quadratic complexity is too expensive for high-resolution images. (3) It only captures spatial adaptability but ignores channel adaptability. In this paper, we propose a novel large kernel attention (LKA) module to enable self-adaptive and long-range correlations in self-attention while avoiding the above issues. We further introduce a novel neural network based on LKA, namely Visual Attention Network (VAN). While extremely simple, VAN outperforms the state-of-the-art vision transformers and convolutional neural networks with a large margin in extensive experiments, including image classification, object detection, semantic segmentation, instance segmentation, etc. Code is available at [this https URL](https://github.com/Visual-Attention-Network/VAN-Classification).*
Tips:
- VAN does not have an embedding layer, thus the `hidden_states` will have a length equal to the number of stages.
The figure below illustrates the architecture of a Visual Attention Layer. Taken from the [original paper](https://huggingface.co/papers/2202.09741).
<img width="600" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/van_architecture.png"/>
This model was contributed by [Francesco](https://huggingface.co/Francesco). The original code can be found [here](https://github.com/Visual-Attention-Network/VAN-Classification).
## Resources
A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with VAN.
<PipelineTag pipeline="image-classification"/>
- [`VanForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
- See also: [Image classification task guide](../tasks/image_classification)
If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
## VanConfig
[[autodoc]] VanConfig
## VanModel
[[autodoc]] VanModel
- forward
## VanForImageClassification
[[autodoc]] VanForImageClassification
- forward

View File

@ -0,0 +1,112 @@
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
*This model was released on 2020-10-22 and added to Hugging Face Transformers on 2023-06-20.*
# Hybrid Vision Transformer (ViT Hybrid)
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
<Tip warning={true}>
This model is in maintenance mode only, we don't accept any new PRs changing its code.
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
You can do so by running the following command: `pip install -U transformers==4.40.2`.
</Tip>
## Overview
The hybrid Vision Transformer (ViT) model was proposed in [An Image is Worth 16x16 Words: Transformers for Image Recognition
at Scale](https://huggingface.co/papers/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk
Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob
Uszkoreit, Neil Houlsby. It's the first paper that successfully trains a Transformer encoder on ImageNet, attaining
very good results compared to familiar convolutional architectures. ViT hybrid is a slight variant of the [plain Vision Transformer](vit),
by leveraging a convolutional backbone (specifically, [BiT](bit)) whose features are used as initial "tokens" for the Transformer.
The abstract from the paper is the following:
*While the Transformer architecture has become the de-facto standard for natural language processing tasks, its
applications to computer vision remain limited. In vision, attention is either applied in conjunction with
convolutional networks, or used to replace certain components of convolutional networks while keeping their overall
structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to
sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of
data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.),
Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring
substantially fewer computational resources to train.*
This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code (written in JAX) can be
found [here](https://github.com/google-research/vision_transformer).
## Using Scaled Dot Product Attention (SDPA)
PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function
encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the
[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
page for more information.
SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set
`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
```py
from transformers import ViTHybridForImageClassification
model = ViTHybridForImageClassification.from_pretrained("google/vit-hybrid-base-bit-384", attn_implementation="sdpa", dtype=torch.float16)
...
```
For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` and `google/vit-hybrid-base-bit-384` model, we saw the following speedups during inference.
| Batch size | Average inference time (ms), eager mode | Average inference time (ms), sdpa model | Speed up, Sdpa / Eager (x) |
|--------------|-------------------------------------------|-------------------------------------------|------------------------------|
| 1 | 29 | 18 | 1.61 |
| 2 | 26 | 18 | 1.44 |
| 4 | 25 | 18 | 1.39 |
| 8 | 34 | 24 | 1.42 |
## Resources
A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViT Hybrid.
<PipelineTag pipeline="image-classification"/>
- [`ViTHybridForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
- See also: [Image classification task guide](../tasks/image_classification)
If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
## ViTHybridConfig
[[autodoc]] ViTHybridConfig
## ViTHybridImageProcessor
[[autodoc]] ViTHybridImageProcessor
- preprocess
## ViTHybridModel
[[autodoc]] ViTHybridModel
- forward
## ViTHybridForImageClassification
[[autodoc]] ViTHybridForImageClassification
- forward

View File

@ -0,0 +1,99 @@
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
*This model was released on 2020-01-13 and added to Hugging Face Transformers on 2023-06-20.*
# XLM-ProphetNet
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
<Tip warning={true}>
This model is in maintenance mode only, we don't accept any new PRs changing its code.
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
You can do so by running the following command: `pip install -U transformers==4.40.2`.
</Tip>
<div class="flex flex-wrap space-x-1">
<a href="https://huggingface.co/models?filter=xprophetnet">
<img alt="Models" src="https://img.shields.io/badge/All_model_pages-xprophetnet-blueviolet">
</a>
<a href="https://huggingface.co/spaces/docs-demos/xprophetnet-large-wiki100-cased-xglue-ntg">
<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
</a>
</div>
**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign
@patrickvonplaten
## Overview
The XLM-ProphetNet model was proposed in [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training,](https://huggingface.co/papers/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei
Zhang, Ming Zhou on 13 Jan, 2020.
XLM-ProphetNet is an encoder-decoder model and can predict n-future tokens for "ngram" language modeling instead of
just the next token. Its architecture is identical to ProhpetNet, but the model was trained on the multi-lingual
"wiki100" Wikipedia dump. XLM-ProphetNet's model architecture and pretraining objective is same as ProphetNet, but XLM-ProphetNet was pre-trained on the cross-lingual dataset XGLUE.
The abstract from the paper is the following:
*In this paper, we present a new sequence-to-sequence pretraining model called ProphetNet, which introduces a novel
self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of
the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by
n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time
step. The future n-gram prediction explicitly encourages the model to plan for the future tokens and prevent
overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale
dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for
abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new
state-of-the-art results on all these datasets compared to the models using the same scale pretraining corpus.*
The Authors' code can be found [here](https://github.com/microsoft/ProphetNet).
## Resources
- [Causal language modeling task guide](../tasks/language_modeling)
- [Translation task guide](../tasks/translation)
- [Summarization task guide](../tasks/summarization)
## XLMProphetNetConfig
[[autodoc]] XLMProphetNetConfig
## XLMProphetNetTokenizer
[[autodoc]] XLMProphetNetTokenizer
## XLMProphetNetModel
[[autodoc]] XLMProphetNetModel
## XLMProphetNetEncoder
[[autodoc]] XLMProphetNetEncoder
## XLMProphetNetDecoder
[[autodoc]] XLMProphetNetDecoder
## XLMProphetNetForConditionalGeneration
[[autodoc]] XLMProphetNetForConditionalGeneration
## XLMProphetNetForCausalLM
[[autodoc]] XLMProphetNetForCausalLM

View File

@ -302,9 +302,10 @@ class PreTrainedConfig(PushToHubMixin):
self.sep_token_id = sep_token_id
self.decoder_start_token_id = decoder_start_token_id
# Parameters for sequence generation saved in the config are popped instead of loading them.
for parameter_name in self._get_global_generation_defaults().keys():
kwargs.pop(parameter_name, None)
# Retrocompatibility: Parameters for sequence generation. While we will keep the ability to load these
# parameters, saving them will be deprecated. In a distant future, we won't need to load them.
for parameter_name, default_value in self._get_global_generation_defaults().items():
setattr(self, parameter_name, kwargs.pop(parameter_name, default_value))
# Name or path to the pretrained checkpoint
self._name_or_path = str(kwargs.pop("name_or_path", ""))
@ -444,11 +445,14 @@ class PreTrainedConfig(PushToHubMixin):
non_default_generation_parameters = self._get_non_default_generation_parameters()
if len(non_default_generation_parameters) > 0:
raise ValueError(
# TODO (joao): this should be an exception if the user has modified the loaded config. See #33886
warnings.warn(
"Some non-default generation parameters are set in the model config. These should go into either a) "
"`model.generation_config` (as opposed to `model.config`); OR b) a GenerationConfig file "
"(https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model)."
"This warning will become an exception in the future."
f"\nNon-default generation parameters: {str(non_default_generation_parameters)}",
UserWarning,
)
os.makedirs(save_directory, exist_ok=True)
@ -1097,18 +1101,40 @@ class PreTrainedConfig(PushToHubMixin):
non_default_generation_parameters = {}
decoder_attribute_name = None
# Some composite models don't have a default config, use their decoder config as a fallback for default values
# If no known pattern is matched, then `default_config = None` -> check against the global generation defaults
if not self.has_no_defaults_at_init:
default_config = self.__class__()
else:
decoder_config = self.get_text_config(decoder=True)
if decoder_config is not self:
default_config = decoder_config.__class__()
else:
default_config = None
# If it is a composite model, we want to check the subconfig that will be used for generation
self_decoder_config = self if decoder_attribute_name is None else getattr(self, decoder_attribute_name)
for parameter_name, default_global_value in self._get_global_generation_defaults().items():
if hasattr(self_decoder_config, parameter_name):
parameter_value = getattr(self_decoder_config, parameter_name, None)
# Two cases in which is okay for the model config to hold generation config parameters:
is_default_in_config = is_default_generation_value = None
parameter_value = getattr(self_decoder_config, parameter_name)
# Three cases in which is okay for the model config to hold generation config parameters:
# 1. The parameter is set to `None`, effectively delegating its value to the generation config
# 2. The parameter is set the global generation defaults
if parameter_value is None or parameter_value == default_global_value:
if parameter_value is None:
continue
non_default_generation_parameters[parameter_name] = getattr(self_decoder_config, parameter_name)
# 2. If we have a default config, then the instance should hold the same generation defaults
if default_config is not None:
is_default_in_config = parameter_value == getattr(default_config, parameter_name)
# 3. if we don't have a default config, then the instance should hold the global generation defaults
else:
is_default_generation_value = parameter_value == default_global_value
is_non_default = (is_default_in_config is False) or (
is_default_in_config is None and is_default_generation_value is False
)
if is_non_default:
non_default_generation_parameters[parameter_name] = getattr(self_decoder_config, parameter_name)
return non_default_generation_parameters

View File

@ -1731,8 +1731,10 @@ SLOW_TO_FAST_CONVERTERS = {
"OpenAIGPTTokenizer": OpenAIGPTConverter,
"PegasusTokenizer": PegasusConverter,
"Qwen2Tokenizer": Qwen2Converter,
"RealmTokenizer": BertConverter,
"ReformerTokenizer": ReformerConverter,
"RemBertTokenizer": RemBertConverter,
"RetriBertTokenizer": BertConverter,
"RobertaTokenizer": RobertaConverter,
"RoFormerTokenizer": RoFormerConverter,
"SeamlessM4TTokenizer": SeamlessM4TConverter,

View File

@ -46,6 +46,22 @@ if TYPE_CHECKING:
logger = logging.get_logger(__name__)
str_to_torch_dtype = {
"BOOL": torch.bool,
"U8": torch.uint8,
"I8": torch.int8,
"I16": torch.int16,
"F16": torch.float16,
"BF16": torch.bfloat16,
"I32": torch.int32,
"F32": torch.float32,
"F64": torch.float64,
"I64": torch.int64,
"F8_E4M3": torch.float8_e4m3fn,
"F8_E5M2": torch.float8_e5m2,
}
logger = logging.get_logger(__name__)
@ -132,33 +148,34 @@ class ConversionOps:
class Chunk(ConversionOps):
"""Split a tensor along ``dim`` into equally sized chunks or using explicit ``sizes``."""
reverse_op: type[ConversionOps]
def __init__(self, dim: int = 0, chunks: Optional[int] = None, sizes: Optional[Sequence[int]] = None):
if chunks is None and sizes is None:
raise ValueError("`chunks` or `sizes` must be provided for Chunk operations.")
if chunks is not None and chunks <= 0:
raise ValueError("`chunks` must be a strictly positive integer.")
self.dim = dim
self.chunks = chunks
self.sizes = list(sizes) if sizes is not None else None
self.reverse_op = Concatenate
def convert(self, value: torch.Tensor, *args, **kwargs) -> list[torch.Tensor]:
# chunk requires a single tensor input
if len(value) != 1 or len(value[0]) != 1:
raise ValueError("Chunk operation requires a single tensor input.")
@property
def reverse_op(self) -> ConversionOps:
return Concatenate(self.dim)
def convert(self, value: torch.Tensor, concrete_target_keys=None, *args, **kwargs) -> list[torch.Tensor]:
# chunk requires a single tensor input (maybe not when saving actually!)
udpate_ = []
if concrete_target_keys is not None: # when saving we have multiple tensors
for layer in value:
for tensors in layer:
chunk_size = len(concrete_target_keys)
udpate_+= [dict(zip(concrete_target_keys, torch.chunk(tensors, chunks=chunk_size, dim=self.dim)))]
return udpate_
return list(torch.chunk(value[0][0], self.chunks, dim=self.dim))
class Concatenate(ConversionOps):
"""Concatenate tensors along `dim` using a reusable buffer."""
reverse_op: type[ConversionOps]
def __init__(self, dim: int = 0):
self.dim = dim
self.reverse_op = Chunk
@property
def reverse_op(self) -> ConversionOps:
return Chunk(self.dim)
@torch.no_grad
def convert(self, value: Sequence[torch.Tensor], *args, **kwargs) -> torch.Tensor:
@ -171,7 +188,7 @@ class Concatenate(ConversionOps):
return torch.cat(tuple(tensors), dim=self.dim)
class MergeModulelist(Concatenate):
class MergeModulelist(ConversionOps):
"""
Merge a list of tensors into a single tensor along the first dimension.
We explicitly define this because for EP or TP you want to make sure you know what you are doing!
@ -179,8 +196,11 @@ class MergeModulelist(Concatenate):
"""
def __init__(self, dim: int = 0):
super().__init__(dim=dim)
self.reverse_op = SplitModulelist
self.dim = dim
@property
def reverse_op(self) -> ConversionOps:
return SplitModulelist(self.dim)
@torch.no_grad
def convert(self, value: Sequence[torch.Tensor], *args, **kwargs) -> list[torch.Tensor]:
@ -196,26 +216,24 @@ class MergeModulelist(Concatenate):
class SplitModulelist(ConversionOps):
"""Inverse of :class:`MergeModulelist` using explicit split sizes per group."""
def __init__(self, sizes: Sequence[Sequence[int]], dim: int = 0):
if not isinstance(sizes, Sequence) or not all(isinstance(sub, Sequence) and sub for sub in sizes):
raise ValueError("`sizes` must be a sequence of non-empty sequences of integers.")
self.sizes = [list(sub) for sub in sizes]
self.dim = dim
self.reverse_op = MergeModulelist
def __init__(self, dim: int = 0):
self.dim = dim
@property
def reverse_op(self) -> ConversionOps:
return MergeModulelist(self.dim)
@torch.no_grad
def convert(self, value: Sequence[torch.Tensor], *, context: dict[str, Any]) -> list[list[torch.Tensor]]:
if not isinstance(value, Sequence):
raise TypeError("SplitModulelist expects a sequence of tensors.")
if len(value) != len(self.sizes):
raise ValueError("Number of tensors does not match the provided split specifications.")
result: list[list[torch.Tensor]] = []
for tensor, split_sizes in zip(value, self.sizes):
if not isinstance(tensor, torch.Tensor):
raise TypeError("SplitModulelist can only split torch.Tensor instances.")
splits = torch.split(tensor, split_sizes, dim=self.dim)
result.append(list(splits))
def convert(self, value: Sequence[torch.Tensor], concrete_target_keys=None, config=None, *args, **kwargs) -> list[list[torch.Tensor]]:
result = []
for i, layers in enumerate(value):
tmp = {}
if not isinstance(layers, dict):
layers = {concrete_target_keys[i]: layers[i] for i in range(len(layers))}
for k, v in layers.items():
splits = torch.chunk(v, config.num_experts, dim=self.dim)
tmp.update({k.replace("*", str(i)): v for i, v in enumerate(splits)})
result.append(tmp)
return result
@ -296,16 +314,21 @@ class ConversionEntry:
GLOBAL_WORKERS = min(16, (os.cpu_count() or 8) * 2) # NVMe: 8-16; HDD/NFS: 2-4
def _materialize_copy(tensor, device=None, dtype=None):
def _materialize_copy(tensor, dtype=None):
tensor = tensor[...]
if dtype is not None or device is not None:
tensor = tensor.to(device=device, dtype=dtype)
if dtype is not None:
tensor = tensor.to(dtype)
return tensor
def spawn_materialize(thread_pool, tensor, device=None, dtype=None) -> Future:
def spawn_dematerialize(thread_pool, tensor, dtype=None) -> Future:
def _job():
return _materialize_copy(tensor, device, dtype)
return tensor.detach() #.cpu()
return thread_pool.submit(_job)
def spawn_materialize(thread_pool, tensor, dtype=None) -> Future:
def _job():
return _materialize_copy(tensor, dtype)
return thread_pool.submit(_job)
@ -353,7 +376,7 @@ def log_to_misc(
values, target_keys = extras
descriptor = f"{op_name} " if op_name else ""
misc[layer_name] = (
f"{e}\nError: {descriptor}on tensors destined for {target_keys}. Ckpt contains: {len(values[0])}"
f"{e}\nError: {descriptor}on tensors destined for {target_keys}. Ckpt contains: {values}"
)
elif isinstance(extras, str):
suffix = f" via {op_name}" if op_name else ""
@ -373,15 +396,11 @@ def set_param_for_module(
missing_keys: MutableSet[str],
misc: MutableMapping[str, Any],
distributed_operation: Optional[TensorParallelLayer],
hf_quantizer: HfQuantizer,
):
with log_to_misc(layer_name, misc, layer_name):
module_path, _, param_name = layer_name.rpartition(".")
module_obj = model.get_submodule(module_path) if module_path else model
if isinstance(param_value, list):
param_value = param_value[0]
elif not isinstance(param_value, torch.nn.Parameter):
param_value = param_value[...]
param_value = param_value[0] if isinstance(param_value, list) else param_value[...]
ref = getattr(module_obj, param_name)
use_dtensor = hasattr(distributed_operation, "use_dtensor") and distributed_operation.use_dtensor
@ -403,7 +422,7 @@ def set_param_for_module(
# Remove from missing keys (it's either mismatched, or all good)
missing_keys.discard(layer_name)
if ref is not None and ref.shape != param_value.shape and hf_quantizer is None:
if ref is not None and ref.shape != param_value.shape:
mismatch_keys.add((layer_name, param_value.shape, ref.shape))
module_obj.param_name._is_hf_initialized = False # Needs to be initialized
else:
@ -422,7 +441,7 @@ def convert_and_load_state_dict_in_model(
state_dict: dict[str, Any],
weight_mapping: dict[str, WeightConverter] | None,
tp_plan: dict[str, str] | None,
hf_quantizer: HfQuantizer | None,
quantizer: HfQuantizer | None,
dtype: torch.dtype | None = None,
device_map: dict | None = None,
dtype_plan: dict | None = None,
@ -435,10 +454,7 @@ def convert_and_load_state_dict_in_model(
prefix = model.base_model_prefix
tp_plan = tp_plan or {} # {glob_pattern: plan_obj_or_key}
device_map = device_map or {"": "cpu"} # {exact_target_key: device}
device_map_regex = re.compile(
"|".join(rf"({k})" for k in sorted(device_map.keys(), key=lambda x: x.count("."), reverse=True))
)
device_map = device_map or {} # {exact_target_key: device}
dtype_plan = dtype_plan or {} # {glob_pattern: dtype}
weight_mapping = weight_mapping or {} # {glob_pattern: WeightConverter}
meta_model_state_dict = model.state_dict()
@ -487,14 +503,20 @@ def convert_and_load_state_dict_in_model(
unexpected_keys.add(t)
continue
if hf_quantizer is not None and hf_quantizer.param_needs_quantization(model, t):
converter.quantization_operation = hf_quantizer.get_quantize_ops()
_dtype = dtype
matched_dtype_pattern = match_glob(t, dtype_policy_alt, dtype_policy_by_group_name)
if matched_dtype_pattern is not None:
_dtype = dtype_plan[matched_dtype_pattern]
elif empty_param.dtype != _dtype:
_dtype = empty_param.dtype
if quantizer is not None and quantizer.param_needs_quantization(model, t):
if quantizer.__class__.__name__ == "FineGrainedFP8HfQuantizer":
from .integrations.finegrained_fp8 import Fp8Quantize
converter.quantization_operation = Fp8Quantize() # TODO support other methods
else:
raise ValueError("This quantization method is gonna be supported SOOOON")
else:
_dtype = dtype
matched_dtype_pattern = match_glob(t, dtype_policy_alt, dtype_policy_by_group_name)
if matched_dtype_pattern is not None:
_dtype = dtype_plan[matched_dtype_pattern]
elif empty_param.dtype != _dtype:
_dtype = empty_param.dtype
first_target_key = new_target_key[0]
target_key = "|".join(new_target_key)
@ -519,9 +541,7 @@ def convert_and_load_state_dict_in_model(
)
if future is None: # If not TP, async materialize the tensors. TODO handle disk offload?
device_match = device_map_regex.match(first_target_key)
param_device = device_map[device_match.group()] if device_match else device_map.get("", "cpu")
future = spawn_materialize(thread_pool, tensor, param_device, _dtype)
future = spawn_materialize(thread_pool, tensor, _dtype)
entry.collected_tensors[target_key].setdefault(converter_key, []).append(future)
# 2. Actually convert the ckpt
@ -557,7 +577,9 @@ def convert_and_load_state_dict_in_model(
if op := converter.quantization_operation:
with log_to_misc(layer_name, misc, op=op):
realized_value.update(
op.convert({k: realized_value.pop(k)}, model=model, missing_keys=missing_keys)
op.convert(
{k: realized_value.pop(k)}, quant_config=quantizer.quantization_config
)
)
for k, output_value in realized_value.items():
@ -571,7 +593,6 @@ def convert_and_load_state_dict_in_model(
missing_keys,
misc,
converter.distributed_operation,
hf_quantizer,
)
except SkipLayer:
@ -584,19 +605,116 @@ def convert_and_load_state_dict_in_model(
# TODO this is not done yet!
def revert_weight_conversion(model, state_dict):
def revert_weight_conversion(model, state_dict, weight_mapping):
mapping = getattr(model, "_checkpoint_conversion_mapping", {}) # IDK why but setting this will fail all llava.
reverse_key_mapping = [(v, k) for k, v in mapping.items()]
original_state_dict = {}
for key, value in state_dict.items():
for pattern, inverse_converter in reverse_key_mapping:
# TODO FIXME you name it
replacement = inverse_converter.lstrip("^") # strip off un-needed chars and patterns
replacement = re.sub(r"\(.*\)", "", replacement)
key, n_replace = re.subn(pattern, replacement, key)
# Early exit of the loop
if n_replace > 0:
break
original_state_dict[key] = value
state_dict = original_state_dict
return state_dict
reverse_key_mapping = [(v, k) for k, v in mapping.items()] # todo also take it into account
tp_plan = model.tp_plan or {} # {glob_pattern: plan_obj_or_key}
weight_mapping = weight_mapping or {} # {glob_pattern: WeightConverter}
misc = {}
final_state_dict = {}
# Global thread_pool
thread_pool = ThreadPoolExecutor(max_workers=GLOBAL_WORKERS)
_patterns = list(itertools.chain.from_iterable([k.target_keys for k in weight_mapping]))
target_to_source = {sk: k for k in weight_mapping for sk in k.target_keys}
weight_pattern_alt, weight_pattern_by_group_name = build_glob_alt(_patterns)
tp_plan_alt, tp_plan_by_group_name = build_glob_alt(list(tp_plan.keys()))
state_dict = sorted(state_dict.items(), key=lambda kv: dot_natural_key(kv[0]))
# 1. Create the conversion entries
by_conversion_pattern: dict[str, ConversionEntry] = {}
for original_key, tensor in state_dict:
matched_pattern = match_glob(original_key, weight_pattern_alt, weight_pattern_by_group_name)
if matched_pattern is not None:
converter = target_to_source[matched_pattern] # TODO make sure its the ref
sub_with_extractor = partial(re.sub, matched_pattern.replace("*", r"(\d+)"), string=original_key)
entry_key = "|".join(map(sub_with_extractor, converter.source_keys))
target_key = entry_key # at this point we don't know how many we'll collect :)
entry: ConversionEntry = by_conversion_pattern.setdefault(entry_key, ConversionEntry(converter))
converter_key = sub_with_extractor(matched_pattern)
else:
converter = WeightConverter(original_key)
converter_key = entry_key = target_key = original_key
entry = by_conversion_pattern.setdefault(converter_key, ConversionEntry(converter))
if False and quantizer is not None and quantizer.param_needs_quantization(model, t):
if quantizer.__class__.__name__ == "FineGrainedFP8HfQuantizer":
from .integrations.finegrained_fp8 import Fp8Quantize
converter.quantization_operation = Fp8Quantize() # TODO support other methods
else:
raise ValueError("This quantization method is gonna be supported SOOOON")
future = None
# if device_mesh:
# if matched_tp_pattern := match_glob(first_target_key, tp_plan_alt, tp_plan_by_group_name):
# if getattr(converter, "distributed_operation", {}) is None:
# tp_layer = ALL_PARALLEL_STYLES[model.tp_plan[matched_tp_pattern]].__class__
# converter.distributed_operation = tp_layer(
# device_mesh=device_mesh, rank=device_map[""].index, empty_param=empty_param.clone()
# )
# # VERY IMPORTANT: this tells us wether we collected stuffs or not.
# shard_index = len(entry.collected_tensors[target_key].get(converter_key, []))
# future = spawn_tp_dematerialize(
# thread_pool,
# tensor,
# converter.distributed_operation,
# shard_index,
# )
if future is None: # If not TP, async materialize the tensors. TODO handle disk offload?
future = spawn_dematerialize(thread_pool, tensor) # -> should we put it to CPU always?
entry.collected_tensors[target_key].setdefault(converter_key, []).append(future)
# 2. Actually convert the ckpt
keys = list(by_conversion_pattern.keys())
with logging.tqdm(total=len(keys), desc="saving weights") as pbar:
for key in keys[::-1]: # revert to process simple keys first
group = by_conversion_pattern.pop(key)
converter = group.weight_converter
operations = converter.operations if isinstance(converter.operations, list) else [converter.operations]
for layer_name, tensors_for_this_layer in group.collected_tensors.items():
pbar.update(1)
pbar.set_postfix({"Materializing param": layer_name})
pbar.refresh()
concrete_target_keys = layer_name.split("|")
try:
with log_to_misc(layer_name, misc):
values = [[k.result() for k in inner] for inner in tensors_for_this_layer.values()]
for op in operations[::-1]:
with log_to_misc(layer_name, misc, (values, concrete_target_keys), operations):
reverse_op = op.reverse_op
values = reverse_op.convert(values, concrete_target_keys, config=model.config)
values = [values] if not isinstance(values, list) else values
with log_to_misc(layer_name, misc, (values, concrete_target_keys), operations):
if len(values) == 1 and isinstance(values[0], dict):
realized_value = values[0]
else:
realized_value = dict(zip(concrete_target_keys, values))
for k in list(realized_value.keys()).copy():
if op := converter.quantization_operation: # dequantize
with log_to_misc(layer_name, misc, op=op):
realized_value.update(
op.convert(
{k: realized_value.pop(k)}, # quant_config=quantizer.quantization_config
)
)
for k, output_value in realized_value.items():
final_state_dict[k] = output_value[0] if isinstance(output_value, list) else output_value
# TODO @Cyrilvallez handle scheduled saving, gather and etc
# schedule the saving of the weights using the threadpool. `save_file`
except SkipLayer:
continue
del group
print(misc)
return final_state_dict

View File

@ -918,9 +918,7 @@ class GenerationConfig(PushToHubMixin):
else:
logger.info(f"loading configuration file {configuration_file} from cache at {resolved_config_file}")
if kwargs.get("_from_model_config", False):
return cls.from_model_config(config_dict)
elif kwargs.get("return_unused_kwargs") is True:
if kwargs.get("return_unused_kwargs") is True:
config, unused_kwargs = cls.from_dict(config_dict, **kwargs)
config._original_object_hash = hash(config) # Hash to detect whether the instance was modified
return config, unused_kwargs
@ -1086,19 +1084,19 @@ class GenerationConfig(PushToHubMixin):
writer.write(self.to_json_string(use_diff=use_diff))
@classmethod
def from_model_config(cls, model_config: PreTrainedConfig | dict) -> "GenerationConfig":
def from_model_config(cls, model_config: PreTrainedConfig) -> "GenerationConfig":
"""
Instantiates a [`GenerationConfig`] from a [`PreTrainedConfig`]. This function is useful to convert legacy
[`PreTrainedConfig`] objects, which may contain generation parameters, into a stand-alone [`GenerationConfig`].
Args:
model_config (`PreTrainedConfig | dict`):
model_config (`PreTrainedConfig`):
The model config that will be used to instantiate the generation config.
Returns:
[`GenerationConfig`]: The configuration object instantiated from those parameters.
"""
config_dict = model_config.to_dict() if not isinstance(model_config, dict) else model_config
config_dict = model_config.to_dict()
config_dict.pop("_from_model_config", None)
# Removes all `None` from the model config dict -- this lets the generation config defaults to take hold
@ -1108,15 +1106,14 @@ class GenerationConfig(PushToHubMixin):
# Special case: some models have generation attributes set in the decoder. Use them if still unset in the
# generation config (which in turn is defined from the outer attributes of model config).
if not isinstance(model_config, dict):
decoder_config = model_config.get_text_config(decoder=True)
if decoder_config is not model_config:
default_generation_config = GenerationConfig()
decoder_config_dict = decoder_config.to_dict()
for attr in generation_config.to_dict():
is_unset = getattr(generation_config, attr) == getattr(default_generation_config, attr)
if attr in decoder_config_dict and is_unset:
setattr(generation_config, attr, decoder_config_dict[attr])
decoder_config = model_config.get_text_config(decoder=True)
if decoder_config is not model_config:
default_generation_config = GenerationConfig()
decoder_config_dict = decoder_config.to_dict()
for attr in generation_config.to_dict():
is_unset = getattr(generation_config, attr) == getattr(default_generation_config, attr)
if attr in decoder_config_dict and is_unset:
setattr(generation_config, attr, decoder_config_dict[attr])
# If any `output_...` flag is set to `True`, we ensure `return_dict_in_generate` is set to `True`.
if generation_config.return_dict_in_generate is False:

View File

@ -410,14 +410,6 @@ class GenerationMixin(ContinuousMixin):
logger.info(
"Generation config file not found, using a generation config created from the model config."
)
self.generation_config = GenerationConfig.from_pretrained(
pretrained_model_name_or_path,
config_file_name="config.json",
_from_auto=from_auto_class,
_from_pipeline=from_pipeline,
_from_model_config=True,
**repo_loading_kwargs,
)
# Load custom generate function if `pretrained_model_name_or_path` defines it (and override `generate`)
if hasattr(self, "load_custom_generate") and trust_remote_code:
try:
@ -1786,12 +1778,14 @@ class GenerationMixin(ContinuousMixin):
):
new_generation_config = GenerationConfig.from_model_config(self.config)
if new_generation_config != self.generation_config: # 4)
raise ValueError(
"You have modified the pretrained model configuration to control generation."
" This strategy to control generation is not supported anymore. "
warnings.warn(
"You have modified the pretrained model configuration to control generation. This is a"
" deprecated strategy to control generation and will be removed in v5."
" Please use and modify the model generation configuration (see"
" https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )",
UserWarning,
)
self.generation_config = new_generation_config
generation_config = self.generation_config
using_model_generation_config = True

View File

@ -162,25 +162,6 @@ def copy_(tensor: torch.Tensor, other: torch.Tensor) -> torch.Tensor:
return tensor
# Here, we need to check several modules imported, and hot patch all of them, as sometimes torch does
# something like `from torch.nn.init import xavier_uniform_` in their internals (e.g in torch.nn.modules.activations,
# where MultiHeadAttention lives), so the function name is binded at import time and just doing
# `setattr(torch.nn.init, name, globals()[name])` is thus not enough
# The following list should be enough for all torch versions we work with
TORCH_MODULES_TO_PATCH = (
"torch.nn.init",
"torch.nn.modules.activation",
"torch.nn.modules.transformer",
"torch.nn.modules.linear",
"torch.nn.modules.loss",
"torch.nn.modules.batchnorm",
"torch.nn.modules.conv",
"torch.nn.modules.normalization",
"torch.nn.modules.rnn",
"torch.nn.modules.sparse",
)
@contextmanager
def guard_torch_init_functions():
"""
@ -193,16 +174,18 @@ def guard_torch_init_functions():
originals = defaultdict(dict)
try:
# Replace all torch funcs by the ones in this file
for module_name in TORCH_MODULES_TO_PATCH:
if module_name in sys.modules:
module = sys.modules[module_name]
for func_name in TORCH_INIT_FUNCTIONS.keys():
if hasattr(module, func_name):
originals[module][func_name] = getattr(module, func_name)
setattr(module, func_name, globals()[func_name])
for name in TORCH_INIT_FUNCTIONS.keys():
# Here, we need to check all modules imported, and hot patch all of them, as usually torch does
# something like `from torch.nn.init import xavier_uniform_` in their internals (e.g in torch.nn.modules,
# where MultiHeadAttention lives), so the function name is binded at import time and just doing
# `setattr(torch.nn.init, name, gloabls()[name])` is thus not enough
for module in sys.modules.copy().values():
if module and hasattr(module, name):
originals[module][name] = getattr(module, name)
setattr(module, name, globals()[name])
yield
finally:
# Set back the original functions on all modules
for module, functions in originals.items():
for func_name, func in functions.items():
setattr(module, func_name, func)
for name, func in functions.items():
setattr(module, name, func)

View File

@ -32,8 +32,8 @@ _import_structure = {
"unpack_weights",
],
"bitsandbytes": [
"Bnb4bitQuantize",
"dequantize_and_replace",
"get_keys_to_not_convert",
"replace_with_bnb_linear",
"validate_bnb_backend_availability",
],
@ -177,8 +177,8 @@ if TYPE_CHECKING:
unpack_weights,
)
from .bitsandbytes import (
Bnb4bitQuantize,
dequantize_and_replace,
get_keys_to_not_convert,
replace_with_bnb_linear,
validate_bnb_backend_availability,
)

View File

@ -19,9 +19,9 @@ and simplicity/ease of use.
import copy
import inspect
import os
from collections import OrderedDict, defaultdict
from collections import defaultdict
from contextlib import contextmanager
from typing import TYPE_CHECKING, Optional, Union
from typing import TYPE_CHECKING
from ..utils import (
is_accelerate_available,
@ -39,9 +39,8 @@ if is_torch_available():
import torch.nn as nn
if is_accelerate_available():
from accelerate import dispatch_model
from accelerate.utils import get_max_memory
from accelerate.utils.modeling import clean_device_map, get_max_layer_size, get_module_size_with_ties
from accelerate import dispatch_model, infer_auto_device_map
from accelerate.utils import check_tied_parameters_on_same_device, get_max_memory
if TYPE_CHECKING:
from ..modeling_utils import PreTrainedModel
@ -259,7 +258,7 @@ def check_and_set_device_map(device_map: "torch.device | int | str | dict | None
def compute_module_sizes(
model: "PreTrainedModel", hf_quantizer: "HfQuantizer | None" = None, buffers_only: bool = False
model: "PreTrainedModel", hf_quantizer: "HfQuantizer | None"
) -> tuple[dict[str, int], dict[str, int]]:
"""
Compute the size of each submodule of a given model (in bytes).
@ -269,13 +268,7 @@ def compute_module_sizes(
"""
all_module_sizes = defaultdict(int)
leaves_module_sizes = defaultdict(int)
if buffers_only:
named_tensors = model.named_buffers(recurse=True)
else:
named_tensors = model.state_dict().items()
for name, param in named_tensors:
for name, param in model.state_dict().items():
if hf_quantizer is not None:
dtype_size = hf_quantizer.param_element_size(model, name)
else:
@ -290,14 +283,6 @@ def compute_module_sizes(
return all_module_sizes, leaves_module_sizes
def compute_module_total_buffer_size(model: nn.Module, hf_quantizer: "HfQuantizer | None" = None):
"""
Compute the total size of buffers in each submodule of a given model.
"""
module_sizes, _ = compute_module_sizes(model, hf_quantizer, buffers_only=True)
return module_sizes.get("", 0)
def get_balanced_memory(
model: "PreTrainedModel",
max_memory: dict[int | str, int | str] | None = None,
@ -408,11 +393,20 @@ def _get_device_map(
device_map: dict | str | None,
max_memory: dict | None,
hf_quantizer: "HfQuantizer | None",
dtype: torch.dtype | None,
) -> dict:
"""Compute the final `device_map` to use if we passed a value in ['auto', 'balanced', 'balanced_low_0', 'sequential'].
Otherwise, we check for any device inconsistencies in the device_map.
"""
if isinstance(device_map, str):
special_dtypes = {}
if hf_quantizer is not None:
special_dtypes = hf_quantizer.get_special_dtypes_update(model, dtype)
target_dtype = dtype
if hf_quantizer is not None:
target_dtype = hf_quantizer.adjust_target_dtype(target_dtype)
no_split_modules = model._get_no_split_modules(device_map)
if device_map != "sequential":
@ -444,13 +438,19 @@ def _get_device_map(
device_map = infer_auto_device_map(
model,
max_memory=inferred_max_memory,
dtype=target_dtype,
no_split_module_classes=no_split_modules,
hf_quantizer=hf_quantizer,
special_dtypes=special_dtypes,
)
if hf_quantizer is not None:
hf_quantizer.validate_environment(device_map=device_map)
elif device_map is not None:
tied_params = find_tied_parameters(model)
# check if we don't have tied param in different devices
check_tied_parameters_on_same_device(tied_params, device_map)
return device_map
@ -547,383 +547,3 @@ def accelerate_disk_offload(
else:
disk_offload_index = {}
return disk_offload_index, disk_only_shard_files, is_offloaded_safetensors
def _init_infer_auto_device_map(
model: nn.Module,
max_memory: Optional[dict[Union[int, str], Union[int, str]]] = None,
no_split_module_classes: Optional[list[str]] = None,
tied_parameters: Optional[list[list[str]]] = None,
hf_quantizer: "HfQuantizer | None" = None,
) -> tuple[
list[Union[int, str]],
dict[Union[int, str], Union[int, str]],
list[Union[int, str]],
list[int],
dict[str, int],
list[list[str]],
list[str],
list[tuple[str, nn.Module]],
]:
"""
Initialize variables required for computing the device map for model allocation.
"""
max_memory = get_max_memory(max_memory)
if no_split_module_classes is None:
no_split_module_classes = []
elif not isinstance(no_split_module_classes, (list, tuple)):
no_split_module_classes = [no_split_module_classes]
devices = list(max_memory.keys())
if "disk" not in devices:
devices.append("disk")
gpus = [device for device in devices if device not in ["cpu", "disk"]]
# Devices that need to keep space for a potential offloaded layer.
if "mps" in gpus:
main_devices = ["mps"]
elif len(gpus) > 0:
main_devices = [gpus[0], "cpu"]
else:
main_devices = ["cpu"]
module_sizes, _ = compute_module_sizes(model, hf_quantizer)
if tied_parameters is None:
if len(model.all_tied_weights_keys) > 0:
# create a list of list of tied params
tied_parameters = [list(t) for t in model.all_tied_weights_keys.items()]
else:
tied_parameters = [[]]
# Direct submodules and parameters
modules_to_treat = (
list(model.named_parameters(recurse=False))
+ list(model.named_children())
+ list(model.named_buffers(recurse=False))
)
return (
devices,
max_memory,
main_devices,
gpus,
module_sizes,
tied_parameters,
no_split_module_classes,
modules_to_treat,
)
def infer_auto_device_map(
model: nn.Module,
max_memory: Optional[dict[Union[int, str], Union[int, str]]] = None,
no_split_module_classes: Optional[list[str]] = None,
verbose: bool = False,
clean_result: bool = True,
offload_buffers: bool = False,
tied_parameters: Optional[list[list[str]]] = None,
hf_quantizer: "HfQuantizer | None" = None,
):
"""
Compute a device map for a given model giving priority to GPUs, then offload on CPU and finally offload to disk,
such that:
- we don't exceed the memory available of any of the GPU.
- if offload to the CPU is needed, there is always room left on GPU 0 to put back the layer offloaded on CPU that
has the largest size.
- if offload to the CPU is needed,we don't exceed the RAM available on the CPU.
- if offload to the disk is needed, there is always room left on the CPU to put back the layer offloaded on disk
that has the largest size.
<Tip>
All computation is done analyzing sizes and dtypes of the model parameters. As a result, the model can be on the
meta device (as it would if initialized within the `init_empty_weights` context manager).
</Tip>
Args:
model (`torch.nn.Module`):
The model to analyze.
max_memory (`Dict`, *optional*):
A dictionary device identifier to maximum memory. Will default to the maximum memory available if unset.
Example: `max_memory={0: "1GB"}`.
no_split_module_classes (`List[str]`, *optional*):
A list of layer class names that should never be split across device (for instance any layer that has a
residual connection).
verbose (`bool`, *optional*, defaults to `False`):
Whether or not to provide debugging statements as the function builds the device_map.
clean_result (`bool`, *optional*, defaults to `True`):
Clean the resulting device_map by grouping all submodules that go on the same device together.
offload_buffers (`bool`, *optional*, defaults to `False`):
In the layers that are offloaded on the CPU or the hard drive, whether or not to offload the buffers as
well as the parameters.
"""
# Initialize the variables
(
devices,
max_memory,
main_devices,
gpus,
module_sizes,
tied_parameters,
no_split_module_classes,
modules_to_treat,
) = _init_infer_auto_device_map(model, max_memory, no_split_module_classes, tied_parameters, hf_quantizer)
device_map = OrderedDict()
current_device = 0
device_memory_used = dict.fromkeys(devices, 0)
device_buffer_sizes = {}
device_minimum_assignment_memory = {}
# Initialize maximum largest layer, to know which space to keep in memory
max_layer_size, max_layer_names = get_max_layer_size(modules_to_treat, module_sizes, no_split_module_classes)
# Ready ? This is going to be a bit messy.
while len(modules_to_treat) > 0:
name, module = modules_to_treat.pop(0)
if verbose:
print(f"\nTreating module {name}.")
# Max size in the remaining layers may have changed since we took one, so we maybe update it.
max_layer_names = [n for n in max_layer_names if n != name and not n.startswith(name + ".")]
if len(max_layer_names) == 0:
max_layer_size, max_layer_names = get_max_layer_size(
[(n, m) for n, m in modules_to_treat if isinstance(m, torch.nn.Module)],
module_sizes,
no_split_module_classes,
)
# Assess size needed
module_size = module_sizes[name]
# We keep relevant tied parameters only: one of the tied parameters in the group is inside the current module
# and the other is not.
# Note: If we are currently processing the name `compute.weight`, an other parameter named
# e.g. `compute.weight_submodule.parameter`
# needs to be considered outside the current module, hence the check with additional dots.
tied_param_groups = [
tied_group
for tied_group in tied_parameters
if any(name + "." in k + "." for k in tied_group) and not all(name + "." in k + "." for k in tied_group)
]
if verbose and len(tied_param_groups) > 0:
print(f" Found the relevant tied param groups {tied_param_groups}")
# Then we keep track of all the parameters that are tied to the current module, but not in the current module
tied_params = sum(
[[p for p in tied_group if name + "." not in p + "."] for tied_group in tied_param_groups], []
)
if verbose and len(tied_params) > 0:
print(f" So those parameters need to be taken into account {tied_params}")
device = devices[current_device]
current_max_size = max_memory[device] if device != "disk" else None
current_memory_reserved = 0
# Reduce max size available by the largest layer.
if devices[current_device] in main_devices:
current_max_size = current_max_size - max_layer_size
current_memory_reserved = max_layer_size
module_size_with_ties, tied_module_names, tied_modules = get_module_size_with_ties(
tied_params, module_size, module_sizes, modules_to_treat
)
# The module and its tied modules fit on the current device.
if current_max_size is None or device_memory_used[device] + module_size_with_ties <= current_max_size:
if verbose:
output = f"Putting {name}"
if tied_module_names:
output += f" and {tied_module_names}"
else:
output += f" (size={module_size})"
if current_max_size is not None:
output += f" (available={current_max_size - device_memory_used[device]})"
output += f" on {device}."
print(output)
device_memory_used[device] += module_size_with_ties
# Assign the primary module to the device.
device_map[name] = device
# Assign tied modules if any.
for tied_module_name in tied_module_names:
if tied_module_name in [m[0] for m in modules_to_treat]:
# Find the index of the tied module in the list
tied_module_index = next(i for i, (n, _) in enumerate(modules_to_treat) if n == tied_module_name)
# Remove the tied module from the list to prevent reprocessing
modules_to_treat.pop(tied_module_index)
# Assign the tied module to the device
device_map[tied_module_name] = device
# Buffer Handling
if not offload_buffers and isinstance(module, nn.Module):
# Compute the total buffer size for the module
current_buffer_size = compute_module_total_buffer_size(module, hf_quantizer)
# Update the buffer size on the device
device_buffer_sizes[device] = device_buffer_sizes.get(device, 0) + current_buffer_size
continue
# The current module itself fits, so we try to split the tied modules.
if len(tied_params) > 0 and device_memory_used[device] + module_size <= current_max_size:
# can we split one of the tied modules to make it smaller or do we need to go on the next device?
if verbose:
print(
f"Not enough space on {devices[current_device]} to put {name} and {tied_module_names} (space "
f"available {current_max_size - device_memory_used[device]}, needed size {module_size_with_ties})."
)
split_happened = False
for tied_module_name, tied_module in zip(tied_module_names, tied_modules):
tied_module_children = list(tied_module.named_children())
if len(tied_module_children) == 0 or tied_module.__class__.__name__ in no_split_module_classes:
# can't break this one.
continue
if verbose:
print(f"Splitting {tied_module_name}.")
tied_module_children = list(tied_module.named_parameters(recurse=False)) + tied_module_children
tied_module_children = [(f"{tied_module_name}.{n}", v) for n, v in tied_module_children]
tied_module_index = [i for i, (n, _) in enumerate(modules_to_treat) if n == tied_module_name][0]
modules_to_treat = (
[(name, module)]
+ modules_to_treat[:tied_module_index]
+ tied_module_children
+ modules_to_treat[tied_module_index + 1 :]
)
# Update the max layer size.
max_layer_size, max_layer_names = get_max_layer_size(
[(n, m) for n, m in modules_to_treat if isinstance(m, torch.nn.Module)],
module_sizes,
no_split_module_classes,
)
split_happened = True
break
if split_happened:
continue
# If the tied module is not split, we go to the next device
if verbose:
print("None of the tied module can be split, going to the next device.")
# The current module itself doesn't fit, so we have to split it or go to the next device.
if device_memory_used[device] + module_size >= current_max_size:
# Split or not split?
modules_children = (
[]
if isinstance(module, nn.Parameter) or isinstance(module, torch.Tensor)
else list(module.named_children())
)
if verbose:
print(
f"Not enough space on {devices[current_device]} to put {name} (space available "
f"{current_max_size - device_memory_used[device]}, module size {module_size})."
)
if len(modules_children) == 0 or module.__class__.__name__ in no_split_module_classes:
# -> no split, we go to the next device
if verbose:
print("This module cannot be split, going to the next device.")
else:
# -> split, we replace the module studied by its children + parameters
if verbose:
print(f"Splitting {name}.")
modules_children = list(module.named_parameters(recurse=False)) + modules_children
modules_to_treat = [(f"{name}.{n}", v) for n, v in modules_children] + modules_to_treat
# Update the max layer size.
max_layer_size, max_layer_names = get_max_layer_size(
[(n, m) for n, m in modules_to_treat if isinstance(m, torch.nn.Module)],
module_sizes,
no_split_module_classes,
)
continue
if device_memory_used[device] == 0:
device_minimum_assignment_memory[device] = module_size_with_ties + current_memory_reserved
# Neither the current module nor any tied modules can be split, so we move to the next device.
device_memory_used[device] = device_memory_used[device] + current_memory_reserved
current_device += 1
modules_to_treat = [(name, module)] + modules_to_treat
device_memory_used = {device: mem for device, mem in device_memory_used.items() if mem > 0}
if clean_result:
device_map = clean_device_map(device_map)
non_gpu_buffer_size = device_buffer_sizes.get("cpu", 0) + device_buffer_sizes.get("disk", 0)
if non_gpu_buffer_size > 0 and not offload_buffers:
is_buffer_fit_any_gpu = False
for gpu_device, gpu_max_memory in max_memory.items():
if gpu_device == "cpu" or gpu_device == "disk":
continue
if not is_buffer_fit_any_gpu:
gpu_memory_used = device_memory_used.get(gpu_device, 0)
if gpu_max_memory >= non_gpu_buffer_size + gpu_memory_used:
is_buffer_fit_any_gpu = True
if len(gpus) > 0 and not is_buffer_fit_any_gpu:
logger.warning(
f"Current model requires {non_gpu_buffer_size} bytes of buffer for offloaded layers, which seems does "
f"not fit any GPU's remaining memory. If you are experiencing a OOM later, please consider using "
f"offload_buffers=True."
)
if device_minimum_assignment_memory:
devices_info = "\n".join(
f" - {device}: {mem} bytes required" for device, mem in device_minimum_assignment_memory.items()
)
logger.info(
f"Based on the current allocation process, no modules could be assigned to the following devices due to "
f"insufficient memory:\n"
f"{devices_info}\n"
f"These minimum requirements are specific to this allocation attempt and may vary. Consider increasing "
f"the available memory for these devices to at least the specified minimum, or adjusting the model config."
)
check_tied_parameters_on_same_device(tied_parameters, device_map)
return device_map
def _get_param_device(param, device_map):
if param in device_map:
return device_map[param]
parent_param = ".".join(param.split(".")[:-1])
if parent_param == param:
raise ValueError(f"The `device_map` does not contain the module {param}.")
else:
return _get_param_device(parent_param, device_map)
def check_tied_parameters_on_same_device(tied_params, device_map):
"""
Check if tied parameters are on the same device
Args:
tied_params (`List[List[str]]`):
A list of lists of parameter names being all tied together.
device_map (`Dict[str, Union[int, str, torch.device]]`):
A map that specifies where each submodule should go.
"""
for tie_param in tied_params:
tie_param_devices = {}
for param in tie_param:
tie_param_devices[param] = _get_param_device(param, device_map)
if len(set(tie_param_devices.values())) > 1:
logger.warning(
f"Tied parameters are on different devices: {tie_param_devices}. "
"Please modify your custom device map or set `device_map='auto'`. "
)

View File

@ -1,10 +1,6 @@
import inspect
from collections import defaultdict
from inspect import signature
from typing import Optional
from ..core_model_loading import ConversionOps
from ..quantizers.quantizers_utils import get_module_from_name
from ..utils import (
get_available_devices,
is_accelerate_available,
@ -31,112 +27,12 @@ if is_accelerate_available():
logger = logging.get_logger(__name__)
class Bnb4bitQuantize(ConversionOps):
def __init__(self, hf_quantizer):
self.hf_quantizer = hf_quantizer
def convert(
self, input_dict: torch.Tensor, model: Optional[torch.nn.Module] = None, missing_keys=None, **kwargs
) -> dict[str, torch.Tensor]:
"""
we need to store some parameters to create the quantized weight. For example, bnb requires 6 values that are stored in the checkpoint to recover the quantized weight. So we store them in a dict that it stored in hf_quantizer for now as we can't save it in the op since we create an op per tensor.
"""
target_key, value = tuple(input_dict.items())[0]
value = value[0] if isinstance(value, list) else value
full_name = target_key
# update param name to get the weights instead of the quantized stats
target_key = self.hf_quantizer.get_param_name(target_key)
module, _ = get_module_from_name(model, target_key)
if not self.hf_quantizer.pre_quantized:
# Support models using `Conv1D` in place of `nn.Linear` (e.g. openai-community/gpt2) by transposing the weight matrix prior to quantization.
# Since weights are saved in the correct "orientation", we skip transposing when loading.
if issubclass(module.source_cls, Conv1D):
value = value.T
old_value = model.get_parameter_or_buffer(target_key)
new_value = bnb.nn.Params4bit(value, requires_grad=False, **old_value.__dict__).to(value.device)
# remove missing keys that were create when initializing Params4bit
for key in new_value.quant_state.as_dict(packed=True).keys():
missing_keys.discard(f"{full_name}.{key}")
return {target_key: new_value}
else:
module_name = target_key.rsplit(".", 1)[0]
# Save the states for later quantization when they are all gathered
if not hasattr(self.hf_quantizer, "param_quant_stats"):
self.hf_quantizer.param_quant_stats = defaultdict(dict)
self.hf_quantizer.param_quant_stats[module_name].update({full_name: value})
missing_keys.discard(full_name)
# We are ready for quantization in this case (note, the +1 is for the weight itself)
if len(self.hf_quantizer.param_quant_stats[module_name]) == len(self.hf_quantizer.bnb_keys) + 1:
weight = self.hf_quantizer.param_quant_stats[module_name].pop(f"{module_name}.weight")
new_value = bnb.nn.Params4bit.from_prequantized(
data=weight,
quantized_stats=self.hf_quantizer.param_quant_stats[module_name],
requires_grad=False,
device=value.device,
module=module,
)
del self.hf_quantizer.param_quant_stats[module_name]
return {target_key: new_value}
return {}
class Bnb8bitQuantize(ConversionOps):
def __init__(self, hf_quantizer):
self.hf_quantizer = hf_quantizer
def convert(
self, input_dict: torch.Tensor, model: Optional[torch.nn.Module] = None, missing_keys=None, **kwargs
) -> dict[str, torch.Tensor]:
target_key, value = tuple(input_dict.items())[0]
value = value[0] if isinstance(value, list) else value
module, tensor_name = get_module_from_name(model, target_key)
module_name = target_key.rsplit(".", 1)[0]
if not self.hf_quantizer.pre_quantized:
# Support models using `Conv1D` in place of `nn.Linear` (e.g. openai-community/gpt2) by transposing the weight matrix prior to quantization.
# Since weights are saved in the correct "orientation", we skip transposing when loading.
if issubclass(module.source_cls, Conv1D):
value = value.T
value_device = value.device
kwargs = getattr(module, tensor_name).__dict__
kwargs.pop("SCB", None)
new_value = bnb.nn.Int8Params(value.to("cpu"), requires_grad=False, **kwargs).to(value_device)
missing_keys.discard(f"{module_name}.weight_format")
missing_keys.discard(f"{module_name}.SCB")
return {target_key: new_value}
else:
missing_keys.discard(target_key)
# useless key that gets saved for no reason
if tensor_name.endswith("weight_format"):
return {}
# Save the states for later quantization when they are all gathered
if not hasattr(self.hf_quantizer, "param_quant_stats"):
self.hf_quantizer.param_quant_stats = defaultdict(dict)
self.hf_quantizer.param_quant_stats[module_name].update({target_key: value})
# We are ready for quantization in this case (SCB and the weight)
if len(self.hf_quantizer.param_quant_stats[module_name]) == 2:
weight = self.hf_quantizer.param_quant_stats[module_name].pop(f"{module_name}.weight")
kwargs = getattr(module, "weight").__dict__
weight_device = weight.device
new_value = bnb.nn.Int8Params(weight.to("cpu"), requires_grad=False, **kwargs).to(weight_device)
setattr(new_value, "SCB", self.hf_quantizer.param_quant_stats[module_name][f"{module_name}.SCB"])
del self.hf_quantizer.param_quant_stats[module_name]
# sometimes, weight_format is not saved so we need to remove it from missing keys ...
missing_keys.discard(f"{module_name}.weight_format")
return {f"{module_name}.weight": new_value}
return {}
def _replace_with_bnb_linear(
model,
modules_to_not_convert=None,
current_key_name=None,
quantization_config=None,
has_been_replaced=False,
pre_quantized=False,
):
"""
Private method that wraps the recursion for module replacement.
@ -162,18 +58,13 @@ def _replace_with_bnb_linear(
out_features = module.out_features
if quantization_config.quantization_method() == "llm_int8":
new_module = bnb.nn.Linear8bitLt(
model._modules[name] = bnb.nn.Linear8bitLt(
in_features,
out_features,
module.bias is not None,
has_fp16_weights=quantization_config.llm_int8_has_fp16_weight,
threshold=quantization_config.llm_int8_threshold,
)
# hack to create the correct keys in the state dict with the right dtype
new_module.weight.SCB = torch.empty(1, dtype=torch.float32)
if pre_quantized:
new_module.weight.data = new_module.weight.data.to(dtype=torch.int8)
model._modules[name] = new_module
has_been_replaced = True
else:
if (
@ -187,7 +78,7 @@ def _replace_with_bnb_linear(
if "quant_storage" in list(signature(bnb.nn.Linear4bit).parameters)
else {}
)
new_module = bnb.nn.Linear4bit(
model._modules[name] = bnb.nn.Linear4bit(
in_features,
out_features,
module.bias is not None,
@ -196,30 +87,6 @@ def _replace_with_bnb_linear(
quant_type=quantization_config.bnb_4bit_quant_type,
**extra_kwargs,
)
from bitsandbytes.functional import QuantState
# hack to create the correct keys in the state dict with the right dtype
absmax_dtype = (
torch.uint8 if quantization_config.bnb_4bit_use_double_quant else torch.float32
)
new_module.weight.quant_state = QuantState(
absmax=torch.empty(1, dtype=absmax_dtype),
code=torch.empty(1, dtype=torch.float32),
shape=(1,),
offset=torch.empty(1),
quant_type=quantization_config.bnb_4bit_quant_type,
state2=QuantState(
absmax=torch.empty(1, dtype=torch.float32),
code=torch.empty(1, dtype=torch.float32),
)
if quantization_config.bnb_4bit_use_double_quant
else None,
)
if pre_quantized:
# this is kind of an edge case when supporting both loading and quantization ...
# we need to set the right dtype as we cast the checkpoint with the dtype of the meta model
new_module.weight.data = new_module.weight.data.to(dtype=torch.uint8)
model._modules[name] = new_module
has_been_replaced = True
# Store the module class in case we need to transpose the weight later
model._modules[name].source_cls = type(module)
@ -232,16 +99,13 @@ def _replace_with_bnb_linear(
current_key_name,
quantization_config,
has_been_replaced=has_been_replaced,
pre_quantized=pre_quantized,
)
# Remove the last key for recursion
current_key_name.pop(-1)
return model, has_been_replaced
def replace_with_bnb_linear(
model, modules_to_not_convert=None, current_key_name=None, quantization_config=None, pre_quantized=False
):
def replace_with_bnb_linear(model, modules_to_not_convert=None, current_key_name=None, quantization_config=None):
"""
A helper function to replace all `torch.nn.Linear` modules by `bnb.nn.Linear8bit` modules from the `bitsandbytes`
library. This will enable running your models using mixed int8 precision as described by the paper `LLM.int8():
@ -273,7 +137,7 @@ def replace_with_bnb_linear(
"""
modules_to_not_convert = ["lm_head"] if modules_to_not_convert is None else modules_to_not_convert
model, has_been_replaced = _replace_with_bnb_linear(
model, modules_to_not_convert, current_key_name, quantization_config, pre_quantized=pre_quantized
model, modules_to_not_convert, current_key_name, quantization_config
)
if not has_been_replaced:

View File

@ -549,8 +549,6 @@ def replace_with_fp8_linear(
quantization_config=None,
):
"""Helper function to replace model layers with FP8 versions."""
if modules_to_not_convert is None:
modules_to_not_convert = []
modules_to_not_convert += ["lm_head"]
if quantization_config.modules_to_not_convert is not None:
@ -572,29 +570,35 @@ def replace_with_fp8_linear(
return model
class Fp8Quantize(ConversionOps):
class QuantizationOp(ConversionOps):
"""Base class for quantization operations."""
pass
class Fp8Quantize(QuantizationOp):
"""
A quantization operation that creates two tensors, weight and scale out of a weight.
"""
reverse_op: type[ConversionOps]
def __init__(self, hf_quantizer):
self.hf_quantizer = hf_quantizer
def __init__(self, block_size: Optional[tuple[int, int]] = None):
self.block_size = block_size
self.reverse_op = Fp8Dequantize
def convert(self, input_dict: torch.Tensor, **kwargs) -> dict[str, torch.Tensor]:
def convert(self, input_dict: torch.Tensor, *, quant_config: dict[str, Any]) -> dict[str, torch.Tensor]:
# Unpack single key/value (value may be wrapped in a list)
target_keys, value = tuple(input_dict.items())[0]
value = value[0] if isinstance(value, list) else value
# Resolve block size (support dict-like or attr-like quant_config)
block_size = None
if self.hf_quantizer.quantization_config is not None:
if isinstance(self.hf_quantizer.quantization_config, dict):
block_size = self.hf_quantizer.quantization_config.get("weight_block_size")
if quant_config is not None:
if isinstance(quant_config, dict):
block_size = quant_config.get("weight_block_size")
else:
block_size = getattr(self.hf_quantizer.quantization_config, "weight_block_size", None)
block_size = getattr(quant_config, "weight_block_size", None)
if block_size is None:
block_size = (value.shape[-2], value.shape[-1])
@ -652,7 +656,7 @@ class Fp8Quantize(ConversionOps):
}
class Fp8Dequantize(ConversionOps):
class Fp8Dequantize(QuantizationOp):
"""Inverse operation of :class:`Fp8Quantize`. Takes a pair (weight, scale) and reconstructs the fp32 tensor."""
def __init__(self, block_size: Optional[tuple[int, int]] = None):

View File

@ -654,18 +654,17 @@ def maybe_load_adapters(
token_from_adapter_kwargs = adapter_kwargs.pop("token", None)
if _adapter_model_path is None:
peft_kwargs = adapter_kwargs.copy()
for arg_name in ("cache_dir", "proxies", "subfolder"): # don't override revision
if (arg_name not in peft_kwargs) and (arg_name in download_kwargs):
peft_kwargs[arg_name] = download_kwargs[arg_name]
if "commit_hash" in download_kwargs:
peft_kwargs["_commit_hash"] = download_kwargs["commit_hash"]
peft_kwargs["force_download"] = bool(download_kwargs.get("force_download", False))
peft_kwargs["local_files_only"] = bool(download_kwargs.get("local_files_only", False))
peft_kwargs["token"] = token or token_from_adapter_kwargs
_adapter_model_path = find_adapter_config_file(
pretrained_model_name_or_path,
**peft_kwargs,
cache_dir=download_kwargs.get("cache_dir"),
force_download=bool(download_kwargs.get("force_download", False)),
proxies=download_kwargs.get("proxies"),
token=token or token_from_adapter_kwargs,
revision=download_kwargs.get("revision"),
local_files_only=bool(download_kwargs.get("local_files_only", False)),
subfolder=download_kwargs.get("subfolder", ""),
_commit_hash=download_kwargs.get("commit_hash"),
**adapter_kwargs,
)
if _adapter_model_path is not None and os.path.isfile(_adapter_model_path):

View File

@ -447,7 +447,6 @@ def RTDetrForObjectDetectionLoss(
outputs_loss = {}
outputs_loss["logits"] = logits
outputs_loss["pred_boxes"] = pred_boxes
auxiliary_outputs = None
if config.auxiliary_loss:
if denoising_meta_values is not None:
dn_out_coord, outputs_coord = torch.split(outputs_coord, denoising_meta_values["dn_num_split"], dim=2)

View File

@ -408,7 +408,7 @@ def _prepare_4d_causal_attention_mask_for_sdpa(
# Attend to all tokens in masked rows from the causal_mask, for example the relevant first rows when
# using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
# Details: https://github.com/pytorch/pytorch/issues/110213
if not is_tracing_ and expanded_4d_mask.device.type in ["cuda", "xpu"]:
if not is_tracing_ and expanded_4d_mask.device.type == "cuda":
expanded_4d_mask = AttentionMaskConverter._unmask_unattended(
expanded_4d_mask, min_dtype=torch.finfo(inputs_embeds.dtype).min
)

View File

@ -47,11 +47,7 @@ from torch.utils.checkpoint import checkpoint
from . import initialization as init
from .configuration_utils import PreTrainedConfig
from .conversion_mapping import get_checkpoint_conversion_mapping
from .core_model_loading import (
WeightConverter,
convert_and_load_state_dict_in_model,
revert_weight_conversion,
)
from .core_model_loading import WeightConverter, convert_and_load_state_dict_in_model, revert_weight_conversion
from .distributed import DistributedConfig
from .dynamic_module_utils import custom_object_save
from .generation import CompileConfig, GenerationConfig
@ -1460,8 +1456,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
self.config._attn_implementation_internal = self._check_and_adjust_attn_implementation(
self.config._attn_implementation, is_init_check=True
)
if self.can_generate():
self.generation_config = GenerationConfig.from_model_config(config)
# for initialization of the loss
loss_type = self.__class__.__name__
@ -1476,6 +1470,8 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
self.name_or_path = config.name_or_path
self.warnings_issued = {}
self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None
# Overwrite the class attribute to make it an instance attribute, so models like
# `InstructBlipForConditionalGeneration` can dynamically update it without modifying the class attribute
# when a different component (e.g. language_model) is used.
@ -1499,8 +1495,11 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
# Attach the different parallel plans and tied weight keys to the top-most model, so that everything is
# easily available
self._tp_plan, self._ep_plan, self._pp_plan = {}, {}, {}
# Current submodel should register its tied weights
self.all_tied_weights_keys = self.get_expanded_tied_weights_keys(all_submodels=False)
# Current submodel should register its tied weights keys only if the config is asking for it
if not self.config.tie_word_embeddings and not self.config.tie_encoder_decoder:
self.all_tied_weights_keys = {}
else:
self.all_tied_weights_keys = self._tied_weights_keys.copy() if self._tied_weights_keys is not None else {}
# If current model is a base model, attach `base_model_tp_plan` and `base_model_pp_plan` from config
if self.base_model is self:
self._pp_plan = self.config.base_model_pp_plan.copy() if self.config.base_model_pp_plan is not None else {}
@ -2338,63 +2337,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
# Let the magic happen with this simple call
self.smart_apply(self._initialize_weights)
def get_expanded_tied_weights_keys(self, all_submodels: bool = False) -> dict:
"""
Return the expanded tied weight keys (in case they contain modules or regex patterns) for only the current
model, or recursively for all submodels if `all_submodels=True` (i.e. it will re-check the config values for all
submodels).
"""
if all_submodels:
expanded_tied_weights = {}
for prefix, submodule in self.named_modules():
if isinstance(submodule, PreTrainedModel):
# Will dynamically check the config if it has changed
submodel_tied_weights = submodule.get_expanded_tied_weights_keys(all_submodels=False)
if prefix != "":
submodel_tied_weights = {f"{prefix}.{k}": f"{prefix}.{v}" for k, v in submodel_tied_weights.items()}
expanded_tied_weights.update(submodel_tied_weights)
return expanded_tied_weights
tied_mapping = self._tied_weights_keys
# If the config does not specify any tying, return empty dict
if not self.config.tie_word_embeddings and not self.config.tie_encoder_decoder:
return {}
# If None, return empty dict
elif tied_mapping is None:
return {}
# Short-cut for the most common cases: if the tied weights mapping only contains already expanded params,
# return it directly (the regex matches names containing only letters, numbers, dots, and underscores to make
# sure it does not contain a regex pattern, and finishing by "bias" or "weight" to make sure it's not a module)
common_case_regex = re.compile(r"^[A-Za-z0-9_\.]+(weight)|(bias)$")
if all(common_case_regex.match(k) for k in tied_mapping.keys() | tied_mapping.values()):
return tied_mapping.copy()
# We need to expand the regex patterns or the modules into proper parameters
expanded_tied_weights = {}
all_param_names = {k for k, _ in self.named_parameters(remove_duplicate=False)} | {
k for k, _ in self.named_buffers(remove_duplicate=False)
}
for target_name, source_name in tied_mapping.items():
target_name = "^" + target_name
source_name = "^" + source_name
source_params = sorted(filter(lambda x: re.search(source_name, x), all_param_names))
target_params = sorted(filter(lambda x: re.search(target_name, x), all_param_names))
if (
not len(source_params) > 0
or not len(target_params) > 0
or len(target_params) % len(source_params) != 0
):
raise ValueError(
f"There is an issue with your definition of `tie_weights_keys` for {source_name}:{target_name}. We found {source_params} to tie into {target_params}"
)
# we cycle source as it should be dispatch in many target if regex
for target_n, source_n in zip(target_params, cycle(source_params)):
expanded_tied_weights[target_n] = source_n
return expanded_tied_weights
def tie_weights(self, missing_keys: Optional[set[str]] = None, recompute_mapping: bool = True):
def tie_weights(self, missing_keys: Optional[set[str]] = None):
"""
If set in the config, tie the weights between the input embeddings and the output embeddings,
and the encoder and decoder. This relies on the `_tied_weights_keys` dict.
@ -2443,35 +2386,50 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
If you call this function, it will always tie. There is only 1 tricky case, if all weights are missing, you still want to mention that
the ones you tied were missing.
"""
# In this case, the keys stored in `all_tied_weights_keys` are already correct
if not recompute_mapping:
tied_keys = self.all_tied_weights_keys
else:
tied_keys = self.get_expanded_tied_weights_keys(all_submodels=True)
# TODO Cyril: using this fixed set of keys (set in post_init()) does not allow to switch the config flag and re-tie
mapping = getattr(self, "all_tied_weights_keys", None)
if not isinstance(mapping, dict):
return
for target_param_name, source_param_name in tied_keys.items():
source_param = self.get_parameter_or_buffer(source_param_name)
if "." in target_param_name:
parent_name, name = target_param_name.rsplit(".", 1)
parent = self.get_submodule(parent_name)
# TODO let's pray this is not too slow :)
top_level_params = dict(self.named_parameters(remove_duplicate=False)) | dict(
self.named_buffers(remove_duplicate=False)
)
for target_name, source_name in mapping.items():
source_name = "^" + source_name
target_name = "^" + target_name
source_is_there = bool(missing_keys) and not re.search(
source_name, "\n".join(missing_keys), flags=re.MULTILINE
)
source_params = sorted(filter(lambda x: re.search(source_name, x), top_level_params.keys()))
target_params = sorted(filter(lambda x: re.search(target_name, x), top_level_params.keys()))
if not len(source_params) > 0 or len(target_params) % len(source_params) != 0:
raise ValueError(
f"There is an issue with your definition of `tie_weights_keys` for {source_name}:{target_name}. We found {source_params} to tie into {target_params}"
)
if len(target_params) > 0:
# we cycle source as it should be dispatch in many target if regex
for target_n, source_n in zip(target_params, cycle(source_params)):
if "." in target_n:
parent_path, last = target_n.rsplit(".", 1)
parent = self.get_submodule(parent_path)
else:
parent_path, last = "", target_n
parent = self # top-level
setattr(parent, last, top_level_params[source_n])
self._adjust_bias(parent, top_level_params[source_n])
if missing_keys and source_is_there: # test_model_weights_reload_no_missing_tied_weights
missing_keys.discard(target_n)
else:
name = target_param_name
parent = self
setattr(parent, name, source_param)
self._adjust_bias(parent, source_param)
if missing_keys is not None:
source_is_there = not re.search(source_param_name, "\n".join(missing_keys), flags=re.MULTILINE)
target_is_there = not re.search(target_param_name, "\n".join(missing_keys), flags=re.MULTILINE)
if source_is_there:
missing_keys.discard(target_param_name)
# If the source is not present, the checkpoint is corrupted
else:
target_present = "not present either" if not target_is_there else "present"
raise ValueError(
f"This checkpoint seem corrupted. The tied weights mapping for this model specifies to tie "
f"{source_param_name} (which should be present and is not), to {target_param_name} (which is "
f"{target_present})"
)
target_is_not_there = missing_keys and re.search(
target_name, "\n".join(missing_keys), flags=re.MULTILINE
)
raise ValueError(
"There is a problem in the way you tie your keys or the way they were saved.\n"
f"source_is_there={source_is_there}, target_is_there={not target_is_not_there}, missing_keys={missing_keys},"
"tie_word_embeddings/tie_encoder_decoder={(self.config.tie_word_embeddings or self.config.tie_encoder_decoder)}"
)
def _adjust_bias(self, output_embeddings, input_embeddings):
if getattr(output_embeddings, "bias", None) is not None and hasattr(output_embeddings, "weight"):
@ -2987,8 +2945,9 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
if _init_weights:
# Initialize weights
self.initialize_weights()
# Tie weights needs to be called here, but it can use the pre-computed `all_tied_weights_keys`
self.tie_weights(recompute_mapping=False)
# Tie weights needs to be called as it figures out recursively if sub modules
# need to tie
self.tie_weights()
def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
"""
@ -3090,7 +3049,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
variant: Optional[str] = None,
token: Optional[Union[str, bool]] = None,
save_peft_format: bool = True,
save_original_format: bool = False, # TODO next PR will make it go to True
save_original_format: bool = True, # TODO next PR will make it go to True
**kwargs,
):
"""
@ -3215,6 +3174,19 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
# Save the config
if is_main_process:
if not _hf_peft_config_loaded:
# If the model config has set attributes that should be in the generation config, move them there.
misplaced_generation_parameters = model_to_save.config._get_non_default_generation_parameters()
if self.can_generate() and len(misplaced_generation_parameters) > 0:
warnings.warn(
"Moving the following attributes in the config to the generation config: "
f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
"generation parameters in the model config, as opposed to in the generation config.",
UserWarning,
)
for param_name, param_value in misplaced_generation_parameters.items():
setattr(model_to_save.generation_config, param_name, param_value)
setattr(model_to_save.config, param_name, None)
model_to_save.config.save_pretrained(save_directory)
if self.can_generate():
model_to_save.generation_config.save_pretrained(save_directory)
@ -3369,7 +3341,8 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
# MEGA BIG TODO HERE: self._conversion_ops needs to be used to save the final ckpt
# using what was loaded. Actually self._conversion_ops wont work because we need it
# even if the files are not legacy -> thus no conversion happened
state_dict = revert_weight_conversion(self, state_dict)
weight_mapping = get_checkpoint_conversion_mapping(self.config.model_type)
state_dict = revert_weight_conversion(self, state_dict, weight_mapping)
# Shard the model if it is too big.
if not _hf_peft_config_loaded:
@ -3909,7 +3882,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
subfolder = kwargs.pop("subfolder", "")
commit_hash = kwargs.pop("_commit_hash", None)
variant = kwargs.pop("variant", None)
adapter_kwargs = (kwargs.pop("adapter_kwargs", {}) or {}).copy()
adapter_kwargs = kwargs.pop("adapter_kwargs", {})
adapter_name = kwargs.pop("adapter_name", "default")
generation_config = kwargs.pop("generation_config", None)
gguf_file = kwargs.pop("gguf_file", None)
@ -4100,7 +4073,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
# Prepare the full device map
if device_map is not None:
device_map = _get_device_map(model, device_map, max_memory, hf_quantizer)
device_map = _get_device_map(model, device_map, max_memory, hf_quantizer, dtype)
# restore default dtype
if dtype_orig is not None:
@ -4214,6 +4187,9 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
expanded_device_map = expand_device_map(device_map, expected_keys)
caching_allocator_warmup(model, expanded_device_map, hf_quantizer)
if device_map is None:
device_map = {"": "cpu"}
keys = sorted(device_map.keys(), key=len, reverse=True)
tp_plan = getattr(model, "_tp_plan", None)
error_msgs = []
@ -4236,18 +4212,33 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
missing_keys, unexpected_keys, mismatched_keys, misc = set(), set(), set(), set()
else:
all_pointer = set()
# Checkpoints are safetensors
if checkpoint_files is not None and checkpoint_files[0].endswith(".safetensors"):
pattern = re.compile(r"(" + "|".join(map(re.escape, keys)) + r")")
if sharded_metadata is None:
k_v_iterator = dict.fromkeys(
safe_open(checkpoint_files[0], framework="pt").keys(), checkpoint_files[0].rsplit("/", 1)[1]
).items()
else:
k_v_iterator = sharded_metadata["weight_map"].items()
merged_state_dict = {}
for file in checkpoint_files:
file_pointer = safe_open(file, framework="pt", device="cpu")
for k, v in k_v_iterator:
match = pattern.match(k)
if match and match.group(1) != "":
device = device_map[match.group(1)]
else:
device = device_map.get("", "cpu")
if isinstance(device, torch.device):
device = device.index # safetensors only
if device == "disk":
device = "cpu" # we read to cpu to then write to disk
file_pointer = safe_open(
os.path.join(checkpoint_files[0].rsplit("/", 1)[0], v), framework="pt", device=device
)
all_pointer.add(file_pointer)
for k in file_pointer.keys():
merged_state_dict[k] = file_pointer.get_slice(k) # don't materialize yet
# User passed an explicit state_dict
merged_state_dict[k] = file_pointer.get_slice(k) # don't materialize yet
elif state_dict is not None:
merged_state_dict = state_dict
# Checkpoints are .bin
elif checkpoint_files is not None:
merged_state_dict = {}
for ckpt_file in checkpoint_files:
@ -4283,7 +4274,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
model._initialize_missing_keys(is_quantized)
# Tie the weights
model.tie_weights(missing_keys=missing_keys, recompute_mapping=False)
model.tie_weights(missing_keys)
# Adjust missing and unexpected keys
missing_keys, unexpected_keys = model._adjust_missing_and_unexpected_keys(missing_keys, unexpected_keys)
@ -4600,8 +4591,13 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
This is very important as most embeddings are tied, and they are huge params (vocabularies are often 256k), so
running inits on them is very costly."""
for tied_param in self.all_tied_weights_keys.keys():
param = self.get_parameter(tied_param)
param._is_hf_initialized = True
# It's always a proper weight except for 2 or 3 old models where it's a regex or module set to None
# -> just skip it in those cases (they will just re-init before tying, so they loose the added optimization)
try:
param = self.get_parameter(tied_param)
param._is_hf_initialized = True
except AttributeError:
pass
def get_parameter_or_buffer(self, target: str):
"""
@ -4714,7 +4710,7 @@ def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: dict,
else None
)
total_byte_count = defaultdict(lambda: 0)
tied_param_names = model.all_tied_weights_keys.keys()
tied_param_names = _get_tied_weight_keys(model)
for param_name, device in accelerator_device_map.items():
# Skip if the parameter has already been accounted for (tied weights)
if param_name in tied_param_names:
@ -4728,9 +4724,6 @@ def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: dict,
try:
param = model.get_parameter_or_buffer(param_name)
except AttributeError:
# TODO: for now let's skip if we can't find the parameters
if hf_quantizer is not None:
continue
raise AttributeError(f"Parameter {param_name} not found in model")
# The dtype of different parameters may be different with composite models or `keep_in_fp32_modules`

View File

@ -596,7 +596,7 @@ class AriaTextPreTrainedModel(PreTrainedModel):
@auto_docstring
class AriaPreTrainedModel(PreTrainedModel):
config: AriaConfig
base_model_prefix = "model"
base_model_prefix = ""
supports_gradient_checkpointing = True
_no_split_modules = ["AriaDecoderLayer"]
_skip_keys_device_placement = ["past_key_values"]
@ -893,10 +893,6 @@ class AriaModelOutputWithPast(BaseModelOutputWithPast):
"""
)
class AriaModel(AriaPreTrainedModel):
_checkpoint_conversion_mapping = {
r"^language_model.model": "language_model",
}
def __init__(self, config: AriaConfig):
super().__init__(config)
self.vision_tower = AutoModel.from_config(config.vision_config)

View File

@ -1206,7 +1206,7 @@ class AriaTextPreTrainedModel(PreTrainedModel):
class AriaPreTrainedModel(LlamaPreTrainedModel):
config: AriaConfig
base_model_prefix = "model"
base_model_prefix = ""
_can_compile_fullgraph = False # MoE models don't work with torch.compile (dynamic slicing)
_supports_attention_backend = True

View File

@ -288,9 +288,8 @@ class _BaseAutoModelClass:
if is_peft_available():
if adapter_kwargs is None:
adapter_kwargs = {}
adapter_kwargs = adapter_kwargs.copy() # avoid mutating original
if token is not None:
adapter_kwargs["token"] = token
if token is not None:
adapter_kwargs["token"] = token
maybe_adapter_path = find_adapter_config_file(
pretrained_model_name_or_path, _commit_hash=commit_hash, **adapter_kwargs

View File

@ -114,6 +114,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
("deit", "DeiTConfig"),
("depth_anything", "DepthAnythingConfig"),
("depth_pro", "DepthProConfig"),
("deta", "DetaConfig"),
("detr", "DetrConfig"),
("dia", "DiaConfig"),
("diffllama", "DiffLlamaConfig"),
@ -131,6 +132,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
("edgetam", "EdgeTamConfig"),
("edgetam_video", "EdgeTamVideoConfig"),
("edgetam_vision_model", "EdgeTamVisionConfig"),
("efficientformer", "EfficientFormerConfig"),
("efficientloftr", "EfficientLoFTRConfig"),
("efficientnet", "EfficientNetConfig"),
("electra", "ElectraConfig"),
@ -141,6 +143,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
("ernie", "ErnieConfig"),
("ernie4_5", "Ernie4_5Config"),
("ernie4_5_moe", "Ernie4_5_MoeConfig"),
("ernie_m", "ErnieMConfig"),
("esm", "EsmConfig"),
("evolla", "EvollaConfig"),
("exaone4", "Exaone4Config"),
@ -187,12 +190,14 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
("gpt_neox_japanese", "GPTNeoXJapaneseConfig"),
("gpt_oss", "GptOssConfig"),
("gptj", "GPTJConfig"),
("gptsan-japanese", "GPTSanJapaneseConfig"),
("granite", "GraniteConfig"),
("granite_speech", "GraniteSpeechConfig"),
("granitemoe", "GraniteMoeConfig"),
("granitemoehybrid", "GraniteMoeHybridConfig"),
("granitemoeshared", "GraniteMoeSharedConfig"),
("granitevision", "LlavaNextConfig"),
("graphormer", "GraphormerConfig"),
("grounding-dino", "GroundingDinoConfig"),
("groupvit", "GroupViTConfig"),
("helium", "HeliumConfig"),
@ -216,6 +221,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
("jamba", "JambaConfig"),
("janus", "JanusConfig"),
("jetmoe", "JetMoeConfig"),
("jukebox", "JukeboxConfig"),
("kosmos-2", "Kosmos2Config"),
("kosmos-2.5", "Kosmos2_5Config"),
("kyutai_speech_to_text", "KyutaiSpeechToTextConfig"),
@ -251,6 +257,8 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
("maskformer", "MaskFormerConfig"),
("maskformer-swin", "MaskFormerSwinConfig"),
("mbart", "MBartConfig"),
("mctct", "MCTCTConfig"),
("mega", "MegaConfig"),
("megatron-bert", "MegatronBertConfig"),
("metaclip_2", "MetaClip2Config"),
("mgp-str", "MgpstrConfig"),
@ -279,7 +287,9 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
("musicgen", "MusicgenConfig"),
("musicgen_melody", "MusicgenMelodyConfig"),
("mvp", "MvpConfig"),
("nat", "NatConfig"),
("nemotron", "NemotronConfig"),
("nezha", "NezhaConfig"),
("nllb-moe", "NllbMoeConfig"),
("nougat", "VisionEncoderDecoderConfig"),
("nystromformer", "NystromformerConfig"),
@ -289,6 +299,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
("olmoe", "OlmoeConfig"),
("omdet-turbo", "OmDetTurboConfig"),
("oneformer", "OneFormerConfig"),
("open-llama", "OpenLlamaConfig"),
("openai-gpt", "OpenAIGPTConfig"),
("opt", "OPTConfig"),
("ovis2", "Ovis2Config"),
@ -317,6 +328,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
("prophetnet", "ProphetNetConfig"),
("pvt", "PvtConfig"),
("pvt_v2", "PvtV2Config"),
("qdqbert", "QDQBertConfig"),
("qwen2", "Qwen2Config"),
("qwen2_5_omni", "Qwen2_5OmniConfig"),
("qwen2_5_vl", "Qwen2_5_VLConfig"),
@ -335,11 +347,13 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
("qwen3_vl_moe_text", "Qwen3VLMoeTextConfig"),
("qwen3_vl_text", "Qwen3VLTextConfig"),
("rag", "RagConfig"),
("realm", "RealmConfig"),
("recurrent_gemma", "RecurrentGemmaConfig"),
("reformer", "ReformerConfig"),
("regnet", "RegNetConfig"),
("rembert", "RemBertConfig"),
("resnet", "ResNetConfig"),
("retribert", "RetriBertConfig"),
("roberta", "RobertaConfig"),
("roberta-prelayernorm", "RobertaPreLayerNormConfig"),
("roc_bert", "RoCBertConfig"),
@ -373,6 +387,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
("smolvlm_vision", "SmolVLMVisionConfig"),
("speech-encoder-decoder", "SpeechEncoderDecoderConfig"),
("speech_to_text", "Speech2TextConfig"),
("speech_to_text_2", "Speech2Text2Config"),
("speecht5", "SpeechT5Config"),
("splinter", "SplinterConfig"),
("squeezebert", "SqueezeBertConfig"),
@ -395,7 +410,10 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
("timesformer", "TimesformerConfig"),
("timm_backbone", "TimmBackboneConfig"),
("timm_wrapper", "TimmWrapperConfig"),
("trajectory_transformer", "TrajectoryTransformerConfig"),
("transfo-xl", "TransfoXLConfig"),
("trocr", "TrOCRConfig"),
("tvlt", "TvltConfig"),
("tvp", "TvpConfig"),
("udop", "UdopConfig"),
("umt5", "UMT5Config"),
@ -403,6 +421,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
("unispeech-sat", "UniSpeechSatConfig"),
("univnet", "UnivNetConfig"),
("upernet", "UperNetConfig"),
("van", "VanConfig"),
("vaultgemma", "VaultGemmaConfig"),
("video_llama_3", "VideoLlama3Config"),
("video_llama_3_vision", "VideoLlama3VisionConfig"),
@ -414,6 +433,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
("vision-text-dual-encoder", "VisionTextDualEncoderConfig"),
("visual_bert", "VisualBertConfig"),
("vit", "ViTConfig"),
("vit_hybrid", "ViTHybridConfig"),
("vit_mae", "ViTMAEConfig"),
("vit_msn", "ViTMSNConfig"),
("vitdet", "VitDetConfig"),
@ -434,6 +454,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
("xcodec", "XcodecConfig"),
("xglm", "XGLMConfig"),
("xlm", "XLMConfig"),
("xlm-prophetnet", "XLMProphetNetConfig"),
("xlm-roberta", "XLMRobertaConfig"),
("xlm-roberta-xl", "XLMRobertaXLConfig"),
("xlnet", "XLNetConfig"),
@ -487,6 +508,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
("blip_2_qformer", "BLIP-2 QFormer"),
("bloom", "BLOOM"),
("blt", "Blt"),
("bort", "BORT"),
("bridgetower", "BridgeTower"),
("bros", "BROS"),
("byt5", "ByT5"),
@ -538,6 +560,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
("depth_anything", "Depth Anything"),
("depth_anything_v2", "Depth Anything V2"),
("depth_pro", "DepthPro"),
("deta", "DETA"),
("detr", "DETR"),
("dia", "Dia"),
("dialogpt", "DialoGPT"),
@ -557,6 +580,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
("edgetam", "EdgeTAM"),
("edgetam_video", "EdgeTamVideo"),
("edgetam_vision_model", "EdgeTamVisionModel"),
("efficientformer", "EfficientFormer"),
("efficientloftr", "EfficientLoFTR"),
("efficientnet", "EfficientNet"),
("electra", "ELECTRA"),
@ -567,6 +591,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
("ernie", "ERNIE"),
("ernie4_5", "Ernie4_5"),
("ernie4_5_moe", "Ernie4_5_MoE"),
("ernie_m", "ErnieM"),
("esm", "ESM"),
("evolla", "Evolla"),
("exaone4", "EXAONE-4.0"),
@ -616,12 +641,14 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
("gpt_neox_japanese", "GPT NeoX Japanese"),
("gpt_oss", "GptOss"),
("gptj", "GPT-J"),
("gptsan-japanese", "GPTSAN-japanese"),
("granite", "Granite"),
("granite_speech", "GraniteSpeech"),
("granitemoe", "GraniteMoeMoe"),
("granitemoehybrid", "GraniteMoeHybrid"),
("granitemoeshared", "GraniteMoeSharedMoe"),
("granitevision", "LLaVA-NeXT"),
("graphormer", "Graphormer"),
("grounding-dino", "Grounding DINO"),
("groupvit", "GroupViT"),
("helium", "Helium"),
@ -646,6 +673,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
("jamba", "Jamba"),
("janus", "Janus"),
("jetmoe", "JetMoe"),
("jukebox", "Jukebox"),
("kosmos-2", "KOSMOS-2"),
("kosmos-2.5", "KOSMOS-2.5"),
("kyutai_speech_to_text", "KyutaiSpeechToText"),
@ -686,6 +714,8 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
("matcha", "MatCha"),
("mbart", "mBART"),
("mbart50", "mBART-50"),
("mctct", "M-CTC-T"),
("mega", "MEGA"),
("megatron-bert", "Megatron-BERT"),
("megatron_gpt2", "Megatron-GPT2"),
("metaclip_2", "MetaCLIP 2"),
@ -718,7 +748,9 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
("musicgen_melody", "MusicGen Melody"),
("mvp", "MVP"),
("myt5", "myt5"),
("nat", "NAT"),
("nemotron", "Nemotron"),
("nezha", "Nezha"),
("nllb", "NLLB"),
("nllb-moe", "NLLB-MOE"),
("nougat", "Nougat"),
@ -729,6 +761,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
("olmoe", "OLMoE"),
("omdet-turbo", "OmDet-Turbo"),
("oneformer", "OneFormer"),
("open-llama", "OpenLlama"),
("openai-gpt", "OpenAI GPT"),
("opt", "OPT"),
("ovis2", "Ovis2"),
@ -759,6 +792,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
("prophetnet", "ProphetNet"),
("pvt", "PVT"),
("pvt_v2", "PVTv2"),
("qdqbert", "QDQBert"),
("qwen2", "Qwen2"),
("qwen2_5_omni", "Qwen2_5Omni"),
("qwen2_5_vl", "Qwen2_5_VL"),
@ -777,11 +811,13 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
("qwen3_vl_moe_text", "Qwen3VLMoe"),
("qwen3_vl_text", "Qwen3VL"),
("rag", "RAG"),
("realm", "REALM"),
("recurrent_gemma", "RecurrentGemma"),
("reformer", "Reformer"),
("regnet", "RegNet"),
("rembert", "RemBERT"),
("resnet", "ResNet"),
("retribert", "RetriBERT"),
("roberta", "RoBERTa"),
("roberta-prelayernorm", "RoBERTa-PreLayerNorm"),
("roc_bert", "RoCBert"),
@ -815,6 +851,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
("smolvlm_vision", "SmolVLMVisionTransformer"),
("speech-encoder-decoder", "Speech Encoder decoder"),
("speech_to_text", "Speech2Text"),
("speech_to_text_2", "Speech2Text2"),
("speecht5", "SpeechT5"),
("splinter", "Splinter"),
("squeezebert", "SqueezeBERT"),
@ -832,13 +869,17 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
("t5v1.1", "T5v1.1"),
("table-transformer", "Table Transformer"),
("tapas", "TAPAS"),
("tapex", "TAPEX"),
("textnet", "TextNet"),
("time_series_transformer", "Time Series Transformer"),
("timesfm", "TimesFm"),
("timesformer", "TimeSformer"),
("timm_backbone", "TimmBackbone"),
("timm_wrapper", "TimmWrapperModel"),
("trajectory_transformer", "Trajectory Transformer"),
("transfo-xl", "Transformer-XL"),
("trocr", "TrOCR"),
("tvlt", "TVLT"),
("tvp", "TVP"),
("udop", "UDOP"),
("ul2", "UL2"),
@ -847,6 +888,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
("unispeech-sat", "UniSpeechSat"),
("univnet", "UnivNet"),
("upernet", "UPerNet"),
("van", "VAN"),
("vaultgemma", "VaultGemma"),
("video_llama_3", "VideoLlama3"),
("video_llama_3_vision", "VideoLlama3Vision"),
@ -858,6 +900,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
("vision-text-dual-encoder", "VisionTextDualEncoder"),
("visual_bert", "VisualBERT"),
("vit", "ViT"),
("vit_hybrid", "ViT Hybrid"),
("vit_mae", "ViTMAE"),
("vit_msn", "ViTMSN"),
("vitdet", "VitDet"),
@ -879,6 +922,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
("xcodec", "X-CODEC"),
("xglm", "XGLM"),
("xlm", "XLM"),
("xlm-prophetnet", "XLM-ProphetNet"),
("xlm-roberta", "XLM-RoBERTa"),
("xlm-roberta-xl", "XLM-RoBERTa-XL"),
("xlm-v", "XLM-V"),
@ -897,7 +941,32 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
# This is tied to the processing `-` -> `_` in `model_type_to_module_name`. For example, instead of putting
# `transfo-xl` (as in `CONFIG_MAPPING_NAMES`), we should use `transfo_xl`.
DEPRECATED_MODELS = []
DEPRECATED_MODELS = [
"bort",
"deta",
"efficientformer",
"ernie_m",
"gptsan_japanese",
"graphormer",
"jukebox",
"mctct",
"mega",
"mmbt",
"nat",
"nezha",
"open_llama",
"qdqbert",
"realm",
"retribert",
"speech_to_text_2",
"tapex",
"trajectory_transformer",
"transfo_xl",
"tvlt",
"van",
"vit_hybrid",
"xlm_prophetnet",
]
SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict[str, str](
[

View File

@ -50,6 +50,7 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
("hubert", "Wav2Vec2FeatureExtractor"),
("kyutai_speech_to_text", "KyutaiSpeechToTextFeatureExtractor"),
("markuplm", "MarkupLMFeatureExtractor"),
("mctct", "MCTCTFeatureExtractor"),
("mimi", "EncodecFeatureExtractor"),
("moonshine", "Wav2Vec2FeatureExtractor"),
("moshi", "EncodecFeatureExtractor"),

View File

@ -89,6 +89,7 @@ else:
("deit", ("DeiTImageProcessor", "DeiTImageProcessorFast")),
("depth_anything", ("DPTImageProcessor", "DPTImageProcessorFast")),
("depth_pro", ("DepthProImageProcessor", "DepthProImageProcessorFast")),
("deta", ("DetaImageProcessor", None)),
("detr", ("DetrImageProcessor", "DetrImageProcessorFast")),
("dinat", ("ViTImageProcessor", "ViTImageProcessorFast")),
("dinov2", ("BitImageProcessor", "BitImageProcessorFast")),
@ -96,6 +97,7 @@ else:
("donut-swin", ("DonutImageProcessor", "DonutImageProcessorFast")),
("dpt", ("DPTImageProcessor", "DPTImageProcessorFast")),
("edgetam", (None, "Sam2ImageProcessorFast")),
("efficientformer", ("EfficientFormerImageProcessor", None)),
("efficientloftr", ("EfficientLoFTRImageProcessor", "EfficientLoFTRImageProcessorFast")),
("efficientnet", ("EfficientNetImageProcessor", "EfficientNetImageProcessorFast")),
("emu3", ("Emu3ImageProcessor", None)),
@ -147,6 +149,7 @@ else:
("mobilenet_v2", ("MobileNetV2ImageProcessor", "MobileNetV2ImageProcessorFast")),
("mobilevit", ("MobileViTImageProcessor", "MobileViTImageProcessorFast")),
("mobilevitv2", ("MobileViTImageProcessor", "MobileViTImageProcessorFast")),
("nat", ("ViTImageProcessor", "ViTImageProcessorFast")),
("nougat", ("NougatImageProcessor", "NougatImageProcessorFast")),
("omdet-turbo", ("DetrImageProcessor", "DetrImageProcessorFast")),
("oneformer", ("OneFormerImageProcessor", "OneFormerImageProcessorFast")),
@ -192,15 +195,18 @@ else:
("timesformer", ("VideoMAEImageProcessor", None)),
("timm_wrapper", ("TimmWrapperImageProcessor", None)),
("trocr", ("ViTImageProcessor", "ViTImageProcessorFast")),
("tvlt", ("TvltImageProcessor", None)),
("tvp", ("TvpImageProcessor", "TvpImageProcessorFast")),
("udop", ("LayoutLMv3ImageProcessor", "LayoutLMv3ImageProcessorFast")),
("upernet", ("SegformerImageProcessor", "SegformerImageProcessorFast")),
("van", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
("video_llama_3", ("VideoLlama3ImageProcessor", "VideoLlama3ImageProcessorFast")),
("video_llava", ("VideoLlavaImageProcessor", None)),
("videomae", ("VideoMAEImageProcessor", None)),
("vilt", ("ViltImageProcessor", "ViltImageProcessorFast")),
("vipllava", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
("vit", ("ViTImageProcessor", "ViTImageProcessorFast")),
("vit_hybrid", ("ViTHybridImageProcessor", None)),
("vit_mae", ("ViTImageProcessor", "ViTImageProcessorFast")),
("vit_msn", ("ViTImageProcessor", "ViTImageProcessorFast")),
("vitmatte", ("VitMatteImageProcessor", "VitMatteImageProcessorFast")),

View File

@ -119,6 +119,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("deformable_detr", "DeformableDetrModel"),
("deit", "DeiTModel"),
("depth_pro", "DepthProModel"),
("deta", "DetaModel"),
("detr", "DetrModel"),
("dia", "DiaModel"),
("diffllama", "DiffLlamaModel"),
@ -136,6 +137,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("edgetam", "EdgeTamModel"),
("edgetam_video", "EdgeTamVideoModel"),
("edgetam_vision_model", "EdgeTamVisionModel"),
("efficientformer", "EfficientFormerModel"),
("efficientloftr", "EfficientLoFTRModel"),
("efficientnet", "EfficientNetModel"),
("electra", "ElectraModel"),
@ -144,6 +146,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("ernie", "ErnieModel"),
("ernie4_5", "Ernie4_5Model"),
("ernie4_5_moe", "Ernie4_5_MoeModel"),
("ernie_m", "ErnieMModel"),
("esm", "EsmModel"),
("evolla", "EvollaModel"),
("exaone4", "Exaone4Model"),
@ -190,10 +193,12 @@ MODEL_MAPPING_NAMES = OrderedDict(
("gpt_neox_japanese", "GPTNeoXJapaneseModel"),
("gpt_oss", "GptOssModel"),
("gptj", "GPTJModel"),
("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
("granite", "GraniteModel"),
("granitemoe", "GraniteMoeModel"),
("granitemoehybrid", "GraniteMoeHybridModel"),
("granitemoeshared", "GraniteMoeSharedModel"),
("graphormer", "GraphormerModel"),
("grounding-dino", "GroundingDinoModel"),
("groupvit", "GroupViTModel"),
("helium", "HeliumModel"),
@ -217,6 +222,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("jamba", "JambaModel"),
("janus", "JanusModel"),
("jetmoe", "JetMoeModel"),
("jukebox", "JukeboxModel"),
("kosmos-2", "Kosmos2Model"),
("kosmos-2.5", "Kosmos2_5Model"),
("kyutai_speech_to_text", "KyutaiSpeechToTextModel"),
@ -251,6 +257,8 @@ MODEL_MAPPING_NAMES = OrderedDict(
("maskformer", "MaskFormerModel"),
("maskformer-swin", "MaskFormerSwinModel"),
("mbart", "MBartModel"),
("mctct", "MCTCTModel"),
("mega", "MegaModel"),
("megatron-bert", "MegatronBertModel"),
("metaclip_2", "MetaClip2Model"),
("mgp-str", "MgpstrForSceneTextRecognition"),
@ -279,7 +287,9 @@ MODEL_MAPPING_NAMES = OrderedDict(
("musicgen", "MusicgenModel"),
("musicgen_melody", "MusicgenMelodyModel"),
("mvp", "MvpModel"),
("nat", "NatModel"),
("nemotron", "NemotronModel"),
("nezha", "NezhaModel"),
("nllb-moe", "NllbMoeModel"),
("nystromformer", "NystromformerModel"),
("olmo", "OlmoModel"),
@ -288,6 +298,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("olmoe", "OlmoeModel"),
("omdet-turbo", "OmDetTurboForObjectDetection"),
("oneformer", "OneFormerModel"),
("open-llama", "OpenLlamaModel"),
("openai-gpt", "OpenAIGPTModel"),
("opt", "OPTModel"),
("ovis2", "Ovis2Model"),
@ -313,6 +324,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("prophetnet", "ProphetNetModel"),
("pvt", "PvtModel"),
("pvt_v2", "PvtV2Model"),
("qdqbert", "QDQBertModel"),
("qwen2", "Qwen2Model"),
("qwen2_5_vl", "Qwen2_5_VLModel"),
("qwen2_5_vl_text", "Qwen2_5_VLTextModel"),
@ -332,6 +344,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("regnet", "RegNetModel"),
("rembert", "RemBertModel"),
("resnet", "ResNetModel"),
("retribert", "RetriBertModel"),
("roberta", "RobertaModel"),
("roberta-prelayernorm", "RobertaPreLayerNormModel"),
("roc_bert", "RoCBertModel"),
@ -382,12 +395,16 @@ MODEL_MAPPING_NAMES = OrderedDict(
("timesformer", "TimesformerModel"),
("timm_backbone", "TimmBackbone"),
("timm_wrapper", "TimmWrapperModel"),
("trajectory_transformer", "TrajectoryTransformerModel"),
("transfo-xl", "TransfoXLModel"),
("tvlt", "TvltModel"),
("tvp", "TvpModel"),
("udop", "UdopModel"),
("umt5", "UMT5Model"),
("unispeech", "UniSpeechModel"),
("unispeech-sat", "UniSpeechSatModel"),
("univnet", "UnivNetModel"),
("van", "VanModel"),
("vaultgemma", "VaultGemmaModel"),
("video_llama_3", "VideoLlama3Model"),
("video_llama_3_vision", "VideoLlama3VisionModel"),
@ -398,6 +415,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("vision-text-dual-encoder", "VisionTextDualEncoderModel"),
("visual_bert", "VisualBertModel"),
("vit", "ViTModel"),
("vit_hybrid", "ViTHybridModel"),
("vit_mae", "ViTMAEModel"),
("vit_msn", "ViTMSNModel"),
("vitdet", "VitDetModel"),
@ -415,6 +433,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("xcodec", "XcodecModel"),
("xglm", "XGLMModel"),
("xlm", "XLMModel"),
("xlm-prophetnet", "XLMProphetNetModel"),
("xlm-roberta", "XLMRobertaModel"),
("xlm-roberta-xl", "XLMRobertaXLModel"),
("xlnet", "XLNetModel"),
@ -459,6 +478,7 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
("gpt-sw3", "GPT2LMHeadModel"),
("gpt2", "GPT2LMHeadModel"),
("gpt_bigcode", "GPTBigCodeForCausalLM"),
("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
("hiera", "HieraForPreTraining"),
("ibert", "IBertForMaskedLM"),
("idefics", "IdeficsForVisionText2Text"),
@ -475,6 +495,7 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
("lxmert", "LxmertForPreTraining"),
("mamba", "MambaForCausalLM"),
("mamba2", "Mamba2ForCausalLM"),
("mega", "MegaForMaskedLM"),
("megatron-bert", "MegatronBertForPreTraining"),
("mistral3", "Mistral3ForConditionalGeneration"),
("mllama", "MllamaForConditionalGeneration"),
@ -483,10 +504,12 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
("mpt", "MptForCausalLM"),
("mra", "MraForMaskedLM"),
("mvp", "MvpForConditionalGeneration"),
("nezha", "NezhaForPreTraining"),
("nllb-moe", "NllbMoeForConditionalGeneration"),
("openai-gpt", "OpenAIGPTLMHeadModel"),
("paligemma", "PaliGemmaForConditionalGeneration"),
("qwen2_audio", "Qwen2AudioForConditionalGeneration"),
("retribert", "RetriBertModel"),
("roberta", "RobertaForMaskedLM"),
("roberta-prelayernorm", "RobertaPreLayerNormForMaskedLM"),
("roc_bert", "RoCBertForPreTraining"),
@ -497,6 +520,8 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
("t5", "T5ForConditionalGeneration"),
("t5gemma", "T5GemmaForConditionalGeneration"),
("tapas", "TapasForMaskedLM"),
("transfo-xl", "TransfoXLLMHeadModel"),
("tvlt", "TvltForPreTraining"),
("unispeech", "UniSpeechForPreTraining"),
("unispeech-sat", "UniSpeechSatForPreTraining"),
("video_llava", "VideoLlavaForConditionalGeneration"),
@ -554,6 +579,7 @@ MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
("gpt_neox", "GPTNeoXForCausalLM"),
("gpt_neox_japanese", "GPTNeoXJapaneseForCausalLM"),
("gptj", "GPTJForCausalLM"),
("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
("ibert", "IBertForMaskedLM"),
("layoutlm", "LayoutLMForMaskedLM"),
("led", "LEDForConditionalGeneration"),
@ -564,6 +590,7 @@ MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
("mamba", "MambaForCausalLM"),
("mamba2", "Mamba2ForCausalLM"),
("marian", "MarianMTModel"),
("mega", "MegaForMaskedLM"),
("megatron-bert", "MegatronBertForCausalLM"),
("mobilebert", "MobileBertForMaskedLM"),
("moonshine", "MoonshineForConditionalGeneration"),
@ -571,12 +598,14 @@ MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
("mpt", "MptForCausalLM"),
("mra", "MraForMaskedLM"),
("mvp", "MvpForConditionalGeneration"),
("nezha", "NezhaForMaskedLM"),
("nllb-moe", "NllbMoeForConditionalGeneration"),
("nystromformer", "NystromformerForMaskedLM"),
("openai-gpt", "OpenAIGPTLMHeadModel"),
("pegasus_x", "PegasusXForConditionalGeneration"),
("plbart", "PLBartForConditionalGeneration"),
("pop2piano", "Pop2PianoForConditionalGeneration"),
("qdqbert", "QDQBertForMaskedLM"),
("reformer", "ReformerModelWithLMHead"),
("rembert", "RemBertForMaskedLM"),
("roberta", "RobertaForMaskedLM"),
@ -590,6 +619,7 @@ MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
("t5", "T5ForConditionalGeneration"),
("t5gemma", "T5GemmaForConditionalGeneration"),
("tapas", "TapasForMaskedLM"),
("transfo-xl", "TransfoXLLMHeadModel"),
("wav2vec2", "Wav2Vec2ForMaskedLM"),
("whisper", "WhisperForConditionalGeneration"),
("xlm", "XLMWithLMHeadModel"),
@ -683,6 +713,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
("mamba2", "Mamba2ForCausalLM"),
("marian", "MarianForCausalLM"),
("mbart", "MBartForCausalLM"),
("mega", "MegaForCausalLM"),
("megatron-bert", "MegatronBertForCausalLM"),
("minimax", "MiniMaxForCausalLM"),
("ministral", "MinistralForCausalLM"),
@ -700,6 +731,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
("olmo2", "Olmo2ForCausalLM"),
("olmo3", "Olmo3ForCausalLM"),
("olmoe", "OlmoeForCausalLM"),
("open-llama", "OpenLlamaForCausalLM"),
("openai-gpt", "OpenAIGPTLMHeadModel"),
("opt", "OPTForCausalLM"),
("pegasus", "PegasusForCausalLM"),
@ -710,6 +742,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
("phimoe", "PhimoeForCausalLM"),
("plbart", "PLBartForCausalLM"),
("prophetnet", "ProphetNetForCausalLM"),
("qdqbert", "QDQBertLMHeadModel"),
("qwen2", "Qwen2ForCausalLM"),
("qwen2_moe", "Qwen2MoeForCausalLM"),
("qwen3", "Qwen3ForCausalLM"),
@ -725,13 +758,16 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
("rwkv", "RwkvForCausalLM"),
("seed_oss", "SeedOssForCausalLM"),
("smollm3", "SmolLM3ForCausalLM"),
("speech_to_text_2", "Speech2Text2ForCausalLM"),
("stablelm", "StableLmForCausalLM"),
("starcoder2", "Starcoder2ForCausalLM"),
("transfo-xl", "TransfoXLLMHeadModel"),
("trocr", "TrOCRForCausalLM"),
("vaultgemma", "VaultGemmaForCausalLM"),
("whisper", "WhisperForCausalLM"),
("xglm", "XGLMForCausalLM"),
("xlm", "XLMWithLMHeadModel"),
("xlm-prophetnet", "XLMProphetNetForCausalLM"),
("xlm-roberta", "XLMRobertaForCausalLM"),
("xlm-roberta-xl", "XLMRobertaXLForCausalLM"),
("xlnet", "XLNetLMHeadModel"),
@ -757,6 +793,7 @@ MODEL_FOR_IMAGE_MAPPING_NAMES = OrderedDict(
("deformable_detr", "DeformableDetrModel"),
("deit", "DeiTModel"),
("depth_pro", "DepthProModel"),
("deta", "DetaModel"),
("detr", "DetrModel"),
("dinat", "DinatModel"),
("dinov2", "Dinov2Model"),
@ -764,6 +801,7 @@ MODEL_FOR_IMAGE_MAPPING_NAMES = OrderedDict(
("dinov3_convnext", "DINOv3ConvNextModel"),
("dinov3_vit", "DINOv3ViTModel"),
("dpt", "DPTModel"),
("efficientformer", "EfficientFormerModel"),
("efficientnet", "EfficientNetModel"),
("focalnet", "FocalNetModel"),
("glpn", "GLPNModel"),
@ -778,6 +816,7 @@ MODEL_FOR_IMAGE_MAPPING_NAMES = OrderedDict(
("mobilenet_v2", "MobileNetV2Model"),
("mobilevit", "MobileViTModel"),
("mobilevitv2", "MobileViTV2Model"),
("nat", "NatModel"),
("poolformer", "PoolFormerModel"),
("pvt", "PvtModel"),
("regnet", "RegNetModel"),
@ -792,8 +831,10 @@ MODEL_FOR_IMAGE_MAPPING_NAMES = OrderedDict(
("timesformer", "TimesformerModel"),
("timm_backbone", "TimmBackbone"),
("timm_wrapper", "TimmWrapperModel"),
("van", "VanModel"),
("videomae", "VideoMAEModel"),
("vit", "ViTModel"),
("vit_hybrid", "ViTHybridModel"),
("vit_mae", "ViTMAEModel"),
("vit_msn", "ViTMSNModel"),
("vitdet", "VitDetModel"),
@ -838,6 +879,13 @@ MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
("dinov2", "Dinov2ForImageClassification"),
("dinov2_with_registers", "Dinov2WithRegistersForImageClassification"),
("donut-swin", "DonutSwinForImageClassification"),
(
"efficientformer",
(
"EfficientFormerForImageClassification",
"EfficientFormerForImageClassificationWithTeacher",
),
),
("efficientnet", "EfficientNetForImageClassification"),
("focalnet", "FocalNetForImageClassification"),
("hgnet_v2", "HGNetV2ForImageClassification"),
@ -853,6 +901,7 @@ MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
("mobilenet_v2", "MobileNetV2ForImageClassification"),
("mobilevit", "MobileViTForImageClassification"),
("mobilevitv2", "MobileViTV2ForImageClassification"),
("nat", "NatForImageClassification"),
(
"perceiver",
(
@ -875,7 +924,9 @@ MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
("swinv2", "Swinv2ForImageClassification"),
("textnet", "TextNetForImageClassification"),
("timm_wrapper", "TimmWrapperForImageClassification"),
("van", "VanForImageClassification"),
("vit", "ViTForImageClassification"),
("vit_hybrid", "ViTHybridForImageClassification"),
("vit_msn", "ViTMSNForImageClassification"),
]
)
@ -1046,14 +1097,17 @@ MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
("longformer", "LongformerForMaskedLM"),
("luke", "LukeForMaskedLM"),
("mbart", "MBartForConditionalGeneration"),
("mega", "MegaForMaskedLM"),
("megatron-bert", "MegatronBertForMaskedLM"),
("mobilebert", "MobileBertForMaskedLM"),
("modernbert", "ModernBertForMaskedLM"),
("mpnet", "MPNetForMaskedLM"),
("mra", "MraForMaskedLM"),
("mvp", "MvpForConditionalGeneration"),
("nezha", "NezhaForMaskedLM"),
("nystromformer", "NystromformerForMaskedLM"),
("perceiver", "PerceiverForMaskedLM"),
("qdqbert", "QDQBertForMaskedLM"),
("reformer", "ReformerForMaskedLM"),
("rembert", "RemBertForMaskedLM"),
("roberta", "RobertaForMaskedLM"),
@ -1078,6 +1132,7 @@ MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES = OrderedDict(
("d_fine", "DFineForObjectDetection"),
("dab-detr", "DabDetrForObjectDetection"),
("deformable_detr", "DeformableDetrForObjectDetection"),
("deta", "DetaForObjectDetection"),
("detr", "DetrForObjectDetection"),
("rt_detr", "RTDetrForObjectDetection"),
("rt_detr_v2", "RTDetrV2ForObjectDetection"),
@ -1118,6 +1173,7 @@ MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
("blenderbot-small", "BlenderbotSmallForConditionalGeneration"),
("encoder-decoder", "EncoderDecoderModel"),
("fsmt", "FSMTForConditionalGeneration"),
("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
("granite_speech", "GraniteSpeechForConditionalGeneration"),
("led", "LEDForConditionalGeneration"),
("longt5", "LongT5ForConditionalGeneration"),
@ -1139,6 +1195,7 @@ MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
("t5gemma", "T5GemmaForConditionalGeneration"),
("umt5", "UMT5ForConditionalGeneration"),
("voxtral", "VoxtralForConditionalGeneration"),
("xlm-prophetnet", "XLMProphetNetForConditionalGeneration"),
]
)
@ -1184,6 +1241,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
("doge", "DogeForSequenceClassification"),
("electra", "ElectraForSequenceClassification"),
("ernie", "ErnieForSequenceClassification"),
("ernie_m", "ErnieMForSequenceClassification"),
("esm", "EsmForSequenceClassification"),
("exaone4", "Exaone4ForSequenceClassification"),
("falcon", "FalconForSequenceClassification"),
@ -1219,6 +1277,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
("luke", "LukeForSequenceClassification"),
("markuplm", "MarkupLMForSequenceClassification"),
("mbart", "MBartForSequenceClassification"),
("mega", "MegaForSequenceClassification"),
("megatron-bert", "MegatronBertForSequenceClassification"),
("minimax", "MiniMaxForSequenceClassification"),
("ministral", "MinistralForSequenceClassification"),
@ -1233,7 +1292,9 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
("mt5", "MT5ForSequenceClassification"),
("mvp", "MvpForSequenceClassification"),
("nemotron", "NemotronForSequenceClassification"),
("nezha", "NezhaForSequenceClassification"),
("nystromformer", "NystromformerForSequenceClassification"),
("open-llama", "OpenLlamaForSequenceClassification"),
("openai-gpt", "OpenAIGPTForSequenceClassification"),
("opt", "OPTForSequenceClassification"),
("perceiver", "PerceiverForSequenceClassification"),
@ -1242,6 +1303,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
("phi3", "Phi3ForSequenceClassification"),
("phimoe", "PhimoeForSequenceClassification"),
("plbart", "PLBartForSequenceClassification"),
("qdqbert", "QDQBertForSequenceClassification"),
("qwen2", "Qwen2ForSequenceClassification"),
("qwen2_moe", "Qwen2MoeForSequenceClassification"),
("qwen3", "Qwen3ForSequenceClassification"),
@ -1261,6 +1323,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
("t5", "T5ForSequenceClassification"),
("t5gemma", "T5GemmaForSequenceClassification"),
("tapas", "TapasForSequenceClassification"),
("transfo-xl", "TransfoXLForSequenceClassification"),
("umt5", "UMT5ForSequenceClassification"),
("xlm", "XLMForSequenceClassification"),
("xlm-roberta", "XLMRobertaForSequenceClassification"),
@ -1293,6 +1356,7 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
("distilbert", "DistilBertForQuestionAnswering"),
("electra", "ElectraForQuestionAnswering"),
("ernie", "ErnieForQuestionAnswering"),
("ernie_m", "ErnieMForQuestionAnswering"),
("exaone4", "Exaone4ForQuestionAnswering"),
("falcon", "FalconForQuestionAnswering"),
("flaubert", "FlaubertForQuestionAnsweringSimple"),
@ -1313,6 +1377,7 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
("lxmert", "LxmertForQuestionAnswering"),
("markuplm", "MarkupLMForQuestionAnswering"),
("mbart", "MBartForQuestionAnswering"),
("mega", "MegaForQuestionAnswering"),
("megatron-bert", "MegatronBertForQuestionAnswering"),
("minimax", "MiniMaxForQuestionAnswering"),
("ministral", "MinistralForQuestionAnswering"),
@ -1326,8 +1391,10 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
("mt5", "MT5ForQuestionAnswering"),
("mvp", "MvpForQuestionAnswering"),
("nemotron", "NemotronForQuestionAnswering"),
("nezha", "NezhaForQuestionAnswering"),
("nystromformer", "NystromformerForQuestionAnswering"),
("opt", "OPTForQuestionAnswering"),
("qdqbert", "QDQBertForQuestionAnswering"),
("qwen2", "Qwen2ForQuestionAnswering"),
("qwen2_moe", "Qwen2MoeForQuestionAnswering"),
("qwen3", "Qwen3ForQuestionAnswering"),
@ -1399,6 +1466,7 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
("distilbert", "DistilBertForTokenClassification"),
("electra", "ElectraForTokenClassification"),
("ernie", "ErnieForTokenClassification"),
("ernie_m", "ErnieMForTokenClassification"),
("esm", "EsmForTokenClassification"),
("exaone4", "Exaone4ForTokenClassification"),
("falcon", "FalconForTokenClassification"),
@ -1425,6 +1493,7 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
("longformer", "LongformerForTokenClassification"),
("luke", "LukeForTokenClassification"),
("markuplm", "MarkupLMForTokenClassification"),
("mega", "MegaForTokenClassification"),
("megatron-bert", "MegatronBertForTokenClassification"),
("minimax", "MiniMaxForTokenClassification"),
("ministral", "MinistralForTokenClassification"),
@ -1437,10 +1506,12 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
("mra", "MraForTokenClassification"),
("mt5", "MT5ForTokenClassification"),
("nemotron", "NemotronForTokenClassification"),
("nezha", "NezhaForTokenClassification"),
("nystromformer", "NystromformerForTokenClassification"),
("persimmon", "PersimmonForTokenClassification"),
("phi", "PhiForTokenClassification"),
("phi3", "Phi3ForTokenClassification"),
("qdqbert", "QDQBertForTokenClassification"),
("qwen2", "Qwen2ForTokenClassification"),
("qwen2_moe", "Qwen2MoeForTokenClassification"),
("qwen3", "Qwen3ForTokenClassification"),
@ -1482,18 +1553,22 @@ MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES = OrderedDict(
("distilbert", "DistilBertForMultipleChoice"),
("electra", "ElectraForMultipleChoice"),
("ernie", "ErnieForMultipleChoice"),
("ernie_m", "ErnieMForMultipleChoice"),
("flaubert", "FlaubertForMultipleChoice"),
("fnet", "FNetForMultipleChoice"),
("funnel", "FunnelForMultipleChoice"),
("ibert", "IBertForMultipleChoice"),
("longformer", "LongformerForMultipleChoice"),
("luke", "LukeForMultipleChoice"),
("mega", "MegaForMultipleChoice"),
("megatron-bert", "MegatronBertForMultipleChoice"),
("mobilebert", "MobileBertForMultipleChoice"),
("modernbert", "ModernBertForMultipleChoice"),
("mpnet", "MPNetForMultipleChoice"),
("mra", "MraForMultipleChoice"),
("nezha", "NezhaForMultipleChoice"),
("nystromformer", "NystromformerForMultipleChoice"),
("qdqbert", "QDQBertForMultipleChoice"),
("rembert", "RemBertForMultipleChoice"),
("roberta", "RobertaForMultipleChoice"),
("roberta-prelayernorm", "RobertaPreLayerNormForMultipleChoice"),
@ -1516,6 +1591,8 @@ MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES = OrderedDict(
("fnet", "FNetForNextSentencePrediction"),
("megatron-bert", "MegatronBertForNextSentencePrediction"),
("mobilebert", "MobileBertForNextSentencePrediction"),
("nezha", "NezhaForNextSentencePrediction"),
("qdqbert", "QDQBertForNextSentencePrediction"),
]
)
@ -1542,6 +1619,7 @@ MODEL_FOR_CTC_MAPPING_NAMES = OrderedDict(
# Model for Connectionist temporal classification (CTC) mapping
("data2vec-audio", "Data2VecAudioForCTC"),
("hubert", "HubertForCTC"),
("mctct", "MCTCTForCTC"),
("parakeet_ctc", "ParakeetForCTC"),
("sew", "SEWForCTC"),
("sew-d", "SEWDForCTC"),
@ -1635,6 +1713,7 @@ MODEL_FOR_BACKBONE_MAPPING_NAMES = OrderedDict(
("hgnet_v2", "HGNetV2Backbone"),
("hiera", "HieraBackbone"),
("maskformer-swin", "MaskFormerSwinBackbone"),
("nat", "NatBackbone"),
("pvt_v2", "PvtV2Backbone"),
("resnet", "ResNetBackbone"),
("rt_detr_resnet", "RTDetrResNetBackbone"),

View File

@ -102,6 +102,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
("llava_next_video", "LlavaNextVideoProcessor"),
("llava_onevision", "LlavaOnevisionProcessor"),
("markuplm", "MarkupLMProcessor"),
("mctct", "MCTCTProcessor"),
("metaclip_2", "CLIPProcessor"),
("mgp-str", "MgpstrProcessor"),
("mistral3", "PixtralProcessor"),
@ -137,8 +138,10 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
("siglip2", "Siglip2Processor"),
("smolvlm", "SmolVLMProcessor"),
("speech_to_text", "Speech2TextProcessor"),
("speech_to_text_2", "Speech2Text2Processor"),
("speecht5", "SpeechT5Processor"),
("trocr", "TrOCRProcessor"),
("tvlt", "TvltProcessor"),
("tvp", "TvpProcessor"),
("udop", "UdopProcessor"),
("unispeech", "Wav2Vec2Processor"),

View File

@ -239,6 +239,7 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
("ernie", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
("ernie4_5", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
("ernie4_5_moe", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
("ernie_m", ("ErnieMTokenizer" if is_sentencepiece_available() else None, None)),
("esm", ("EsmTokenizer", None)),
("evolla", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
(
@ -320,6 +321,7 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
("gpt_neox_japanese", ("GPTNeoXJapaneseTokenizer", None)),
("gpt_oss", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
("gptj", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
("gptsan-japanese", ("GPTSanJapaneseTokenizer", None)),
("granite", ("GPT2Tokenizer", None)),
("granite_speech", ("GPT2Tokenizer", None)),
("granitemoe", ("GPT2Tokenizer", None)),
@ -352,6 +354,7 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
"LlamaTokenizerFast" if is_tokenizers_available() else None,
),
),
("jukebox", ("JukeboxTokenizer", None)),
(
"kosmos-2",
(
@ -423,6 +426,7 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
"MBart50TokenizerFast" if is_tokenizers_available() else None,
),
),
("mega", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
("megatron-bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
(
"metaclip_2",
@ -497,6 +501,7 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
("mvp", ("MvpTokenizer", "MvpTokenizerFast" if is_tokenizers_available() else None)),
("myt5", ("MyT5Tokenizer", None)),
("nemotron", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
("nezha", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
(
"nllb",
(
@ -585,6 +590,7 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
("plbart", ("PLBartTokenizer" if is_sentencepiece_available() else None, None)),
("pop2piano", ("Pop2PianoTokenizer", None)),
("prophetnet", ("ProphetNetTokenizer", None)),
("qdqbert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
(
"qwen2",
(
@ -628,6 +634,7 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
("qwen3_vl", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
("qwen3_vl_moe", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
("rag", ("RagTokenizer", None)),
("realm", ("RealmTokenizer", "RealmTokenizerFast" if is_tokenizers_available() else None)),
(
"recurrent_gemma",
(
@ -649,6 +656,7 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
"RemBertTokenizerFast" if is_tokenizers_available() else None,
),
),
("retribert", ("RetriBertTokenizer", "RetriBertTokenizerFast" if is_tokenizers_available() else None)),
("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
(
"roberta-prelayernorm",
@ -689,6 +697,7 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
("smollm3", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
("smolvlm", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)),
("speech_to_text_2", ("Speech2Text2Tokenizer", None)),
("speecht5", ("SpeechT5Tokenizer" if is_sentencepiece_available() else None, None)),
("splinter", ("SplinterTokenizer", "SplinterTokenizerFast")),
(
@ -719,6 +728,8 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
),
),
("tapas", ("TapasTokenizer", None)),
("tapex", ("TapexTokenizer", None)),
("transfo-xl", ("TransfoXLTokenizer", None)),
("trocr", ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast" if is_tokenizers_available() else None)),
("tvp", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
(
@ -769,6 +780,7 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
),
),
("xlm", ("XLMTokenizer", None)),
("xlm-prophetnet", ("XLMProphetNetTokenizer" if is_sentencepiece_available() else None, None)),
(
"xlm-roberta",
(

View File

@ -90,7 +90,6 @@ class AyaVisionMultiModalProjector(nn.Module):
@auto_docstring
class AyaVisionPreTrainedModel(PreTrainedModel):
config: AyaVisionConfig
base_model_prefix = "model"
input_modalities = ["image", "text"]
supports_gradient_checkpointing = True
_skip_keys_device_placement = "past_key_values"
@ -163,10 +162,6 @@ class AyaVisionModelOutputWithPast(BaseModelOutputWithPast):
"""
)
class AyaVisionModel(AyaVisionPreTrainedModel):
_checkpoint_conversion_mapping = {
r"^language_model.model": "language_model",
}
def __init__(self, config: AyaVisionConfig):
super().__init__(config)
self.vision_tower = AutoModel.from_config(config.vision_config)

View File

@ -166,10 +166,11 @@ class BartConfig(PreTrainedConfig):
)
self.tie_encoder_decoder = True
# ensure backward compatibility for BART CNN models
if kwargs.get("force_bos_token_to_be_generated", False):
if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
self.forced_bos_token_id = self.bos_token_id
warnings.warn(
f"Please make sure the generation config includes `forced_bos_token_id={self.bos_token_id}`. "
f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
"The config can simply be saved and uploaded again to be fixed."
)

View File

@ -22,6 +22,7 @@ if TYPE_CHECKING:
from .modeling_bert import *
from .tokenization_bert import *
from .tokenization_bert_fast import *
from .tokenization_bert_tf import *
else:
import sys

View File

@ -430,7 +430,6 @@ class BltCrossAttention(nn.Module):
@auto_docstring
class BltPreTrainedModel(PreTrainedModel):
config: BltConfig
base_model_prefix = "model"
input_modalities = ["image", "text"]
supports_gradient_checkpointing = True
_no_split_modules = ["BltTransformerLayer"]

View File

@ -778,7 +778,7 @@ class ClvpConditioningEncoder(nn.Module):
@auto_docstring
class ClvpPreTrainedModel(PreTrainedModel):
config: ClvpConfig
base_model_prefix = "model"
base_model_prefix = "clvp"
supports_gradient_checkpointing = True
_skip_keys_device_placement = "past_key_values"

View File

@ -129,7 +129,6 @@ class Cohere2VisionCausalLMOutputWithPast(ModelOutput):
@auto_docstring
class Cohere2VisionPreTrainedModel(PreTrainedModel):
config: Cohere2VisionConfig
base_model_prefix = "model"
input_modalities = ["image", "text"]
supports_gradient_checkpointing = True
_skip_keys_device_placement = "past_key_values"
@ -143,6 +142,7 @@ class Cohere2VisionPreTrainedModel(PreTrainedModel):
"hidden_states": "DecoderLayer",
"attentions": "Attention",
}
base_model_prefix = "model"
@auto_docstring(

View File

@ -18,9 +18,30 @@ from ...utils.import_utils import define_import_structure
if TYPE_CHECKING:
pass
# Add models to deprecate like:
# from .XXX import *
from .bort import *
from .deta import *
from .efficientformer import *
from .ernie_m import *
from .gptsan_japanese import *
from .graphormer import *
from .jukebox import *
from .mctct import *
from .mega import *
from .mmbt import *
from .nat import *
from .nezha import *
from .open_llama import *
from .qdqbert import *
from .realm import *
from .retribert import *
from .speech_to_text_2 import *
from .tapex import *
from .trajectory_transformer import *
from .transfo_xl import *
from .tvlt import *
from .van import *
from .vit_hybrid import *
from .xlm_prophetnet import *
else:
import sys

View File

@ -0,0 +1,318 @@
# coding=utf-8
# Copyright 2020, The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert Bort checkpoint."""
import argparse
import os
import gluonnlp as nlp
import mxnet as mx
import numpy as np
import torch
from gluonnlp.base import get_home_dir
from gluonnlp.model.bert import BERTEncoder
from gluonnlp.model.utils import _load_vocab
from gluonnlp.vocab import Vocab
from packaging import version
from torch import nn
from transformers import BertConfig, BertForMaskedLM, BertModel, RobertaTokenizer
from transformers.models.bert.modeling_bert import (
BertIntermediate,
BertLayer,
BertOutput,
BertSelfAttention,
BertSelfOutput,
)
from transformers.utils import logging
if version.parse(nlp.__version__) != version.parse("0.8.3"):
raise Exception("requires gluonnlp == 0.8.3")
if version.parse(mx.__version__) != version.parse("1.5.0"):
raise Exception("requires mxnet == 1.5.0")
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
SAMPLE_TEXT = "The Nymphenburg Palace is a beautiful palace in Munich!"
def convert_bort_checkpoint_to_pytorch(bort_checkpoint_path: str, pytorch_dump_folder_path: str):
"""
Convert the original Bort checkpoint (based on MXNET and Gluonnlp) to our BERT structure-
"""
# Original Bort configuration
bort_4_8_768_1024_hparams = {
"attention_cell": "multi_head",
"num_layers": 4,
"units": 1024,
"hidden_size": 768,
"max_length": 512,
"num_heads": 8,
"scaled": True,
"dropout": 0.1,
"use_residual": True,
"embed_size": 1024,
"embed_dropout": 0.1,
"word_embed": None,
"layer_norm_eps": 1e-5,
"token_type_vocab_size": 2,
}
predefined_args = bort_4_8_768_1024_hparams
# Let's construct the original Bort model here
# Taken from official BERT implementation, see:
# https://github.com/alexa/bort/blob/master/bort/bort.py
encoder = BERTEncoder(
attention_cell=predefined_args["attention_cell"],
num_layers=predefined_args["num_layers"],
units=predefined_args["units"],
hidden_size=predefined_args["hidden_size"],
max_length=predefined_args["max_length"],
num_heads=predefined_args["num_heads"],
scaled=predefined_args["scaled"],
dropout=predefined_args["dropout"],
output_attention=False,
output_all_encodings=False,
use_residual=predefined_args["use_residual"],
activation=predefined_args.get("activation", "gelu"),
layer_norm_eps=predefined_args.get("layer_norm_eps", None),
)
# Vocab information needs to be fetched first
# It's the same as RoBERTa, so RobertaTokenizer can be used later
vocab_name = "openwebtext_ccnews_stories_books_cased"
# Specify download folder to Gluonnlp's vocab
gluon_cache_dir = os.path.join(get_home_dir(), "models")
bort_vocab = _load_vocab(vocab_name, None, gluon_cache_dir, cls=Vocab)
original_bort = nlp.model.BERTModel(
encoder,
len(bort_vocab),
units=predefined_args["units"],
embed_size=predefined_args["embed_size"],
embed_dropout=predefined_args["embed_dropout"],
word_embed=predefined_args["word_embed"],
use_pooler=False,
use_token_type_embed=False,
token_type_vocab_size=predefined_args["token_type_vocab_size"],
use_classifier=False,
use_decoder=False,
)
original_bort.load_parameters(bort_checkpoint_path, cast_dtype=True, ignore_extra=True)
params = original_bort._collect_params_with_prefix()
# Build our config 🤗
hf_bort_config_json = {
"architectures": ["BertForMaskedLM"],
"attention_probs_dropout_prob": predefined_args["dropout"],
"hidden_act": "gelu",
"hidden_dropout_prob": predefined_args["dropout"],
"hidden_size": predefined_args["embed_size"],
"initializer_range": 0.02,
"intermediate_size": predefined_args["hidden_size"],
"layer_norm_eps": predefined_args["layer_norm_eps"],
"max_position_embeddings": predefined_args["max_length"],
"model_type": "bort",
"num_attention_heads": predefined_args["num_heads"],
"num_hidden_layers": predefined_args["num_layers"],
"pad_token_id": 1, # 2 = BERT, 1 = RoBERTa
"type_vocab_size": 1, # 2 = BERT, 1 = RoBERTa
"vocab_size": len(bort_vocab),
}
hf_bort_config = BertConfig.from_dict(hf_bort_config_json)
hf_bort_model = BertForMaskedLM(hf_bort_config)
hf_bort_model.eval()
# Parameter mapping table (Gluonnlp to Transformers)
# * denotes layer index
#
# | Gluon Parameter | Transformers Parameter
# | -------------------------------------------------------------- | ----------------------
# | `encoder.layer_norm.beta` | `bert.embeddings.LayerNorm.bias`
# | `encoder.layer_norm.gamma` | `bert.embeddings.LayerNorm.weight`
# | `encoder.position_weight` | `bert.embeddings.position_embeddings.weight`
# | `word_embed.0.weight` | `bert.embeddings.word_embeddings.weight`
# | `encoder.transformer_cells.*.attention_cell.proj_key.bias` | `bert.encoder.layer.*.attention.self.key.bias`
# | `encoder.transformer_cells.*.attention_cell.proj_key.weight` | `bert.encoder.layer.*.attention.self.key.weight`
# | `encoder.transformer_cells.*.attention_cell.proj_query.bias` | `bert.encoder.layer.*.attention.self.query.bias`
# | `encoder.transformer_cells.*.attention_cell.proj_query.weight` | `bert.encoder.layer.*.attention.self.query.weight`
# | `encoder.transformer_cells.*.attention_cell.proj_value.bias` | `bert.encoder.layer.*.attention.self.value.bias`
# | `encoder.transformer_cells.*.attention_cell.proj_value.weight` | `bert.encoder.layer.*.attention.self.value.weight`
# | `encoder.transformer_cells.*.ffn.ffn_2.bias` | `bert.encoder.layer.*.attention.output.dense.bias`
# | `encoder.transformer_cells.*.ffn.ffn_2.weight` | `bert.encoder.layer.*.attention.output.dense.weight`
# | `encoder.transformer_cells.*.layer_norm.beta` | `bert.encoder.layer.*.attention.output.LayerNorm.bias`
# | `encoder.transformer_cells.*.layer_norm.gamma` | `bert.encoder.layer.*.attention.output.LayerNorm.weight`
# | `encoder.transformer_cells.*.ffn.ffn_1.bias` | `bert.encoder.layer.*.intermediate.dense.bias`
# | `encoder.transformer_cells.*.ffn.ffn_1.weight` | `bert.encoder.layer.*.intermediate.dense.weight`
# | `encoder.transformer_cells.*.ffn.layer_norm.beta` | `bert.encoder.layer.*.output.LayerNorm.bias`
# | `encoder.transformer_cells.*.ffn.layer_norm.gamma` | `bert.encoder.layer.*.output.LayerNorm.weight`
# | `encoder.transformer_cells.*.proj.bias` | `bert.encoder.layer.*.output.dense.bias`
# | `encoder.transformer_cells.*.proj.weight` | `bert.encoder.layer.*.output.dense.weight`
# Helper function to convert MXNET Arrays to PyTorch
def to_torch(mx_array) -> nn.Parameter:
return nn.Parameter(torch.FloatTensor(mx_array.data().asnumpy()))
# Check param shapes and map new HF param back
def check_and_map_params(hf_param, gluon_param):
shape_hf = hf_param.shape
gluon_param = to_torch(params[gluon_param])
shape_gluon = gluon_param.shape
assert shape_hf == shape_gluon, (
f"The gluon parameter {gluon_param} has shape {shape_gluon}, but expects shape {shape_hf} for Transformers"
)
return gluon_param
hf_bort_model.bert.embeddings.word_embeddings.weight = check_and_map_params(
hf_bort_model.bert.embeddings.word_embeddings.weight, "word_embed.0.weight"
)
hf_bort_model.bert.embeddings.position_embeddings.weight = check_and_map_params(
hf_bort_model.bert.embeddings.position_embeddings.weight, "encoder.position_weight"
)
hf_bort_model.bert.embeddings.LayerNorm.bias = check_and_map_params(
hf_bort_model.bert.embeddings.LayerNorm.bias, "encoder.layer_norm.beta"
)
hf_bort_model.bert.embeddings.LayerNorm.weight = check_and_map_params(
hf_bort_model.bert.embeddings.LayerNorm.weight, "encoder.layer_norm.gamma"
)
# Inspired by RoBERTa conversion script, we just zero them out (Bort does not use them)
hf_bort_model.bert.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
hf_bort_model.bert.embeddings.token_type_embeddings.weight.data
)
for i in range(hf_bort_config.num_hidden_layers):
layer: BertLayer = hf_bort_model.bert.encoder.layer[i]
# self attention
self_attn: BertSelfAttention = layer.attention.self
self_attn.key.bias.data = check_and_map_params(
self_attn.key.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_key.bias"
)
self_attn.key.weight.data = check_and_map_params(
self_attn.key.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_key.weight"
)
self_attn.query.bias.data = check_and_map_params(
self_attn.query.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_query.bias"
)
self_attn.query.weight.data = check_and_map_params(
self_attn.query.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_query.weight"
)
self_attn.value.bias.data = check_and_map_params(
self_attn.value.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_value.bias"
)
self_attn.value.weight.data = check_and_map_params(
self_attn.value.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_value.weight"
)
# self attention output
self_output: BertSelfOutput = layer.attention.output
self_output.dense.bias = check_and_map_params(
self_output.dense.bias, f"encoder.transformer_cells.{i}.proj.bias"
)
self_output.dense.weight = check_and_map_params(
self_output.dense.weight, f"encoder.transformer_cells.{i}.proj.weight"
)
self_output.LayerNorm.bias = check_and_map_params(
self_output.LayerNorm.bias, f"encoder.transformer_cells.{i}.layer_norm.beta"
)
self_output.LayerNorm.weight = check_and_map_params(
self_output.LayerNorm.weight, f"encoder.transformer_cells.{i}.layer_norm.gamma"
)
# intermediate
intermediate: BertIntermediate = layer.intermediate
intermediate.dense.bias = check_and_map_params(
intermediate.dense.bias, f"encoder.transformer_cells.{i}.ffn.ffn_1.bias"
)
intermediate.dense.weight = check_and_map_params(
intermediate.dense.weight, f"encoder.transformer_cells.{i}.ffn.ffn_1.weight"
)
# output
bert_output: BertOutput = layer.output
bert_output.dense.bias = check_and_map_params(
bert_output.dense.bias, f"encoder.transformer_cells.{i}.ffn.ffn_2.bias"
)
bert_output.dense.weight = check_and_map_params(
bert_output.dense.weight, f"encoder.transformer_cells.{i}.ffn.ffn_2.weight"
)
bert_output.LayerNorm.bias = check_and_map_params(
bert_output.LayerNorm.bias, f"encoder.transformer_cells.{i}.ffn.layer_norm.beta"
)
bert_output.LayerNorm.weight = check_and_map_params(
bert_output.LayerNorm.weight, f"encoder.transformer_cells.{i}.ffn.layer_norm.gamma"
)
# Save space and energy 🎄
hf_bort_model.half()
# Compare output of both models
tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base")
input_ids = tokenizer.encode_plus(SAMPLE_TEXT)["input_ids"]
# Get gluon output
gluon_input_ids = mx.nd.array([input_ids])
output_gluon = original_bort(inputs=gluon_input_ids, token_types=[])
# Get Transformer output (save and reload model again)
hf_bort_model.save_pretrained(pytorch_dump_folder_path)
hf_bort_model = BertModel.from_pretrained(pytorch_dump_folder_path)
hf_bort_model.eval()
input_ids = tokenizer.encode_plus(SAMPLE_TEXT, return_tensors="pt")
output_hf = hf_bort_model(**input_ids)[0]
gluon_layer = output_gluon[0].asnumpy()
hf_layer = output_hf[0].detach().numpy()
max_absolute_diff = np.max(np.abs(hf_layer - gluon_layer)).item()
success = np.allclose(gluon_layer, hf_layer, atol=1e-3)
if success:
print("[SUCCESS] Both models do output the same tensors")
else:
print("[FAIL] Both models do **NOT** output the same tensors")
print("Absolute difference is:", max_absolute_diff)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument(
"--bort_checkpoint_path", default=None, type=str, required=True, help="Path the official Bort params file."
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
args = parser.parse_args()
convert_bort_checkpoint_to_pytorch(args.bort_checkpoint_path, args.pytorch_dump_folder_path)

View File

@ -0,0 +1,28 @@
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ....utils import _LazyModule
from ....utils.import_utils import define_import_structure
if TYPE_CHECKING:
from .configuration_deta import *
from .image_processing_deta import *
from .modeling_deta import *
else:
import sys
_file = globals()["__file__"]
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

View File

@ -0,0 +1,278 @@
# coding=utf-8
# Copyright 2022 SenseTime and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""DETA model configuration"""
from ....configuration_utils import PreTrainedConfig
from ....utils import logging
from ...auto import CONFIG_MAPPING
logger = logging.get_logger(__name__)
class DetaConfig(PreTrainedConfig):
r"""
This is the configuration class to store the configuration of a [`DetaModel`]. It is used to instantiate a DETA
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the DETA
[SenseTime/deformable-detr](https://huggingface.co/SenseTime/deformable-detr) architecture.
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PreTrainedConfig`] for more information.
Args:
backbone_config (`PreTrainedConfig` or `dict`, *optional*, defaults to `ResNetConfig()`):
The configuration of the backbone model.
backbone (`str`, *optional*):
Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
use_pretrained_backbone (`bool`, *optional*, `False`):
Whether to use pretrained weights for the backbone.
use_timm_backbone (`bool`, *optional*, `False`):
Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
library.
backbone_kwargs (`dict`, *optional*):
Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
num_queries (`int`, *optional*, defaults to 900):
Number of object queries, i.e. detection slots. This is the maximal number of objects [`DetaModel`] can
detect in a single image. In case `two_stage` is set to `True`, we use `two_stage_num_proposals` instead.
d_model (`int`, *optional*, defaults to 256):
Dimension of the layers.
encoder_layers (`int`, *optional*, defaults to 6):
Number of encoder layers.
decoder_layers (`int`, *optional*, defaults to 6):
Number of decoder layers.
encoder_attention_heads (`int`, *optional*, defaults to 8):
Number of attention heads for each attention layer in the Transformer encoder.
decoder_attention_heads (`int`, *optional*, defaults to 8):
Number of attention heads for each attention layer in the Transformer decoder.
decoder_ffn_dim (`int`, *optional*, defaults to 2048):
Dimension of the "intermediate" (often named feed-forward) layer in decoder.
encoder_ffn_dim (`int`, *optional*, defaults to 2048):
Dimension of the "intermediate" (often named feed-forward) layer in decoder.
activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"silu"` and `"gelu_new"` are supported.
dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer.
init_std (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
init_xavier_std (`float`, *optional*, defaults to 1):
The scaling factor used for the Xavier initialization gain in the HM Attention map module.
encoder_layerdrop (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556)
for more details.
auxiliary_loss (`bool`, *optional*, defaults to `False`):
Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
position_embedding_type (`str`, *optional*, defaults to `"sine"`):
Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
class_cost (`float`, *optional*, defaults to 1):
Relative weight of the classification error in the Hungarian matching cost.
bbox_cost (`float`, *optional*, defaults to 5):
Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
giou_cost (`float`, *optional*, defaults to 2):
Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
mask_loss_coefficient (`float`, *optional*, defaults to 1):
Relative weight of the Focal loss in the panoptic segmentation loss.
dice_loss_coefficient (`float`, *optional*, defaults to 1):
Relative weight of the DICE/F-1 loss in the panoptic segmentation loss.
bbox_loss_coefficient (`float`, *optional*, defaults to 5):
Relative weight of the L1 bounding box loss in the object detection loss.
giou_loss_coefficient (`float`, *optional*, defaults to 2):
Relative weight of the generalized IoU loss in the object detection loss.
eos_coefficient (`float`, *optional*, defaults to 0.1):
Relative classification weight of the 'no-object' class in the object detection loss.
num_feature_levels (`int`, *optional*, defaults to 5):
The number of input feature levels.
encoder_n_points (`int`, *optional*, defaults to 4):
The number of sampled keys in each feature level for each attention head in the encoder.
decoder_n_points (`int`, *optional*, defaults to 4):
The number of sampled keys in each feature level for each attention head in the decoder.
two_stage (`bool`, *optional*, defaults to `True`):
Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of
DETA, which are further fed into the decoder for iterative bounding box refinement.
two_stage_num_proposals (`int`, *optional*, defaults to 300):
The number of region proposals to be generated, in case `two_stage` is set to `True`.
with_box_refine (`bool`, *optional*, defaults to `True`):
Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes
based on the predictions from the previous layer.
focal_alpha (`float`, *optional*, defaults to 0.25):
Alpha parameter in the focal loss.
assign_first_stage (`bool`, *optional*, defaults to `True`):
Whether to assign each prediction i to the highest overlapping ground truth object if the overlap is larger than a threshold 0.7.
assign_second_stage (`bool`, *optional*, defaults to `True`):
Whether to assign second assignment procedure in the second stage closely follows the first stage assignment procedure.
disable_custom_kernels (`bool`, *optional*, defaults to `True`):
Disable the use of custom CUDA and CPU kernels. This option is necessary for the ONNX export, as custom
kernels are not supported by PyTorch ONNX export.
Examples:
```python
>>> from transformers import DetaConfig, DetaModel
>>> # Initializing a DETA SenseTime/deformable-detr style configuration
>>> configuration = DetaConfig()
>>> # Initializing a model (with random weights) from the SenseTime/deformable-detr style configuration
>>> model = DetaModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "deta"
attribute_map = {
"hidden_size": "d_model",
"num_attention_heads": "encoder_attention_heads",
}
def __init__(
self,
backbone_config=None,
backbone=None,
use_pretrained_backbone=False,
use_timm_backbone=False,
backbone_kwargs=None,
num_queries=900,
max_position_embeddings=2048,
encoder_layers=6,
encoder_ffn_dim=2048,
encoder_attention_heads=8,
decoder_layers=6,
decoder_ffn_dim=1024,
decoder_attention_heads=8,
encoder_layerdrop=0.0,
is_encoder_decoder=True,
activation_function="relu",
d_model=256,
dropout=0.1,
attention_dropout=0.0,
activation_dropout=0.0,
init_std=0.02,
init_xavier_std=1.0,
return_intermediate=True,
auxiliary_loss=False,
position_embedding_type="sine",
num_feature_levels=5,
encoder_n_points=4,
decoder_n_points=4,
two_stage=True,
two_stage_num_proposals=300,
with_box_refine=True,
assign_first_stage=True,
assign_second_stage=True,
class_cost=1,
bbox_cost=5,
giou_cost=2,
mask_loss_coefficient=1,
dice_loss_coefficient=1,
bbox_loss_coefficient=5,
giou_loss_coefficient=2,
eos_coefficient=0.1,
focal_alpha=0.25,
disable_custom_kernels=True,
**kwargs,
):
if use_pretrained_backbone:
raise ValueError("Pretrained backbones are not supported yet.")
if backbone_config is not None and backbone is not None:
raise ValueError("You can't specify both `backbone` and `backbone_config`.")
if backbone_config is None and backbone is None:
logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage2", "stage3", "stage4"])
else:
if isinstance(backbone_config, dict):
backbone_model_type = backbone_config.pop("model_type")
config_class = CONFIG_MAPPING[backbone_model_type]
backbone_config = config_class.from_dict(backbone_config)
if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
self.backbone_config = backbone_config
self.backbone = backbone
self.use_pretrained_backbone = use_pretrained_backbone
self.use_timm_backbone = use_timm_backbone
self.backbone_kwargs = backbone_kwargs
self.num_queries = num_queries
self.max_position_embeddings = max_position_embeddings
self.d_model = d_model
self.encoder_ffn_dim = encoder_ffn_dim
self.encoder_layers = encoder_layers
self.encoder_attention_heads = encoder_attention_heads
self.decoder_ffn_dim = decoder_ffn_dim
self.decoder_layers = decoder_layers
self.decoder_attention_heads = decoder_attention_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.activation_function = activation_function
self.init_std = init_std
self.init_xavier_std = init_xavier_std
self.encoder_layerdrop = encoder_layerdrop
self.auxiliary_loss = auxiliary_loss
self.position_embedding_type = position_embedding_type
# deformable attributes
self.num_feature_levels = num_feature_levels
self.encoder_n_points = encoder_n_points
self.decoder_n_points = decoder_n_points
self.two_stage = two_stage
self.two_stage_num_proposals = two_stage_num_proposals
self.with_box_refine = with_box_refine
self.assign_first_stage = assign_first_stage
self.assign_second_stage = assign_second_stage
if two_stage is True and with_box_refine is False:
raise ValueError("If two_stage is True, with_box_refine must be True.")
# Hungarian matcher
self.class_cost = class_cost
self.bbox_cost = bbox_cost
self.giou_cost = giou_cost
# Loss coefficients
self.mask_loss_coefficient = mask_loss_coefficient
self.dice_loss_coefficient = dice_loss_coefficient
self.bbox_loss_coefficient = bbox_loss_coefficient
self.giou_loss_coefficient = giou_loss_coefficient
self.eos_coefficient = eos_coefficient
self.focal_alpha = focal_alpha
self.disable_custom_kernels = disable_custom_kernels
super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
@property
def num_attention_heads(self) -> int:
return self.encoder_attention_heads
@property
def hidden_size(self) -> int:
return self.d_model
@property
def sub_configs(self):
return (
{"backbone_config": type(self.backbone_config)}
if getattr(self, "backbone_config", None) is not None
else {}
)
__all__ = ["DetaConfig"]

View File

@ -0,0 +1,321 @@
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert DETA checkpoints from the original repository.
URL: https://github.com/jozhang97/DETA/tree/master"""
import argparse
import json
from pathlib import Path
import requests
import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from transformers import DetaConfig, DetaForObjectDetection, DetaImageProcessor
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def get_deta_config():
config = DetaConfig(
num_queries=900,
encoder_ffn_dim=2048,
decoder_ffn_dim=2048,
num_feature_levels=5,
assign_first_stage=True,
with_box_refine=True,
two_stage=True,
)
# set labels
config.num_labels = 91
repo_id = "huggingface/label-files"
filename = "coco-detection-id2label.json"
id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text())
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
return config
# here we list all keys to be renamed (original name on the left, our name on the right)
def create_rename_keys(config):
rename_keys = []
# stem
# fmt: off
rename_keys.append(("backbone.0.body.conv1.weight", "model.backbone.model.embedder.embedder.convolution.weight"))
rename_keys.append(("backbone.0.body.bn1.weight", "model.backbone.model.embedder.embedder.normalization.weight"))
rename_keys.append(("backbone.0.body.bn1.bias", "model.backbone.model.embedder.embedder.normalization.bias"))
rename_keys.append(("backbone.0.body.bn1.running_mean", "model.backbone.model.embedder.embedder.normalization.running_mean"))
rename_keys.append(("backbone.0.body.bn1.running_var", "model.backbone.model.embedder.embedder.normalization.running_var"))
# stages
for stage_idx in range(len(config.backbone_config.depths)):
for layer_idx in range(config.backbone_config.depths[stage_idx]):
# shortcut
if layer_idx == 0:
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.0.weight",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.convolution.weight",
)
)
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.weight",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.weight",
)
)
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.bias",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.bias",
)
)
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_mean",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_mean",
)
)
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_var",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_var",
)
)
# 3 convs
for i in range(3):
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.conv{i+1}.weight",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.convolution.weight",
)
)
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.weight",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.weight",
)
)
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.bias",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.bias",
)
)
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_mean",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_mean",
)
)
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_var",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_var",
)
)
# transformer encoder
for i in range(config.encoder_layers):
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.weight", f"model.encoder.layers.{i}.self_attn.sampling_offsets.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.bias", f"model.encoder.layers.{i}.self_attn.sampling_offsets.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.weight", f"model.encoder.layers.{i}.self_attn.attention_weights.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.bias", f"model.encoder.layers.{i}.self_attn.attention_weights.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.weight", f"model.encoder.layers.{i}.self_attn.value_proj.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.bias", f"model.encoder.layers.{i}.self_attn.value_proj.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.weight", f"model.encoder.layers.{i}.self_attn.output_proj.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.bias", f"model.encoder.layers.{i}.self_attn.output_proj.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm1.weight", f"model.encoder.layers.{i}.self_attn_layer_norm.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"model.encoder.layers.{i}.self_attn_layer_norm.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"model.encoder.layers.{i}.fc1.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"model.encoder.layers.{i}.fc1.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"model.encoder.layers.{i}.fc2.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"model.encoder.layers.{i}.fc2.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"model.encoder.layers.{i}.final_layer_norm.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"model.encoder.layers.{i}.final_layer_norm.bias"))
# transformer decoder
for i in range(config.decoder_layers):
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.weight", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.bias", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.weight", f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.bias", f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.weight", f"model.decoder.layers.{i}.encoder_attn.value_proj.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.bias", f"model.decoder.layers.{i}.encoder_attn.value_proj.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.weight", f"model.decoder.layers.{i}.encoder_attn.output_proj.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.bias", f"model.decoder.layers.{i}.encoder_attn.output_proj.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"model.decoder.layers.{i}.self_attn.out_proj.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"model.decoder.layers.{i}.self_attn.out_proj.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias"))
# fmt: on
return rename_keys
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
def read_in_decoder_q_k_v(state_dict, config):
# transformer decoder self-attention layers
hidden_size = config.d_model
for i in range(config.decoder_layers):
# read in weights + bias of input projection layer of self-attention
in_proj_weight = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_weight")
in_proj_bias = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_bias")
# next, add query, keys and values (in that order) to the state dict
state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:hidden_size, :]
state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:hidden_size]
state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[
hidden_size : hidden_size * 2, :
]
state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size:, :]
state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size:]
# We will verify our results on an image of cute cats
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@torch.no_grad()
def convert_deta_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
"""
Copy/paste/tweak model's weights to our DETA structure.
"""
# load config
config = get_deta_config()
# load original state dict
if model_name == "deta-resnet-50":
filename = "adet_checkpoint0011.pth"
elif model_name == "deta-resnet-50-24-epochs":
filename = "adet_2x_checkpoint0023.pth"
else:
raise ValueError(f"Model name {model_name} not supported")
checkpoint_path = hf_hub_download(repo_id="nielsr/deta-checkpoints", filename=filename)
state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["model"]
# rename keys
rename_keys = create_rename_keys(config)
for src, dest in rename_keys:
rename_key(state_dict, src, dest)
read_in_decoder_q_k_v(state_dict, config)
# fix some prefixes
for key in state_dict.copy():
if "transformer.decoder.class_embed" in key or "transformer.decoder.bbox_embed" in key:
val = state_dict.pop(key)
state_dict[key.replace("transformer.decoder", "model.decoder")] = val
if "input_proj" in key:
val = state_dict.pop(key)
state_dict["model." + key] = val
if "level_embed" in key or "pos_trans" in key or "pix_trans" in key or "enc_output" in key:
val = state_dict.pop(key)
state_dict[key.replace("transformer", "model")] = val
# finally, create HuggingFace model and load state dict
model = DetaForObjectDetection(config)
model.load_state_dict(state_dict)
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
# load image processor
processor = DetaImageProcessor(format="coco_detection")
# verify our conversion on image
img = prepare_img()
encoding = processor(images=img, return_tensors="pt")
pixel_values = encoding["pixel_values"]
outputs = model(pixel_values.to(device))
# verify logits
if model_name == "deta-resnet-50":
expected_logits = torch.tensor(
[[-7.3978, -2.5406, -4.1668], [-8.2684, -3.9933, -3.8096], [-7.0515, -3.7973, -5.8516]]
)
expected_boxes = torch.tensor([[0.5043, 0.4973, 0.9998], [0.2542, 0.5489, 0.4748], [0.5490, 0.2765, 0.0570]])
elif model_name == "deta-resnet-50-24-epochs":
expected_logits = torch.tensor(
[[-7.1688, -2.4857, -4.8669], [-7.8630, -3.8154, -4.2674], [-7.2730, -4.1865, -5.5323]]
)
expected_boxes = torch.tensor([[0.5021, 0.4971, 0.9994], [0.2546, 0.5486, 0.4731], [0.1686, 0.1986, 0.2142]])
assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4)
assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4)
print("Everything ok!")
if pytorch_dump_folder_path:
# Save model and processor
logger.info(f"Saving PyTorch model and processor to {pytorch_dump_folder_path}...")
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
model.save_pretrained(pytorch_dump_folder_path)
processor.save_pretrained(pytorch_dump_folder_path)
# Push to hub
if push_to_hub:
print("Pushing model and processor to hub...")
model.push_to_hub(f"jozhang97/{model_name}")
processor.push_to_hub(f"jozhang97/{model_name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
type=str,
default="deta-resnet-50",
choices=["deta-resnet-50", "deta-resnet-50-24-epochs"],
help="Name of the model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
help="Path to the folder to output PyTorch model.",
)
parser.add_argument(
"--push_to_hub",
action="store_true",
help="Whether or not to push the converted model to the Hugging Face hub.",
)
args = parser.parse_args()
convert_deta_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)

View File

@ -0,0 +1,328 @@
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert DETA checkpoints from the original repository.
URL: https://github.com/jozhang97/DETA/tree/master"""
import argparse
import json
from pathlib import Path
import requests
import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from transformers import DetaConfig, DetaForObjectDetection, DetaImageProcessor, SwinConfig
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def get_deta_config(model_name):
backbone_config = SwinConfig(
embed_dim=192,
depths=(2, 2, 18, 2),
num_heads=(6, 12, 24, 48),
window_size=12,
out_features=["stage2", "stage3", "stage4"],
)
config = DetaConfig(
backbone_config=backbone_config,
num_queries=900,
encoder_ffn_dim=2048,
decoder_ffn_dim=2048,
num_feature_levels=5,
assign_first_stage=True,
with_box_refine=True,
two_stage=True,
)
# set labels
repo_id = "huggingface/label-files"
if "o365" in model_name:
num_labels = 366
filename = "object365-id2label.json"
else:
num_labels = 91
filename = "coco-detection-id2label.json"
config.num_labels = num_labels
id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text())
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
return config
# here we list all keys to be renamed (original name on the left, our name on the right)
def create_rename_keys(config):
rename_keys = []
# stem
# fmt: off
rename_keys.append(("backbone.0.body.patch_embed.proj.weight", "model.backbone.model.embeddings.patch_embeddings.projection.weight"))
rename_keys.append(("backbone.0.body.patch_embed.proj.bias", "model.backbone.model.embeddings.patch_embeddings.projection.bias"))
rename_keys.append(("backbone.0.body.patch_embed.norm.weight", "model.backbone.model.embeddings.norm.weight"))
rename_keys.append(("backbone.0.body.patch_embed.norm.bias", "model.backbone.model.embeddings.norm.bias"))
# stages
for i in range(len(config.backbone_config.depths)):
for j in range(config.backbone_config.depths[i]):
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm1.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_before.weight"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm1.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_before.bias"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.relative_position_bias_table", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_bias_table"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.relative_position_index", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_index"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.proj.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.weight"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.proj.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.bias"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm2.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_after.weight"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm2.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_after.bias"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc1.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.weight"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc1.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.bias"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc2.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.output.dense.weight"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc2.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.output.dense.bias"))
if i < 3:
rename_keys.append((f"backbone.0.body.layers.{i}.downsample.reduction.weight", f"model.backbone.model.encoder.layers.{i}.downsample.reduction.weight"))
rename_keys.append((f"backbone.0.body.layers.{i}.downsample.norm.weight", f"model.backbone.model.encoder.layers.{i}.downsample.norm.weight"))
rename_keys.append((f"backbone.0.body.layers.{i}.downsample.norm.bias", f"model.backbone.model.encoder.layers.{i}.downsample.norm.bias"))
rename_keys.append(("backbone.0.body.norm1.weight", "model.backbone.model.hidden_states_norms.stage2.weight"))
rename_keys.append(("backbone.0.body.norm1.bias", "model.backbone.model.hidden_states_norms.stage2.bias"))
rename_keys.append(("backbone.0.body.norm2.weight", "model.backbone.model.hidden_states_norms.stage3.weight"))
rename_keys.append(("backbone.0.body.norm2.bias", "model.backbone.model.hidden_states_norms.stage3.bias"))
rename_keys.append(("backbone.0.body.norm3.weight", "model.backbone.model.hidden_states_norms.stage4.weight"))
rename_keys.append(("backbone.0.body.norm3.bias", "model.backbone.model.hidden_states_norms.stage4.bias"))
# transformer encoder
for i in range(config.encoder_layers):
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.weight", f"model.encoder.layers.{i}.self_attn.sampling_offsets.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.bias", f"model.encoder.layers.{i}.self_attn.sampling_offsets.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.weight", f"model.encoder.layers.{i}.self_attn.attention_weights.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.bias", f"model.encoder.layers.{i}.self_attn.attention_weights.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.weight", f"model.encoder.layers.{i}.self_attn.value_proj.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.bias", f"model.encoder.layers.{i}.self_attn.value_proj.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.weight", f"model.encoder.layers.{i}.self_attn.output_proj.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.bias", f"model.encoder.layers.{i}.self_attn.output_proj.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm1.weight", f"model.encoder.layers.{i}.self_attn_layer_norm.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"model.encoder.layers.{i}.self_attn_layer_norm.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"model.encoder.layers.{i}.fc1.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"model.encoder.layers.{i}.fc1.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"model.encoder.layers.{i}.fc2.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"model.encoder.layers.{i}.fc2.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"model.encoder.layers.{i}.final_layer_norm.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"model.encoder.layers.{i}.final_layer_norm.bias"))
# transformer decoder
for i in range(config.decoder_layers):
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.weight", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.bias", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.weight", f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.bias", f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.weight", f"model.decoder.layers.{i}.encoder_attn.value_proj.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.bias", f"model.decoder.layers.{i}.encoder_attn.value_proj.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.weight", f"model.decoder.layers.{i}.encoder_attn.output_proj.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.bias", f"model.decoder.layers.{i}.encoder_attn.output_proj.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"model.decoder.layers.{i}.self_attn.out_proj.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"model.decoder.layers.{i}.self_attn.out_proj.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias"))
# fmt: on
return rename_keys
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
# we split up the matrix of each encoder layer into queries, keys and values
def read_in_swin_q_k_v(state_dict, backbone_config):
num_features = [int(backbone_config.embed_dim * 2**i) for i in range(len(backbone_config.depths))]
for i in range(len(backbone_config.depths)):
dim = num_features[i]
for j in range(backbone_config.depths[i]):
# fmt: off
# read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
in_proj_weight = state_dict.pop(f"backbone.0.body.layers.{i}.blocks.{j}.attn.qkv.weight")
in_proj_bias = state_dict.pop(f"backbone.0.body.layers.{i}.blocks.{j}.attn.qkv.bias")
# next, add query, keys and values (in that order) to the state dict
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.query.weight"] = in_proj_weight[:dim, :]
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.query.bias"] = in_proj_bias[: dim]
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.key.weight"] = in_proj_weight[
dim : dim * 2, :
]
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.key.bias"] = in_proj_bias[
dim : dim * 2
]
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.value.weight"] = in_proj_weight[
-dim :, :
]
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.value.bias"] = in_proj_bias[-dim :]
# fmt: on
def read_in_decoder_q_k_v(state_dict, config):
# transformer decoder self-attention layers
hidden_size = config.d_model
for i in range(config.decoder_layers):
# read in weights + bias of input projection layer of self-attention
in_proj_weight = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_weight")
in_proj_bias = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_bias")
# next, add query, keys and values (in that order) to the state dict
state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:hidden_size, :]
state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:hidden_size]
state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[
hidden_size : hidden_size * 2, :
]
state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size:, :]
state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size:]
# We will verify our results on an image of cute cats
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@torch.no_grad()
def convert_deta_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
"""
Copy/paste/tweak model's weights to our DETA structure.
"""
# load config
config = get_deta_config(model_name)
# load original state dict
if model_name == "deta-swin-large":
checkpoint_path = hf_hub_download(repo_id="nielsr/deta-checkpoints", filename="adet_swin_ft.pth")
elif model_name == "deta-swin-large-o365":
checkpoint_path = hf_hub_download(repo_id="jozhang97/deta-swin-l-o365", filename="deta_swin_pt_o365.pth")
else:
raise ValueError(f"Model name {model_name} not supported")
state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["model"]
# original state dict
for name, param in state_dict.items():
print(name, param.shape)
# rename keys
rename_keys = create_rename_keys(config)
for src, dest in rename_keys:
rename_key(state_dict, src, dest)
read_in_swin_q_k_v(state_dict, config.backbone_config)
read_in_decoder_q_k_v(state_dict, config)
# fix some prefixes
for key in state_dict.copy():
if "transformer.decoder.class_embed" in key or "transformer.decoder.bbox_embed" in key:
val = state_dict.pop(key)
state_dict[key.replace("transformer.decoder", "model.decoder")] = val
if "input_proj" in key:
val = state_dict.pop(key)
state_dict["model." + key] = val
if "level_embed" in key or "pos_trans" in key or "pix_trans" in key or "enc_output" in key:
val = state_dict.pop(key)
state_dict[key.replace("transformer", "model")] = val
# finally, create HuggingFace model and load state dict
model = DetaForObjectDetection(config)
model.load_state_dict(state_dict)
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
# load image processor
processor = DetaImageProcessor(format="coco_detection")
# verify our conversion on image
img = prepare_img()
encoding = processor(images=img, return_tensors="pt")
pixel_values = encoding["pixel_values"]
outputs = model(pixel_values.to(device))
# verify logits
print("Logits:", outputs.logits[0, :3, :3])
print("Boxes:", outputs.pred_boxes[0, :3, :3])
if model_name == "deta-swin-large":
expected_logits = torch.tensor(
[[-7.6308, -2.8485, -5.3737], [-7.2037, -4.5505, -4.8027], [-7.2943, -4.2611, -4.6617]]
)
expected_boxes = torch.tensor([[0.4987, 0.4969, 0.9999], [0.2549, 0.5498, 0.4805], [0.5498, 0.2757, 0.0569]])
elif model_name == "deta-swin-large-o365":
expected_logits = torch.tensor(
[[-8.0122, -3.5720, -4.9717], [-8.1547, -3.6886, -4.6389], [-7.6610, -3.6194, -5.0134]]
)
expected_boxes = torch.tensor([[0.2523, 0.5549, 0.4881], [0.7715, 0.4149, 0.4601], [0.5503, 0.2753, 0.0575]])
assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4)
assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4)
print("Everything ok!")
if pytorch_dump_folder_path:
# Save model and processor
logger.info(f"Saving PyTorch model and processor to {pytorch_dump_folder_path}...")
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
model.save_pretrained(pytorch_dump_folder_path)
processor.save_pretrained(pytorch_dump_folder_path)
# Push to hub
if push_to_hub:
print("Pushing model and processor to hub...")
model.push_to_hub(f"jozhang97/{model_name}")
processor.push_to_hub(f"jozhang97/{model_name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
type=str,
default="deta-swin-large",
choices=["deta-swin-large", "deta-swin-large-o365"],
help="Name of the model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
help="Path to the folder to output PyTorch model.",
)
parser.add_argument(
"--push_to_hub",
action="store_true",
help="Whether or not to push the converted model to the Hugging Face hub.",
)
args = parser.parse_args()
convert_deta_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,28 @@
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ....utils import _LazyModule
from ....utils.import_utils import define_import_structure
if TYPE_CHECKING:
from .configuration_efficientformer import *
from .image_processing_efficientformer import *
from .modeling_efficientformer import *
else:
import sys
_file = globals()["__file__"]
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

View File

@ -0,0 +1,170 @@
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""EfficientFormer model configuration"""
from ....configuration_utils import PreTrainedConfig
from ....utils import logging
logger = logging.get_logger(__name__)
class EfficientFormerConfig(PreTrainedConfig):
r"""
This is the configuration class to store the configuration of an [`EfficientFormerModel`]. It is used to
instantiate an EfficientFormer model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the EfficientFormer
[snap-research/efficientformer-l1](https://huggingface.co/snap-research/efficientformer-l1) architecture.
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PreTrainedConfig`] for more information.
Args:
depths (`List(int)`, *optional*, defaults to `[3, 2, 6, 4]`)
Depth of each stage.
hidden_sizes (`List(int)`, *optional*, defaults to `[48, 96, 224, 448]`)
Dimensionality of each stage.
downsamples (`List(bool)`, *optional*, defaults to `[True, True, True, True]`)
Whether or not to downsample inputs between two stages.
dim (`int`, *optional*, defaults to 448):
Number of channels in Meta3D layers
key_dim (`int`, *optional*, defaults to 32):
The size of the key in meta3D block.
attention_ratio (`int`, *optional*, defaults to 4):
Ratio of the dimension of the query and value to the dimension of the key in MSHA block
resolution (`int`, *optional*, defaults to 7)
Size of each patch
num_hidden_layers (`int`, *optional*, defaults to 5):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 8):
Number of attention heads for each attention layer in the 3D MetaBlock.
mlp_expansion_ratio (`int`, *optional*, defaults to 4):
Ratio of size of the hidden dimensionality of an MLP to the dimensionality of its input.
hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings and encoder.
patch_size (`int`, *optional*, defaults to 16):
The size (resolution) of each patch.
num_channels (`int`, *optional*, defaults to 3):
The number of input channels.
pool_size (`int`, *optional*, defaults to 3):
Kernel size of pooling layers.
downsample_patch_size (`int`, *optional*, defaults to 3):
The size of patches in downsampling layers.
downsample_stride (`int`, *optional*, defaults to 2):
The stride of convolution kernels in downsampling layers.
downsample_pad (`int`, *optional*, defaults to 1):
Padding in downsampling layers.
drop_path_rate (`int`, *optional*, defaults to 0):
Rate at which to increase dropout probability in DropPath.
num_meta3d_blocks (`int`, *optional*, defaults to 1):
The number of 3D MetaBlocks in the last stage.
distillation (`bool`, *optional*, defaults to `True`):
Whether to add a distillation head.
use_layer_scale (`bool`, *optional*, defaults to `True`):
Whether to scale outputs from token mixers.
layer_scale_init_value (`float`, *optional*, defaults to 1e-5):
Factor by which outputs from token mixers are scaled.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` are supported.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
image_size (`int`, *optional*, defaults to `224`):
The size (resolution) of each image.
Example:
```python
>>> from transformers import EfficientFormerConfig, EfficientFormerModel
>>> # Initializing a EfficientFormer efficientformer-l1 style configuration
>>> configuration = EfficientFormerConfig()
>>> # Initializing a EfficientFormerModel (with random weights) from the efficientformer-l3 style configuration
>>> model = EfficientFormerModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "efficientformer"
def __init__(
self,
depths: list[int] = [3, 2, 6, 4],
hidden_sizes: list[int] = [48, 96, 224, 448],
downsamples: list[bool] = [True, True, True, True],
dim: int = 448,
key_dim: int = 32,
attention_ratio: int = 4,
resolution: int = 7,
num_hidden_layers: int = 5,
num_attention_heads: int = 8,
mlp_expansion_ratio: int = 4,
hidden_dropout_prob: float = 0.0,
patch_size: int = 16,
num_channels: int = 3,
pool_size: int = 3,
downsample_patch_size: int = 3,
downsample_stride: int = 2,
downsample_pad: int = 1,
drop_path_rate: float = 0.0,
num_meta3d_blocks: int = 1,
distillation: bool = True,
use_layer_scale: bool = True,
layer_scale_init_value: float = 1e-5,
hidden_act: str = "gelu",
initializer_range: float = 0.02,
layer_norm_eps: float = 1e-12,
image_size: int = 224,
batch_norm_eps: float = 1e-05,
**kwargs,
) -> None:
super().__init__(**kwargs)
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.hidden_sizes = hidden_sizes
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.patch_size = patch_size
self.num_channels = num_channels
self.depths = depths
self.mlp_expansion_ratio = mlp_expansion_ratio
self.downsamples = downsamples
self.dim = dim
self.key_dim = key_dim
self.attention_ratio = attention_ratio
self.resolution = resolution
self.pool_size = pool_size
self.downsample_patch_size = downsample_patch_size
self.downsample_stride = downsample_stride
self.downsample_pad = downsample_pad
self.drop_path_rate = drop_path_rate
self.num_meta3d_blocks = num_meta3d_blocks
self.distillation = distillation
self.use_layer_scale = use_layer_scale
self.layer_scale_init_value = layer_scale_init_value
self.image_size = image_size
self.batch_norm_eps = batch_norm_eps
__all__ = [
"EfficientFormerConfig",
]

View File

@ -0,0 +1,252 @@
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert EfficientFormer checkpoints from the original repository.
URL: https://github.com/snap-research/EfficientFormer
"""
import argparse
import re
from pathlib import Path
import requests
import torch
from PIL import Image
from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor
from transformers import (
EfficientFormerConfig,
EfficientFormerForImageClassificationWithTeacher,
EfficientFormerImageProcessor,
)
from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling
def rename_key(old_name, num_meta4D_last_stage):
new_name = old_name
if "patch_embed" in old_name:
_, layer, param = old_name.split(".")
if layer == "0":
new_name = old_name.replace("0", "convolution1")
elif layer == "1":
new_name = old_name.replace("1", "batchnorm_before")
elif layer == "3":
new_name = old_name.replace("3", "convolution2")
else:
new_name = old_name.replace("4", "batchnorm_after")
if "network" in old_name and re.search(r"\d\.\d", old_name):
two_digit_num = r"\b\d{2}\b"
if bool(re.search(two_digit_num, old_name)):
match = re.search(r"\d\.\d\d.", old_name).group()
else:
match = re.search(r"\d\.\d.", old_name).group()
if int(match[0]) < 6:
trimmed_name = old_name.replace(match, "")
trimmed_name = trimmed_name.replace("network", match[0] + ".meta4D_layers.blocks." + match[2:-1])
new_name = "intermediate_stages." + trimmed_name
else:
trimmed_name = old_name.replace(match, "")
if int(match[2]) < num_meta4D_last_stage:
trimmed_name = trimmed_name.replace("network", "meta4D_layers.blocks." + match[2])
else:
layer_index = str(int(match[2]) - num_meta4D_last_stage)
trimmed_name = trimmed_name.replace("network", "meta3D_layers.blocks." + layer_index)
if "norm1" in old_name:
trimmed_name = trimmed_name.replace("norm1", "layernorm1")
elif "norm2" in old_name:
trimmed_name = trimmed_name.replace("norm2", "layernorm2")
elif "fc1" in old_name:
trimmed_name = trimmed_name.replace("fc1", "linear_in")
elif "fc2" in old_name:
trimmed_name = trimmed_name.replace("fc2", "linear_out")
new_name = "last_stage." + trimmed_name
elif "network" in old_name and re.search(r".\d.", old_name):
new_name = old_name.replace("network", "intermediate_stages")
if "fc" in new_name:
new_name = new_name.replace("fc", "convolution")
elif ("norm1" in new_name) and ("layernorm1" not in new_name):
new_name = new_name.replace("norm1", "batchnorm_before")
elif ("norm2" in new_name) and ("layernorm2" not in new_name):
new_name = new_name.replace("norm2", "batchnorm_after")
if "proj" in new_name:
new_name = new_name.replace("proj", "projection")
if "dist_head" in new_name:
new_name = new_name.replace("dist_head", "distillation_classifier")
elif "head" in new_name:
new_name = new_name.replace("head", "classifier")
elif "patch_embed" in new_name:
new_name = "efficientformer." + new_name
elif new_name == "norm.weight" or new_name == "norm.bias":
new_name = new_name.replace("norm", "layernorm")
new_name = "efficientformer." + new_name
else:
new_name = "efficientformer.encoder." + new_name
return new_name
def convert_torch_checkpoint(checkpoint, num_meta4D_last_stage):
for key in checkpoint.copy():
val = checkpoint.pop(key)
checkpoint[rename_key(key, num_meta4D_last_stage)] = val
return checkpoint
# We will verify our results on a COCO image
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
return image
def convert_efficientformer_checkpoint(
checkpoint_path: Path, efficientformer_config_file: Path, pytorch_dump_path: Path, push_to_hub: bool
):
orig_state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["model"]
config = EfficientFormerConfig.from_json_file(efficientformer_config_file)
model = EfficientFormerForImageClassificationWithTeacher(config)
model_name = "_".join(checkpoint_path.split("/")[-1].split(".")[0].split("_")[:-1])
num_meta4D_last_stage = config.depths[-1] - config.num_meta3d_blocks + 1
new_state_dict = convert_torch_checkpoint(orig_state_dict, num_meta4D_last_stage)
model.load_state_dict(new_state_dict)
model.eval()
pillow_resamplings = {
"bilinear": PILImageResampling.BILINEAR,
"bicubic": PILImageResampling.BICUBIC,
"nearest": PILImageResampling.NEAREST,
}
# prepare image
image = prepare_img()
image_size = 256
crop_size = 224
processor = EfficientFormerImageProcessor(
size={"shortest_edge": image_size},
crop_size={"height": crop_size, "width": crop_size},
resample=pillow_resamplings["bicubic"],
)
pixel_values = processor(images=image, return_tensors="pt").pixel_values
# original processing pipeline
image_transforms = Compose(
[
Resize(image_size, interpolation=pillow_resamplings["bicubic"]),
CenterCrop(crop_size),
ToTensor(),
Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
]
)
original_pixel_values = image_transforms(image).unsqueeze(0)
assert torch.allclose(original_pixel_values, pixel_values)
outputs = model(pixel_values)
logits = outputs.logits
expected_shape = (1, 1000)
if "l1" in model_name:
expected_logits = torch.Tensor(
[-0.1312, 0.4353, -1.0499, -0.5124, 0.4183, -0.6793, -1.3777, -0.0893, -0.7358, -2.4328]
)
assert torch.allclose(logits[0, :10], expected_logits, atol=1e-3)
assert logits.shape == expected_shape
elif "l3" in model_name:
expected_logits = torch.Tensor(
[-1.3150, -1.5456, -1.2556, -0.8496, -0.7127, -0.7897, -0.9728, -0.3052, 0.3751, -0.3127]
)
assert torch.allclose(logits[0, :10], expected_logits, atol=1e-3)
assert logits.shape == expected_shape
elif "l7" in model_name:
expected_logits = torch.Tensor(
[-1.0283, -1.4131, -0.5644, -1.3115, -0.5785, -1.2049, -0.7528, 0.1992, -0.3822, -0.0878]
)
assert logits.shape == expected_shape
else:
raise ValueError(
f"Unknown model checkpoint: {checkpoint_path}. Supported version of efficientformer are l1, l3 and l7"
)
# Save Checkpoints
Path(pytorch_dump_path).mkdir(exist_ok=True)
model.save_pretrained(pytorch_dump_path)
print(f"Checkpoint successfully converted. Model saved at {pytorch_dump_path}")
processor.save_pretrained(pytorch_dump_path)
print(f"Processor successfully saved at {pytorch_dump_path}")
if push_to_hub:
print("Pushing model to the hub...")
model.push_to_hub(
repo_id=f"Bearnardd/{pytorch_dump_path}",
commit_message="Add model",
use_temp_dir=True,
)
processor.push_to_hub(
repo_id=f"Bearnardd/{pytorch_dump_path}",
commit_message="Add image processor",
use_temp_dir=True,
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument(
"--pytorch_model_path",
default=None,
type=str,
required=True,
help="Path to EfficientFormer pytorch checkpoint.",
)
parser.add_argument(
"--config_file",
default=None,
type=str,
required=True,
help="The json file for EfficientFormer model config.",
)
parser.add_argument(
"--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
parser.add_argument(
"--no-push_to_hub",
dest="push_to_hub",
action="store_false",
help="Do not push model and image processor to the hub",
)
parser.set_defaults(push_to_hub=True)
args = parser.parse_args()
convert_efficientformer_checkpoint(
checkpoint_path=args.pytorch_model_path,
efficientformer_config_file=args.config_file,
pytorch_dump_path=args.pytorch_dump_path,
push_to_hub=args.push_to_hub,
)

View File

@ -0,0 +1,319 @@
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for EfficientFormer."""
from typing import Optional, Union
import numpy as np
from ....image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ....image_transforms import (
get_resize_output_image_size,
resize,
to_channel_dimension_format,
)
from ....image_utils import (
IMAGENET_DEFAULT_MEAN,
IMAGENET_DEFAULT_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
infer_channel_dimension_format,
is_batched,
is_scaled_image,
to_numpy_array,
valid_images,
validate_kwargs,
validate_preprocess_arguments,
)
from ....utils import TensorType, logging
logger = logging.get_logger(__name__)
class EfficientFormerImageProcessor(BaseImageProcessor):
r"""
Constructs a EfficientFormer image processor.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`):
Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
method.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
`preprocess` method.
do_center_crop (`bool`, *optional*, defaults to `True`):
Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
`preprocess` method.
crop_size (`dict[str, int]` *optional*, defaults to 224):
Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
method.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
parameter in the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
`preprocess` method.
do_normalize:
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
method.
image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize: bool = True,
size: Optional[dict[str, int]] = None,
resample: PILImageResampling = PILImageResampling.BICUBIC,
do_center_crop: bool = True,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
crop_size: Optional[dict[str, int]] = None,
do_normalize: bool = True,
image_mean: Optional[Union[float, list[float]]] = None,
image_std: Optional[Union[float, list[float]]] = None,
**kwargs,
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"height": 224, "width": 224}
size = get_size_dict(size)
crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
self.do_resize = do_resize
self.do_rescale = do_rescale
self.do_normalize = do_normalize
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.size = size
self.resample = resample
self.rescale_factor = rescale_factor
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self._valid_processor_keys = [
"images",
"do_resize",
"size",
"resample",
"do_center_crop",
"crop_size",
"do_rescale",
"rescale_factor",
"do_normalize",
"image_mean",
"image_std",
"return_tensors",
"data_format",
"input_data_format",
]
def resize(
self,
image: np.ndarray,
size: dict[str, int],
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Resize an image to `(size["height"], size["width"])`.
Args:
image (`np.ndarray`):
Image to resize.
size (`dict[str, int]`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
resample:
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
Returns:
`np.ndarray`: The resized image.
"""
size = get_size_dict(size)
if "shortest_edge" in size:
size = get_resize_output_image_size(
image, size=size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
)
# size = get_resize_output_image_size(image, size["shortest_edge"], size["longest_edge"])
elif "height" in size and "width" in size:
size = (size["height"], size["width"])
else:
raise ValueError(f"Size must contain 'height' and 'width' keys or 'shortest_edge' key. Got {size.keys()}")
return resize(
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
)
def preprocess(
self,
images: ImageInput,
do_resize: Optional[bool] = None,
size: Optional[dict[str, int]] = None,
resample: Optional[PILImageResampling] = None,
do_center_crop: Optional[bool] = None,
crop_size: Optional[int] = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, list[float]]] = None,
image_std: Optional[Union[float, list[float]]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> BatchFeature:
"""
Preprocess an image or batch of images.
Args:
images (`ImageInput`):
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`dict[str, int]`, *optional*, defaults to `self.size`):
Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
resizing.
resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
`PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
an effect if `do_resize` is set to `True`.
do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
Whether to center crop the image.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image values between [0 - 1].
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`):
Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
Image mean to use if `do_normalize` is set to `True`.
image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
Image standard deviation to use if `do_normalize` is set to `True`.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
"""
do_resize = do_resize if do_resize is not None else self.do_resize
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
crop_size = crop_size if crop_size is not None else self.crop_size
crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
resample = resample if resample is not None else self.resample
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
size = size if size is not None else self.size
size_dict = get_size_dict(size)
validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
if not is_batched(images):
images = [images]
if not valid_images(images):
raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, or torch.Tensor")
validate_preprocess_arguments(
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
do_center_crop=do_center_crop,
crop_size=crop_size,
do_resize=do_resize,
size=size,
resample=resample,
)
# All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images]
if do_rescale and is_scaled_image(images[0]):
logger.warning_once(
"It looks like you are trying to rescale already rescaled images. If the input"
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
)
if input_data_format is None:
# We assume that all images have the same channel dimension format.
input_data_format = infer_channel_dimension_format(images[0])
if do_resize:
images = [
self.resize(image=image, size=size_dict, resample=resample, input_data_format=input_data_format)
for image in images
]
if do_center_crop:
images = [
self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
]
if do_rescale:
images = [
self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
for image in images
]
if do_normalize:
images = [
self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
for image in images
]
images = [
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
]
data = {"pixel_values": images}
return BatchFeature(data=data, tensor_type=return_tensors)
__all__ = ["EfficientFormerImageProcessor"]

View File

@ -0,0 +1,771 @@
# coding=utf-8
# Copyright 2022 Snapchat Research and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch EfficientFormer model."""
import itertools
from dataclasses import dataclass
from typing import Optional, Union
import torch
from torch import nn
from ....activations import ACT2FN
from ....modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
from ....modeling_utils import PreTrainedModel
from ....utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
)
from .configuration_efficientformer import EfficientFormerConfig
logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "EfficientFormerConfig"
# Base docstring
_CHECKPOINT_FOR_DOC = "snap-research/efficientformer-l1-300"
_EXPECTED_OUTPUT_SHAPE = [1, 49, 448]
# Image classification docstring
_IMAGE_CLASS_CHECKPOINT = "snap-research/efficientformer-l1-300"
_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat"
class EfficientFormerPatchEmbeddings(nn.Module):
"""
This class performs downsampling between two stages. For the input tensor with the shape [batch_size, num_channels,
height, width] it produces output tensor with the shape [batch_size, num_channels, height/stride, width/stride]
"""
def __init__(self, config: EfficientFormerConfig, num_channels: int, embed_dim: int, apply_norm: bool = True):
super().__init__()
self.num_channels = num_channels
self.projection = nn.Conv2d(
num_channels,
embed_dim,
kernel_size=config.downsample_patch_size,
stride=config.downsample_stride,
padding=config.downsample_pad,
)
self.norm = nn.BatchNorm2d(embed_dim, eps=config.batch_norm_eps) if apply_norm else nn.Identity()
def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
batch_size, num_channels, height, width = pixel_values.shape
if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
embeddings = self.projection(pixel_values)
embeddings = self.norm(embeddings)
return embeddings
class EfficientFormerSelfAttention(nn.Module):
def __init__(self, dim: int, key_dim: int, num_heads: int, attention_ratio: int, resolution: int):
super().__init__()
self.num_heads = num_heads
self.key_dim = key_dim
self.attention_ratio = attention_ratio
self.scale = key_dim**-0.5
self.total_key_dim = key_dim * num_heads
self.expanded_key_dim = int(attention_ratio * key_dim)
self.total_expanded_key_dim = int(self.expanded_key_dim * num_heads)
hidden_size = self.total_expanded_key_dim + self.total_key_dim * 2
self.qkv = nn.Linear(dim, hidden_size)
self.projection = nn.Linear(self.total_expanded_key_dim, dim)
points = list(itertools.product(range(resolution), range(resolution)))
num_points = len(points)
attention_offsets = {}
idxs = []
for point_1 in points:
for point_2 in points:
offset = (abs(point_1[0] - point_2[0]), abs(point_1[1] - point_2[1]))
if offset not in attention_offsets:
attention_offsets[offset] = len(attention_offsets)
idxs.append(attention_offsets[offset])
self.attention_biases = torch.nn.Parameter(torch.zeros(num_heads, len(attention_offsets)))
self.register_buffer("attention_bias_idxs", torch.LongTensor(idxs).view(num_points, num_points))
@torch.no_grad()
def train(self, mode=True):
super().train(mode)
if mode and hasattr(self, "ab"):
del self.ab
else:
self.ab = self.attention_biases[:, self.attention_bias_idxs]
def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> tuple[torch.Tensor]:
batch_size, sequence_length, num_channels = hidden_states.shape
qkv = self.qkv(hidden_states)
query_layer, key_layer, value_layer = qkv.reshape(batch_size, sequence_length, self.num_heads, -1).split(
[self.key_dim, self.key_dim, self.expanded_key_dim], dim=3
)
query_layer = query_layer.permute(0, 2, 1, 3)
key_layer = key_layer.permute(0, 2, 1, 3)
value_layer = value_layer.permute(0, 2, 1, 3)
# set `model.to(torch_device)` won't change `self.ab.device`, if there is no follow-up `train` or `eval` call.
# Let's do it manually here, so users won't have to do this everytime.
if not self.training:
self.ab = self.ab.to(self.attention_biases.device)
attention_probs = (torch.matmul(query_layer, key_layer.transpose(-2, -1))) * self.scale + (
self.attention_biases[:, self.attention_bias_idxs] if self.training else self.ab
)
attention_probs = attention_probs.softmax(dim=-1)
context_layer = torch.matmul(attention_probs, value_layer).transpose(1, 2)
context_layer = context_layer.reshape(batch_size, sequence_length, self.total_expanded_key_dim)
context_layer = self.projection(context_layer)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
class EfficientFormerConvStem(nn.Module):
def __init__(self, config: EfficientFormerConfig, out_channels: int):
super().__init__()
self.convolution1 = nn.Conv2d(config.num_channels, out_channels // 2, kernel_size=3, stride=2, padding=1)
self.batchnorm_before = nn.BatchNorm2d(out_channels // 2, eps=config.batch_norm_eps)
self.convolution2 = nn.Conv2d(out_channels // 2, out_channels, kernel_size=3, stride=2, padding=1)
self.batchnorm_after = nn.BatchNorm2d(out_channels, eps=config.batch_norm_eps)
self.activation = nn.ReLU()
def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
features = self.batchnorm_before(self.convolution1(pixel_values))
features = self.activation(features)
features = self.batchnorm_after(self.convolution2(features))
features = self.activation(features)
return features
class EfficientFormerPooling(nn.Module):
def __init__(self, pool_size: int):
super().__init__()
self.pool = nn.AvgPool2d(pool_size, stride=1, padding=pool_size // 2, count_include_pad=False)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
output = self.pool(hidden_states) - hidden_states
return output
class EfficientFormerDenseMlp(nn.Module):
def __init__(
self,
config: EfficientFormerConfig,
in_features: int,
hidden_features: Optional[int] = None,
out_features: Optional[int] = None,
):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.linear_in = nn.Linear(in_features, hidden_features)
self.activation = ACT2FN[config.hidden_act]
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.linear_out = nn.Linear(hidden_features, out_features)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.linear_in(hidden_states)
hidden_states = self.activation(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.linear_out(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class EfficientFormerConvMlp(nn.Module):
def __init__(
self,
config: EfficientFormerConfig,
in_features: int,
hidden_features: Optional[int] = None,
out_features: Optional[int] = None,
drop: float = 0.0,
):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.convolution1 = nn.Conv2d(in_features, hidden_features, 1)
self.activation = ACT2FN[config.hidden_act]
self.convolution2 = nn.Conv2d(hidden_features, out_features, 1)
self.dropout = nn.Dropout(drop)
self.batchnorm_before = nn.BatchNorm2d(hidden_features, eps=config.batch_norm_eps)
self.batchnorm_after = nn.BatchNorm2d(out_features, eps=config.batch_norm_eps)
def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
hidden_state = self.convolution1(hidden_state)
hidden_state = self.batchnorm_before(hidden_state)
hidden_state = self.activation(hidden_state)
hidden_state = self.dropout(hidden_state)
hidden_state = self.convolution2(hidden_state)
hidden_state = self.batchnorm_after(hidden_state)
hidden_state = self.dropout(hidden_state)
return hidden_state
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
"""
if drop_prob == 0.0 or not training:
return input
keep_prob = 1 - drop_prob
shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
random_tensor.floor_() # binarize
output = input.div(keep_prob) * random_tensor
return output
class EfficientFormerDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
def __init__(self, drop_prob: Optional[float] = None) -> None:
super().__init__()
self.drop_prob = drop_prob
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
return drop_path(hidden_states, self.drop_prob, self.training)
def extra_repr(self) -> str:
return f"p={self.drop_prob}"
class EfficientFormerFlat(nn.Module):
def __init__(self):
super().__init__()
def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor]:
hidden_states = hidden_states.flatten(2).transpose(1, 2)
return hidden_states
class EfficientFormerMeta3D(nn.Module):
def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0):
super().__init__()
self.token_mixer = EfficientFormerSelfAttention(
dim=config.dim,
key_dim=config.key_dim,
num_heads=config.num_attention_heads,
attention_ratio=config.attention_ratio,
resolution=config.resolution,
)
self.layernorm1 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
self.layernorm2 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
self.mlp = EfficientFormerDenseMlp(config, in_features=dim, hidden_features=mlp_hidden_dim)
self.drop_path = EfficientFormerDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
self.use_layer_scale = config.use_layer_scale
if config.use_layer_scale:
self.layer_scale_1 = nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True)
self.layer_scale_2 = nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True)
def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> tuple[torch.Tensor]:
self_attention_outputs = self.token_mixer(self.layernorm1(hidden_states), output_attentions)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
if self.use_layer_scale:
layer_output = hidden_states + self.drop_path(
self.layer_scale_1.unsqueeze(0).unsqueeze(0) * attention_output
)
layer_output = layer_output + self.drop_path(
self.layer_scale_2.unsqueeze(0).unsqueeze(0) * self.mlp(self.layernorm2(layer_output))
)
else:
layer_output = hidden_states + self.drop_path(attention_output)
layer_output = layer_output + self.drop_path(self.mlp(self.layernorm2(layer_output)))
outputs = (layer_output,) + outputs
return outputs
class EfficientFormerMeta3DLayers(nn.Module):
def __init__(self, config: EfficientFormerConfig):
super().__init__()
drop_paths = [
config.drop_path_rate * (block_idx + sum(config.depths[:-1]))
for block_idx in range(config.num_meta3d_blocks)
]
self.blocks = nn.ModuleList(
[EfficientFormerMeta3D(config, config.hidden_sizes[-1], drop_path=drop_path) for drop_path in drop_paths]
)
def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> tuple[torch.Tensor]:
all_attention_outputs = () if output_attentions else None
for layer_module in self.blocks:
if isinstance(hidden_states, tuple):
hidden_states = hidden_states[0]
hidden_states = layer_module(hidden_states, output_attentions)
if output_attentions:
all_attention_outputs = all_attention_outputs + (hidden_states[1],)
if output_attentions:
outputs = (hidden_states[0],) + all_attention_outputs
return outputs
return hidden_states
class EfficientFormerMeta4D(nn.Module):
def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0):
super().__init__()
pool_size = config.pool_size if config.pool_size is not None else 3
self.token_mixer = EfficientFormerPooling(pool_size=pool_size)
mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
self.mlp = EfficientFormerConvMlp(
config, in_features=dim, hidden_features=mlp_hidden_dim, drop=config.hidden_dropout_prob
)
self.drop_path = EfficientFormerDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
self.use_layer_scale = config.use_layer_scale
if config.use_layer_scale:
self.layer_scale_1 = nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True)
self.layer_scale_2 = nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True)
def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor]:
outputs = self.token_mixer(hidden_states)
if self.use_layer_scale:
layer_output = hidden_states + self.drop_path(self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) * outputs)
layer_output = layer_output + self.drop_path(
self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) * self.mlp(layer_output)
)
else:
layer_output = hidden_states + self.drop_path(outputs)
layer_output = layer_output + self.drop_path(self.mlp(layer_output))
return layer_output
class EfficientFormerMeta4DLayers(nn.Module):
def __init__(self, config: EfficientFormerConfig, stage_idx: int):
super().__init__()
num_layers = (
config.depths[stage_idx] if stage_idx != -1 else config.depths[stage_idx] - config.num_meta3d_blocks
)
drop_paths = [
config.drop_path_rate * (block_idx + sum(config.depths[:stage_idx])) for block_idx in range(num_layers)
]
self.blocks = nn.ModuleList(
[
EfficientFormerMeta4D(config, config.hidden_sizes[stage_idx], drop_path=drop_path)
for drop_path in drop_paths
]
)
def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor]:
for layer_module in self.blocks:
hidden_states = layer_module(hidden_states)
return hidden_states
class EfficientFormerIntermediateStage(nn.Module):
def __init__(self, config: EfficientFormerConfig, index: int):
super().__init__()
self.meta4D_layers = EfficientFormerMeta4DLayers(config, index)
def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor]:
hidden_states = self.meta4D_layers(hidden_states)
return hidden_states
class EfficientFormerLastStage(nn.Module):
def __init__(self, config: EfficientFormerConfig):
super().__init__()
self.meta4D_layers = EfficientFormerMeta4DLayers(config, -1)
self.flat = EfficientFormerFlat()
self.meta3D_layers = EfficientFormerMeta3DLayers(config)
def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> tuple[torch.Tensor]:
hidden_states = self.meta4D_layers(hidden_states)
hidden_states = self.flat(hidden_states)
hidden_states = self.meta3D_layers(hidden_states, output_attentions)
return hidden_states
class EfficientFormerEncoder(nn.Module):
def __init__(self, config: EfficientFormerConfig):
super().__init__()
self.config = config
num_intermediate_stages = len(config.depths) - 1
downsamples = [
config.downsamples[i] or config.hidden_sizes[i] != config.hidden_sizes[i + 1]
for i in range(num_intermediate_stages)
]
intermediate_stages = []
for i in range(num_intermediate_stages):
intermediate_stages.append(EfficientFormerIntermediateStage(config, i))
if downsamples[i]:
intermediate_stages.append(
EfficientFormerPatchEmbeddings(config, config.hidden_sizes[i], config.hidden_sizes[i + 1])
)
self.intermediate_stages = nn.ModuleList(intermediate_stages)
self.last_stage = EfficientFormerLastStage(config)
def forward(
self,
hidden_states: torch.Tensor,
output_hidden_states: bool = False,
output_attentions: bool = False,
return_dict: bool = True,
) -> BaseModelOutput:
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
for layer_module in self.intermediate_stages:
hidden_states = layer_module(hidden_states)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_output = self.last_stage(hidden_states, output_attentions=output_attentions)
if output_attentions:
all_self_attentions = all_self_attentions + layer_output[1:]
if output_hidden_states:
all_hidden_states = all_hidden_states + (layer_output[0],)
if not return_dict:
return tuple(v for v in [layer_output[0], all_hidden_states, all_self_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=layer_output[0],
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
class EfficientFormerPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config: EfficientFormerConfig
base_model_prefix = "efficientformer"
main_input_name = "pixel_values"
supports_gradient_checkpointing = False
EFFICIENTFORMER_START_DOCSTRING = r"""
This model is a PyTorch [nn.Module](https://pytorch.org/docs/stable/nn.html#nn.Module) subclass. Use it as a
regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior.
Parameters:
config ([`EfficientFormerConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
EFFICIENTFORMER_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`ViTImageProcessor`]. See
[`ViTImageProcessor.preprocess`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare EfficientFormer Model transformer outputting raw hidden-states without any specific head on top.",
EFFICIENTFORMER_START_DOCSTRING,
)
class EfficientFormerModel(EfficientFormerPreTrainedModel):
def __init__(self, config: EfficientFormerConfig):
super().__init__(config)
self.config = config
_no_split_modules = ["EfficientFormerMeta4D"]
self.patch_embed = EfficientFormerConvStem(config, config.hidden_sizes[0])
self.encoder = EfficientFormerEncoder(config)
self.layernorm = nn.LayerNorm(config.hidden_sizes[-1], eps=config.layer_norm_eps)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[tuple, BaseModelOutput]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
embedding_output = self.patch_embed(pixel_values)
encoder_outputs = self.encoder(
embedding_output, output_attentions=output_attentions, output_hidden_states=output_hidden_states
)
sequence_output = encoder_outputs[0]
sequence_output = self.layernorm(sequence_output)
if not return_dict:
head_outputs = (sequence_output,)
return head_outputs + encoder_outputs[1:]
return BaseModelOutput(
last_hidden_state=sequence_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
@add_start_docstrings(
"""
EfficientFormer Model transformer with an image classification head on top (a linear layer on top of the final
hidden state of the [CLS] token) e.g. for ImageNet.
""",
EFFICIENTFORMER_START_DOCSTRING,
)
class EfficientFormerForImageClassification(EfficientFormerPreTrainedModel):
def __init__(self, config: EfficientFormerConfig):
super().__init__(config)
self.num_labels = config.num_labels
self.efficientformer = EfficientFormerModel(config)
# Classifier head
self.classifier = (
nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[tuple, ImageClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.efficientformer(
pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.classifier(sequence_output.mean(-2))
loss = None
if labels is not None:
loss = self.loss_function(labels, logits, self.config)
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return ImageClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@dataclass
class EfficientFormerForImageClassificationWithTeacherOutput(ModelOutput):
"""
Output type of [`EfficientFormerForImageClassificationWithTeacher`].
Args:
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
Prediction scores as the average of the cls_logits and distillation logits.
cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
class token).
distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
distillation token).
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
the self-attention heads.
"""
logits: Optional[torch.FloatTensor] = None
cls_logits: Optional[torch.FloatTensor] = None
distillation_logits: Optional[torch.FloatTensor] = None
hidden_states: Optional[tuple[torch.FloatTensor]] = None
attentions: Optional[tuple[torch.FloatTensor]] = None
@add_start_docstrings(
"""
EfficientFormer Model transformer with image classification heads on top (a linear layer on top of the final hidden
state of the [CLS] token and a linear layer on top of the final hidden state of the distillation token) e.g. for
ImageNet.
<Tip warning={true}>
This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
supported.
</Tip>
""",
EFFICIENTFORMER_START_DOCSTRING,
)
class EfficientFormerForImageClassificationWithTeacher(EfficientFormerPreTrainedModel):
def __init__(self, config: EfficientFormerConfig):
super().__init__(config)
self.num_labels = config.num_labels
self.efficientformer = EfficientFormerModel(config)
# Classifier head
self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
# Distillation head
self.distillation_classifier = (
nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=EfficientFormerForImageClassificationWithTeacherOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[tuple, EfficientFormerForImageClassificationWithTeacherOutput]:
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.efficientformer(
pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
cls_logits = self.classifier(sequence_output.mean(-2))
distillation_logits = self.distillation_classifier(sequence_output.mean(-2))
# during inference, return the average of both classifier predictions
logits = (cls_logits + distillation_logits) / 2
if not return_dict:
output = (logits, cls_logits, distillation_logits) + outputs[1:]
return output
return EfficientFormerForImageClassificationWithTeacherOutput(
logits=logits,
cls_logits=cls_logits,
distillation_logits=distillation_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
__all__ = [
"EfficientFormerForImageClassification",
"EfficientFormerForImageClassificationWithTeacher",
"EfficientFormerModel",
"EfficientFormerPreTrainedModel",
]

View File

@ -0,0 +1,28 @@
# Copyright 2023 The HuggingFace and Baidu Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ....utils import _LazyModule
from ....utils.import_utils import define_import_structure
if TYPE_CHECKING:
from .configuration_ernie_m import *
from .modeling_ernie_m import *
from .tokenization_ernie_m import *
else:
import sys
_file = globals()["__file__"]
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

View File

@ -0,0 +1,112 @@
# coding=utf-8
# Copyright 2023 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""ErnieM model configuration"""
# Adapted from original paddlenlp repository.(https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/transformers/ernie_m/configuration.py)
from __future__ import annotations
from ....configuration_utils import PreTrainedConfig
class ErnieMConfig(PreTrainedConfig):
r"""
This is the configuration class to store the configuration of a [`ErnieMModel`]. It is used to instantiate a
Ernie-M model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the `Ernie-M`
[susnato/ernie-m-base_pytorch](https://huggingface.co/susnato/ernie-m-base_pytorch) architecture.
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PreTrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 250002):
Vocabulary size of `inputs_ids` in [`ErnieMModel`]. Also is the vocab size of token embedding matrix.
Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling
[`ErnieMModel`].
hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the embedding layer, encoder layers and pooler layer.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors to feed-forward layers are
firstly projected from hidden_size to intermediate_size, and then projected back to hidden_size. Typically
intermediate_size is larger than hidden_size.
hidden_act (`str`, *optional*, defaults to `"gelu"`):
The non-linear activation function in the feed-forward layer. `"gelu"`, `"relu"` and any other torch
supported activation functions are supported.
hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings and encoder.
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probability used in `MultiHeadAttention` in all encoder layers to drop some attention target.
max_position_embeddings (`int`, *optional*, defaults to 514):
The maximum value of the dimensionality of position encoding, which dictates the maximum supported length
of an input sequence.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the normal initializer for initializing all weight matrices. The index of padding
token in the token vocabulary.
pad_token_id (`int`, *optional*, defaults to 1):
Padding token id.
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the layer normalization layers.
classifier_dropout (`float`, *optional*):
The dropout ratio for the classification head.
act_dropout (`float`, *optional*, defaults to 0.0):
This dropout probability is used in `ErnieMEncoderLayer` after activation.
A normal_initializer initializes weight matrices as normal distributions. See
`ErnieMPretrainedModel._init_weights()` for how weights are initialized in `ErnieMModel`.
"""
model_type = "ernie_m"
attribute_map: dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"}
def __init__(
self,
vocab_size: int = 250002,
hidden_size: int = 768,
num_hidden_layers: int = 12,
num_attention_heads: int = 12,
intermediate_size: int = 3072,
hidden_act: str = "gelu",
hidden_dropout_prob: float = 0.1,
attention_probs_dropout_prob: float = 0.1,
max_position_embeddings: int = 514,
initializer_range: float = 0.02,
pad_token_id: int = 1,
layer_norm_eps: float = 1e-05,
classifier_dropout=None,
act_dropout=0.0,
**kwargs,
):
super().__init__(pad_token_id=pad_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.classifier_dropout = classifier_dropout
self.act_dropout = act_dropout
__all__ = ["ErnieMConfig"]

View File

@ -0,0 +1,987 @@
# coding=utf-8
# Copyright 2023 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch ErnieM model."""
import math
from typing import Optional, Union
import torch
from torch import nn, tensor
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ....activations import ACT2FN
from ....cache_utils import Cache
from ....modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
BaseModelOutputWithPoolingAndCrossAttentions,
MultipleChoiceModelOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ....modeling_utils import PreTrainedModel
from ....utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_ernie_m import ErnieMConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "susnato/ernie-m-base_pytorch"
_CONFIG_FOR_DOC = "ErnieMConfig"
_TOKENIZER_FOR_DOC = "ErnieMTokenizer"
# Adapted from paddlenlp.transformers.ernie_m.modeling.ErnieEmbeddings
class ErnieMEmbeddings(nn.Module):
"""Construct the embeddings from word and position embeddings."""
def __init__(self, config):
super().__init__()
self.hidden_size = config.hidden_size
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(
config.max_position_embeddings, config.hidden_size, padding_idx=config.pad_token_id
)
self.layer_norm = nn.LayerNorm(normalized_shape=config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(p=config.hidden_dropout_prob)
self.padding_idx = config.pad_token_id
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.LongTensor] = None,
past_key_values_length: int = 0,
) -> torch.Tensor:
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
if position_ids is None:
input_shape = inputs_embeds.size()[:-1]
ones = torch.ones(input_shape, dtype=torch.int64, device=inputs_embeds.device)
seq_length = torch.cumsum(ones, dim=1)
position_ids = seq_length - ones
if past_key_values_length > 0:
position_ids = position_ids + past_key_values_length
# to mimic paddlenlp implementation
position_ids += 2
position_embeddings = self.position_embeddings(position_ids)
embeddings = inputs_embeds + position_embeddings
embeddings = self.layer_norm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class ErnieMSelfAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.q_proj = nn.Linear(config.hidden_size, self.all_head_size)
self.k_proj = nn.Linear(config.hidden_size, self.all_head_size)
self.v_proj = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.position_embedding_type = position_embedding_type or getattr(
config, "position_embedding_type", "absolute"
)
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
self.max_position_embeddings = config.max_position_embeddings
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
self.is_decoder = config.is_decoder
def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Cache] = None,
output_attentions: Optional[bool] = False,
) -> tuple[torch.Tensor]:
mixed_query_layer = self.q_proj(hidden_states)
# If this is instantiated as a cross-attention module, the keys
# and values come from an encoder; the attention mask needs to be
# such that the encoder's padding tokens are not attended to.
is_cross_attention = encoder_hidden_states is not None
if is_cross_attention and past_key_values is not None:
# reuse k,v, cross_attentions
key_layer = past_key_values[0]
value_layer = past_key_values[1]
attention_mask = encoder_attention_mask
elif is_cross_attention:
key_layer = self.transpose_for_scores(self.k_proj(encoder_hidden_states))
value_layer = self.transpose_for_scores(self.v_proj(encoder_hidden_states))
attention_mask = encoder_attention_mask
elif past_key_values is not None:
key_layer = self.transpose_for_scores(self.k_proj(hidden_states))
value_layer = self.transpose_for_scores(self.v_proj(hidden_states))
key_layer = torch.cat([past_key_values[0], key_layer], dim=2)
value_layer = torch.cat([past_key_values[1], value_layer], dim=2)
else:
key_layer = self.transpose_for_scores(self.k_proj(hidden_states))
value_layer = self.transpose_for_scores(self.v_proj(hidden_states))
query_layer = self.transpose_for_scores(mixed_query_layer)
use_cache = past_key_values is not None
if self.is_decoder:
# if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
# Further calls to cross_attention layer can then reuse all cross-attention
# key/value_states (first "if" case)
# if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
# all previous decoder key/value_states. Further calls to uni-directional self-attention
# can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
# if encoder bi-directional self-attention `past_key_values` is always `None`
past_key_values = (key_layer, value_layer)
# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
query_length, key_length = query_layer.shape[2], key_layer.shape[2]
if use_cache:
position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
-1, 1
)
else:
position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
distance = position_ids_l - position_ids_r
positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility
if self.position_embedding_type == "relative_key":
relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
attention_scores = attention_scores + relative_position_scores
elif self.position_embedding_type == "relative_key_query":
relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in ErnieMModel forward() function)
attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities.
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs)
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(new_context_layer_shape)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
if self.is_decoder:
outputs = outputs + (past_key_values,)
return outputs
class ErnieMAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
self.self_attn = ErnieMSelfAttention(config, position_embedding_type=position_embedding_type)
self.out_proj = nn.Linear(config.hidden_size, config.hidden_size)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Cache] = None,
output_attentions: Optional[bool] = False,
) -> tuple[torch.Tensor]:
self_outputs = self.self_attn(
hidden_states,
attention_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_values,
output_attentions,
)
attention_output = self.out_proj(self_outputs[0])
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
return outputs
class ErnieMEncoderLayer(nn.Module):
def __init__(self, config):
super().__init__()
# to mimic paddlenlp implementation
dropout = 0.1 if config.hidden_dropout_prob is None else config.hidden_dropout_prob
act_dropout = config.hidden_dropout_prob if config.act_dropout is None else config.act_dropout
self.self_attn = ErnieMAttention(config)
self.linear1 = nn.Linear(config.hidden_size, config.intermediate_size)
self.dropout = nn.Dropout(act_dropout)
self.linear2 = nn.Linear(config.intermediate_size, config.hidden_size)
self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
if isinstance(config.hidden_act, str):
self.activation = ACT2FN[config.hidden_act]
else:
self.activation = config.hidden_act
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Cache] = None,
output_attentions: Optional[bool] = True,
):
residual = hidden_states
if output_attentions:
hidden_states, attention_opt_weights = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
else:
hidden_states = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = residual + self.dropout1(hidden_states)
hidden_states = self.norm1(hidden_states)
residual = hidden_states
hidden_states = self.linear1(hidden_states)
hidden_states = self.activation(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.linear2(hidden_states)
hidden_states = residual + self.dropout2(hidden_states)
hidden_states = self.norm2(hidden_states)
if output_attentions:
return hidden_states, attention_opt_weights
else:
return hidden_states
class ErnieMEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.layers = nn.ModuleList([ErnieMEncoderLayer(config) for _ in range(config.num_hidden_layers)])
def forward(
self,
input_embeds: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Cache] = None,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
hidden_states = () if output_hidden_states else None
attentions = () if output_attentions else None
output = input_embeds
if output_hidden_states:
hidden_states = hidden_states + (output,)
for i, layer in enumerate(self.layers):
output, opt_attn_weights = layer(
hidden_states=output,
attention_mask=attention_mask,
past_key_values=past_key_values[i] if past_key_values is not None else None,
)
if output_hidden_states:
hidden_states = hidden_states + (output,)
if output_attentions:
attentions = attentions + (opt_attn_weights,)
last_hidden_state = output
if not return_dict:
return tuple(v for v in [last_hidden_state, hidden_states, attentions] if v is not None)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=last_hidden_state, hidden_states=hidden_states, attentions=attentions
)
class ErnieMPooler(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# We "pool" the model by simply taking the hidden state corresponding
# to the first token.
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
class ErnieMPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config: ErnieMConfig
base_model_prefix = "ernie_m"
ERNIE_M_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`ErnieMConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
ERNIE_M_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`ErnieMTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
model's internal embedding lookup matrix.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare ErnieM Model transformer outputting raw hidden-states without any specific head on top.",
ERNIE_M_START_DOCSTRING,
)
class ErnieMModel(ErnieMPreTrainedModel):
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
self.initializer_range = config.initializer_range
self.embeddings = ErnieMEmbeddings(config)
self.encoder = ErnieMEncoder(config)
self.pooler = ErnieMPooler(config) if add_pooling_layer else None
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layers[layer].self_attn.prune_heads(heads)
@add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
processor_class=_TOKENIZER_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPastAndCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[tensor] = None,
position_ids: Optional[tensor] = None,
attention_mask: Optional[tensor] = None,
inputs_embeds: Optional[tensor] = None,
past_key_values: Optional[tuple[tuple[tensor]]] = None,
use_cache: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_attentions: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[tuple[torch.FloatTensor], BaseModelOutputWithPoolingAndCrossAttentions]:
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time.")
# init the default bool value
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.return_dict
past_key_values_length = 0
if past_key_values is not None:
past_key_values_length = past_key_values.get_seq_length()
# Adapted from paddlenlp.transformers.ernie_m.ErnieMModel
if attention_mask is None:
attention_mask = (input_ids == self.config.pad_token_id).to(torch.float32)
attention_mask *= torch.finfo(attention_mask.dtype).min
if past_key_values is not None:
batch_size = past_key_values[0][0].shape[0]
past_mask = torch.zeros([batch_size, 1, 1, past_key_values_length], dtype=attention_mask.dtype)
attention_mask = torch.concat([past_mask, attention_mask], dim=-1)
# For 2D attention_mask from tokenizer
elif attention_mask.ndim == 2:
attention_mask = attention_mask.to(torch.float32)
attention_mask = 1.0 - attention_mask
attention_mask *= torch.finfo(attention_mask.dtype).min
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(1)
embedding_output = self.embeddings(
input_ids=input_ids,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
past_key_values_length=past_key_values_length,
)
encoder_outputs = self.encoder(
embedding_output,
attention_mask=extended_attention_mask,
past_key_values=past_key_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
if not return_dict:
sequence_output = encoder_outputs[0]
pooler_output = self.pooler(sequence_output) if self.pooler is not None else None
return (sequence_output, pooler_output) + encoder_outputs[1:]
sequence_output = encoder_outputs["last_hidden_state"]
pooler_output = self.pooler(sequence_output) if self.pooler is not None else None
hidden_states = None if not output_hidden_states else encoder_outputs["hidden_states"]
attentions = None if not output_attentions else encoder_outputs["attentions"]
return BaseModelOutputWithPoolingAndCrossAttentions(
last_hidden_state=sequence_output,
pooler_output=pooler_output,
hidden_states=hidden_states,
attentions=attentions,
)
@add_start_docstrings(
"""ErnieM Model transformer with a sequence classification/regression head on top (a linear layer on top of
the pooled output) e.g. for GLUE tasks.""",
ERNIE_M_START_DOCSTRING,
)
class ErnieMForSequenceClassification(ErnieMPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.config = config
self.ernie_m = ErnieMModel(config)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
processor_class=_TOKENIZER_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
past_key_values: Optional[Cache] = None,
use_cache: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_attentions: Optional[bool] = None,
return_dict: Optional[bool] = True,
labels: Optional[torch.Tensor] = None,
) -> Union[tuple[torch.FloatTensor], SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.ernie_m(
input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
past_key_values=past_key_values,
output_hidden_states=output_hidden_states,
output_attentions=output_attentions,
return_dict=return_dict,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""ErnieM Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks.""",
ERNIE_M_START_DOCSTRING,
)
class ErnieMForMultipleChoice(ErnieMPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.ernie_m = ErnieMModel(config)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, 1)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = True,
) -> Union[tuple[torch.FloatTensor], MultipleChoiceModelOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
`input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
outputs = self.ernie_m(
input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
reshaped_logits = logits.view(-1, num_choices)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""ErnieM Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.""",
ERNIE_M_START_DOCSTRING,
)
class ErnieMForTokenClassification(ErnieMPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.ernie_m = ErnieMModel(config, add_pooling_layer=False)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
processor_class=_TOKENIZER_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
past_key_values: Optional[Cache] = None,
output_hidden_states: Optional[bool] = None,
output_attentions: Optional[bool] = None,
return_dict: Optional[bool] = True,
labels: Optional[torch.Tensor] = None,
) -> Union[tuple[torch.FloatTensor], TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.ernie_m(
input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
past_key_values=past_key_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""ErnieM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).""",
ERNIE_M_START_DOCSTRING,
)
class ErnieMForQuestionAnswering(ErnieMPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.ernie_m = ErnieMModel(config, add_pooling_layer=False)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
processor_class=_TOKENIZER_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
start_positions: Optional[torch.Tensor] = None,
end_positions: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = True,
) -> Union[tuple[torch.FloatTensor], QuestionAnsweringModelOutput]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.ernie_m(
input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()
total_loss = None
if start_positions is not None and end_positions is not None:
# If we are on multi-GPU, split add a dimension
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# sometimes the start/end positions are outside our model inputs, we ignore these terms
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""ErnieMForInformationExtraction is a Ernie-M Model with two linear layer on top of the hidden-states output to
compute `start_prob` and `end_prob`, designed for Universal Information Extraction.""",
ERNIE_M_START_DOCSTRING,
)
class ErnieMForInformationExtraction(ErnieMPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.ernie_m = ErnieMModel(config)
self.linear_start = nn.Linear(config.hidden_size, 1)
self.linear_end = nn.Linear(config.hidden_size, 1)
self.sigmoid = nn.Sigmoid()
self.post_init()
@add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
start_positions: Optional[torch.Tensor] = None,
end_positions: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = True,
) -> Union[tuple[torch.FloatTensor], QuestionAnsweringModelOutput]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for position (index) for computing the start_positions loss. Position outside of the sequence are
not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) for computing the end_positions loss. Position outside of the sequence are not
taken into account for computing the loss.
"""
result = self.ernie_m(
input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
if return_dict:
sequence_output = result.last_hidden_state
elif not return_dict:
sequence_output = result[0]
start_logits = self.linear_start(sequence_output)
start_logits = start_logits.squeeze(-1)
end_logits = self.linear_end(sequence_output)
end_logits = end_logits.squeeze(-1)
total_loss = None
if start_positions is not None and end_positions is not None:
# If we are on multi-GPU, split add a dimension
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# sometimes the start/end positions are outside our model inputs, we ignore these terms
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
loss_fct = BCEWithLogitsLoss()
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if not return_dict:
return tuple(
i
for i in [total_loss, start_logits, end_logits, result.hidden_states, result.attentions]
if i is not None
)
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=result.hidden_states,
attentions=result.attentions,
)
__all__ = [
"ErnieMForMultipleChoice",
"ErnieMForQuestionAnswering",
"ErnieMForSequenceClassification",
"ErnieMForTokenClassification",
"ErnieMModel",
"ErnieMPreTrainedModel",
"ErnieMForInformationExtraction",
]

View File

@ -0,0 +1,409 @@
# coding=utf-8
# Copyright 2023 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for Ernie-M."""
import os
import unicodedata
from typing import Any, Optional
import sentencepiece as spm
from ....tokenization_utils import PreTrainedTokenizer
from ....utils import logging
from ....utils.import_utils import requires
logger = logging.get_logger(__name__)
SPIECE_UNDERLINE = ""
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "sentencepiece_model_ckpt": "sentencepiece.bpe.model"}
RESOURCE_FILES_NAMES = {
"sentencepiece_model_file": "sentencepiece.bpe.model",
"vocab_file": "vocab.txt",
}
# Adapted from paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer
@requires(backends=("sentencepiece",))
class ErnieMTokenizer(PreTrainedTokenizer):
r"""
Constructs a Ernie-M tokenizer. It uses the `sentencepiece` tools to cut the words to sub-words.
Args:
sentencepiece_model_file (`str`):
The file path of sentencepiece model.
vocab_file (`str`, *optional*):
The file path of the vocabulary.
do_lower_case (`str`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
unk_token (`str`, *optional*, defaults to `"[UNK]"`):
A special token representing the `unknown (out-of-vocabulary)` token. An unknown token is set to be
`unk_token` inorder to be converted to an ID.
sep_token (`str`, *optional*, defaults to `"[SEP]"`):
A special token separating two different sentences in the same input.
pad_token (`str`, *optional*, defaults to `"[PAD]"`):
A special token used to make arrays of tokens the same size for batching purposes.
cls_token (`str`, *optional*, defaults to `"[CLS]"`):
A special token used for sequence classification. It is the last token of the sequence when built with
special tokens.
mask_token (`str`, *optional*, defaults to `"[MASK]"`):
A special token representing a masked token. This is the token used in the masked language modeling task
which the model tries to predict the original unmasked ones.
"""
# Ernie-M model doesn't have token_type embedding.
model_input_names: list[str] = ["input_ids"]
vocab_files_names = VOCAB_FILES_NAMES
resource_files_names = RESOURCE_FILES_NAMES
def __init__(
self,
sentencepiece_model_ckpt,
vocab_file=None,
do_lower_case=False,
encoding="utf8",
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
sp_model_kwargs: Optional[dict[str, Any]] = None,
**kwargs,
) -> None:
# Mask token behave like a normal word, i.e. include the space before it and
# is included in the raw text, there should be a match in a non-normalized sentence.
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
self.do_lower_case = do_lower_case
self.sentencepiece_model_ckpt = sentencepiece_model_ckpt
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(sentencepiece_model_ckpt)
# to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning
if vocab_file is not None:
self.vocab = self.load_vocab(filepath=vocab_file)
else:
self.vocab = {self.sp_model.id_to_piece(id): id for id in range(self.sp_model.get_piece_size())}
self.reverse_vocab = {v: k for k, v in self.vocab.items()}
super().__init__(
do_lower_case=do_lower_case,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
vocab_file=vocab_file,
encoding=encoding,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs,
)
def get_offset_mapping(self, text):
if text is None:
return None
split_tokens = self.tokenize(text)
normalized_text, char_mapping = "", []
for i, ch in enumerate(text):
if ch in self.SP_CHAR_MAPPING:
ch = self.SP_CHAR_MAPPING.get(ch)
else:
ch = unicodedata.normalize("NFKC", ch)
if self.is_whitespace(ch):
continue
normalized_text += ch
char_mapping.extend([i] * len(ch))
text, token_mapping, offset = normalized_text, [], 0
if self.do_lower_case:
text = text.lower()
for token in split_tokens:
if token[:1] == "":
token = token[1:]
start = text[offset:].index(token) + offset
end = start + len(token)
token_mapping.append((char_mapping[start], char_mapping[end - 1] + 1))
offset = end
return token_mapping
@property
def vocab_size(self):
return len(self.vocab)
def get_vocab(self):
return dict(self.vocab, **self.added_tokens_encoder)
def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
return state
def __setstate__(self, d):
self.__dict__ = d
# for backward compatibility
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.sentencepiece_model_ckpt)
def clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
return "".join(self.SP_CHAR_MAPPING.get(c, c) for c in text)
def _tokenize(self, text, enable_sampling=False, nbest_size=64, alpha=0.1):
"""Tokenize a string."""
if self.sp_model_kwargs.get("enable_sampling") is True:
enable_sampling = True
if self.sp_model_kwargs.get("alpha") is not None:
alpha = self.sp_model_kwargs.get("alpha")
if self.sp_model_kwargs.get("nbest_size") is not None:
nbest_size = self.sp_model_kwargs.get("nbest_size")
if not enable_sampling:
pieces = self.sp_model.EncodeAsPieces(text)
else:
pieces = self.sp_model.SampleEncodeAsPieces(text, nbest_size, alpha)
new_pieces = []
for pi, piece in enumerate(pieces):
if piece == SPIECE_UNDERLINE:
if not pieces[pi + 1].startswith(SPIECE_UNDERLINE) and pi != 0:
new_pieces.append(SPIECE_UNDERLINE)
continue
else:
continue
lst_i = 0
for i, chunk in enumerate(piece):
if chunk == SPIECE_UNDERLINE:
continue
if self.is_ch_char(chunk) or self.is_punct(chunk):
if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE:
new_pieces.append(piece[lst_i:i])
new_pieces.append(chunk)
lst_i = i + 1
elif chunk.isdigit() and i > 0 and not piece[i - 1].isdigit():
if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE:
new_pieces.append(piece[lst_i:i])
lst_i = i
elif not chunk.isdigit() and i > 0 and piece[i - 1].isdigit():
if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE:
new_pieces.append(piece[lst_i:i])
lst_i = i
if len(piece) > lst_i:
new_pieces.append(piece[lst_i:])
return new_pieces
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
return out_string
def convert_ids_to_string(self, ids):
"""
Converts a sequence of tokens (strings for sub-words) in a single string.
"""
tokens = self.convert_ids_to_tokens(ids)
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
return out_string
# to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning
def _convert_token_to_id(self, token):
return self.vocab.get(token, self.vocab.get(self.unk_token))
# to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.reverse_vocab.get(index, self.unk_token)
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
r"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An ErnieM sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] [SEP] B [SEP]`
Args:
token_ids_0 (`list[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`list[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`list[int]`: List of input_id with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
_cls = [self.cls_token_id]
_sep = [self.sep_token_id]
return _cls + token_ids_0 + _sep + _sep + token_ids_1 + _sep
def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
r"""
Build offset map from a pair of offset map by concatenating and adding offsets of special tokens. An Ernie-M
offset_mapping has the following format:
- single sequence: `(0,0) X (0,0)`
- pair of sequences: `(0,0) A (0,0) (0,0) B (0,0)`
Args:
offset_mapping_ids_0 (`list[tuple]`):
List of char offsets to which the special tokens will be added.
offset_mapping_ids_1 (`list[tuple]`, *optional*):
Optional second list of wordpiece offsets for offset mapping pairs.
Returns:
`list[tuple]`: List of wordpiece offsets with the appropriate offsets of special tokens.
"""
if offset_mapping_1 is None:
return [(0, 0)] + offset_mapping_0 + [(0, 0)]
return [(0, 0)] + offset_mapping_0 + [(0, 0), (0, 0)] + offset_mapping_1 + [(0, 0)]
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
r"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `encode` method.
Args:
token_ids_0 (`list[int]`):
List of ids of the first sequence.
token_ids_1 (`list[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`str`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`list[int]`:
The list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return [1 if x in [self.sep_token_id, self.cls_token_id] else 0 for x in token_ids_0]
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
) -> list[int]:
"""
Create the token type IDs corresponding to the sequences passed. [What are token type
IDs?](../glossary#token-type-ids) Should be overridden in a subclass if the model has a special way of
building: those.
Args:
token_ids_0 (`list[int]`):
The first tokenized sequence.
token_ids_1 (`list[int]`, *optional*):
The second tokenized sequence.
Returns:
`list[int]`: The token type ids.
"""
# called when `add_special_tokens` is True, so align with `build_inputs_with_special_tokens` method
if token_ids_1 is None:
# [CLS] X [SEP]
return (len(token_ids_0) + 2) * [0]
# [CLS] A [SEP] [SEP] B [SEP]
return [0] * (len(token_ids_0) + 1) + [1] * (len(token_ids_1) + 3)
def is_ch_char(self, char):
"""
is_ch_char
"""
if "\u4e00" <= char <= "\u9fff":
return True
return False
def is_alpha(self, char):
"""
is_alpha
"""
if ("a" <= char <= "z") or ("A" <= char <= "Z"):
return True
return False
def is_punct(self, char):
"""
is_punct
"""
if char in ",;:.?!~,;:。?!《》【】":
return True
return False
def is_whitespace(self, char):
"""
is whitespace
"""
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
if len(char) == 1:
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False
def load_vocab(self, filepath):
token_to_idx = {}
with open(filepath, "r", encoding="utf-8") as f:
for index, line in enumerate(f):
token = line.rstrip("\n")
token_to_idx[token] = int(index)
return token_to_idx
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
index = 0
if os.path.isdir(save_directory):
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
else:
vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
with open(vocab_file, "w", encoding="utf-8") as writer:
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!"
)
index = token_index
writer.write(token + "\n")
index += 1
tokenizer_model_file = os.path.join(save_directory, "sentencepiece.bpe.model")
with open(tokenizer_model_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (vocab_file,)
__all__ = ["ErnieMTokenizer"]

View File

@ -0,0 +1,28 @@
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ....utils import _LazyModule
from ....utils.import_utils import define_import_structure
if TYPE_CHECKING:
from .configuration_gptsan_japanese import *
from .modeling_gptsan_japanese import *
from .tokenization_gptsan_japanese import *
else:
import sys
_file = globals()["__file__"]
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

View File

@ -0,0 +1,157 @@
# coding=utf-8
# Copyright 2023, HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""GPTSAN-japanese model configuration"""
from ....configuration_utils import PreTrainedConfig
from ....utils import logging
logger = logging.get_logger(__name__)
class GPTSanJapaneseConfig(PreTrainedConfig):
r"""
This is the configuration class to store the configuration of a [`GPTSanJapaneseModel`]. It is used to instantiate
a GPTSANJapanese model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the GPTSANJapanese
[Tanrei/GPTSAN-japanese](https://huggingface.co/Tanrei/GPTSAN-japanese) architecture.
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PreTrainedConfig`] for more information.
Arguments:
vocab_size (`int`, *optional*, defaults to 36000):
Vocabulary size of the GPTSANJapanese model. Defines the number of different tokens that can be represented
by the `inputs_ids` passed when calling [`GPTSanJapaneseModel`].
max_position_embeddings (`int`, *optional*, defaults to 1280):
The maximum sequence length that this model might ever be used with. Defaults set this to 1280.
d_model (`int`, *optional*, defaults to 1024):
Size of the encoder layers and the pooler layer.
d_ff (`int`, *optional*, defaults to 8192):
Size of the intermediate feed forward layer in each `SwitchTransformersBlock`.
d_ext (`int`, *optional*, defaults to 4096):
Size of the intermediate feed forward layer in each Extra-layers.
d_spout (`int`, *optional*, defaults to 128):
Size of the `spout` vector.
num_switch_layers (`int`, *optional*, defaults to 10):
Number of layers in the Switch Transformer layer.
num_ext_layers (`int`, *optional*, defaults to 0):
Number of layers in the Extra-layers.
num_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
num_experts (`int`, *optional*, defaults to 16):
Number of experts for each SwitchTransformer layer.
expert_capacity (`int`, *optional*, defaults to 128):
Number of tokens that can be stored in each expert. If set to 1, the model will behave like a regular
Transformer.
dropout_rate (`float`, *optional*, defaults to 0.0):
The ratio for all dropout layers.
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
router_bias (`bool`, *optional*, defaults to `False`):
Whether to add a bias to the router.
router_jitter_noise (`float`, *optional*, defaults to 0.0):
Amount of noise to add to the router. Set it to 0.0 during prediction or set small value (usually 1e-2)
during training.
router_dtype (`str`, *optional*, default to `"float32"`):
The `dtype` used for the routers. It is preferable to keep the `dtype` to `"float32"` as specified in the
*selective precision* discussion in [the paper](https://huggingface.co/papers/2101.03961).
router_ignore_padding_tokens (`bool`, *optional*, defaults to `False`):
Whether to ignore padding tokens when routing.
output_hidden_states (`bool`, *optional*, default to `False`):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
output_attentions (`bool`, *optional*, defaults to `False`):
Whether or not to return the attentions tensors of all attention layers.
initializer_factor (`float`, *optional*, defaults to 0.002):
A factor for initializing all weight matrices.
output_router_logits (`bool`, *optional*, default to `False`):
Whether or not to return the router logits of all experts.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models)
"""
model_type = "gptsan-japanese"
keys_to_ignore_at_inference = [
"past_key_values",
]
attribute_map = {
"hidden_size": "d_model",
"num_attention_heads": "num_heads",
"num_hidden_layers": "num_layers",
}
def __init__(
self,
vocab_size=36000,
max_position_embeddings=1280,
d_model=1024,
d_ff=8192,
d_ext=4096,
d_spout=128,
num_switch_layers=10,
num_ext_layers=0,
num_heads=16,
num_experts=16,
expert_capacity=128,
dropout_rate=0.0,
layer_norm_epsilon=1e-5,
router_bias=False,
router_jitter_noise=0.0,
router_dtype="float32",
router_ignore_padding_tokens=False,
output_hidden_states=False,
output_attentions=False,
initializer_factor=0.002,
output_router_logits=False,
use_cache=True,
separator_token_id=35998,
pad_token_id=35995,
eos_token_id=35999,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.d_model = d_model
self.d_ff = d_ff
self.d_ext = d_ext
self.d_spout = d_spout
self.num_switch_layers = num_switch_layers
self.num_ext_layers = num_ext_layers
self.num_layers = num_switch_layers + num_ext_layers
self.num_heads = num_heads
self.num_experts = num_experts
self.expert_capacity = expert_capacity
self.dropout_rate = dropout_rate
self.layer_norm_epsilon = layer_norm_epsilon
self.router_bias = router_bias
self.router_jitter_noise = router_jitter_noise
self.router_dtype = router_dtype
self.router_ignore_padding_tokens = router_ignore_padding_tokens
self.output_hidden_states = output_hidden_states
self.output_attentions = output_attentions
self.initializer_factor = initializer_factor
self.output_router_logits = output_router_logits
self.use_cache = use_cache
super().__init__(
separator_token_id=separator_token_id,
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
**kwargs,
)
__all__ = ["GPTSanJapaneseConfig"]

View File

@ -0,0 +1,181 @@
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert GPTSANJapanese checkpoints from the original repository to pytorch model."""
import argparse
import json
import os
from collections import OrderedDict
import numpy as np
import tensorflow as tf
import torch
def convert_tf_gptsan_to_pt(args):
parameter_file = os.path.join(args.tf_model_dir, "parameters.json")
params = json.loads(open(parameter_file).read())
if not params:
raise ValueError(
f"It seems that the json file at {parameter_file} is empty. Make sure you have a correct json file."
)
if not args.output.endswith(".pt"):
args.output = args.output + ".pt"
new_state = OrderedDict()
with tf.device("/CPU:0"):
reader = tf.train.load_checkpoint(args.tf_model_dir)
shapes = reader.get_variable_to_shape_map()
for key_name in shapes:
vnp = reader.get_tensor(key_name).astype(np.float16)
if key_name.endswith("/adam_m") or key_name.endswith("/adam_v"):
continue
if key_name.startswith("pasts/"):
if key_name.startswith("pasts/mlp"):
player = int(key_name[9])
elif key_name.startswith("pasts/out"):
player = 8
name = "model.sqout.%d.weight" % (player * 2) # enter to nn.Sequential with Tanh, so 2 at a time
state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix
new_state[name] = torch.tensor(state)
elif key_name.startswith("model/moe"):
player = int(key_name[9:].split("/")[0])
if key_name.endswith("/switch_gating/kernel"):
name = "model.blocks.%d.feed_forward.mlp.router.classifier.weight" % player
state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix
new_state[name] = torch.tensor(state)
elif key_name.endswith("/softmlp/kernel"):
name = "model.blocks.%d.feed_forward.soft_bypass_mlp.weight" % player
state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix
new_state[name] = torch.tensor(state)
elif key_name.endswith("/wo/kernel") or key_name.endswith("/wi/kernel"):
nlayer = key_name[-9:-7]
for i in range(16):
name = "model.blocks.%d.feed_forward.mlp.experts.expert_%d.%s.weight" % (player, i, nlayer)
state = (
vnp[i].transpose([1, 0]).copy()
) # In Mesh-Tensorflow, it is one array, so it is divided
new_state[name] = torch.tensor(state)
elif key_name.startswith("model/mlp"):
player = int(key_name[9:].split("/")[0])
if key_name.endswith("/p1/kernel"):
name = "model.blocks.%d.feed_forward.mlp.wi.weight" % player
state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix
new_state[name] = torch.tensor(state)
elif key_name.endswith("/p1/bias"):
name = "model.blocks.%d.feed_forward.mlp.wi.bias" % player
state = vnp.copy() # same because it is one dimensional
new_state[name] = torch.tensor(state)
elif key_name.endswith("/p2/kernel"):
name = "model.blocks.%d.feed_forward.mlp.wo.weight" % player
state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix
new_state[name] = torch.tensor(state)
elif key_name.endswith("/p2/bias"):
name = "model.blocks.%d.feed_forward.mlp.wo.bias" % player
state = vnp.copy() # same because it is one dimensional
new_state[name] = torch.tensor(state)
elif key_name.startswith("model/ln"):
player = int(key_name[8:].split("/")[0])
if key_name.endswith("/b"):
name = "model.blocks.%d.feed_forward.norm.bias" % player
state = vnp.copy() # same because it is one dimensional
new_state[name] = torch.tensor(state)
elif key_name.endswith("/g"):
name = "model.blocks.%d.feed_forward.norm.weight" % player
state = vnp.copy() # same because it is one dimensional
new_state[name] = torch.tensor(state)
elif key_name.startswith("model/att"):
player = int(key_name[9:].split("/")[0])
if key_name.endswith("/qkv/kernel"):
state = vnp.copy() # Compute same dimension as Mesh-tensorflow using einsum
state_q = state[:, 0, :, :]
state_k = state[:, 1, :, :]
state_v = state[:, 2, :, :]
state_q = (
state_q.reshape([state_q.shape[0], state_q.shape[1] * state_q.shape[2]])
.transpose([1, 0])
.copy()
) # Mesh-Tensorflow is a diagonal matrix
state_k = (
state_k.reshape([state_k.shape[0], state_k.shape[1] * state_k.shape[2]])
.transpose([1, 0])
.copy()
) # Mesh-Tensorflow is a diagonal matrix
state_v = (
state_v.reshape([state_v.shape[0], state_v.shape[1] * state_v.shape[2]])
.transpose([1, 0])
.copy()
) # Mesh-Tensorflow is a diagonal matrix
name = "model.blocks.%d.self_attn.self_attn.q_proj.weight" % player
new_state[name] = torch.tensor(state_q)
name = "model.blocks.%d.self_attn.self_attn.k_proj.weight" % player
new_state[name] = torch.tensor(state_k)
name = "model.blocks.%d.self_attn.self_attn.v_proj.weight" % player
new_state[name] = torch.tensor(state_v)
elif key_name.endswith("/o/kernel"):
name = "model.blocks.%d.self_attn.self_attn.out_proj.weight" % player
state = (
vnp.reshape([vnp.shape[0] * vnp.shape[1], vnp.shape[2]]).transpose([1, 0]).copy()
) # Mesh-Tensorflow is a diagonal matrix
new_state[name] = torch.tensor(state)
elif key_name.startswith("model/an"):
player = int(key_name[8:].split("/")[0])
if key_name.endswith("/b"):
name = "model.blocks.%d.self_attn.norm.bias" % player
state = vnp.copy() # same because it is one dimensional
new_state[name] = torch.tensor(state)
elif key_name.endswith("/g"):
name = "model.blocks.%d.self_attn.norm.weight" % player
state = vnp.copy() # same because it is one dimensional
new_state[name] = torch.tensor(state)
elif (
key_name.startswith("model/wte")
or key_name.startswith("model/wpe")
or key_name.startswith("model/ete")
):
nlayer = {"wte": "embed_tokens", "wpe": "position_embeddings", "ete": "extra_position_embeddings"}[
key_name[-3:]
]
name = "model.%s.weight" % nlayer
state = vnp.copy() # same in embedded
new_state[name] = torch.tensor(state)
if key_name.startswith("model/wte"):
name = "lm_head.weight"
state = vnp.copy() # same in embedded
new_state[name] = torch.tensor(state)
elif key_name.startswith("model/wob"):
name = "final_logits_bias"
state = vnp.copy() # same in embedded
state = state.reshape((1, -1))
new_state[name] = torch.tensor(state)
elif key_name == "model/dense/kernel":
name = "model.last_project.weight"
state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix
new_state[name] = torch.tensor(state)
elif key_name == "model/dense_1/bias":
name = "model.last_project.bias"
state = vnp.copy() # same because it is one dimensional
new_state[name] = torch.tensor(state)
torch.save(new_state, args.output)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="model converter.", formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("--tf_model_dir", metavar="PATH", type=str, required=True, help="import model")
parser.add_argument("--output", metavar="PATH", type=str, required=True, help="output model")
args = parser.parse_args()
convert_tf_gptsan_to_pt(args)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,518 @@
# coding=utf-8
# Copyright 2023 HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for GPTSANJapanese."""
import collections
import json
import os
import re
import sys
from typing import Optional, Union
import numpy as np
from ....tokenization_utils import PreTrainedTokenizer
from ....tokenization_utils_base import (
BatchEncoding,
PreTokenizedInput,
PreTokenizedInputPair,
TextInput,
TextInputPair,
TruncationStrategy,
)
from ....utils import PaddingStrategy, logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "emoji_file": "emoji.json"}
def load_vocab_and_emoji(vocab_file, emoji_file):
"""Loads a vocabulary file and emoji file into a dictionary."""
with open(emoji_file, "r", encoding="utf-8") as f:
emoji = json.loads(f.read())
vocab = collections.OrderedDict()
raw_vocab = collections.OrderedDict()
ids_to_tokens = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as f:
token = f.readlines()
token = [[t.rstrip("\n")] if (t == ",\n" or "," not in t) else t.rstrip("\n").split(",") for t in token]
for idx, b in enumerate(token):
ids_to_tokens[idx] = b
raw_vocab[",".join(b)] = idx
for wd in b:
vocab[wd] = idx
return vocab, raw_vocab, ids_to_tokens, emoji
class GPTSanJapaneseTokenizer(PreTrainedTokenizer):
"""
This tokenizer is based on GPTNeoXJapaneseTokenizer and has the following modifications
- Decoding byte0~byte255 tokens correctly
- Added bagofword token handling
- Return token_type_ids for Prefix-LM model
The bagofword token represents a repetition of the previous token and is converted to 3 consecutive tokens when
decoding In addition, the original Japanese special Sub-Word-Encoding has been released in this repository
(https://github.com/tanreinama/Japanese-BPEEncoder_V2). The token_type_ids is a mask indicating the prefix input
position of the Prefix-LM model. To specify a prefix position, specify a prefix input for prefix_text, or specify a
sentence of the prefix part and the part after it as a text pair of batch input.
Example:
```python
>>> from transformers import GPTSanJapaneseTokenizer
>>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
>>> # You can confirm both 慶応 and 慶應 are encoded to 17750
>>> tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"]
[35993, 35998, 34347, 31459, 30647, 31448, 25, 30659, 35729, 35676, 32417, 30647, 17750, 35589, 17750, 35590, 321, 1281]
>>> # Both 慶応 and 慶應 are decoded to 慶応
>>> tokenizer.decode(tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"])
'吾輩は猫である🐯。実は慶応(慶応)大学出身'
```
Example for Prefix-LM:
```python
>>> from transformers import GPTSanJapaneseTokenizer
>>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
>>> tokenizer("実は慶応(慶應)大学出身", prefix_text="吾輩は猫である🐯。")["input_ids"]
[35993, 34347, 31459, 30647, 31448, 25, 30659, 35729, 35676, 35998, 32417, 30647, 17750, 35589, 17750, 35590, 321, 1281]
>>> # Mask for Prefix-LM inputs
>>> tokenizer("実は慶応(慶應)大学出身", prefix_text="吾輩は猫である🐯。")["token_type_ids"]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
```
Example for batch encode:
```python
>>> from transformers import GPTSanJapaneseTokenizer
>>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
>>> tokenizer([["武田信玄", "は、"], ["織田信長", "の配下の、"]], padding=True)["input_ids"]
[[35993, 35998, 8640, 25948, 35993, 35998, 30647, 35675, 35999, 35999], [35993, 35998, 10382, 9868, 35993, 35998, 30646, 9459, 30646, 35675]]
>>> # Mask for Prefix-LM inputs
>>> tokenizer([["武田信玄", "は、"], ["織田信長", "の配下の、"]], padding=True)["token_type_ids"]
[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
>>> # Mask for padding
>>> tokenizer([["武田信玄", "は、"], ["織田信長", "の配下の、"]], padding=True)["attention_mask"]
[[1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
```
Args:
vocab_file (`str`):
File containing the vocabulary.
emoji_file (`str`):
File containing the emoji.
unk_token (`str`, *optional*, defaults to `"<|nottoken|>"`):
The token used for unknown character
pad_token (`str`, *optional*, defaults to `"<|separator|>"`):
The token used for padding
bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`):
The beginning of sequence token.
eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The end of sequence token.
sep_token (`str`, *optional*, defaults to `"<|segmenter|>"`):
A special token to separate token to prefix part and general input part.
do_clean_text (`bool`, *optional*, defaults to `False`):
Whether or not to clean text for URL, EMAIL, TEL, Japanese DATE and Japanese PRICE.
"""
vocab_files_names = VOCAB_FILES_NAMES
model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
def __init__(
self,
vocab_file,
emoji_file,
unk_token="<|nottoken|>",
pad_token="<|separator|>",
bos_token="<|startoftext|>",
eos_token="<|endoftext|>",
sep_token="<|segmenter|>",
do_clean_text=False,
**kwargs,
):
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
" model use `tokenizer = GPTSanJapaneseTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
if not os.path.isfile(emoji_file):
raise ValueError(
f"Can't find a emoji file at path '{emoji_file}'. To load the emoji information from a Google"
" pretrained model use `tokenizer = GPTSanJapaneseTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
self.do_clean_text = do_clean_text
self.vocab, self.raw_vocab, self.ids_to_tokens, self.emoji = load_vocab_and_emoji(vocab_file, emoji_file)
self.subword_tokenizer = SubWordJapaneseTokenizer(
vocab=self.vocab, ids_to_tokens=self.ids_to_tokens, emoji=self.emoji
)
super().__init__(
unk_token=unk_token,
pad_token=pad_token,
bos_token=bos_token,
eos_token=eos_token,
sep_token=sep_token,
do_clean_text=do_clean_text,
**kwargs,
)
@property
def vocab_size(self):
# self.vocab contains support for character fluctuation unique to Japanese, and has a large number of vocab
return len(self.raw_vocab)
def get_vocab(self):
return dict(self.raw_vocab, **self.added_tokens_encoder)
def _tokenize(self, text):
return self.subword_tokenizer.tokenize(text, clean=self.do_clean_text)
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.vocab.get(token, self.vocab.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.subword_tokenizer.convert_id_to_token(index)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
words = []
byte_tokens = []
for word in tokens:
if word[:6] == "<|byte" and word[-2:] == "|>":
byte_tokens.append(int(word[6:-2]))
else:
if len(byte_tokens) > 0:
words.append(bytearray(byte_tokens).decode("utf-8", errors="replace"))
byte_tokens = []
if word[:7] == "<|emoji" and word[-2:] == "|>":
words.append(self.emoji["emoji_inv"][word])
elif word == "<SP>":
words.append(" ")
elif word == "<BR>":
words.append("\n")
elif word == "<TAB>":
words.append("\t")
elif word == "<BLOCK>":
words.append("")
elif word == "<KIGOU>":
words.append("ǀ")
elif word == "<U2000U2BFF>":
words.append("")
elif word == "<|bagoftoken|>":
if len(words) > 0:
words.append(words[-1])
words.append(words[-1])
words.append(words[-1])
elif word.startswith("<|") and word.endswith("|>"):
words.append("")
else:
words.append(word)
if len(byte_tokens) > 0:
words.append(bytearray(byte_tokens).decode("utf-8", errors="replace"))
text = "".join(words)
return text
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
index = 0
if os.path.isdir(save_directory):
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
emoji_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["emoji_file"]
)
else:
vocab_file = (
(filename_prefix + "-" if filename_prefix else "") + save_directory + VOCAB_FILES_NAMES["vocab_file"]
)
emoji_file = (
(filename_prefix + "-" if filename_prefix else "") + save_directory + VOCAB_FILES_NAMES["emoji_file"]
)
with open(vocab_file, "w", encoding="utf-8") as writer:
for token_index, token in self.ids_to_tokens.items():
if index != token_index:
logger.warning(
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!"
)
index = token_index
writer.write(",".join(token) + "\n")
index += 1
with open(emoji_file, "w", encoding="utf-8") as writer:
json.dump(self.emoji, writer)
return vocab_file, emoji_file
def create_token_type_ids_from_sequences(
self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
) -> list[int]:
# docstyle-ignore
"""
The tokenizer returns token_type_ids as separators between the Prefix part and the rest.
token_type_ids is 1 for the Prefix part and 0 for the rest of the token.
Example:
```python
>>> from transformers import GPTSanJapaneseTokenizer
>>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
>>> x_token = tokenizer("アイウエ")
>>> # input_ids: | SOT | SEG | ア | イ | ウ | エ |
>>> # token_type_ids: | 1 | 0 | 0 | 0 | 0 | 0 |
>>> x_token = tokenizer("", prefix_text="アイウエ")
>>> # input_ids: | SOT | ア | イ | ウ | エ | SEG |
>>> # token_type_ids: | 1 | 1 | 1 | 1 | 1 | 0 |
>>> x_token = tokenizer("ウエ", prefix_text="アイ")
>>> # input_ids: | SOT | ア | イ | SEG | ウ | エ |
>>> # token_type_ids: | 1 | 1 | 1 | 0 | 0 | 0 |
```"""
prefix_len = 0
if self.sep_token in self.vocab:
segid = self.vocab[self.sep_token]
if segid in token_ids_0:
prefix_len = token_ids_0.index(segid)
if token_ids_1 is None:
total_len = len(token_ids_0)
else:
total_len = len(token_ids_0 + token_ids_1)
return prefix_len * [1] + (total_len - prefix_len) * [0]
def prepare_for_tokenization(self, text, prefix_text=None, add_sep_token=None, **kwargs):
# GPTSAN inserts extra SEP tokens in Prefix-LM in addition to SOT for text generation.
# SOT at the beginning of the text, and SEP at the separator between the Prefix part and the rest.
if add_sep_token is None:
add_sep_token = self.sep_token not in text # If insert un-prefix position explicitly
prepared = self.bos_token if self.bos_token in self.vocab else ""
prepared += prefix_text if prefix_text is not None else ""
if add_sep_token:
prepared += self.sep_token if self.sep_token in self.vocab else ""
prepared += text
return (prepared, kwargs)
def _batch_encode_plus(
self,
batch_text_or_text_pairs: Union[
list[TextInput], list[TextInputPair], list[PreTokenizedInput], list[PreTokenizedInputPair]
],
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
# This tokenizer converts input text pairs into Prefix input and subsequent input
if isinstance(batch_text_or_text_pairs[0], tuple) or isinstance(tuple(batch_text_or_text_pairs[0]), list):
# As a single text with an explicit un-prefix position
batch_prefix_texts = []
for pref, txt in batch_text_or_text_pairs:
batch_prefix_texts.append(pref + self.sep_token + txt)
batch_text_or_text_pairs = batch_prefix_texts
return super()._batch_encode_plus(
batch_text_or_text_pairs,
add_special_tokens,
padding_strategy,
truncation_strategy,
max_length,
stride,
is_split_into_words,
pad_to_multiple_of,
return_tensors,
return_token_type_ids,
return_attention_mask,
return_overflowing_tokens,
return_special_tokens_mask,
return_offsets_mapping,
return_length,
verbose,
**kwargs,
)
class SubWordJapaneseTokenizer:
"""
This tokenizer is based on GPTNeoXJapaneseTokenizer and has the following modifications
- Decoding byte0~byte255 tokens correctly
- Added bagofword token handling
https://github.com/tanreinama/Japanese-BPEEncoder_V2 This tokenizer class is under MIT License according to the
original repository.
MIT License
Copyright (c) 2020 tanreinama
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of
the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""
def __init__(self, vocab, ids_to_tokens, emoji):
self.vocab = vocab # same as swe
self.ids_to_tokens = ids_to_tokens # same as bpe
self.emoji = emoji
self.maxlen = np.max([len(w) for w in self.vocab])
self.content_repatter1 = re.compile(r"(https?|ftp)(:\/\/[-_\.!~*\'()a-zA-Z0-9;\/?:\@&=\+$,%#]+)")
self.content_repatter2 = re.compile(r"[A-Za-z0-9\._+]*@[\-_0-9A-Za-z]+(\.[A-Za-z]+)*")
self.content_repatter3 = re.compile(r"[\(]{0,1}[0-9]{2,4}[\)\-\(]{0,1}[0-9]{2,4}[\)\-]{0,1}[0-9]{3,4}")
self.content_repatter4 = re.compile(
r"([12]\d{3}[/\-年])*(0?[1-9]|1[0-2])[/\-月]((0?[1-9]|[12][0-9]|3[01])日?)*(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*"
)
self.content_repatter5 = re.compile(
r"(明治|大正|昭和|平成|令和|㍾|㍽|㍼|㍻|\u32ff)\d{1,2}年(0?[1-9]|1[0-2])月(0?[1-9]|[12][0-9]|3[01])日(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*"
)
# The original version of this regex displays catastrophic backtracking behaviour. We avoid this using
# possessive quantifiers in Py >= 3.11. In versions below this, we avoid the vulnerability using a slightly
# different regex that should generally have the same behaviour in most non-pathological cases.
if sys.version_info >= (3, 11):
self.content_repatter6 = re.compile(
r"(?:\d,\d{3}|[\d億])*+"
r"(?:\d,\d{3}|[\d万])*+"
r"(?:\d,\d{3}|[\d千])*+"
r"(?:千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+"
r"(?:\(税込\)|\(税抜\)|\+tax)*"
)
else:
self.content_repatter6 = re.compile(
r"(?:\d,\d{3}|[\d億万千])*"
r"(?:千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+"
r"(?:\(税込\)|\(税抜\)|\+tax)*"
)
keisen = "─━│┃┄┅┆┇┈┉┊┋┌┍┎┏┐┑┒┓└┕┖┗┘┙┚┛├┝┞┟┠┡┢┣┤┥┦┧┨┩┪┫┬┭┮┯┰┱┲┳┴┵┶┷┸┹┺┻┼┽┾┿╀╁╂╃╄╅╆╇╈╉╊╋╌╍╎╏═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡╢╣╤╥╦╧╨╩╪╫╬╭╮╯╰╱╲╳╴╵╶╷╸╹╺╻╼╽╾╿"
blocks = "▀▁▂▃▄▅▆▇█▉▊▋▌▍▎▏▐░▒▓▔▕▖▗▘▙▚▛▜▝▞▟"
self.content_trans1 = str.maketrans(dict.fromkeys(keisen + blocks, "<BLOCK>"))
def __len__(self):
return len(self.ids_to_tokens)
def clean_text(self, content):
content = self.content_repatter1.sub("<URL>", content)
content = self.content_repatter2.sub("<EMAIL>", content)
content = self.content_repatter3.sub("<TEL>", content)
content = self.content_repatter4.sub("<DATE>", content)
content = self.content_repatter5.sub("<DATE>", content)
content = self.content_repatter6.sub("<PRICE>", content)
content = content.translate(self.content_trans1)
while "<BLOCK><BLOCK>" in content:
content = content.replace("<BLOCK><BLOCK>", "<BLOCK>")
return content
def tokenize(self, text, clean=False):
text = text.replace(" ", "<SP>")
text = text.replace(" ", "<SP>")
text = text.replace("\r\n", "<BR>")
text = text.replace("\n", "<BR>")
text = text.replace("\r", "<BR>")
text = text.replace("\t", "<TAB>")
text = text.replace("", "")
text = text.replace("", "")
for k, v in self.emoji["emoji"].items():
if k in text:
text = text.replace(k, v)
if clean:
text = self.clean_text(text)
def check_simbol(x):
e = x.encode()
if len(x) == 1 and len(e) == 2:
c = (int(e[0]) << 8) + int(e[1])
if (
(c >= 0xC2A1 and c <= 0xC2BF)
or (c >= 0xC780 and c <= 0xC783)
or (c >= 0xCAB9 and c <= 0xCBBF)
or (c >= 0xCC80 and c <= 0xCDA2)
):
return True
return False
def checku2e(x):
e = x.encode()
if len(x) == 1 and len(e) == 3:
c = (int(e[0]) << 16) + (int(e[1]) << 8) + int(e[2])
if c >= 0xE28080 and c <= 0xE2B07F:
return True
return False
pos = 0
result = []
while pos < len(text):
end = min(len(text), pos + self.maxlen + 1) if text[pos] == "<" else pos + 3
candidates = [] # (token_id, token, pos)
for e in range(end, pos, -1):
wd = text[pos:e]
if wd in self.vocab:
if wd[0] == "<" and len(wd) > 2:
candidates = [(self.vocab[wd], wd, e)]
break
else:
candidates.append((self.vocab[wd], wd, e))
if len(candidates) > 0:
# the smallest token_id is adopted
_, wd, e = min(candidates, key=lambda x: x[0])
result.append(wd)
pos = e
else:
end = pos + 1
wd = text[pos:end]
if check_simbol(wd):
result.append("<KIGOU>")
elif checku2e(wd):
result.append("<U2000U2BFF>")
else:
for i in wd.encode("utf-8"):
result.append("<|byte%d|>" % i)
pos = end
return result
def convert_id_to_token(self, index):
return self.ids_to_tokens[index][0]
__all__ = ["GPTSanJapaneseTokenizer"]

View File

@ -0,0 +1,27 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ....utils import _LazyModule
from ....utils.import_utils import define_import_structure
if TYPE_CHECKING:
from .configuration_graphormer import *
from .modeling_graphormer import *
else:
import sys
_file = globals()["__file__"]
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

View File

@ -0,0 +1,107 @@
# Copyright (c) Microsoft Corporation and HuggingFace
# Licensed under the MIT License.
import cython
cimport numpy
from cython.parallel cimport parallel, prange
import numpy as np
# Reduce this number if matrices are too big for large graphs
UNREACHABLE_NODE_DISTANCE = 510
def floyd_warshall(adjacency_matrix):
"""
Applies the Floyd-Warshall algorithm to the adjacency matrix, to compute the
shortest paths distance between all nodes, up to UNREACHABLE_NODE_DISTANCE.
"""
(nrows, ncols) = adjacency_matrix.shape
assert nrows == ncols
cdef unsigned int n = nrows
adj_mat_copy = adjacency_matrix.astype(np.int32, order='C', casting='safe', copy=True)
assert adj_mat_copy.flags['C_CONTIGUOUS']
cdef numpy.ndarray[numpy.int32_t, ndim=2, mode='c'] M = adj_mat_copy
cdef numpy.ndarray[numpy.int32_t, ndim=2, mode='c'] path = -1 * np.ones([n, n], dtype=np.int32)
cdef unsigned int i, j, k
cdef numpy.int32_t M_ij, M_ik, cost_ikkj
cdef numpy.int32_t* M_ptr = &M[0,0]
cdef numpy.int32_t* M_i_ptr
cdef numpy.int32_t* M_k_ptr
# set unreachable nodes distance to UNREACHABLE_NODE_DISTANCE
for i in range(n):
for j in range(n):
if i == j:
M[i][j] = 0
elif M[i][j] == 0:
M[i][j] = UNREACHABLE_NODE_DISTANCE
# floyed algo
for k in range(n):
M_k_ptr = M_ptr + n*k
for i in range(n):
M_i_ptr = M_ptr + n*i
M_ik = M_i_ptr[k]
for j in range(n):
cost_ikkj = M_ik + M_k_ptr[j]
M_ij = M_i_ptr[j]
if M_ij > cost_ikkj:
M_i_ptr[j] = cost_ikkj
path[i][j] = k
# set unreachable path to UNREACHABLE_NODE_DISTANCE
for i in range(n):
for j in range(n):
if M[i][j] >= UNREACHABLE_NODE_DISTANCE:
path[i][j] = UNREACHABLE_NODE_DISTANCE
M[i][j] = UNREACHABLE_NODE_DISTANCE
return M, path
def get_all_edges(path, i, j):
"""
Recursive function to compute all possible paths between two nodes from the graph adjacency matrix.
"""
cdef int k = path[i][j]
if k == -1:
return []
else:
return get_all_edges(path, i, k) + [k] + get_all_edges(path, k, j)
def gen_edge_input(max_dist, path, edge_feat):
"""
Generates the full edge feature and adjacency matrix.
Shape: num_nodes * num_nodes * max_distance_between_nodes * num_edge_features
Dim 1 is the input node, dim 2 the output node of the edge, dim 3 the depth of the edge, dim 4 the feature
"""
(nrows, ncols) = path.shape
assert nrows == ncols
cdef unsigned int n = nrows
cdef unsigned int max_dist_copy = max_dist
path_copy = path.astype(int, order='C', casting='safe', copy=True)
edge_feat_copy = edge_feat.astype(int, order='C', casting='safe', copy=True)
assert path_copy.flags['C_CONTIGUOUS']
assert edge_feat_copy.flags['C_CONTIGUOUS']
cdef numpy.ndarray[numpy.int32_t, ndim=4, mode='c'] edge_fea_all = -1 * np.ones([n, n, max_dist_copy, edge_feat.shape[-1]], dtype=np.int32)
cdef unsigned int i, j, k, num_path, cur
for i in range(n):
for j in range(n):
if i == j:
continue
if path_copy[i][j] == UNREACHABLE_NODE_DISTANCE:
continue
path = [i] + get_all_edges(path_copy, i, j) + [j]
num_path = len(path) - 1
for k in range(num_path):
edge_fea_all[i, j, k, :] = edge_feat_copy[path[k], path[k+1], :]
return edge_fea_all

View File

@ -0,0 +1,135 @@
# Copyright (c) Microsoft Corporation and HuggingFace
# Licensed under the MIT License.
from collections.abc import Mapping
from typing import Any
import numpy as np
import torch
from ....utils import is_cython_available, requires_backends
if is_cython_available():
import pyximport
pyximport.install(setup_args={"include_dirs": np.get_include()})
from . import algos_graphormer
def convert_to_single_emb(x, offset: int = 512):
feature_num = x.shape[1] if len(x.shape) > 1 else 1
feature_offset = 1 + np.arange(0, feature_num * offset, offset, dtype=np.int64)
x = x + feature_offset
return x
def preprocess_item(item, keep_features=True):
requires_backends(preprocess_item, ["cython"])
if keep_features and "edge_attr" in item: # edge_attr
edge_attr = np.asarray(item["edge_attr"], dtype=np.int64)
else:
edge_attr = np.ones((len(item["edge_index"][0]), 1), dtype=np.int64) # same embedding for all
if keep_features and "node_feat" in item: # input_nodes
node_feature = np.asarray(item["node_feat"], dtype=np.int64)
else:
node_feature = np.ones((item["num_nodes"], 1), dtype=np.int64) # same embedding for all
edge_index = np.asarray(item["edge_index"], dtype=np.int64)
input_nodes = convert_to_single_emb(node_feature) + 1
num_nodes = item["num_nodes"]
if len(edge_attr.shape) == 1:
edge_attr = edge_attr[:, None]
attn_edge_type = np.zeros([num_nodes, num_nodes, edge_attr.shape[-1]], dtype=np.int64)
attn_edge_type[edge_index[0], edge_index[1]] = convert_to_single_emb(edge_attr) + 1
# node adj matrix [num_nodes, num_nodes] bool
adj = np.zeros([num_nodes, num_nodes], dtype=bool)
adj[edge_index[0], edge_index[1]] = True
shortest_path_result, path = algos_graphormer.floyd_warshall(adj)
max_dist = np.amax(shortest_path_result)
input_edges = algos_graphormer.gen_edge_input(max_dist, path, attn_edge_type)
attn_bias = np.zeros([num_nodes + 1, num_nodes + 1], dtype=np.single) # with graph token
# combine
item["input_nodes"] = input_nodes + 1 # we shift all indices by one for padding
item["attn_bias"] = attn_bias
item["attn_edge_type"] = attn_edge_type
item["spatial_pos"] = shortest_path_result.astype(np.int64) + 1 # we shift all indices by one for padding
item["in_degree"] = np.sum(adj, axis=1).reshape(-1) + 1 # we shift all indices by one for padding
item["out_degree"] = item["in_degree"] # for undirected graph
item["input_edges"] = input_edges + 1 # we shift all indices by one for padding
if "labels" not in item:
item["labels"] = item["y"]
return item
class GraphormerDataCollator:
def __init__(self, spatial_pos_max=20, on_the_fly_processing=False):
if not is_cython_available():
raise ImportError("Graphormer preprocessing needs Cython (pyximport)")
self.spatial_pos_max = spatial_pos_max
self.on_the_fly_processing = on_the_fly_processing
def __call__(self, features: list[dict]) -> dict[str, Any]:
if self.on_the_fly_processing:
features = [preprocess_item(i) for i in features]
if not isinstance(features[0], Mapping):
features = [vars(f) for f in features]
batch = {}
max_node_num = max(len(i["input_nodes"]) for i in features)
node_feat_size = len(features[0]["input_nodes"][0])
edge_feat_size = len(features[0]["attn_edge_type"][0][0])
max_dist = max(len(i["input_edges"][0][0]) for i in features)
edge_input_size = len(features[0]["input_edges"][0][0][0])
batch_size = len(features)
batch["attn_bias"] = torch.zeros(batch_size, max_node_num + 1, max_node_num + 1, dtype=torch.float)
batch["attn_edge_type"] = torch.zeros(batch_size, max_node_num, max_node_num, edge_feat_size, dtype=torch.long)
batch["spatial_pos"] = torch.zeros(batch_size, max_node_num, max_node_num, dtype=torch.long)
batch["in_degree"] = torch.zeros(batch_size, max_node_num, dtype=torch.long)
batch["input_nodes"] = torch.zeros(batch_size, max_node_num, node_feat_size, dtype=torch.long)
batch["input_edges"] = torch.zeros(
batch_size, max_node_num, max_node_num, max_dist, edge_input_size, dtype=torch.long
)
for ix, f in enumerate(features):
for k in ["attn_bias", "attn_edge_type", "spatial_pos", "in_degree", "input_nodes", "input_edges"]:
f[k] = torch.tensor(f[k])
if len(f["attn_bias"][1:, 1:][f["spatial_pos"] >= self.spatial_pos_max]) > 0:
f["attn_bias"][1:, 1:][f["spatial_pos"] >= self.spatial_pos_max] = float("-inf")
batch["attn_bias"][ix, : f["attn_bias"].shape[0], : f["attn_bias"].shape[1]] = f["attn_bias"]
batch["attn_edge_type"][ix, : f["attn_edge_type"].shape[0], : f["attn_edge_type"].shape[1], :] = f[
"attn_edge_type"
]
batch["spatial_pos"][ix, : f["spatial_pos"].shape[0], : f["spatial_pos"].shape[1]] = f["spatial_pos"]
batch["in_degree"][ix, : f["in_degree"].shape[0]] = f["in_degree"]
batch["input_nodes"][ix, : f["input_nodes"].shape[0], :] = f["input_nodes"]
batch["input_edges"][
ix, : f["input_edges"].shape[0], : f["input_edges"].shape[1], : f["input_edges"].shape[2], :
] = f["input_edges"]
batch["out_degree"] = batch["in_degree"]
sample = features[0]["labels"]
if len(sample) == 1: # one task
if isinstance(sample[0], float): # regression
batch["labels"] = torch.from_numpy(np.concatenate([i["labels"] for i in features]))
else: # binary classification
batch["labels"] = torch.from_numpy(np.concatenate([i["labels"] for i in features]))
else: # multi task classification, left to float to keep the NaNs
batch["labels"] = torch.from_numpy(np.stack([i["labels"] for i in features], axis=0))
return batch

View File

@ -0,0 +1,220 @@
# coding=utf-8
# Copyright 2022 Microsoft, clefourrier and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Graphormer model configuration"""
from typing import Optional
from ....configuration_utils import PreTrainedConfig
from ....utils import logging
logger = logging.get_logger(__name__)
class GraphormerConfig(PreTrainedConfig):
r"""
This is the configuration class to store the configuration of a [`~GraphormerModel`]. It is used to instantiate an
Graphormer model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the Graphormer
[graphormer-base-pcqm4mv1](https://huggingface.co/graphormer-base-pcqm4mv1) architecture.
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PreTrainedConfig`] for more information.
Args:
num_classes (`int`, *optional*, defaults to 1):
Number of target classes or labels, set to n for binary classification of n tasks.
num_atoms (`int`, *optional*, defaults to 512*9):
Number of node types in the graphs.
num_edges (`int`, *optional*, defaults to 512*3):
Number of edges types in the graph.
num_in_degree (`int`, *optional*, defaults to 512):
Number of in degrees types in the input graphs.
num_out_degree (`int`, *optional*, defaults to 512):
Number of out degrees types in the input graphs.
num_edge_dis (`int`, *optional*, defaults to 128):
Number of edge dis in the input graphs.
multi_hop_max_dist (`int`, *optional*, defaults to 20):
Maximum distance of multi hop edges between two nodes.
spatial_pos_max (`int`, *optional*, defaults to 1024):
Maximum distance between nodes in the graph attention bias matrices, used during preprocessing and
collation.
edge_type (`str`, *optional*, defaults to multihop):
Type of edge relation chosen.
max_nodes (`int`, *optional*, defaults to 512):
Maximum number of nodes which can be parsed for the input graphs.
share_input_output_embed (`bool`, *optional*, defaults to `False`):
Shares the embedding layer between encoder and decoder - careful, True is not implemented.
num_layers (`int`, *optional*, defaults to 12):
Number of layers.
embedding_dim (`int`, *optional*, defaults to 768):
Dimension of the embedding layer in encoder.
ffn_embedding_dim (`int`, *optional*, defaults to 768):
Dimension of the "intermediate" (often named feed-forward) layer in encoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads in the encoder.
self_attention (`bool`, *optional*, defaults to `True`):
Model is self attentive (False not implemented).
activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"silu"` and `"gelu_new"` are supported.
dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for the attention weights.
activation_dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for the activation of the linear transformer layer.
layerdrop (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556)
for more details.
bias (`bool`, *optional*, defaults to `True`):
Uses bias in the attention module - unsupported at the moment.
embed_scale(`float`, *optional*, defaults to None):
Scaling factor for the node embeddings.
num_trans_layers_to_freeze (`int`, *optional*, defaults to 0):
Number of transformer layers to freeze.
encoder_normalize_before (`bool`, *optional*, defaults to `False`):
Normalize features before encoding the graph.
pre_layernorm (`bool`, *optional*, defaults to `False`):
Apply layernorm before self attention and the feed forward network. Without this, post layernorm will be
used.
apply_graphormer_init (`bool`, *optional*, defaults to `False`):
Apply a custom graphormer initialisation to the model before training.
freeze_embeddings (`bool`, *optional*, defaults to `False`):
Freeze the embedding layer, or train it along the model.
encoder_normalize_before (`bool`, *optional*, defaults to `False`):
Apply the layer norm before each encoder block.
q_noise (`float`, *optional*, defaults to 0.0):
Amount of quantization noise (see "Training with Quantization Noise for Extreme Model Compression"). (For
more detail, see fairseq's documentation on quant_noise).
qn_block_size (`int`, *optional*, defaults to 8):
Size of the blocks for subsequent quantization with iPQ (see q_noise).
kdim (`int`, *optional*, defaults to None):
Dimension of the key in the attention, if different from the other values.
vdim (`int`, *optional*, defaults to None):
Dimension of the value in the attention, if different from the other values.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models).
traceable (`bool`, *optional*, defaults to `False`):
Changes return value of the encoder's inner_state to stacked tensors.
Example:
```python
>>> from transformers import GraphormerForGraphClassification, GraphormerConfig
>>> # Initializing a Graphormer graphormer-base-pcqm4mv2 style configuration
>>> configuration = GraphormerConfig()
>>> # Initializing a model from the graphormer-base-pcqm4mv1 style configuration
>>> model = GraphormerForGraphClassification(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "graphormer"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
num_classes: int = 1,
num_atoms: int = 512 * 9,
num_edges: int = 512 * 3,
num_in_degree: int = 512,
num_out_degree: int = 512,
num_spatial: int = 512,
num_edge_dis: int = 128,
multi_hop_max_dist: int = 5, # sometimes is 20
spatial_pos_max: int = 1024,
edge_type: str = "multi_hop",
max_nodes: int = 512,
share_input_output_embed: bool = False,
num_hidden_layers: int = 12,
embedding_dim: int = 768,
ffn_embedding_dim: int = 768,
num_attention_heads: int = 32,
dropout: float = 0.1,
attention_dropout: float = 0.1,
activation_dropout: float = 0.1,
layerdrop: float = 0.0,
encoder_normalize_before: bool = False,
pre_layernorm: bool = False,
apply_graphormer_init: bool = False,
activation_fn: str = "gelu",
embed_scale: Optional[float] = None,
freeze_embeddings: bool = False,
num_trans_layers_to_freeze: int = 0,
traceable: bool = False,
q_noise: float = 0.0,
qn_block_size: int = 8,
kdim: Optional[int] = None,
vdim: Optional[int] = None,
bias: bool = True,
self_attention: bool = True,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
**kwargs,
):
self.num_classes = num_classes
self.num_atoms = num_atoms
self.num_in_degree = num_in_degree
self.num_out_degree = num_out_degree
self.num_edges = num_edges
self.num_spatial = num_spatial
self.num_edge_dis = num_edge_dis
self.edge_type = edge_type
self.multi_hop_max_dist = multi_hop_max_dist
self.spatial_pos_max = spatial_pos_max
self.max_nodes = max_nodes
self.num_hidden_layers = num_hidden_layers
self.embedding_dim = embedding_dim
self.hidden_size = embedding_dim
self.ffn_embedding_dim = ffn_embedding_dim
self.num_attention_heads = num_attention_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.layerdrop = layerdrop
self.encoder_normalize_before = encoder_normalize_before
self.pre_layernorm = pre_layernorm
self.apply_graphormer_init = apply_graphormer_init
self.activation_fn = activation_fn
self.embed_scale = embed_scale
self.freeze_embeddings = freeze_embeddings
self.num_trans_layers_to_freeze = num_trans_layers_to_freeze
self.share_input_output_embed = share_input_output_embed
self.traceable = traceable
self.q_noise = q_noise
self.qn_block_size = qn_block_size
# These parameters are here for future extensions
# atm, the model only supports self attention
self.kdim = kdim
self.vdim = vdim
self.self_attention = self_attention
self.bias = bias
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
**kwargs,
)
__all__ = ["GraphormerConfig"]

View File

@ -0,0 +1,895 @@
# coding=utf-8
# Copyright 2022 Microsoft, clefourrier The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch Graphormer model."""
import math
from collections.abc import Iterable, Iterator
from typing import Optional, Union
import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from .... import initialization as init
from ....activations import ACT2FN
from ....modeling_outputs import (
BaseModelOutputWithNoAttention,
SequenceClassifierOutput,
)
from ....modeling_utils import PreTrainedModel
from ....utils import logging
from .configuration_graphormer import GraphormerConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "graphormer-base-pcqm4mv1"
_CONFIG_FOR_DOC = "GraphormerConfig"
def quant_noise(module: nn.Module, p: float, block_size: int):
"""
From:
https://github.com/facebookresearch/fairseq/blob/dd0079bde7f678b0cd0715cbd0ae68d661b7226d/fairseq/modules/quant_noise.py
Wraps modules and applies quantization noise to the weights for subsequent quantization with Iterative Product
Quantization as described in "Training with Quantization Noise for Extreme Model Compression"
Args:
- module: nn.Module
- p: amount of Quantization Noise
- block_size: size of the blocks for subsequent quantization with iPQ
Remarks:
- Module weights must have the right sizes wrt the block size
- Only Linear, Embedding and Conv2d modules are supported for the moment
- For more detail on how to quantize by blocks with convolutional weights, see "And the Bit Goes Down:
Revisiting the Quantization of Neural Networks"
- We implement the simplest form of noise here as stated in the paper which consists in randomly dropping
blocks
"""
# if no quantization noise, don't register hook
if p <= 0:
return module
# supported modules
if not isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d)):
raise NotImplementedError("Module unsupported for quant_noise.")
# test whether module.weight has the right sizes wrt block_size
is_conv = module.weight.ndim == 4
# 2D matrix
if not is_conv:
if module.weight.size(1) % block_size != 0:
raise AssertionError("Input features must be a multiple of block sizes")
# 4D matrix
else:
# 1x1 convolutions
if module.kernel_size == (1, 1):
if module.in_channels % block_size != 0:
raise AssertionError("Input channels must be a multiple of block sizes")
# regular convolutions
else:
k = module.kernel_size[0] * module.kernel_size[1]
if k % block_size != 0:
raise AssertionError("Kernel size must be a multiple of block size")
def _forward_pre_hook(mod, input):
# no noise for evaluation
if mod.training:
if not is_conv:
# gather weight and sizes
weight = mod.weight
in_features = weight.size(1)
out_features = weight.size(0)
# split weight matrix into blocks and randomly drop selected blocks
mask = torch.zeros(in_features // block_size * out_features, device=weight.device)
mask.bernoulli_(p)
mask = mask.repeat_interleave(block_size, -1).view(-1, in_features)
else:
# gather weight and sizes
weight = mod.weight
in_channels = mod.in_channels
out_channels = mod.out_channels
# split weight matrix into blocks and randomly drop selected blocks
if mod.kernel_size == (1, 1):
mask = torch.zeros(
int(in_channels // block_size * out_channels),
device=weight.device,
)
mask.bernoulli_(p)
mask = mask.repeat_interleave(block_size, -1).view(-1, in_channels)
else:
mask = torch.zeros(weight.size(0), weight.size(1), device=weight.device)
mask.bernoulli_(p)
mask = mask.unsqueeze(2).unsqueeze(3).repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1])
# scale weights and apply mask
mask = mask.to(torch.bool) # x.bool() is not currently supported in TorchScript
s = 1 / (1 - p)
mod.weight.data = s * weight.masked_fill(mask, 0)
module.register_forward_pre_hook(_forward_pre_hook)
return module
class LayerDropModuleList(nn.ModuleList):
"""
From:
https://github.com/facebookresearch/fairseq/blob/dd0079bde7f678b0cd0715cbd0ae68d661b7226d/fairseq/modules/layer_drop.py
A LayerDrop implementation based on [`torch.nn.ModuleList`]. LayerDrop as described in
https://huggingface.co/papers/1909.11556.
We refresh the choice of which layers to drop every time we iterate over the LayerDropModuleList instance. During
evaluation we always iterate over all layers.
Usage:
```python
layers = LayerDropList(p=0.5, modules=[layer1, layer2, layer3])
for layer in layers: # this might iterate over layers 1 and 3
x = layer(x)
for layer in layers: # this might iterate over all layers
x = layer(x)
for layer in layers: # this might not iterate over any layers
x = layer(x)
```
Args:
p (float): probability of dropping out each layer
modules (iterable, optional): an iterable of modules to add
"""
def __init__(self, p: float, modules: Optional[Iterable[nn.Module]] = None):
super().__init__(modules)
self.p = p
def __iter__(self) -> Iterator[nn.Module]:
dropout_probs = torch.empty(len(self)).uniform_()
for i, m in enumerate(super().__iter__()):
if not self.training or (dropout_probs[i] > self.p):
yield m
class GraphormerGraphNodeFeature(nn.Module):
"""
Compute node features for each node in the graph.
"""
def __init__(self, config: GraphormerConfig):
super().__init__()
self.num_heads = config.num_attention_heads
self.num_atoms = config.num_atoms
self.atom_encoder = nn.Embedding(config.num_atoms + 1, config.hidden_size, padding_idx=config.pad_token_id)
self.in_degree_encoder = nn.Embedding(
config.num_in_degree, config.hidden_size, padding_idx=config.pad_token_id
)
self.out_degree_encoder = nn.Embedding(
config.num_out_degree, config.hidden_size, padding_idx=config.pad_token_id
)
self.graph_token = nn.Embedding(1, config.hidden_size)
def forward(
self,
input_nodes: torch.LongTensor,
in_degree: torch.LongTensor,
out_degree: torch.LongTensor,
) -> torch.Tensor:
n_graph, n_node = input_nodes.size()[:2]
node_feature = ( # node feature + graph token
self.atom_encoder(input_nodes).sum(dim=-2) # [n_graph, n_node, n_hidden]
+ self.in_degree_encoder(in_degree)
+ self.out_degree_encoder(out_degree)
)
graph_token_feature = self.graph_token.weight.unsqueeze(0).repeat(n_graph, 1, 1)
graph_node_feature = torch.cat([graph_token_feature, node_feature], dim=1)
return graph_node_feature
class GraphormerGraphAttnBias(nn.Module):
"""
Compute attention bias for each head.
"""
def __init__(self, config: GraphormerConfig):
super().__init__()
self.num_heads = config.num_attention_heads
self.multi_hop_max_dist = config.multi_hop_max_dist
# We do not change edge feature embedding learning, as edge embeddings are represented as a combination of the original features
# + shortest path
self.edge_encoder = nn.Embedding(config.num_edges + 1, config.num_attention_heads, padding_idx=0)
self.edge_type = config.edge_type
if self.edge_type == "multi_hop":
self.edge_dis_encoder = nn.Embedding(
config.num_edge_dis * config.num_attention_heads * config.num_attention_heads,
1,
)
self.spatial_pos_encoder = nn.Embedding(config.num_spatial, config.num_attention_heads, padding_idx=0)
self.graph_token_virtual_distance = nn.Embedding(1, config.num_attention_heads)
def forward(
self,
input_nodes: torch.LongTensor,
attn_bias: torch.Tensor,
spatial_pos: torch.LongTensor,
input_edges: torch.LongTensor,
attn_edge_type: torch.LongTensor,
) -> torch.Tensor:
n_graph, n_node = input_nodes.size()[:2]
graph_attn_bias = attn_bias.clone()
graph_attn_bias = graph_attn_bias.unsqueeze(1).repeat(
1, self.num_heads, 1, 1
) # [n_graph, n_head, n_node+1, n_node+1]
# spatial pos
# [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node]
spatial_pos_bias = self.spatial_pos_encoder(spatial_pos).permute(0, 3, 1, 2)
graph_attn_bias[:, :, 1:, 1:] = graph_attn_bias[:, :, 1:, 1:] + spatial_pos_bias
# reset spatial pos here
t = self.graph_token_virtual_distance.weight.view(1, self.num_heads, 1)
graph_attn_bias[:, :, 1:, 0] = graph_attn_bias[:, :, 1:, 0] + t
graph_attn_bias[:, :, 0, :] = graph_attn_bias[:, :, 0, :] + t
# edge feature
if self.edge_type == "multi_hop":
spatial_pos_ = spatial_pos.clone()
spatial_pos_[spatial_pos_ == 0] = 1 # set pad to 1
# set 1 to 1, input_nodes > 1 to input_nodes - 1
spatial_pos_ = torch.where(spatial_pos_ > 1, spatial_pos_ - 1, spatial_pos_)
if self.multi_hop_max_dist > 0:
spatial_pos_ = spatial_pos_.clamp(0, self.multi_hop_max_dist)
input_edges = input_edges[:, :, :, : self.multi_hop_max_dist, :]
# [n_graph, n_node, n_node, max_dist, n_head]
input_edges = self.edge_encoder(input_edges).mean(-2)
max_dist = input_edges.size(-2)
edge_input_flat = input_edges.permute(3, 0, 1, 2, 4).reshape(max_dist, -1, self.num_heads)
edge_input_flat = torch.bmm(
edge_input_flat,
self.edge_dis_encoder.weight.reshape(-1, self.num_heads, self.num_heads)[:max_dist, :, :],
)
input_edges = edge_input_flat.reshape(max_dist, n_graph, n_node, n_node, self.num_heads).permute(
1, 2, 3, 0, 4
)
input_edges = (input_edges.sum(-2) / (spatial_pos_.float().unsqueeze(-1))).permute(0, 3, 1, 2)
else:
# [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node]
input_edges = self.edge_encoder(attn_edge_type).mean(-2).permute(0, 3, 1, 2)
graph_attn_bias[:, :, 1:, 1:] = graph_attn_bias[:, :, 1:, 1:] + input_edges
graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1) # reset
return graph_attn_bias
class GraphormerMultiheadAttention(nn.Module):
"""Multi-headed attention.
See "Attention Is All You Need" for more details.
"""
def __init__(self, config: GraphormerConfig):
super().__init__()
self.embedding_dim = config.embedding_dim
self.kdim = config.kdim if config.kdim is not None else config.embedding_dim
self.vdim = config.vdim if config.vdim is not None else config.embedding_dim
self.qkv_same_dim = self.kdim == config.embedding_dim and self.vdim == config.embedding_dim
self.num_heads = config.num_attention_heads
self.attention_dropout_module = torch.nn.Dropout(p=config.attention_dropout, inplace=False)
self.head_dim = config.embedding_dim // config.num_attention_heads
if not (self.head_dim * config.num_attention_heads == self.embedding_dim):
raise AssertionError("The embedding_dim must be divisible by num_heads.")
self.scaling = self.head_dim**-0.5
self.self_attention = True # config.self_attention
if not (self.self_attention):
raise NotImplementedError("The Graphormer model only supports self attention for now.")
if self.self_attention and not self.qkv_same_dim:
raise AssertionError("Self-attention requires query, key and value to be of the same size.")
self.k_proj = quant_noise(
nn.Linear(self.kdim, config.embedding_dim, bias=config.bias),
config.q_noise,
config.qn_block_size,
)
self.v_proj = quant_noise(
nn.Linear(self.vdim, config.embedding_dim, bias=config.bias),
config.q_noise,
config.qn_block_size,
)
self.q_proj = quant_noise(
nn.Linear(config.embedding_dim, config.embedding_dim, bias=config.bias),
config.q_noise,
config.qn_block_size,
)
self.out_proj = quant_noise(
nn.Linear(config.embedding_dim, config.embedding_dim, bias=config.bias),
config.q_noise,
config.qn_block_size,
)
self.onnx_trace = False
def reset_parameters(self):
if self.qkv_same_dim:
# Empirically observed the convergence to be much better with
# the scaled initialization
init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
else:
init.xavier_uniform_(self.k_proj.weight)
init.xavier_uniform_(self.v_proj.weight)
init.xavier_uniform_(self.q_proj.weight)
init.xavier_uniform_(self.out_proj.weight)
if self.out_proj.bias is not None:
init.constant_(self.out_proj.bias, 0.0)
def forward(
self,
query: torch.LongTensor,
key: Optional[torch.Tensor],
value: Optional[torch.Tensor],
attn_bias: Optional[torch.Tensor],
key_padding_mask: Optional[torch.Tensor] = None,
need_weights: bool = True,
attn_mask: Optional[torch.Tensor] = None,
before_softmax: bool = False,
need_head_weights: bool = False,
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
"""
Args:
key_padding_mask (Bytetorch.Tensor, optional): mask to exclude
keys that are pads, of shape `(batch, src_len)`, where padding elements are indicated by 1s.
need_weights (bool, optional): return the attention weights,
averaged over heads (default: False).
attn_mask (Bytetorch.Tensor, optional): typically used to
implement causal attention, where the mask prevents the attention from looking forward in time
(default: None).
before_softmax (bool, optional): return the raw attention
weights and values before the attention softmax.
need_head_weights (bool, optional): return the attention
weights for each head. Implies *need_weights*. Default: return the average attention weights over all
heads.
"""
if need_head_weights:
need_weights = True
tgt_len, bsz, embedding_dim = query.size()
src_len = tgt_len
if not (embedding_dim == self.embedding_dim):
raise AssertionError(
f"The query embedding dimension {embedding_dim} is not equal to the expected embedding_dim"
f" {self.embedding_dim}."
)
if not (list(query.size()) == [tgt_len, bsz, embedding_dim]):
raise AssertionError("Query size incorrect in Graphormer, compared to model dimensions.")
if key is not None:
src_len, key_bsz, _ = key.size()
if not torch.jit.is_scripting():
if (key_bsz != bsz) or (value is None) or not (src_len, bsz == value.shape[:2]):
raise AssertionError(
"The batch shape does not match the key or value shapes provided to the attention."
)
q = self.q_proj(query)
k = self.k_proj(query)
v = self.v_proj(query)
q *= self.scaling
q = q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
if k is not None:
k = k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
if v is not None:
v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
if (k is None) or not (k.size(1) == src_len):
raise AssertionError("The shape of the key generated in the attention is incorrect")
# This is part of a workaround to get around fork/join parallelism
# not supporting Optional types.
if key_padding_mask is not None and key_padding_mask.dim() == 0:
key_padding_mask = None
if key_padding_mask is not None:
if key_padding_mask.size(0) != bsz or key_padding_mask.size(1) != src_len:
raise AssertionError(
"The shape of the generated padding mask for the key does not match expected dimensions."
)
attn_weights = torch.bmm(q, k.transpose(1, 2))
attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
if list(attn_weights.size()) != [bsz * self.num_heads, tgt_len, src_len]:
raise AssertionError("The attention weights generated do not match the expected dimensions.")
if attn_bias is not None:
attn_weights += attn_bias.view(bsz * self.num_heads, tgt_len, src_len)
if attn_mask is not None:
attn_mask = attn_mask.unsqueeze(0)
attn_weights += attn_mask
if key_padding_mask is not None:
# don't attend to padding symbols
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights.masked_fill(
key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), float("-inf")
)
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if before_softmax:
return attn_weights, v
attn_weights_float = torch.nn.functional.softmax(attn_weights, dim=-1)
attn_weights = attn_weights_float.type_as(attn_weights)
attn_probs = self.attention_dropout_module(attn_weights)
if v is None:
raise AssertionError("No value generated")
attn = torch.bmm(attn_probs, v)
if list(attn.size()) != [bsz * self.num_heads, tgt_len, self.head_dim]:
raise AssertionError("The attention generated do not match the expected dimensions.")
attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embedding_dim)
attn: torch.Tensor = self.out_proj(attn)
attn_weights = None
if need_weights:
attn_weights = attn_weights_float.contiguous().view(bsz, self.num_heads, tgt_len, src_len).transpose(1, 0)
if not need_head_weights:
# average attention weights over heads
attn_weights = attn_weights.mean(dim=0)
return attn, attn_weights
def apply_sparse_mask(self, attn_weights: torch.Tensor, tgt_len: int, src_len: int, bsz: int) -> torch.Tensor:
return attn_weights
class GraphormerGraphEncoderLayer(nn.Module):
def __init__(self, config: GraphormerConfig) -> None:
super().__init__()
# Initialize parameters
self.embedding_dim = config.embedding_dim
self.num_attention_heads = config.num_attention_heads
self.q_noise = config.q_noise
self.qn_block_size = config.qn_block_size
self.pre_layernorm = config.pre_layernorm
self.dropout_module = torch.nn.Dropout(p=config.dropout, inplace=False)
self.activation_dropout_module = torch.nn.Dropout(p=config.activation_dropout, inplace=False)
# Initialize blocks
self.activation_fn = ACT2FN[config.activation_fn]
self.self_attn = GraphormerMultiheadAttention(config)
# layer norm associated with the self attention layer
self.self_attn_layer_norm = nn.LayerNorm(self.embedding_dim)
self.fc1 = self.build_fc(
self.embedding_dim,
config.ffn_embedding_dim,
q_noise=config.q_noise,
qn_block_size=config.qn_block_size,
)
self.fc2 = self.build_fc(
config.ffn_embedding_dim,
self.embedding_dim,
q_noise=config.q_noise,
qn_block_size=config.qn_block_size,
)
# layer norm associated with the position wise feed-forward NN
self.final_layer_norm = nn.LayerNorm(self.embedding_dim)
def build_fc(
self, input_dim: int, output_dim: int, q_noise: float, qn_block_size: int
) -> Union[nn.Module, nn.Linear, nn.Embedding, nn.Conv2d]:
return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size)
def forward(
self,
input_nodes: torch.Tensor,
self_attn_bias: Optional[torch.Tensor] = None,
self_attn_mask: Optional[torch.Tensor] = None,
self_attn_padding_mask: Optional[torch.Tensor] = None,
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
"""
nn.LayerNorm is applied either before or after the self-attention/ffn modules similar to the original
Transformer implementation.
"""
residual = input_nodes
if self.pre_layernorm:
input_nodes = self.self_attn_layer_norm(input_nodes)
input_nodes, attn = self.self_attn(
query=input_nodes,
key=input_nodes,
value=input_nodes,
attn_bias=self_attn_bias,
key_padding_mask=self_attn_padding_mask,
need_weights=False,
attn_mask=self_attn_mask,
)
input_nodes = self.dropout_module(input_nodes)
input_nodes = residual + input_nodes
if not self.pre_layernorm:
input_nodes = self.self_attn_layer_norm(input_nodes)
residual = input_nodes
if self.pre_layernorm:
input_nodes = self.final_layer_norm(input_nodes)
input_nodes = self.activation_fn(self.fc1(input_nodes))
input_nodes = self.activation_dropout_module(input_nodes)
input_nodes = self.fc2(input_nodes)
input_nodes = self.dropout_module(input_nodes)
input_nodes = residual + input_nodes
if not self.pre_layernorm:
input_nodes = self.final_layer_norm(input_nodes)
return input_nodes, attn
class GraphormerGraphEncoder(nn.Module):
def __init__(self, config: GraphormerConfig):
super().__init__()
self.dropout_module = torch.nn.Dropout(p=config.dropout, inplace=False)
self.layerdrop = config.layerdrop
self.embedding_dim = config.embedding_dim
self.apply_graphormer_init = config.apply_graphormer_init
self.traceable = config.traceable
self.graph_node_feature = GraphormerGraphNodeFeature(config)
self.graph_attn_bias = GraphormerGraphAttnBias(config)
self.embed_scale = config.embed_scale
if config.q_noise > 0:
self.quant_noise = quant_noise(
nn.Linear(self.embedding_dim, self.embedding_dim, bias=False),
config.q_noise,
config.qn_block_size,
)
else:
self.quant_noise = None
if config.encoder_normalize_before:
self.emb_layer_norm = nn.LayerNorm(self.embedding_dim)
else:
self.emb_layer_norm = None
if config.pre_layernorm:
self.final_layer_norm = nn.LayerNorm(self.embedding_dim)
if self.layerdrop > 0.0:
self.layers = LayerDropModuleList(p=self.layerdrop)
else:
self.layers = nn.ModuleList([])
self.layers.extend([GraphormerGraphEncoderLayer(config) for _ in range(config.num_hidden_layers)])
# Apply initialization of model params after building the model
if config.freeze_embeddings:
raise NotImplementedError("Freezing embeddings is not implemented yet.")
for layer in range(config.num_trans_layers_to_freeze):
m = self.layers[layer]
if m is not None:
for p in m.parameters():
p.requires_grad = False
def forward(
self,
input_nodes: torch.LongTensor,
input_edges: torch.LongTensor,
attn_bias: torch.Tensor,
in_degree: torch.LongTensor,
out_degree: torch.LongTensor,
spatial_pos: torch.LongTensor,
attn_edge_type: torch.LongTensor,
perturb=None,
last_state_only: bool = False,
token_embeddings: Optional[torch.Tensor] = None,
attn_mask: Optional[torch.Tensor] = None,
) -> tuple[Union[torch.Tensor, list[torch.LongTensor]], torch.Tensor]:
# compute padding mask. This is needed for multi-head attention
data_x = input_nodes
n_graph, n_node = data_x.size()[:2]
padding_mask = (data_x[:, :, 0]).eq(0)
padding_mask_cls = torch.zeros(n_graph, 1, device=padding_mask.device, dtype=padding_mask.dtype)
padding_mask = torch.cat((padding_mask_cls, padding_mask), dim=1)
attn_bias = self.graph_attn_bias(input_nodes, attn_bias, spatial_pos, input_edges, attn_edge_type)
if token_embeddings is not None:
input_nodes = token_embeddings
else:
input_nodes = self.graph_node_feature(input_nodes, in_degree, out_degree)
if perturb is not None:
input_nodes[:, 1:, :] += perturb
if self.embed_scale is not None:
input_nodes = input_nodes * self.embed_scale
if self.quant_noise is not None:
input_nodes = self.quant_noise(input_nodes)
if self.emb_layer_norm is not None:
input_nodes = self.emb_layer_norm(input_nodes)
input_nodes = self.dropout_module(input_nodes)
input_nodes = input_nodes.transpose(0, 1)
inner_states = []
if not last_state_only:
inner_states.append(input_nodes)
for layer in self.layers:
input_nodes, _ = layer(
input_nodes,
self_attn_padding_mask=padding_mask,
self_attn_mask=attn_mask,
self_attn_bias=attn_bias,
)
if not last_state_only:
inner_states.append(input_nodes)
graph_rep = input_nodes[0, :, :]
if last_state_only:
inner_states = [input_nodes]
if self.traceable:
return torch.stack(inner_states), graph_rep
else:
return inner_states, graph_rep
class GraphormerDecoderHead(nn.Module):
def __init__(self, embedding_dim: int, num_classes: int):
super().__init__()
"""num_classes should be 1 for regression, or the number of classes for classification"""
self.lm_output_learned_bias = nn.Parameter(torch.zeros(1))
self.classifier = nn.Linear(embedding_dim, num_classes, bias=False)
self.num_classes = num_classes
def forward(self, input_nodes: torch.Tensor, **unused) -> torch.Tensor:
input_nodes = self.classifier(input_nodes)
input_nodes = input_nodes + self.lm_output_learned_bias
return input_nodes
class GraphormerPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config: GraphormerConfig
base_model_prefix = "graphormer"
main_input_name_nodes = "input_nodes"
main_input_name_edges = "input_edges"
def init_graphormer_params(self, module: Union[nn.Linear, nn.Embedding, GraphormerMultiheadAttention]):
"""
Initialize the weights specific to the Graphormer Model.
"""
if isinstance(module, nn.Linear):
init.normal_(module.weight.data, mean=0.0, std=0.02)
if module.bias is not None:
init.zeros_(module.bias)
if isinstance(module, nn.Embedding):
init.normal_(module.weight.data, mean=0.0, std=0.02)
# Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
init.zeros_(module.weight[module.padding_idx])
if isinstance(module, GraphormerMultiheadAttention):
init.normal_(module.q_proj.weight.data, mean=0.0, std=0.02)
init.normal_(module.k_proj.weight.data, mean=0.0, std=0.02)
init.normal_(module.v_proj.weight.data, mean=0.0, std=0.02)
@torch.no_grad()
def _init_weights(
self,
module: Union[
nn.Linear, nn.Conv2d, nn.Embedding, nn.LayerNorm, GraphormerMultiheadAttention, GraphormerGraphEncoder
],
):
"""
Initialize the weights
"""
super()._init_weights(module)
if isinstance(module, GraphormerMultiheadAttention):
init.normal_(module.q_proj.weight, mean=0.0, std=0.02)
init.normal_(module.k_proj.weight, mean=0.0, std=0.02)
init.normal_(module.v_proj.weight, mean=0.0, std=0.02)
module.reset_parameters()
elif isinstance(module, GraphormerGraphEncoder):
if module.apply_graphormer_init:
module.apply(self.init_graphormer_params)
class GraphormerModel(GraphormerPreTrainedModel):
"""The Graphormer model is a graph-encoder model.
It goes from a graph to its representation. If you want to use the model for a downstream classification task, use
GraphormerForGraphClassification instead. For any other downstream task, feel free to add a new class, or combine
this model with a downstream model of your choice, following the example in GraphormerForGraphClassification.
"""
def __init__(self, config: GraphormerConfig):
super().__init__(config)
self.max_nodes = config.max_nodes
self.graph_encoder = GraphormerGraphEncoder(config)
self.share_input_output_embed = config.share_input_output_embed
self.lm_output_learned_bias = None
# Remove head is set to true during fine-tuning
self.load_softmax = not getattr(config, "remove_head", False)
self.lm_head_transform_weight = nn.Linear(config.embedding_dim, config.embedding_dim)
self.activation_fn = ACT2FN[config.activation_fn]
self.layer_norm = nn.LayerNorm(config.embedding_dim)
self.post_init()
def reset_output_layer_parameters(self):
self.lm_output_learned_bias = nn.Parameter(torch.zeros(1))
def forward(
self,
input_nodes: torch.LongTensor,
input_edges: torch.LongTensor,
attn_bias: torch.Tensor,
in_degree: torch.LongTensor,
out_degree: torch.LongTensor,
spatial_pos: torch.LongTensor,
attn_edge_type: torch.LongTensor,
perturb: Optional[torch.FloatTensor] = None,
masked_tokens: None = None,
return_dict: Optional[bool] = None,
**unused,
) -> Union[tuple[torch.LongTensor], BaseModelOutputWithNoAttention]:
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
inner_states, graph_rep = self.graph_encoder(
input_nodes, input_edges, attn_bias, in_degree, out_degree, spatial_pos, attn_edge_type, perturb=perturb
)
# last inner state, then revert Batch and Graph len
input_nodes = inner_states[-1].transpose(0, 1)
# project masked tokens only
if masked_tokens is not None:
raise NotImplementedError
input_nodes = self.layer_norm(self.activation_fn(self.lm_head_transform_weight(input_nodes)))
# project back to size of vocabulary
if self.share_input_output_embed and hasattr(self.graph_encoder.embed_tokens, "weight"):
input_nodes = torch.nn.functional.linear(input_nodes, self.graph_encoder.embed_tokens.weight)
if not return_dict:
return tuple(x for x in [input_nodes, inner_states] if x is not None)
return BaseModelOutputWithNoAttention(last_hidden_state=input_nodes, hidden_states=inner_states)
def max_nodes(self):
"""Maximum output length supported by the encoder."""
return self.max_nodes
class GraphormerForGraphClassification(GraphormerPreTrainedModel):
"""
This model can be used for graph-level classification or regression tasks.
It can be trained on
- regression (by setting config.num_classes to 1); there should be one float-type label per graph
- one task classification (by setting config.num_classes to the number of classes); there should be one integer
label per graph
- binary multi-task classification (by setting config.num_classes to the number of labels); there should be a list
of integer labels for each graph.
"""
def __init__(self, config: GraphormerConfig):
super().__init__(config)
self.encoder = GraphormerModel(config)
self.embedding_dim = config.embedding_dim
self.num_classes = config.num_classes
self.classifier = GraphormerDecoderHead(self.embedding_dim, self.num_classes)
self.is_encoder_decoder = True
# Initialize weights and apply final processing
self.post_init()
def forward(
self,
input_nodes: torch.LongTensor,
input_edges: torch.LongTensor,
attn_bias: torch.Tensor,
in_degree: torch.LongTensor,
out_degree: torch.LongTensor,
spatial_pos: torch.LongTensor,
attn_edge_type: torch.LongTensor,
labels: Optional[torch.LongTensor] = None,
return_dict: Optional[bool] = None,
**unused,
) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]:
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
encoder_outputs = self.encoder(
input_nodes,
input_edges,
attn_bias,
in_degree,
out_degree,
spatial_pos,
attn_edge_type,
return_dict=True,
)
outputs, hidden_states = encoder_outputs["last_hidden_state"], encoder_outputs["hidden_states"]
head_outputs = self.classifier(outputs)
logits = head_outputs[:, 0, :].contiguous()
loss = None
if labels is not None:
mask = ~torch.isnan(labels)
if self.num_classes == 1: # regression
loss_fct = MSELoss()
loss = loss_fct(logits[mask].squeeze(), labels[mask].squeeze().float())
elif self.num_classes > 1 and len(labels.shape) == 1: # One task classification
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits[mask].view(-1, self.num_classes), labels[mask].view(-1))
else: # Binary multi-task classification
loss_fct = BCEWithLogitsLoss(reduction="sum")
loss = loss_fct(logits[mask], labels[mask])
if not return_dict:
return tuple(x for x in [loss, logits, hidden_states] if x is not None)
return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=hidden_states, attentions=None)
__all__ = ["GraphormerForGraphClassification", "GraphormerModel", "GraphormerPreTrainedModel"]

View File

@ -0,0 +1,28 @@
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ....utils import _LazyModule
from ....utils.import_utils import define_import_structure
if TYPE_CHECKING:
from .configuration_jukebox import *
from .modeling_jukebox import *
from .tokenization_jukebox import *
else:
import sys
_file = globals()["__file__"]
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

View File

@ -0,0 +1,599 @@
# coding=utf-8
# Copyright 2022 The OpenAI Team Authors and HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Jukebox configuration"""
import os
from typing import Union
from ....configuration_utils import PreTrainedConfig
from ....utils import logging
logger = logging.get_logger(__name__)
_LARGE_ATTENTION = [
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"cross_attention",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"cross_attention",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"cross_attention",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"cross_attention",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"cross_attention",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"cross_attention",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"cross_attention",
]
_RawColumnPreviousRowAttention = ["block_attn", "transpose_block_attn", "prev_block_attn"]
_FullDenseAttention = ["dense_attention"]
_PrimePrimeDenseAttention = ["prime_attn", "prime_attn", "dense_attn"]
def full_dense_attention(layer):
return _FullDenseAttention[0]
def raw_column_previous_row_attention(layer):
return _RawColumnPreviousRowAttention[layer % 3]
def large_separated_enc_dec_w_lyrics(layer):
return _LARGE_ATTENTION[layer % 79]
def enc_dec_with_lyrics(layer):
if layer % 16 == 15:
return _PrimePrimeDenseAttention[layer % 3]
return _RawColumnPreviousRowAttention[layer % 3]
ATTENTION_PATTERNS = {
"full_dense_attention": full_dense_attention,
"raw_column_previous_row_attention": raw_column_previous_row_attention, # Alternate row, column and previous row attn
"large_separated_enc_dec_w_lyrics": large_separated_enc_dec_w_lyrics, # Used by large separated_enc_dec model with lyrics
"enc_dec_with_lyrics": enc_dec_with_lyrics, # Used by encoder_decoder model with lyrics
}
class JukeboxPriorConfig(PreTrainedConfig):
"""
This is the configuration class to store the configuration of a [`JukeboxPrior`]. It is used to instantiate a
`JukeboxPrior` according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the top level prior from the
[openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox
-1b-lyrics) architecture.
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PreTrainedConfig`] for more information.
Args:
act_fn (`str`, *optional*, defaults to `"quick_gelu"`):
Activation function.
alignment_head (`int`, *optional*, defaults to 2):
Head that is responsible of the alignment between lyrics and music. Only used to compute the lyric to audio
alignment
alignment_layer (`int`, *optional*, defaults to 68):
Index of the layer that is responsible of the alignment between lyrics and music. Only used to compute the
lyric to audio alignment
attention_multiplier (`float`, *optional*, defaults to 0.25):
Multiplier coefficient used to define the hidden dimension of the attention layers. 0.25 means that
0.25*width of the model will be used.
attention_pattern (`str`, *optional*, defaults to `"enc_dec_with_lyrics"`):
Which attention pattern to use for the decoder/
attn_dropout (`int`, *optional*, defaults to 0):
Dropout probability for the post-attention layer dropout in the decoder.
attn_res_scale (`bool`, *optional*, defaults to `False`):
Whether or not to scale the residuals in the attention conditioner block.
blocks (`int`, *optional*, defaults to 64):
Number of blocks used in the `block_attn`. A sequence of length seq_len is factored as `[blocks, seq_len //
blocks]` in the `JukeboxAttention` layer.
conv_res_scale (`int`, *optional*):
Whether or not to scale the residuals in the conditioner block. Since the top level prior does not have a
conditioner, the default value is to None and should not be modified.
num_layers (`int`, *optional*, defaults to 72):
Number of layers of the transformer architecture.
emb_dropout (`int`, *optional*, defaults to 0):
Embedding dropout used in the lyric decoder.
encoder_config (`JukeboxPriorConfig`, *optional*) :
Configuration of the encoder which models the prior on the lyrics.
encoder_loss_fraction (`float`, *optional*, defaults to 0.4):
Multiplication factor used in front of the lyric encoder loss.
hidden_size (`int`, *optional*, defaults to 2048):
Hidden dimension of the attention layers.
init_scale (`float`, *optional*, defaults to 0.2):
Initialization scales for the prior modules.
is_encoder_decoder (`bool`, *optional*, defaults to `True`):
Whether or not the prior is an encoder-decoder model. In case it is not, and `nb_relevant_lyric_tokens` is
greater than 0, the `encoder` args should be specified for the lyric encoding.
mask (`bool`, *optional*, defaults to `False`):
Whether or not to mask the previous positions in the attention.
max_duration (`int`, *optional*, defaults to 600):
Maximum supported duration of the generated song in seconds.
max_nb_genres (`int`, *optional*, defaults to 1):
Maximum number of genres that can be used to condition the model.
merged_decoder (`bool`, *optional*, defaults to `True`):
Whether or not the decoder and the encoder inputs are merged. This is used for the separated
encoder-decoder architecture
metadata_conditioning (`bool`, *optional*, defaults to `True)`:
Whether or not to condition on the artist and genre metadata.
metadata_dims (`List[int]`, *optional*, defaults to `[604, 7898]`):
Number of genres and the number of artists that were used to train the embedding layers of the prior
models.
min_duration (`int`, *optional*, defaults to 0):
Minimum duration of the generated audio on which the model was trained.
mlp_multiplier (`float`, *optional*, defaults to 1.0):
Multiplier coefficient used to define the hidden dimension of the MLP layers. 0.25 means that 0.25*width of
the model will be used.
music_vocab_size (`int`, *optional*, defaults to 2048):
Number of different music tokens. Should be similar to the `JukeboxVQVAEConfig.nb_discrete_codes`.
n_ctx (`int`, *optional*, defaults to 6144):
Number of context tokens for each prior. The context tokens are the music tokens that are attended to when
generating music tokens.
n_heads (`int`, *optional*, defaults to 2):
Number of attention heads.
nb_relevant_lyric_tokens (`int`, *optional*, defaults to 384):
Number of lyric tokens that are used when sampling a single window of length `n_ctx`
res_conv_depth (`int`, *optional*, defaults to 3):
Depth of the `JukeboxDecoderConvBock` used to upsample the previously sampled audio in the
`JukeboxMusicTokenConditioner`.
res_conv_width (`int`, *optional*, defaults to 128):
Width of the `JukeboxDecoderConvBock` used to upsample the previously sampled audio in the
`JukeboxMusicTokenConditioner`.
res_convolution_multiplier (`int`, *optional*, defaults to 1):
Multiplier used to scale the `hidden_dim` of the `JukeboxResConv1DBlock`.
res_dilation_cycle (`int`, *optional*):
Dilation cycle used to define the `JukeboxMusicTokenConditioner`. Usually similar to the ones used in the
corresponding level of the VQVAE. The first prior does not use it as it is not conditioned on upper level
tokens.
res_dilation_growth_rate (`int`, *optional*, defaults to 1):
Dilation grow rate used between each convolutionnal block of the `JukeboxMusicTokenConditioner`
res_downs_t (`List[int]`, *optional*, defaults to `[3, 2, 2]`):
Downsampling rates used in the audio conditioning network
res_strides_t (`List[int]`, *optional*, defaults to `[2, 2, 2]`):
Striding used in the audio conditioning network
resid_dropout (`int`, *optional*, defaults to 0):
Residual dropout used in the attention pattern.
sampling_rate (`int`, *optional*, defaults to 44100):
Sampling rate used for training.
spread (`int`, *optional*):
Spread used in the `summary_spread_attention` pattern
timing_dims (`int`, *optional*, defaults to 64):
Dimension of the timing embedding.
zero_out (`bool`, *optional*, defaults to `False`):
Whether or not to zero out convolution weights when initializing.
"""
model_type = "jukebox_prior"
attribute_map = {
"max_position_embeddings": "n_positions",
"num_attention_heads": "n_head",
}
def __init__(
self,
act_fn="quick_gelu",
level=0,
alignment_head=2,
alignment_layer=68,
attention_multiplier=0.25,
attention_pattern="enc_dec_with_lyrics",
attn_dropout=0,
attn_res_scale=False,
blocks=64,
conv_res_scale=None,
num_layers=72,
emb_dropout=0,
encoder_config=None,
encoder_loss_fraction=0.4,
hidden_size=2048,
init_scale=0.2,
is_encoder_decoder=True,
lyric_vocab_size=80,
mask=False,
max_duration=600,
max_nb_genres=1,
merged_decoder=True,
metadata_conditioning=True,
metadata_dims=[604, 7898],
min_duration=0,
mlp_multiplier=1.0,
music_vocab_size=2048,
n_ctx=6144,
n_heads=2,
nb_relevant_lyric_tokens=384,
res_conv_depth=3,
res_conv_width=128,
res_convolution_multiplier=1,
res_dilation_cycle=None,
res_dilation_growth_rate=1,
res_downs_t=[3, 2, 2],
res_strides_t=[2, 2, 2],
resid_dropout=0,
sampling_rate=44100,
spread=None,
timing_dims=64,
zero_out=False,
**kwargs,
):
super().__init__(**kwargs)
self.act_fn = act_fn
self.alignment_head = alignment_head
self.alignment_layer = alignment_layer
self.attention_multiplier = attention_multiplier
self.attention_pattern = attention_pattern
self.attn_dropout = attn_dropout
self.attn_res_scale = attn_res_scale
self.blocks = blocks
self.conv_res_scale = conv_res_scale
self.num_layers = num_layers
self.emb_dropout = emb_dropout
self.music_vocab_size = music_vocab_size
if encoder_config is not None:
self.encoder_config = JukeboxPriorConfig(**encoder_config)
else:
self.encoder_config = None
self.encoder_loss_fraction = encoder_loss_fraction
self.init_scale = init_scale
self.is_encoder_decoder = is_encoder_decoder
self.lyric_vocab_size = lyric_vocab_size
self.level = level
self.mask = mask
self.max_duration = max_duration
self.max_nb_genres = max_nb_genres
self.merged_decoder = merged_decoder
self.metadata_conditioning = metadata_conditioning
self.metadata_dims = metadata_dims
self.min_duration = min_duration
self.mlp_multiplier = mlp_multiplier
self.n_ctx = n_ctx
self.n_heads = n_heads
self.nb_relevant_lyric_tokens = nb_relevant_lyric_tokens
self.res_conv_depth = res_conv_depth
self.res_conv_width = res_conv_width
self.res_convolution_multiplier = res_convolution_multiplier
self.res_dilation_cycle = res_dilation_cycle
self.res_dilation_growth_rate = res_dilation_growth_rate
self.res_downs_t = res_downs_t
self.res_strides_t = res_strides_t
self.resid_dropout = resid_dropout
self.sampling_rate = sampling_rate
self.spread = spread
self.timing_dims = timing_dims
self.hidden_size = hidden_size
self.zero_out = zero_out
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], level=0, **kwargs):
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
# get the prior config dict if we are loading from JukeboxConfig
if config_dict.get("model_type") == "jukebox":
config_dict = config_dict[f"prior_{level}"]
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
return cls.from_dict(config_dict, **kwargs)
class JukeboxVQVAEConfig(PreTrainedConfig):
"""
This is the configuration class to store the configuration of a [`JukeboxVQVAE`]. It is used to instantiate a
`JukeboxVQVAE` according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the VQVAE from
[openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox-1b-lyrics) architecture.
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PreTrainedConfig`] for more information.
Args:
act_fn (`str`, *optional*, defaults to `"relu"`):
Activation function of the model.
nb_discrete_codes (`int`, *optional*, defaults to 2048):
Number of codes of the VQVAE.
commit (`float`, *optional*, defaults to 0.02):
Commit loss multiplier.
conv_input_shape (`int`, *optional*, defaults to 1):
Number of audio channels.
conv_res_scale (`bool`, *optional*, defaults to `False`):
Whether or not to scale the residuals of the `JukeboxResConv1DBlock`.
embed_dim (`int`, *optional*, defaults to 64):
Embedding dimension of the codebook vectors.
hop_fraction (`List[int]`, *optional*, defaults to `[0.125, 0.5, 0.5]`):
Fraction of non-intersecting window used when continuing the sampling process.
levels (`int`, *optional*, defaults to 3):
Number of hierarchical levels that used in the VQVAE.
lmu (`float`, *optional*, defaults to 0.99):
Used in the codebook update, exponential moving average coefficient. For more detail refer to Appendix A.1
of the original [VQVAE paper](https://huggingface.co/papers/1711.00937v2.pdf)
multipliers (`List[int]`, *optional*, defaults to `[2, 1, 1]`):
Depth and width multipliers used for each level. Used on the `res_conv_width` and `res_conv_depth`
res_conv_depth (`int`, *optional*, defaults to 4):
Depth of the encoder and decoder block. If no `multipliers` are used, this is the same for each level.
res_conv_width (`int`, *optional*, defaults to 32):
Width of the encoder and decoder block. If no `multipliers` are used, this is the same for each level.
res_convolution_multiplier (`int`, *optional*, defaults to 1):
Scaling factor of the hidden dimension used in the `JukeboxResConv1DBlock`.
res_dilation_cycle (`int`, *optional*):
Dilation cycle value used in the `JukeboxResnet`. If an int is used, each new Conv1 block will have a depth
reduced by a power of `res_dilation_cycle`.
res_dilation_growth_rate (`int`, *optional*, defaults to 3):
Resnet dilation growth rate used in the VQVAE (dilation_growth_rate ** depth)
res_downs_t (`List[int]`, *optional*, defaults to `[3, 2, 2]`):
Downsampling rate for each level of the hierarchical VQ-VAE.
res_strides_t (`List[int]`, *optional*, defaults to `[2, 2, 2]`):
Stride used for each level of the hierarchical VQ-VAE.
sample_length (`int`, *optional*, defaults to 1058304):
Provides the max input shape of the VQVAE. Is used to compute the input shape of each level.
init_scale (`float`, *optional*, defaults to 0.2):
Initialization scale.
zero_out (`bool`, *optional*, defaults to `False`):
Whether or not to zero out convolution weights when initializing.
"""
model_type = "jukebox_vqvae"
def __init__(
self,
act_fn="relu",
nb_discrete_codes=2048,
commit=0.02,
conv_input_shape=1,
conv_res_scale=False,
embed_dim=64,
hop_fraction=[0.125, 0.5, 0.5],
levels=3,
lmu=0.99,
multipliers=[2, 1, 1],
res_conv_depth=4,
res_conv_width=32,
res_convolution_multiplier=1,
res_dilation_cycle=None,
res_dilation_growth_rate=3,
res_downs_t=[3, 2, 2],
res_strides_t=[2, 2, 2],
sample_length=1058304,
init_scale=0.2,
zero_out=False,
**kwargs,
):
super().__init__(**kwargs)
self.hop_fraction = hop_fraction
self.conv_input_shape = conv_input_shape
self.sample_length = sample_length
# VQVAE parameters (all used)
self.levels = levels
self.embed_dim = embed_dim
self.nb_discrete_codes = nb_discrete_codes
self.res_conv_width = res_conv_width
self.res_conv_depth = res_conv_depth
self.res_convolution_multiplier = res_convolution_multiplier
self.res_dilation_growth_rate = res_dilation_growth_rate
self.res_dilation_cycle = res_dilation_cycle
self.multipliers = multipliers
self.res_downs_t = res_downs_t
self.res_strides_t = res_strides_t
self.lmu = lmu
self.commit = commit
self.conv_res_scale = conv_res_scale
self.act_fn = act_fn
self.init_scale = init_scale
self.zero_out = zero_out
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs):
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
# get the text config dict if we are loading from CLIPConfig
if config_dict.get("model_type") == "jukebox":
config_dict = config_dict["vqvae_config"]
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
return cls.from_dict(config_dict, **kwargs)
class JukeboxConfig(PreTrainedConfig):
"""
This is the configuration class to store the configuration of a [`JukeboxModel`].
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PreTrainedConfig`] for more information. Instantiating a configuration with the defaults will
yield a similar configuration to that of
[openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox-1b-lyrics) architecture.
The downsampling and stride are used to determine downsampling of the input sequence. For example, downsampling =
(5,3), and strides = (2, 2) will downsample the audio by 2^5 = 32 to get the first level of codes, and 2**8 = 256
to get the second level codes. This is mostly true for training the top level prior and the upsamplers.
Args:
vqvae_config (`JukeboxVQVAEConfig`, *optional*):
Configuration for the `JukeboxVQVAE` model.
prior_config_list (`List[JukeboxPriorConfig]`, *optional*):
List of the configs for each of the `JukeboxPrior` of the model. The original architecture uses 3 priors.
nb_priors (`int`, *optional*, defaults to 3):
Number of prior models that will sequentially sample tokens. Each prior is conditional auto regressive
(decoder) model, apart from the top prior, which can include a lyric encoder. The available models were
trained using a top prior and 2 upsampler priors.
sampling_rate (`int`, *optional*, defaults to 44100):
Sampling rate of the raw audio.
timing_dims (`int`, *optional*, defaults to 64):
Dimensions of the JukeboxRangeEmbedding layer which is equivalent to traditional positional embedding
layer. The timing embedding layer converts the absolute and relative position in the currently sampled
audio to a tensor of length `timing_dims` that will be added to the music tokens.
min_duration (`int`, *optional*, defaults to 0):
Minimum duration of the audios to generate
max_duration (`float`, *optional*, defaults to 600.0):
Maximum duration of the audios to generate
max_nb_genres (`int`, *optional*, defaults to 5):
Maximum number of genres that can be used to condition a single sample.
metadata_conditioning (`bool`, *optional*, defaults to `True`):
Whether or not to use metadata conditioning, corresponding to the artist, the genre and the min/maximum
duration.
Example:
```python
>>> from transformers import JukeboxModel, JukeboxConfig
>>> # Initializing a Jukebox configuration
>>> configuration = JukeboxConfig()
>>> # Initializing a model from the configuration
>>> model = JukeboxModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "jukebox"
def __init__(
self,
vqvae_config=None,
prior_config_list=None,
nb_priors=3,
sampling_rate=44100,
timing_dims=64,
min_duration=0,
max_duration=600.0,
max_nb_genres=5,
metadata_conditioning=True,
**kwargs,
):
if vqvae_config is None:
vqvae_config = JukeboxVQVAEConfig()
logger.info("vqvae_config is None. initializing the JukeboxVQVAE with default values.")
elif isinstance(vqvae_config, dict):
vqvae_config = JukeboxVQVAEConfig(**vqvae_config)
self.vqvae_config = vqvae_config
if prior_config_list is not None and isinstance(prior_config_list[0], dict):
prior_configs = [JukeboxPriorConfig(**prior_config) for prior_config in prior_config_list]
elif prior_config_list is None:
prior_configs = []
for prior_idx in range(nb_priors):
prior_config = kwargs.pop(f"prior_{prior_idx}", None)
if prior_config is None:
prior_config = {}
logger.info(
f"prior_{prior_idx}'s config is None. Initializing the JukeboxPriorConfig list with default"
" values."
)
prior_configs.append(JukeboxPriorConfig(**prior_config))
self.prior_configs = prior_configs
self.hop_fraction = self.vqvae_config.hop_fraction
self.nb_priors = nb_priors
# Metadata conditioning
self.max_nb_genres = max_nb_genres
self.sampling_rate = sampling_rate
self.timing_dims = timing_dims
self.min_duration = min_duration
self.max_duration = max_duration
self.metadata_conditioning = metadata_conditioning
super().__init__(**kwargs)
def to_dict(self):
# Override the default to_dict to apply to_dict to the list of prior configs.
result = super().to_dict()
result["prior_config_list"] = [config.to_dict() for config in result.pop("prior_configs")]
return result
__all__ = ["JukeboxConfig", "JukeboxPriorConfig", "JukeboxVQVAEConfig"]

View File

@ -0,0 +1,279 @@
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert Jukebox checkpoints"""
import argparse
import json
import os
from pathlib import Path
import requests
import torch
from transformers import JukeboxConfig, JukeboxModel
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
PREFIX = "https://openaipublic.azureedge.net/jukebox/models/"
MODEL_MAPPING = {
"jukebox-1b-lyrics": [
"5b/vqvae.pth.tar",
"5b/prior_level_0.pth.tar",
"5b/prior_level_1.pth.tar",
"1b_lyrics/prior_level_2.pth.tar",
],
"jukebox-5b-lyrics": [
"5b/vqvae.pth.tar",
"5b/prior_level_0.pth.tar",
"5b/prior_level_1.pth.tar",
"5b_lyrics/prior_level_2.pth.tar",
],
}
def replace_key(key):
if key.endswith(".model.1.bias") and len(key.split(".")) > 10:
key = key.replace(".model.1.bias", ".conv1d_1.bias")
elif key.endswith(".model.1.weight") and len(key.split(".")) > 10:
key = key.replace(".model.1.weight", ".conv1d_1.weight")
elif key.endswith(".model.3.bias") and len(key.split(".")) > 10:
key = key.replace(".model.3.bias", ".conv1d_2.bias")
elif key.endswith(".model.3.weight") and len(key.split(".")) > 10:
key = key.replace(".model.3.weight", ".conv1d_2.weight")
if "conditioner_blocks.0." in key:
key = key.replace("conditioner_blocks.0", "conditioner_blocks")
if "prime_prior" in key:
key = key.replace("prime_prior", "encoder")
if ".emb." in key and "total" not in key and "absolute" not in key and "relative" not in key:
key = key.replace(".emb.", ".")
if key.endswith("k"): # replace vqvae.X.k with vqvae.X.codebook
return key.replace(".k", ".codebook")
if "y_emb." in key:
return key.replace("y_emb.", "metadata_embedding.")
if "x_emb.emb." in key:
key = key.replace("0.x_emb.emb", "embed_tokens")
if "prime_state_ln" in key:
return key.replace("prime_state_ln", "encoder.final_layer_norm")
if ".ln" in key:
return key.replace(".ln", ".layer_norm")
if "_ln" in key:
return key.replace("_ln", "_layer_norm")
if "prime_state_proj" in key:
return key.replace("prime_state_proj", "encoder.proj_in")
if "prime_x_out" in key:
return key.replace("prime_x_out", "encoder.lm_head")
if "prior.x_out" in key:
return key.replace("x_out", "fc_proj_out")
if "x_emb" in key:
return key.replace("x_emb", "embed_tokens")
return key
def fix_jukebox_keys(state_dict, model_state_dict, key_prefix, mapping):
new_dict = {}
import re
re_encoder_block_conv_in = re.compile(r"encoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).(bias|weight)")
re_encoder_block_resnet = re.compile(
r"encoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)"
)
re_encoder_block_proj_out = re.compile(r"encoders.(\d*).level_blocks.(\d*).model.(\d*).(bias|weight)")
re_decoder_block_conv_out = re.compile(r"decoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).(bias|weight)")
re_decoder_block_resnet = re.compile(
r"decoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)"
)
re_decoder_block_proj_in = re.compile(r"decoders.(\d*).level_blocks.(\d*).model.(\d*).(bias|weight)")
re_prior_cond_conv_out = re.compile(r"conditioner_blocks.(\d*).cond.model.(\d*).(\d).(bias|weight)")
re_prior_cond_resnet = re.compile(
r"conditioner_blocks.(\d*).cond.model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)"
)
re_prior_cond_proj_in = re.compile(r"conditioner_blocks.(\d*).cond.model.(\d*).(bias|weight)")
for original_key, value in state_dict.items():
# rename vqvae.encoder keys
if re_encoder_block_conv_in.fullmatch(original_key):
regex_match = re_encoder_block_conv_in.match(original_key)
groups = regex_match.groups()
block_index = int(groups[2]) * 2 + int(groups[3])
re_new_key = f"encoders.{groups[0]}.level_blocks.{groups[1]}.downsample_block.{block_index}.{groups[-1]}"
key = re_encoder_block_conv_in.sub(re_new_key, original_key)
elif re_encoder_block_resnet.fullmatch(original_key):
regex_match = re_encoder_block_resnet.match(original_key)
groups = regex_match.groups()
block_index = int(groups[2]) * 2 + int(groups[3])
conv_index = {"1": 1, "3": 2}[groups[-2]]
prefix = f"encoders.{groups[0]}.level_blocks.{groups[1]}.downsample_block.{block_index}."
resnet_block = f"resnet_block.{groups[-3]}.conv1d_{conv_index}.{groups[-1]}"
re_new_key = prefix + resnet_block
key = re_encoder_block_resnet.sub(re_new_key, original_key)
elif re_encoder_block_proj_out.fullmatch(original_key):
regex_match = re_encoder_block_proj_out.match(original_key)
groups = regex_match.groups()
re_new_key = f"encoders.{groups[0]}.level_blocks.{groups[1]}.proj_out.{groups[-1]}"
key = re_encoder_block_proj_out.sub(re_new_key, original_key)
# rename vqvae.decoder keys
elif re_decoder_block_conv_out.fullmatch(original_key):
regex_match = re_decoder_block_conv_out.match(original_key)
groups = regex_match.groups()
block_index = int(groups[2]) * 2 + int(groups[3]) - 2
re_new_key = f"decoders.{groups[0]}.level_blocks.{groups[1]}.upsample_block.{block_index}.{groups[-1]}"
key = re_decoder_block_conv_out.sub(re_new_key, original_key)
elif re_decoder_block_resnet.fullmatch(original_key):
regex_match = re_decoder_block_resnet.match(original_key)
groups = regex_match.groups()
block_index = int(groups[2]) * 2 + int(groups[3]) - 2
conv_index = {"1": 1, "3": 2}[groups[-2]]
prefix = f"decoders.{groups[0]}.level_blocks.{groups[1]}.upsample_block.{block_index}."
resnet_block = f"resnet_block.{groups[-3]}.conv1d_{conv_index}.{groups[-1]}"
re_new_key = prefix + resnet_block
key = re_decoder_block_resnet.sub(re_new_key, original_key)
elif re_decoder_block_proj_in.fullmatch(original_key):
regex_match = re_decoder_block_proj_in.match(original_key)
groups = regex_match.groups()
re_new_key = f"decoders.{groups[0]}.level_blocks.{groups[1]}.proj_in.{groups[-1]}"
key = re_decoder_block_proj_in.sub(re_new_key, original_key)
# rename prior cond.model to upsampler.upsample_block and resnet
elif re_prior_cond_conv_out.fullmatch(original_key):
regex_match = re_prior_cond_conv_out.match(original_key)
groups = regex_match.groups()
block_index = int(groups[1]) * 2 + int(groups[2]) - 2
re_new_key = f"conditioner_blocks.upsampler.upsample_block.{block_index}.{groups[-1]}"
key = re_prior_cond_conv_out.sub(re_new_key, original_key)
elif re_prior_cond_resnet.fullmatch(original_key):
regex_match = re_prior_cond_resnet.match(original_key)
groups = regex_match.groups()
block_index = int(groups[1]) * 2 + int(groups[2]) - 2
conv_index = {"1": 1, "3": 2}[groups[-2]]
prefix = f"conditioner_blocks.upsampler.upsample_block.{block_index}."
resnet_block = f"resnet_block.{groups[-3]}.conv1d_{conv_index}.{groups[-1]}"
re_new_key = prefix + resnet_block
key = re_prior_cond_resnet.sub(re_new_key, original_key)
elif re_prior_cond_proj_in.fullmatch(original_key):
regex_match = re_prior_cond_proj_in.match(original_key)
groups = regex_match.groups()
re_new_key = f"conditioner_blocks.upsampler.proj_in.{groups[-1]}"
key = re_prior_cond_proj_in.sub(re_new_key, original_key)
# keep original key
else:
key = original_key
key = replace_key(key)
if f"{key_prefix}.{key}" not in model_state_dict or key is None:
print(f"failed converting {original_key} to {key}, does not match")
# handle mismatched shape
elif value.shape != model_state_dict[f"{key_prefix}.{key}"].shape:
val = model_state_dict[f"{key_prefix}.{key}"]
print(f"{original_key}-> {key} : \nshape {val.shape} and {value.shape}, do not match")
key = original_key
mapping[key] = original_key
new_dict[key] = value
return new_dict
@torch.no_grad()
def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
"""
Copy/paste/tweak model's weights to our Jukebox structure.
"""
for file in MODEL_MAPPING[model_name]:
if not os.path.isfile(f"{pytorch_dump_folder_path}/{file.split('/')[-1]}"):
r = requests.get(f"{PREFIX}{file}", allow_redirects=True)
os.makedirs(f"{pytorch_dump_folder_path}/", exist_ok=True)
open(f"{pytorch_dump_folder_path}/{file.split('/')[-1]}", "wb").write(r.content)
model_to_convert = MODEL_MAPPING[model_name.split("/")[-1]]
config = JukeboxConfig.from_pretrained(model_name)
model = JukeboxModel(config)
weight_dict = []
mapping = {}
for i, dict_name in enumerate(model_to_convert):
old_dic = torch.load(f"{pytorch_dump_folder_path}/{dict_name.split('/')[-1]}", weights_only=True)["model"]
new_dic = {}
for k in old_dic:
if k.endswith(".b"):
new_dic[k.replace("b", "bias")] = old_dic[k]
elif k.endswith(".w"):
new_dic[k.replace("w", "weight")] = old_dic[k]
elif "level_2" not in dict_name and "cond.model." in k:
new_dic[k.replace(".blocks.", ".model.")] = old_dic[k]
else:
new_dic[k] = old_dic[k]
key_prefix = "vqvae" if i == 0 else f"priors.{3 - i}"
new_dic = fix_jukebox_keys(new_dic, model.state_dict(), key_prefix, mapping)
weight_dict.append(new_dic)
vqvae_state_dict = weight_dict.pop(0)
model.vqvae.load_state_dict(vqvae_state_dict)
for i in range(len(weight_dict)):
model.priors[i].load_state_dict(weight_dict[2 - i])
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
with open(f"{pytorch_dump_folder_path}/mapping.json", "w") as txtfile:
json.dump(mapping, txtfile)
print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
return weight_dict
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument(
"--model_name",
default="jukebox-5b-lyrics",
type=str,
help="Name of the model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path",
default="jukebox-5b-lyrics-converted",
type=str,
help="Path to the output PyTorch model directory.",
)
args = parser.parse_args()
convert_openai_checkpoint(args.model_name, args.pytorch_dump_folder_path)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,390 @@
# coding=utf-8
# Copyright 2022 The Open AI Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for OpenAI Jukebox."""
import json
import os
import re
import unicodedata
from json.encoder import INFINITY
from typing import Any, Optional, Union
import numpy as np
import regex
from ....tokenization_utils import AddedToken, PreTrainedTokenizer
from ....tokenization_utils_base import BatchEncoding
from ....utils import TensorType, is_torch_available, logging
from ....utils.generic import is_numpy_array
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"artists_file": "artists.json",
"lyrics_file": "lyrics.json",
"genres_file": "genres.json",
}
class JukeboxTokenizer(PreTrainedTokenizer):
"""
Constructs a Jukebox tokenizer. Jukebox can be conditioned on 3 different inputs :
- Artists, unique ids are associated to each artist from the provided dictionary.
- Genres, unique ids are associated to each genre from the provided dictionary.
- Lyrics, character based tokenization. Must be initialized with the list of characters that are inside the
vocabulary.
This tokenizer does not require training. It should be able to process a different number of inputs:
as the conditioning of the model can be done on the three different queries. If None is provided, defaults values will be used.:
Depending on the number of genres on which the model should be conditioned (`n_genres`).
```python
>>> from transformers import JukeboxTokenizer
>>> tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
>>> tokenizer("Alan Jackson", "Country Rock", "old town road")["input_ids"]
[tensor([[ 0, 0, 0, 6785, 546, 41, 38, 30, 76, 46, 41, 49,
40, 76, 44, 41, 27, 30]]), tensor([[ 0, 0, 0, 145, 0]]), tensor([[ 0, 0, 0, 145, 0]])]
```
You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
<Tip>
If nothing is provided, the genres and the artist will either be selected randomly or set to None
</Tip>
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to:
this superclass for more information regarding those methods.
However the code does not allow that and only supports composing from various genres.
Args:
artists_file (`str`):
Path to the vocabulary file which contains a mapping between artists and ids. The default file supports
both "v2" and "v3"
genres_file (`str`):
Path to the vocabulary file which contain a mapping between genres and ids.
lyrics_file (`str`):
Path to the vocabulary file which contains the accepted characters for the lyrics tokenization.
version (`list[str]`, `optional`, default to `["v3", "v2", "v2"]`) :
List of the tokenizer versions. The `5b-lyrics`'s top level prior model was trained using `v3` instead of
`v2`.
n_genres (`int`, `optional`, defaults to 1):
Maximum number of genres to use for composition.
max_n_lyric_tokens (`int`, `optional`, defaults to 512):
Maximum number of lyric tokens to keep.
unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
"""
vocab_files_names = VOCAB_FILES_NAMES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
artists_file,
genres_file,
lyrics_file,
version=["v3", "v2", "v2"],
max_n_lyric_tokens=512,
n_genres=5,
unk_token="<|endoftext|>",
**kwargs,
):
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
self.version = version
self.max_n_lyric_tokens = max_n_lyric_tokens
self.n_genres = n_genres
self._added_tokens_decoder = {0: unk_token}
with open(artists_file, encoding="utf-8") as vocab_handle:
self.artists_encoder = json.load(vocab_handle)
with open(genres_file, encoding="utf-8") as vocab_handle:
self.genres_encoder = json.load(vocab_handle)
with open(lyrics_file, encoding="utf-8") as vocab_handle:
self.lyrics_encoder = json.load(vocab_handle)
oov = r"[^A-Za-z0-9.,:;!?\-'\"()\[\] \t\n]+"
# In v2, we had a n_vocab=80 and in v3 we missed + and so n_vocab=79 of characters.
if len(self.lyrics_encoder) == 79:
oov = oov.replace(r"\-'", r"\-+'")
self.out_of_vocab = regex.compile(oov)
self.artists_decoder = {v: k for k, v in self.artists_encoder.items()}
self.genres_decoder = {v: k for k, v in self.genres_encoder.items()}
self.lyrics_decoder = {v: k for k, v in self.lyrics_encoder.items()}
super().__init__(
unk_token=unk_token,
n_genres=n_genres,
version=version,
max_n_lyric_tokens=max_n_lyric_tokens,
**kwargs,
)
@property
def vocab_size(self):
return len(self.artists_encoder) + len(self.genres_encoder) + len(self.lyrics_encoder)
def get_vocab(self):
return {
"artists_encoder": self.artists_encoder,
"genres_encoder": self.genres_encoder,
"lyrics_encoder": self.lyrics_encoder,
}
def _convert_token_to_id(self, list_artists, list_genres, list_lyrics):
"""Converts the artist, genre and lyrics tokens to their index using the vocabulary.
The total_length, offset and duration have to be provided in order to select relevant lyrics and add padding to
the lyrics token sequence.
"""
artists_id = [self.artists_encoder.get(artist, 0) for artist in list_artists]
for genres in range(len(list_genres)):
list_genres[genres] = [self.genres_encoder.get(genre, 0) for genre in list_genres[genres]]
list_genres[genres] = list_genres[genres] + [-1] * (self.n_genres - len(list_genres[genres]))
lyric_ids = [[self.lyrics_encoder.get(character, 0) for character in list_lyrics[0]], [], []]
return artists_id, list_genres, lyric_ids
def _tokenize(self, lyrics):
"""
Converts a string into a sequence of tokens (string), using the tokenizer. Split in words for word-based
vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
Do NOT take care of added tokens. Only the lyrics are split into character for the character-based vocabulary.
"""
# only lyrics are not tokenized, but character based is easily handled
return list(lyrics)
def tokenize(self, artist, genre, lyrics, **kwargs):
"""
Converts three strings in a 3 sequence of tokens using the tokenizer
"""
artist, genre, lyrics = self.prepare_for_tokenization(artist, genre, lyrics)
lyrics = self._tokenize(lyrics)
return artist, genre, lyrics
def prepare_for_tokenization(
self, artists: str, genres: str, lyrics: str, is_split_into_words: bool = False
) -> tuple[str, str, str, dict[str, Any]]:
"""
Performs any necessary transformations before tokenization.
Args:
artist (`str`):
The artist name to prepare. This will mostly lower the string
genres (`str`):
The genre name to prepare. This will mostly lower the string.
lyrics (`str`):
The lyrics to prepare.
is_split_into_words (`bool`, *optional*, defaults to `False`):
Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
which it will tokenize. This is useful for NER or token classification.
"""
for idx in range(len(self.version)):
if self.version[idx] == "v3":
artists[idx] = artists[idx].lower()
genres[idx] = [genres[idx].lower()]
else:
artists[idx] = self._normalize(artists[idx]) + ".v2"
genres[idx] = [
self._normalize(genre) + ".v2" for genre in genres[idx].split("_")
] # split is for the full dictionary with combined genres
if self.version[0] == "v2":
self.out_of_vocab = regex.compile(r"[^A-Za-z0-9.,:;!?\-'\"()\[\] \t\n]+")
vocab = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,:;!?-+'\"()[] \t\n"
self.vocab = {vocab[index]: index + 1 for index in range(len(vocab))}
self.vocab["<unk>"] = 0
self.n_vocab = len(vocab) + 1
self.lyrics_encoder = self.vocab
self.lyrics_decoder = {v: k for k, v in self.vocab.items()}
self.lyrics_decoder[0] = ""
else:
self.out_of_vocab = regex.compile(r"[^A-Za-z0-9.,:;!?\-+'\"()\[\] \t\n]+")
lyrics = self._run_strip_accents(lyrics)
lyrics = lyrics.replace("\\", "\n")
lyrics = self.out_of_vocab.sub("", lyrics), [], []
return artists, genres, lyrics
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _normalize(self, text: str) -> str:
"""
Normalizes the input text. This process is for the genres and the artist
Args:
text (`str`):
Artist or Genre string to normalize
"""
accepted = (
[chr(i) for i in range(ord("a"), ord("z") + 1)]
+ [chr(i) for i in range(ord("A"), ord("Z") + 1)]
+ [chr(i) for i in range(ord("0"), ord("9") + 1)]
+ ["."]
)
accepted = frozenset(accepted)
pattern = re.compile(r"_+")
text = "".join([c if c in accepted else "_" for c in text.lower()])
text = pattern.sub("_", text).strip("_")
return text
def convert_lyric_tokens_to_string(self, lyrics: list[str]) -> str:
return " ".join(lyrics)
def convert_to_tensors(
self, inputs, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False
):
"""
Convert the inner content to tensors.
Args:
tensor_type (`str` or [`~utils.TensorType`], *optional*):
The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
unset, no modification is done.
prepend_batch_axis (`int`, *optional*, defaults to `False`):
Whether or not to add the batch dimension during the conversion.
"""
# Convert to TensorType
if not isinstance(tensor_type, TensorType):
tensor_type = TensorType(tensor_type)
if tensor_type == TensorType.PYTORCH:
if not is_torch_available():
raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
import torch
as_tensor = torch.tensor
is_tensor = torch.is_tensor
else:
as_tensor = np.asarray
is_tensor = is_numpy_array
# Do the tensor conversion in batch
try:
if prepend_batch_axis:
inputs = [inputs]
if not is_tensor(inputs):
inputs = as_tensor(inputs)
except: # noqa E722
raise ValueError(
"Unable to create tensor, you should probably activate truncation and/or padding "
"with 'padding=True' 'truncation=True' to have batched tensors with the same length."
)
return inputs
def __call__(self, artist, genres, lyrics="", return_tensors="pt") -> BatchEncoding:
"""Convert the raw string to a list of token ids
Args:
artist (`str`):
Name of the artist.
genres (`str`):
List of genres that will be mixed to condition the audio
lyrics (`str`, *optional*, defaults to `""`):
Lyrics used to condition the generation
"""
input_ids = [0, 0, 0]
artist = [artist] * len(self.version)
genres = [genres] * len(self.version)
artists_tokens, genres_tokens, lyrics_tokens = self.tokenize(artist, genres, lyrics)
artists_id, genres_ids, full_tokens = self._convert_token_to_id(artists_tokens, genres_tokens, lyrics_tokens)
attention_masks = [-INFINITY] * len(full_tokens[-1])
input_ids = [
self.convert_to_tensors(
[input_ids + [artists_id[i]] + genres_ids[i] + full_tokens[i]], tensor_type=return_tensors
)
for i in range(len(self.version))
]
return BatchEncoding({"input_ids": input_ids, "attention_masks": attention_masks})
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
"""
Saves the tokenizer's vocabulary dictionary to the provided save_directory.
Args:
save_directory (`str`):
A path to the directory where to saved. It will be created if it doesn't exist.
filename_prefix (`Optional[str]`, *optional*):
A prefix to add to the names of the files saved by the tokenizer.
"""
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
artists_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["artists_file"]
)
with open(artists_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.artists_encoder, ensure_ascii=False))
genres_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["genres_file"]
)
with open(genres_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.genres_encoder, ensure_ascii=False))
lyrics_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["lyrics_file"]
)
with open(lyrics_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.lyrics_encoder, ensure_ascii=False))
return (artists_file, genres_file, lyrics_file)
def _convert_id_to_token(self, artists_index, genres_index, lyric_index):
"""
Converts an index (integer) in a token (str) using the vocab.
Args:
artists_index (`int`):
Index of the artist in its corresponding dictionary.
genres_index (`Union[list[int], int]`):
Index of the genre in its corresponding dictionary.
lyric_index (`list[int]`):
List of character indices, which each correspond to a character.
"""
artist = self.artists_decoder.get(artists_index)
genres = [self.genres_decoder.get(genre) for genre in genres_index]
lyrics = [self.lyrics_decoder.get(character) for character in lyric_index]
return artist, genres, lyrics
__all__ = ["JukeboxTokenizer"]

View File

@ -0,0 +1,29 @@
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ....utils import _LazyModule
from ....utils.import_utils import define_import_structure
if TYPE_CHECKING:
from .configuration_mctct import *
from .feature_extraction_mctct import *
from .modeling_mctct import *
from .processing_mctct import *
else:
import sys
_file = globals()["__file__"]
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

View File

@ -0,0 +1,184 @@
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""M-CTC-T model configuration"""
from ....configuration_utils import PreTrainedConfig
from ....utils import logging
logger = logging.get_logger(__name__)
class MCTCTConfig(PreTrainedConfig):
r"""
This is the configuration class to store the configuration of a [`MCTCTModel`]. It is used to instantiate an
M-CTC-T model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the M-CTC-T
[speechbrain/m-ctc-t-large](https://huggingface.co/speechbrain/m-ctc-t-large) architecture.
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PreTrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 8065):
Vocabulary size of the M-CTC-T model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`MCTCTModel`].
hidden_size (`int`, *optional*, defaults to 1536):
Dimension of the encoder layers and the pooler layer.
num_hidden_layers (`int`, *optional*, defaults to 36):
Number of hidden layers in the Transformer encoder.
intermediate_size (`int`, *optional*, defaults to 6144):
Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 4):
Number of attention heads for each attention layer in the Transformer encoder.
attention_head_dim (`int`, *optional*, defaults to 384):
Dimensions of each attention head for each attention layer in the Transformer encoder.
max_position_embeddings (`int`, *optional*, defaults to 920):
The maximum sequence length that this model might ever be used with (after log-mel spectrogram extraction).
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the layer normalization layers.
layerdrop (`float`, *optional*, defaults to 0.3):
The probability of dropping an encoder layer during training. The default 0.3 value is used in the original
implementation.
hidden_act (`str` or `function`, *optional*, defaults to `"relu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` are supported.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
hidden_dropout_prob (`float`, *optional*, defaults to 0.3):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.3):
The dropout ratio for the attention probabilities.
pad_token_id (`int`, *optional*, defaults to 1):
The tokenizer index of the pad token.
bos_token_id (`int`, *optional*, defaults to 0):
The tokenizer index of the bos token.
eos_token_id (`int`, *optional*, defaults to 2):
The tokenizer index of the eos token.
conv_glu_dim (`int`, *optional*, defaults to 1):
The dimension of the output of the `Conv1dSubsampler` layer in which GLU is applied on. Though the original
Flashlight code uses the value of 2, here it's adapted to 1 due to transposition differences.
conv_dropout (`int`, *optional*, defaults to 0.3):
The probability of randomly dropping the `Conv1dSubsampler` layer during training.
num_conv_layers (`int`, *optional*, defaults to 1):
Number of convolution layers before applying transformer encoder layers.
conv_kernel (`Sequence[int]`, *optional*, defaults to `(7,)`):
The kernel size of the 1D convolution applied before transformer layers. `len(conv_kernel)` must be equal
to `num_conv_layers`.
conv_stride (`Sequence[int]`, *optional*, defaults to `(3,)`):
The stride length of the 1D convolution applied before transformer layers. `len(conv_stride)` must be equal
to `num_conv_layers`.
input_feat_per_channel (`int`, *optional*, defaults to 80):
Feature dimensions of the channels of the input to the Conv1D layer.
input_channels (`int`, *optional*, defaults to 1):
Number of input channels of the input to the Conv1D layer.
conv_channels (`list[int]`, *optional*):
Channel sizes of intermediate Conv1D layers.
ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
instance of [`MCTCTForCTC`].
ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
of [`MCTCTForCTC`].
Example:
```python
>>> from transformers import MCTCTConfig, MCTCTModel
>>> # Initializing a M-CTC-T mctct-large style configuration
>>> configuration = MCTCTConfig()
>>> # Initializing a model (with random weights) from the mctct-large style configuration
>>> model = MCTCTModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "mctct"
def __init__(
self,
vocab_size=8065,
hidden_size=1536,
num_hidden_layers=36,
intermediate_size=6144,
num_attention_heads=4,
attention_head_dim=384,
max_position_embeddings=920,
layer_norm_eps=1e-5,
layerdrop=0.3,
hidden_act="relu",
initializer_range=0.02,
hidden_dropout_prob=0.3,
attention_probs_dropout_prob=0.3,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
conv_glu_dim=1,
conv_dropout=0.3,
num_conv_layers=1,
conv_kernel=(7,),
conv_stride=(3,),
input_feat_per_channel=80,
input_channels=1,
conv_channels=None,
ctc_loss_reduction="sum",
ctc_zero_infinity=False,
**kwargs,
):
super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.intermediate_size = intermediate_size
self.num_attention_heads = num_attention_heads
self.attention_head_dim = attention_head_dim
self.max_position_embeddings = max_position_embeddings
self.layer_norm_eps = layer_norm_eps
self.layerdrop = layerdrop
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.pad_token_id = pad_token_id
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.conv_glu_dim = conv_glu_dim
self.conv_dropout = conv_dropout
self.num_conv_layers = num_conv_layers
self.input_feat_per_channel = input_feat_per_channel
self.input_channels = input_channels
self.conv_channels = conv_channels
self.ctc_loss_reduction = ctc_loss_reduction
self.ctc_zero_infinity = ctc_zero_infinity
# prevents config testing fail with exporting to json
self.conv_kernel = list(conv_kernel)
self.conv_stride = list(conv_stride)
if len(self.conv_kernel) != self.num_conv_layers:
raise ValueError(
"Configuration for convolutional module is incorrect. "
"It is required that `len(config.conv_kernel)` == `config.num_conv_layers` "
f"but is `len(config.conv_kernel) = {len(self.conv_kernel)}`, "
f"`config.num_conv_layers = {self.num_conv_layers}`."
)
__all__ = ["MCTCTConfig"]

View File

@ -0,0 +1,290 @@
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Feature extractor class for M-CTC-T
"""
from typing import Optional, Union
import numpy as np
from ....audio_utils import mel_filter_bank, optimal_fft_length, spectrogram, window_function
from ....feature_extraction_sequence_utils import SequenceFeatureExtractor
from ....feature_extraction_utils import BatchFeature
from ....file_utils import PaddingStrategy, TensorType
from ....utils import logging
logger = logging.get_logger(__name__)
class MCTCTFeatureExtractor(SequenceFeatureExtractor):
r"""
Constructs a M-CTC-T feature extractor.
This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
most of the main methods. Users should refer to this superclass for more information regarding those methods. This
code has been adapted from Flashlight's C++ code. For more information about the implementation, one can refer to
this [notebook](https://colab.research.google.com/drive/1GLtINkkhzms-IsdcGy_-tVCkv0qNF-Gt#scrollTo=pMCRGMmUC_an)
that takes the user step-by-step in the implementation.
Args:
feature_size (`int`, defaults to 80):
The feature dimension of the extracted features. This is the number of mel_frequency
sampling_rate (`int`, defaults to 16000):
The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
padding_value (`float`, defaults to 0.0):
The value that is used to fill the padding values.
hop_length (`int`, defaults to 10):
Number of audio samples between windows. Otherwise referred to as "shift" in many papers.
win_length (`int`, defaults to 25):
Number of ms per window
win_function (`str`, defaults to `"hamming_window"`):
Name for the window function used for windowing, must be accessible via `torch.{win_function}`
frame_signal_scale (`float`, defaults to 32768.0):
Constant multiplied in creating the frames before applying DFT.
preemphasis_coeff (`float`, defaults to 0.97):
Constant multiplied in applying Pre-emphasis before DFT.
mel_floor (`float` defaults to 1.0):
Minimum value of mel frequency banks.
normalize_means (`bool`, *optional*, defaults to `True`):
Whether or not to zero-mean normalize the extracted features.
normalize_vars (`bool`, *optional*, defaults to `True`):
Whether or not to unit-variance normalize the extracted features.
"""
model_input_names = ["input_features", "attention_mask"]
def __init__(
self,
feature_size=80,
sampling_rate=16000,
padding_value=0.0,
hop_length=10,
win_length=25,
win_function="hamming_window",
frame_signal_scale=32768.0,
preemphasis_coeff=0.97,
mel_floor=1.0,
normalize_means=True,
normalize_vars=True,
return_attention_mask=False,
**kwargs,
):
super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
self.feature_size = feature_size
self.sampling_rate = sampling_rate
self.padding_value = padding_value
self.hop_length = hop_length
self.win_length = win_length
self.frame_signal_scale = frame_signal_scale
self.preemphasis_coeff = preemphasis_coeff
self.mel_floor = mel_floor
self.normalize_means = normalize_means
self.normalize_vars = normalize_vars
self.win_function = win_function
self.return_attention_mask = return_attention_mask
self.sample_size = win_length * sampling_rate // 1000
self.sample_stride = hop_length * sampling_rate // 1000
self.n_fft = optimal_fft_length(self.sample_size)
self.n_freqs = (self.n_fft // 2) + 1
def _extract_mfsc_features(self, one_waveform: np.ndarray) -> np.ndarray:
"""
Extracts MFSC Features for one waveform vector (unbatched). Adapted from Flashlight's C++ MFSC code.
"""
if self.win_function == "hamming_window":
window = window_function(window_length=self.sample_size, name=self.win_function, periodic=False)
else:
window = window_function(window_length=self.sample_size, name=self.win_function)
fbanks = mel_filter_bank(
num_frequency_bins=self.n_freqs,
num_mel_filters=self.feature_size,
min_frequency=0.0,
max_frequency=self.sampling_rate / 2.0,
sampling_rate=self.sampling_rate,
)
msfc_features = spectrogram(
one_waveform * self.frame_signal_scale,
window=window,
frame_length=self.sample_size,
hop_length=self.sample_stride,
fft_length=self.n_fft,
center=False,
preemphasis=self.preemphasis_coeff,
mel_filters=fbanks,
mel_floor=self.mel_floor,
log_mel="log",
)
return msfc_features.T
def _normalize_one(self, x, input_length, padding_value):
# make sure we normalize float32 arrays
if self.normalize_means:
mean = x[:input_length].mean(axis=0)
x = np.subtract(x, mean)
if self.normalize_vars:
std = x[:input_length].std(axis=0)
x = np.divide(x, std)
if input_length < x.shape[0]:
x[input_length:] = padding_value
# make sure array is in float32
x = x.astype(np.float32)
return x
def normalize(
self, input_features: list[np.ndarray], attention_mask: Optional[np.ndarray] = None
) -> list[np.ndarray]:
lengths = attention_mask.sum(-1) if attention_mask is not None else [x.shape[0] for x in input_features]
return [self._normalize_one(x, n, self.padding_value) for x, n in zip(input_features, lengths)]
def __call__(
self,
raw_speech: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]],
padding: Union[bool, str, PaddingStrategy] = False,
max_length: Optional[int] = None,
truncation: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
sampling_rate: Optional[int] = None,
**kwargs,
) -> BatchFeature:
"""
Main method to featurize and prepare for the model one or several sequence(s). sequences. It returns the
log-mel spectrogram of the input audio, as implemented in the original Flashlight MFSC feature extraction code.
Args:
raw_speech (`torch.Tensor`, `np.ndarray`, `list[float]`, `list[torch.Tensor]`, `list[np.ndarray]`, `list[list[float]]`):
The sequence or batch of sequences to be padded. Each sequence can be a tensor, a numpy array, a list
of float values, a list of tensors, a list of numpy arrays or a list of list of float values. Must be
mono channel audio, not stereo, i.e. single float per timestep.
padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding
index) among:
- `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence if provided).
- `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
acceptable input length for the model if that argument is not provided.
- `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
lengths).
max_length (`int`, *optional*):
Maximum length of the returned list and optionally padding length (see above).
truncation (`bool`):
Activates truncation to cut input sequences longer than *max_length* to *max_length*.
pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
`>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
return_attention_mask (`bool`, *optional*):
Whether to return the attention mask. If left to the default, will return the attention mask according
to the specific feature_extractor's default.
[What are attention masks?](../glossary#attention-mask)
return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
If set, will return tensors instead of list of python integers. Acceptable values are:
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return Numpy `np.ndarray` objects.
sampling_rate (`int`, *optional*):
The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
`sampling_rate` at the forward call to prevent silent errors.
padding_value (`float`, defaults to 0.0):
"""
if sampling_rate is not None:
if sampling_rate != self.sampling_rate:
raise ValueError(
f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
f" {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled with"
f" {self.sampling_rate} and not {sampling_rate}."
)
else:
logger.warning(
"It is strongly recommended to pass the ``sampling_rate`` argument to this function. "
"Failing to do so can result in silent errors that might be hard to debug."
)
is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
if is_batched_numpy and len(raw_speech.shape) > 2:
raise ValueError(f"Only mono-channel audio is supported for input to {self}")
is_batched = is_batched_numpy or (
isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
)
if is_batched:
raw_speech = [np.asarray(speech, dtype=np.float32) for speech in raw_speech]
elif not is_batched and not isinstance(raw_speech, np.ndarray):
raw_speech = np.asarray(raw_speech, dtype=np.float32)
elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
raw_speech = raw_speech.astype(np.float32)
# always return batch
if not is_batched:
raw_speech = [raw_speech]
# extract fbank features
features = [self._extract_mfsc_features(one_waveform) for one_waveform in raw_speech]
# convert into correct format for padding
encoded_inputs = BatchFeature({"input_features": features})
padded_inputs = self.pad(
encoded_inputs,
padding=padding,
max_length=max_length,
truncation=truncation,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=True,
**kwargs,
)
# make sure list is in array format
input_features = padded_inputs.get("input_features")
if isinstance(input_features[0], list):
padded_inputs["input_features"] = [np.asarray(feature, dtype=np.float32) for feature in input_features]
attention_mask = padded_inputs.get("attention_mask")
if attention_mask is not None:
padded_inputs["attention_mask"] = [np.asarray(array, dtype=np.int32) for array in attention_mask]
if self.normalize_means or self.normalize_vars:
attention_mask = (
np.array(attention_mask, dtype=np.int32)
if self._get_padding_strategies(padding, max_length=max_length) is not PaddingStrategy.DO_NOT_PAD
and padding
else None
)
padded_inputs["input_features"] = self.normalize(
padded_inputs["input_features"], attention_mask=attention_mask
)
if return_tensors is not None:
padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
return padded_inputs
__all__ = ["MCTCTFeatureExtractor"]

View File

@ -0,0 +1,714 @@
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch M-CTC-T model."""
import math
from typing import Optional, Union
import torch
from torch import nn
from .... import initialization as init
from ....activations import ACT2FN
from ....file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
from ....integrations.deepspeed import is_deepspeed_zero3_enabled
from ....integrations.fsdp import is_fsdp_managed_module
from ....modeling_attn_mask_utils import _prepare_4d_attention_mask
from ....modeling_layers import GradientCheckpointingLayer
from ....modeling_outputs import BaseModelOutput, CausalLMOutput
from ....modeling_utils import PreTrainedModel
from ....pytorch_utils import apply_chunking_to_forward
from ....utils import logging
from .configuration_mctct import MCTCTConfig
logger = logging.get_logger(__name__)
_HIDDEN_STATES_START_POSITION = 1
_CONFIG_FOR_DOC = "MCTCTConfig"
# Base docstring
_CHECKPOINT_FOR_DOC = "speechbrain/m-ctc-t-large"
_EXPECTED_OUTPUT_SHAPE = [1, 195, 1536]
# CTC docstring
_CTC_EXPECTED_OUTPUT = '"Mr. Quilter is the apostle of the middle classes, and we\'re glad to welcome his gospel."'
_CTC_EXPECTED_LOSS = 1885.65
class MCTCTConv1dSubsampler(nn.Module):
"""
Convolutional subsampler: a stack of 1D convolution (along temporal dimension) followed by non-linear activation
via gated linear units (https://huggingface.co/papers/1911.08460)
"""
def __init__(self, config):
super().__init__()
self.config = config
self.glu_dim = config.conv_glu_dim
self.dropout = nn.Dropout(config.conv_dropout)
self.num_layers = config.num_conv_layers
self.in_channels = config.input_feat_per_channel * config.input_channels
if self.num_layers > 1:
if config.conv_channels is None:
raise ValueError(
"Need to specify `conv_channels` configuration in `MCTCTConfig` to use multiple convolution"
" layers."
)
self.mid_channels = config.conv_channels
else:
self.mid_channels = None
self.out_channels = config.hidden_size * 2 # considering GLU halving
self.kernel_size = config.conv_kernel
self.stride = config.conv_stride
# NOTE: MCTCT by construction only uses one convolution kernel. I've made this flexible to allow for
# multiple layers of convolutions, but not sure if this model definition should just restrict it
# to one layer. This becomes especially relevant when considering the padding like line 1 of forward().
self.conv_layers = nn.ModuleList(
nn.Conv1d(
self.in_channels if i == 0 else self.mid_channels[i],
self.mid_channels[i] if i < self.num_layers - 1 else self.out_channels,
kernel_size=k,
stride=self.stride[i],
padding="valid",
)
for i, k in enumerate(self.kernel_size)
)
def forward(self, input_features):
# NOTE: in reference to the NOTE in __init__, right now it just calculates padding as if
# there will be just one conv layer.
padding = sum(size // 2 for size in self.kernel_size) # (7, 7) -> (3, 3)
input_features = torch.nn.functional.pad(input_features, (0, 0, padding, padding), "constant", 0)
hidden_states = input_features.transpose(1, 2).contiguous() # -> Batch x Frame x Time
for conv in self.conv_layers:
hidden_states = conv(hidden_states)
hidden_states = nn.functional.glu(hidden_states, dim=self.glu_dim)
hidden_states = self.dropout(hidden_states)
hidden_states = hidden_states.transpose(1, 2).contiguous() # -> Batch x Time x Frame
return hidden_states
class MCTCTEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
# self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.LayerNorm = MCTCTLayerNorm()
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.register_buffer(
"token_type_ids",
torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
persistent=False,
)
def forward(
self, input_features=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
):
input_shape = input_features.size() if input_features is not None else inputs_embeds.size()[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
# Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
# when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
# issue #5664
if token_type_ids is None:
if hasattr(self, "token_type_ids"):
buffered_token_type_ids = self.token_type_ids[:, :seq_length]
buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
token_type_ids = buffered_token_type_ids_expanded
else:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_features)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class MCTCTSelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = config.attention_head_dim
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.max_position_embeddings = config.max_position_embeddings
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
self.is_decoder = config.is_decoder
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
def reshape_fortran(self, x, shape):
if len(x.shape) > 0:
x = x.permute(*reversed(range(len(x.shape))))
return x.reshape(*reversed(shape)).permute(*reversed(range(len(shape))))
def relative_position_embedding_rotate(self, scores):
# NOTE: should re-evaluate whether this re-implementation was truly necessary
# or the reason why my complete re-haul worked was due to some other part
# of the code. Adding this and the reshape fortrain code seems very undesirable.
scores = scores.permute(0, 2, 3, 1) # e.g. [10, 1839, 14, 4]
batch, hidden_state, seq_len, heads = scores.shape
# e.g. [10, 1853, 14, 4]
scores = torch.cat((scores, torch.zeros((batch, seq_len, seq_len, heads), device=scores.device)), dim=1)
# e.g. [10, 25942, 1, 4]
scores = self.reshape_fortran(scores, [batch, (hidden_state + seq_len) * seq_len, 1, heads])
# e.g. [10, 25928, 1, 4]
scores = scores[:, : (seq_len + hidden_state - 1) * seq_len]
# e.g. [10, 1852, 14, 4]
scores = self.reshape_fortran(scores, [batch, hidden_state + seq_len - 1, seq_len, heads])
halfpoint = hidden_state // 2
scores = scores[:, halfpoint : halfpoint + seq_len].transpose(1, 2) # e.g. [10, 14, 14, 4]
return scores.permute(0, 3, 1, 2)
def forward(
self,
hidden_states,
attention_mask=None,
output_attentions=False,
):
mixed_query_layer = self.query(hidden_states)
mixed_query_layer = mixed_query_layer / math.sqrt(self.attention_head_size)
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
query_layer = self.transpose_for_scores(mixed_query_layer)
# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
# relative key position embeddings
positional_embedding = self.distance_embedding.weight
relative_position_scores = torch.einsum("lh, bche -> bcle", positional_embedding, query_layer.transpose(2, 3))
relative_position_scores = self.relative_position_embedding_rotate(relative_position_scores)
attention_scores = attention_scores + relative_position_scores
if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in MCTCTModel forward() function)
attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities.
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs)
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).flatten(start_dim=-2)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
class MCTCTLayerNorm(nn.Module):
def __init__(self):
super().__init__()
self.singleton_weight = nn.Parameter(torch.ones(1))
self.singleton_bias = nn.Parameter(torch.zeros(1))
def forward(self, hidden_states):
return (hidden_states * self.singleton_weight) + self.singleton_bias
class MCTCTSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.dense = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class MCTCTAttention(nn.Module):
def __init__(self, config):
super().__init__()
self.self = MCTCTSelfAttention(config)
self.output = MCTCTSelfOutput(config)
def forward(
self,
hidden_states,
attention_mask=None,
output_attentions=False,
):
self_outputs = self.self(
hidden_states,
attention_mask,
output_attentions,
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
return outputs
class MCTCTIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class MCTCTOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class MCTCTLayer(GradientCheckpointingLayer):
def __init__(self, config: MCTCTConfig):
super().__init__()
self.seq_len_dim = 1
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.intermediate = MCTCTIntermediate(config)
self.attention = MCTCTAttention(config)
self.is_decoder = config.is_decoder
self.output = MCTCTOutput(config)
def forward(
self,
hidden_states,
attention_mask=None,
output_attentions=False,
):
self_attention_outputs = self.attention(hidden_states, attention_mask, output_attentions=output_attentions)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
outputs = (layer_output,) + outputs
return outputs
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
class MCTCTPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config: MCTCTConfig
base_model_prefix = "mctct"
main_input_name = "input_features"
supports_gradient_checkpointing = True
@torch.no_grad()
def _init_weights(self, module):
"""Initialize the weights"""
super()._init_weights(module)
if isinstance(module, MCTCTLayerNorm):
init.ones_(module.singleton_weight)
init.zeros_(module.singleton_bias)
def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
"""
Computes the output length of the convolutional layers
"""
dilation = 1
for _, kernel_sz, stride in zip(
range(self.config.num_conv_layers), self.config.conv_kernel, self.config.conv_stride
):
padding = kernel_sz // 2
input_lengths = input_lengths + 2 * padding - dilation * (kernel_sz - 1) - 1
input_lengths = torch.div(input_lengths, stride, rounding_mode="trunc") + 1
return input_lengths
def _get_feature_vector_attention_mask(self, feature_vector_length, attention_mask):
# generate creates 3D attention mask, because of the shape of input_features
# convert it to 2D if that's the case
if len(attention_mask.shape) > 2:
attention_mask = attention_mask[:, :, -1]
# subsampled_lengths = attention_mask.sum(-1)
subsampled_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1))
bsz = attention_mask.size()[0]
attention_mask = torch.zeros(
(bsz, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
)
# these two operations makes sure that all values
# before the output lengths indices are attended to
attention_mask[(torch.arange(bsz, device=attention_mask.device), subsampled_lengths - 1)] = 1
attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).long()
return attention_mask
MCTCT_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`MCTCTConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
MCTCT_INPUTS_DOCSTRING = r"""
Args:
input_features (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`Wav2Vec2CTCTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
class MCTCTEncoder(MCTCTPreTrainedModel):
def __init__(self, config: MCTCTConfig):
super().__init__(config)
self.hidden_dropout_prob = config.hidden_dropout_prob
self.layer_norm = MCTCTLayerNorm()
self.conv = MCTCTConv1dSubsampler(config)
self.layers = nn.ModuleList([MCTCTLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
input_features: torch.Tensor,
attention_mask: torch.Tensor,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
) -> Union[tuple, BaseModelOutput]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
input_features = self.layer_norm(input_features)
inputs_embeds = self.conv(input_features)
# subsample attention mask if necessary
if attention_mask is not None:
attention_mask = self._get_feature_vector_attention_mask(inputs_embeds.shape[1], attention_mask)
hidden_states = nn.functional.dropout(inputs_embeds, p=self.hidden_dropout_prob, training=self.training)
# expand attention_mask
if attention_mask is not None:
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
encoder_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
for idx, encoder_layer in enumerate(self.layers):
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
# add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
dropout_probability = torch.rand([])
skip_the_layer = self.training and dropout_probability < self.config.layerdrop
if not skip_the_layer or synced_gpus:
# under fsdp or deepspeed zero3 all gpus must run in sync
layer_outputs = encoder_layer(
hidden_states=hidden_states,
attention_mask=attention_mask,
output_attentions=output_attentions,
)
hidden_states = layer_outputs[0]
if skip_the_layer:
layer_outputs = (None, None)
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
)
@add_start_docstrings(
"The bare M-CTC-T Model transformer outputting raw hidden-states without any specific head on top.",
MCTCT_START_DOCSTRING,
)
class MCTCTModel(MCTCTPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.config = config
self.encoder = MCTCTEncoder(config)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(MCTCT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutput,
config_class=_CONFIG_FOR_DOC,
modality="audio",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
input_features: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[tuple, BaseModelOutput]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_features is None:
raise ValueError("You have to specify input_features.")
encoder_outputs = self.encoder(
input_features,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = encoder_outputs[0]
if not return_dict:
return (sequence_output,) + encoder_outputs[1:]
return BaseModelOutput(
last_hidden_state=sequence_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
@add_start_docstrings(
"""MCTCT Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
MCTCT_START_DOCSTRING,
)
class MCTCTForCTC(MCTCTPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.mctct = MCTCTModel(config)
if config.vocab_size is None:
raise ValueError(
f"You are trying to instantiate {self.__class__} with a configuration that "
"does not define the vocabulary size of the language model head. Please "
"instantiate the model as follows: `MCTCTForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
"or define `vocab_size` of your model's configuration."
)
output_hidden_size = config.hidden_size
self.ctc_head = nn.Linear(output_hidden_size, config.vocab_size)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(MCTCT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=CausalLMOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_CTC_EXPECTED_OUTPUT,
expected_loss=_CTC_EXPECTED_LOSS,
)
def forward(
self,
input_features: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: Optional[torch.LongTensor] = None,
) -> Union[tuple, CausalLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
config.vocab_size - 1]`.
"""
if labels is not None and labels.max() >= self.config.vocab_size:
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.mctct(
input_features,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs[0]
logits = self.ctc_head(hidden_states)
loss = None
if labels is not None:
# retrieve loss input_lengths from attention_mask
attention_mask = (
attention_mask
if attention_mask is not None
else torch.ones(input_features.shape[:-1], dtype=torch.long)
)
input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
# assuming that padded tokens are filled with -100
# when not being attended to
labels_mask = labels >= 0
target_lengths = labels_mask.sum(-1)
flattened_targets = labels.masked_select(labels_mask)
# ctc_loss doesn't support fp16
log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
with torch.backends.cudnn.flags(enabled=False):
loss = nn.functional.ctc_loss(
log_probs,
flattened_targets,
input_lengths,
target_lengths,
blank=self.config.pad_token_id,
reduction=self.config.ctc_loss_reduction,
zero_infinity=self.config.ctc_zero_infinity,
)
if not return_dict:
output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
return ((loss,) + output) if loss is not None else output
return CausalLMOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
)
__all__ = ["MCTCTForCTC", "MCTCTModel", "MCTCTPreTrainedModel"]

View File

@ -0,0 +1,139 @@
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Speech processor class for M-CTC-T
"""
import warnings
from contextlib import contextmanager
from ....processing_utils import ProcessorMixin
class MCTCTProcessor(ProcessorMixin):
r"""
Constructs a MCTCT processor which wraps a MCTCT feature extractor and a MCTCT tokenizer into a single processor.
[`MCTCTProcessor`] offers all the functionalities of [`MCTCTFeatureExtractor`] and [`AutoTokenizer`]. See the
[`~MCTCTProcessor.__call__`] and [`~MCTCTProcessor.decode`] for more information.
Args:
feature_extractor (`MCTCTFeatureExtractor`):
An instance of [`MCTCTFeatureExtractor`]. The feature extractor is a required input.
tokenizer (`AutoTokenizer`):
An instance of [`AutoTokenizer`]. The tokenizer is a required input.
"""
feature_extractor_class = "MCTCTFeatureExtractor"
tokenizer_class = "AutoTokenizer"
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
self.current_processor = self.feature_extractor
self._in_target_context_manager = False
def __call__(self, *args, **kwargs):
"""
When used in normal mode, this method forwards all its arguments to MCTCTFeatureExtractor's
[`~MCTCTFeatureExtractor.__call__`] and returns its output. If used in the context
[`~MCTCTProcessor.as_target_processor`] this method forwards all its arguments to AutoTokenizer's
[`~AutoTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
"""
# For backward compatibility
if self._in_target_context_manager:
return self.current_processor(*args, **kwargs)
if "raw_speech" in kwargs:
warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
audio = kwargs.pop("raw_speech")
else:
audio = kwargs.pop("audio", None)
sampling_rate = kwargs.pop("sampling_rate", None)
text = kwargs.pop("text", None)
if len(args) > 0:
audio = args[0]
args = args[1:]
if audio is None and text is None:
raise ValueError("You need to specify either an `audio` or `text` input to process.")
if audio is not None:
inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
if text is not None:
encodings = self.tokenizer(text, **kwargs)
if text is None:
return inputs
elif audio is None:
return encodings
else:
inputs["labels"] = encodings["input_ids"]
return inputs
def pad(self, *args, **kwargs):
"""
When used in normal mode, this method forwards all its arguments to MCTCTFeatureExtractor's
[`~MCTCTFeatureExtractor.pad`] and returns its output. If used in the context
[`~MCTCTProcessor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's
[`~PreTrainedTokenizer.pad`]. Please refer to the docstring of the above two methods for more information.
"""
# For backward compatibility
if self._in_target_context_manager:
return self.current_processor.pad(*args, **kwargs)
input_features = kwargs.pop("input_features", None)
labels = kwargs.pop("labels", None)
if len(args) > 0:
input_features = args[0]
args = args[1:]
if input_features is not None:
input_features = self.feature_extractor.pad(input_features, *args, **kwargs)
if labels is not None:
labels = self.tokenizer.pad(labels, **kwargs)
if labels is None:
return input_features
elif input_features is None:
return labels
else:
input_features["labels"] = labels["input_ids"]
return input_features
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to AutoTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
@contextmanager
def as_target_processor(self):
"""
Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning MCTCT.
"""
warnings.warn(
"`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
"labels by using the argument `text` of the regular `__call__` method (either in the same call as "
"your audio inputs, or in a separate call."
)
self._in_target_context_manager = True
self.current_processor = self.tokenizer
yield
self.current_processor = self.feature_extractor
self._in_target_context_manager = False
__all__ = ["MCTCTProcessor"]

View File

@ -0,0 +1,27 @@
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ....utils import _LazyModule
from ....utils.import_utils import define_import_structure
if TYPE_CHECKING:
from .configuration_mega import *
from .modeling_mega import *
else:
import sys
_file = globals()["__file__"]
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

View File

@ -0,0 +1,224 @@
# coding=utf-8
# Copyright 2023 The Mega Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""MEGA configuration"""
from ....configuration_utils import PreTrainedConfig
from ....utils import logging
logger = logging.get_logger(__name__)
class MegaConfig(PreTrainedConfig):
r"""
This is the configuration class to store the configuration of a [`MegaModel`]. It is used to instantiate a Mega
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the Mega
[mnaylor/mega-base-wikitext](https://huggingface.co/mnaylor/mega-base-wikitext) architecture.
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PreTrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 30522):
Vocabulary size of the Mega model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`MegaModel`].
hidden_size (`int`, *optional*, defaults to 128):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (`int`, *optional*, defaults to 4):
Number of hidden layers in the Mega encoder.
intermediate_size (`int`, *optional*, defaults to 256):
Dimensionality of the hidden size (self-attention value projection) within the Mega encoder
ema_projection_size (`int`, *optional*, defaults to 16):
Dimensionality of the MegaMultiDimensionDampedEma
bidirectional (`bool`, *optional*, defaults to `True`):
Whether the MegaMultiDimensionDampedEma used in Mega's self-attention should work bidirectionally (`True`)
or unidirectionally (`False`). Bidirectional EMA is incompatible with causal decoding, so this should be
False if you intend to use the model as a decoder.
shared_representation_size (`int`, *optional*, defaults to 64):
Dimensionality of the linear projection for shared representation of self-attention queries and keys
use_chunking (`bool`, *optional*, defaults to `False`):
Whether to chunk inputs for linear self-attention complexity (described as Mega-chunk in the paper)
chunk_size (`int`, *optional*, defaults to -1):
If `use_chunking` is set to `True`, determines the size of the chunks to apply to the input sequence. If
chunking is used, input sequences must be padded to a multiple of `chunk_size`
truncation (`int`, *optional*):
If specified, the sequence length for which to truncate MegaMultiDimensionDampedEma
normalize_before_mega (`bool`, *optional*, defaults to `True`):
Whether to normalize before (`True`) or after (`False`) passing through Mega encoder blocks
normalization_type (`str`, *optional*, defaults to `"scalenorm"`):
Type of normalization to use in Mega encoder blocks. Choose one of `"scalenorm"`, `"layernorm"`,
`"rmsnorm"`, `"batchnorm"`, or `"syncbatchnorm"` (GPU required for syncbatchnorm)
norm_affine (`bool`, *optional*, defaults to `True`):
If `True`, applies a parameterized affine transformation to inputs during normalization
activation (`str`, *optional*, defaults to `"silu"`):
Activation function to apply within Mega encoder blocks. Choose one of `"silu"`, `"relu"`, `"linear"`,
`"gelu"`, or `"gelu_accurate"`
attention_activation (`str`, *optional*, defaults to `"softmax"`):
Activation function to apply for single-headed self-attention (a la Transformer). Choose one of
`"softmax"`, `"laplace"`, or `"relu2"`
dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probability for EMA self-attention
hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
use_feature_dropout (`bool`, *optional*, defaults to `False`):
Whether to use feature-based (`True`) or standard dropout (`False`)
use_normalized_ffn (`bool`, *optional*, defaults to `True`):
Whether to use the normalized feed-forward sub-layer in Mega blocks (`True`) or pass Mega encoder output
as-is (`False`)
nffn_hidden_size (`int`, *optional*, defaults to 256):
If using the normalized feed-forward network (NFFN) layer within Mega (`use_normalized_ffn = True`), this
is the hidden size of the NFFN
normalize_before_ffn (`bool`, *optional*, defaults to `True`):
Whether to normalize before (`True`) or after (`False`) the feed-forward portion of NFFN
nffn_activation_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the NFFN component.
max_positions (`int`, *optional*, defaults to 2048):
The maximum sequence length to use for positional representations. For `"simple"` relative positional bias,
this is a hard limit on input length; `"rotary"` relative positional bias will extrapolate to longer
sequences
add_token_type_embeddings (`bool`, *optional*, defaults to `True`):
Whether to account for token types in embeddings. Left as optional to maintain compatibility with original
implementation while adding support for token types.
type_vocab_size (`int`, *optional*, defaults to 2):
The vocabulary size of the `token_type_ids` passed when calling [`MegaModel`]. Only used if
`add_token_type_embeddings = True`
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
ema_delta_alpha_range (`float`, *optional*, defaults to 0.2):
The standard deviation for initializing the delta (damping factor) and alpha (decay factor) parameters in
MegaMultiDimensionDampedEma.
ema_beta_range (`float`, *optional*, defaults to 0.02):
The standard deviation for initializing the beta parameter (expansion matrix) in
MegaMultiDimensionDampedEma.
ema_gamma_omega_range (`float`, *optional*, defaults to 1.0):
The standard deviation for initializing the gamma (projection matrix) and omega (residual weight)
parameters in MultiDimensionEMA.
relative_positional_bias (`str`, *optional*, defaults to `"rotary"`):
Type of relative positional encoding. Choose one of `"rotary"` or `"simple"`. If `"simple"` is selected,
`max_positions` is used as a limit on input size, while `"rotary"` extrapolates beyond `max_positions`.
is_decoder (`bool`, *optional*, defaults to `False`):
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
classifier_dropout (`float`, *optional*):
The dropout ratio for the classification head.
add_lm_hidden_dense_layer (`bool`, *optional*, defaults to `True`):
Whether to include a hidden layer for projection between encoder outputs and LM heads (`True`) or pass
hidden states directly to LM head (`False`). Remains optional for compatibility with original
implementation
Examples:
```python
>>> from transformers import MegaConfig, MegaModel
>>> # Initializing a Mega configuration
>>> configuration = MegaConfig()
>>> # Initializing a model (with random weights) from the configuration
>>> model = MegaModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "mega"
def __init__(
self,
vocab_size=30522,
hidden_size=128,
num_hidden_layers=4,
intermediate_size=256,
ema_projection_size=16,
bidirectional=True,
shared_representation_size=64,
use_chunking=False,
chunk_size=-1,
truncation=None,
normalize_before_mega=True,
normalization_type="scalenorm",
norm_affine=True,
activation="silu",
attention_activation="softmax",
dropout_prob=0.1,
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
use_feature_dropout=False,
use_normalized_ffn=True,
nffn_hidden_size=256,
normalize_before_ffn=True,
nffn_activation_dropout_prob=0.1,
max_positions=2048,
add_token_type_embeddings=False,
type_vocab_size=2,
initializer_range=0.02,
ema_delta_alpha_range=0.2,
ema_beta_range=0.02,
ema_gamma_omega_range=1.0,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
relative_positional_bias="rotary",
classifier_dropout=None,
use_cache=True,
add_lm_hidden_dense_layer=True,
**kwargs,
):
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.activation = activation
self.attention_activation = attention_activation
self.intermediate_size = intermediate_size
self.ema_projection_size = ema_projection_size
self.bidirectional = bidirectional
self.shared_representation_size = shared_representation_size
self.use_chunking = use_chunking
self.chunk_size = chunk_size
self.truncation = truncation
self.normalize_before_mega = normalize_before_mega
self.normalization_type = normalization_type
self.norm_affine = norm_affine
self.dropout_prob = dropout_prob
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.use_feature_dropout = use_feature_dropout
self.use_normalized_ffn = use_normalized_ffn
self.nffn_hidden_size = nffn_hidden_size
self.normalize_before_ffn = normalize_before_ffn
self.nffn_activation_dropout_prob = nffn_activation_dropout_prob
self.max_positions = max_positions
self.add_token_type_embeddings = add_token_type_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.ema_delta_alpha_range = ema_delta_alpha_range
self.ema_beta_range = ema_beta_range
self.ema_gamma_omega_range = ema_gamma_omega_range
self.relative_positional_bias = relative_positional_bias
self.use_cache = use_cache
self.classifier_dropout = classifier_dropout
self.add_lm_hidden_dense_layer = add_lm_hidden_dense_layer
self.num_attention_heads = 1 # not used but required by Hugging Face
__all__ = ["MegaConfig"]

View File

@ -0,0 +1,307 @@
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Convert Mega pretrained checkpoint. Built to convert the Masked LM checkpoint located at
https://huggingface.co/mnaylor/mega-wikitext-103
Requirements:
- clone the Mega repo and install fairseq from there
1. git clone https://github.com/facebookresearch/mega.git
2. cd mega && pip install -e
- clone the pretrained weights for the original implementation from the hugging face repo
* use this location as the path for pretrained weights
"""
import argparse
# utilities to import the model weights and config file
import os
import pickle
# PyTorch + new model classes
import torch
from torch import nn
from transformers import AutoTokenizer, MegaConfig, MegaForMaskedLM
from ....utils import strtobool
# import the EncoderLayer class used to pretrain
# !! NOTE !! this requires the version of fairseq that is built when you install the Mega source
try:
from fairseq.modules.mega_layer import MegaEncoderLayer
except ImportError:
raise ImportError("You need to install the version of fairseq from the Mega repo!")
# define the wrapper classes used to train the MLM (see colab notebook below)
# https://colab.research.google.com/drive/1qfUO6o5HRdxBblWlw058HVyvaEPhPpH8?usp=sharing
# MegaLM outputs hidden states
class MegaLM(nn.Module):
"The base class for our Mega encoder - given input IDs, embed text and return encoder output"
def __init__(self, mega_args, depth, vocab_size):
super().__init__()
self.mega_args = mega_args
self.embedding_layer = nn.Embedding(vocab_size, self.mega_args.encoder_embed_dim)
self.encoders = nn.ModuleList([MegaEncoderLayer(self.mega_args) for _ in range(depth)])
self.depth = depth
def forward(self, input_ids, attention_mask, batch_first=True, ignore_mask_value=0):
"""
Code for a forward pass - expects input_ids and attention_mask to come from a Hugging Face tokenizer as PyTorch
tensors, and returns a tensor of size (batch, n_classes) containing classification logits
Other options:
- batch_first: boolean indicating whether the batch dimension is first in input_ids (default: True, which
aligns with the HF tokenizer behavior)
- ignore_mask_value: the value in attention_mask that identifies tokens that should be ignored (default: 0,
which aligns with HF tokenizer)
"""
# Mega expects embeddings to be (time, batch, embedding size), but
# Hugging Face returns tokens as (batch, time)
if batch_first:
input_ids = input_ids.T
# to make things more confusing, Mega expects the attention mask to
# be (batch, time), but with values of 0 (normal token) and 1 (ignore token)
# which is the opposite of what HF returns
if ignore_mask_value == 0:
attention_mask = 1 - attention_mask
# get token embeddings from IDs
embeds = self.embedding_layer(input_ids)
# pass through the Mega layers
# input is (time, batch, encoder dim) and output is the same
for encoder in self.encoders:
embeds = encoder(embeds, attention_mask)
# return according to the shape specified
if batch_first:
# (T, B, H) --> (B, T, H)
return torch.transpose(embeds, 0, 1)
else:
return embeds
# renamed from MegaForMaskedLM to avoid confusion with new module
class OriginalMegaForMaskedLM(nn.Module):
"A wrapper class for doing masked language modeling with Mega"
def __init__(self, mega_args, depth, vocab_size):
super().__init__()
self.mega = MegaLM(mega_args, depth, vocab_size)
self.mlm_head = nn.Linear(mega_args.encoder_embed_dim, vocab_size)
self.dropout = nn.Dropout(p=0.1)
def forward(self, input_ids, attention_mask, batch_first=True, ignore_mask_value=0):
"""
Perform a forward pass through the Mega encoder and the masked LM head. Returns logits for each vocabulary
entry.
If `batch_first` (default to align with Hugging Face tokenizer behavior), output will have the shape (Batch
size, Sequence length, Vocab size); otherwise (S, B, V)
"""
encoder_output = self.mega(input_ids, attention_mask, batch_first, ignore_mask_value)
return self.mlm_head(self.dropout(encoder_output))
# code to convert the checkpoint located in the user-specified location
def convert_checkpoint_to_huggingface(pretrained_checkpoint_path, output_path, includes_tokenizer):
if not strtobool(os.environ.get("TRUST_REMOTE_CODE", "False")):
raise ValueError(
"This part uses `pickle.load` which is insecure and will execute arbitrary code that is potentially "
"malicious. It's recommended to never unpickle data that could have come from an untrusted source, or "
"that could have been tampered with. If you already verified the pickle data and decided to use it, "
"you can set the environment variable `TRUST_REMOTE_CODE` to `True` to allow it."
)
with open(os.path.join(pretrained_checkpoint_path, "model_args.pkl"), "rb") as f:
mega_original_args = pickle.load(f)
# load the original encoder
original_mlm = OriginalMegaForMaskedLM(**mega_original_args).eval()
# load its weights
print(
"Original Mega encoder:",
original_mlm.mega.load_state_dict(
torch.load(
os.path.join(pretrained_checkpoint_path, "encoder_weights.pt"), map_location="cpu", weights_only=True
)
),
)
print(
"Original Mega MLM layer:",
original_mlm.mlm_head.load_state_dict(
torch.load(
os.path.join(pretrained_checkpoint_path, "mlm_head_weights.pt"), map_location="cpu", weights_only=True
)
),
)
# create a new config from the old one
hf_config = MegaConfig(
num_hidden_layers=mega_original_args["depth"],
vocab_size=mega_original_args["vocab_size"],
hidden_size=mega_original_args["mega_args"].encoder_embed_dim,
shared_representation_size=mega_original_args["mega_args"].encoder_z_dim,
intermediate_size=mega_original_args["mega_args"].encoder_hidden_dim,
ema_projection_size=mega_original_args["mega_args"].encoder_n_dim,
dropout_prob=mega_original_args["mega_args"].dropout,
attention_probs_dropout_prob=mega_original_args["mega_args"].attention_dropout,
hidden_dropout_prob=mega_original_args["mega_args"].hidden_dropout,
activation=mega_original_args["mega_args"].activation_fn,
attention_activation=mega_original_args["mega_args"].attention_activation_fn,
bidirectional=mega_original_args["mega_args"].bidirectional,
use_chunking=mega_original_args["mega_args"].encoder_chunk_size > 0,
chunk_size=mega_original_args["mega_args"].encoder_chunk_size,
truncation=mega_original_args["mega_args"].truncation_length,
normalization_type=mega_original_args["mega_args"].normalization_type,
normalize_before_mega=True,
norm_affine=True,
use_feature_dropout=mega_original_args["mega_args"].feature_dropout,
relative_positional_bias=mega_original_args["mega_args"].rel_pos_bias,
max_positions=mega_original_args["mega_args"].max_source_positions,
nffn_hidden_size=mega_original_args["mega_args"].encoder_ffn_embed_dim,
normalize_before_ffn=mega_original_args["mega_args"].normalize_before,
# new arguments added for HF implementation
nffn_activation_dropout_prob=0.0,
add_token_type_embeddings=False,
add_lm_hidden_dense_layer=False,
)
hf_mlm = MegaForMaskedLM(hf_config).eval()
# the original checkpoint just uses nn.Embedding for the word embeddings
# we use a wrapper module for embeddings to add support for positional embeddings
hf_mlm.mega.embedding_layer.word_embeddings.weight = original_mlm.mega.embedding_layer.weight
# modify the state dictionary of the original checkpoint to account for naming issues in the Hugging Face
# ecosystem -- any names containing "beta" or "gamma" aren't safe to use and are renamed upon _load_pretrained,
# also renaming previously confusing parameter names
original_state_dict = original_mlm.mega.encoders.state_dict()
updated_keys = {}
for module_name in original_state_dict:
new_module_name = None
# have to handle gamma, beta, and alpha differently due to their use
# in multiple modules within the original repository;
# beta is used in EMA, MovingAverageGatedAttention, and RotaryRelativePositionalBias, and must be renamed due to flax/tf weights
# the EMA sublayer was renamed from "move" to "ema_gate" for readability, so that is also done here
if "beta" in module_name:
# EMA sub-layers were always called "move" in the original repo
if "move.beta" in module_name:
new_module_name = module_name.replace("move.beta", "ema_gate.ema_expansion_matrix")
elif "mega_layer.beta" in module_name:
new_module_name = module_name.replace("beta", "qk_bias")
else:
new_module_name = module_name.replace("beta", "b_param")
# beta is used in EMA and MovingAverageGatedAttention, and must be renamed due to flax/tf weights
elif "gamma" in module_name:
if "move.gamma" in module_name:
new_module_name = module_name.replace("move.gamma", "ema_gate.kernel_projection_matrix")
elif "mega_layer.gamma" in module_name:
new_module_name = module_name.replace("gamma", "qk_weight")
else:
new_module_name = module_name.replace("gamma", "g_param")
# alpha is used in EMA and positional bias; renaming to improve readability
elif "move.alpha" in module_name:
new_module_name = module_name.replace("move.alpha", "ema_gate.decay_factor")
# delta is only used in EMA; renaming to improve readability
elif "move.delta" in module_name:
new_module_name = module_name.replace("move.delta", "ema_gate.damping_factor")
# omega is only used in EMA; renaming to improve readability
elif "omega" in module_name:
new_module_name = module_name.replace("move.omega", "ema_gate.residual_weight")
if new_module_name:
updated_keys[module_name] = new_module_name
if len(updated_keys) != 0:
print(f"Renaming these keys: {updated_keys.keys()}")
else:
print("No need to rename state dict entries")
for old, new in updated_keys.items():
original_state_dict[new] = original_state_dict.pop(old)
# now attempt to load the state dictionary with updated names
# note that we now call it `mega.layers` instead of `mega.encoders` due to hugging face style
print("HF Mega encoder:", hf_mlm.mega.layers.load_state_dict(original_state_dict))
# load the MLM head weights directly
print(
"HF Mega MLM layer:",
hf_mlm.mlm_head.load_state_dict(
torch.load(
os.path.join(pretrained_checkpoint_path, "mlm_head_weights.pt"), map_location="cpu", weights_only=True
)
),
)
# test on a randomly generated input sequence
input_ids = torch.randint(0, hf_config.vocab_size, size=(4, 256))
input_mask = torch.ones_like(input_ids)
# mask a few tokens to make sure masking is applied appropriately :)
input_mask[:, -10:] = 0
# run forward passes
original_output = original_mlm(input_ids, input_mask, batch_first=True, ignore_mask_value=0)
hf_output = hf_mlm(input_ids, input_mask)[0]
# print shapes and diff
print(f"original output {original_output.shape}")
print(f"hf output {hf_output.shape}")
print(f"max diff: {(original_output - hf_output).max()}") # 0.0
success = torch.allclose(original_output, hf_output, atol=1e-3)
if success:
print("Yay!")
hf_mlm.save_pretrained(output_path)
else:
raise RuntimeError(f"Something's broken :(\nOriginal:\n{original_output}\n\nHF\n{hf_output}\n{hf_mlm}")
if includes_tokenizer:
print("Transferring tokenizer")
tokenizer = AutoTokenizer.from_pretrained(pretrained_checkpoint_path)
tokenizer.save_pretrained(output_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--pretrained_checkpoint_path",
default=None,
type=str,
required=True,
help="Point to the directory containing your model weights using the official Mega repo",
)
parser.add_argument(
"--output_path", default=None, type=str, required=True, help="Location to save the Hugging Face version"
)
parser.add_argument(
"--includes_tokenizer",
action="store_true",
help="Use this flag if there is a Hugging Face tokenizer in the original checkpoint repo",
)
args = parser.parse_args()
convert_checkpoint_to_huggingface(args.pretrained_checkpoint_path, args.output_path, args.includes_tokenizer)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,27 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ....utils import _LazyModule
from ....utils.import_utils import define_import_structure
if TYPE_CHECKING:
from .configuration_mmbt import *
from .modeling_mmbt import *
else:
import sys
_file = globals()["__file__"]
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

View File

@ -0,0 +1,45 @@
# coding=utf-8
# Copyright (c) Facebook, Inc. and its affiliates.
# Copyright (c) HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""MMBT configuration"""
from ....utils import logging
logger = logging.get_logger(__name__)
class MMBTConfig:
"""
This is the configuration class to store the configuration of a [`MMBTModel`]. It is used to instantiate a MMBT
model according to the specified arguments, defining the model architecture.
Args:
config ([`PreTrainedConfig`]):
Config of the underlying Transformer models. Its values are copied over to use a single config.
num_labels (`int`, *optional*):
Size of final Linear layer for classification.
modal_hidden_size (`int`, *optional*, defaults to 2048):
Embedding dimension of the non-text modality encoder.
"""
def __init__(self, config, num_labels=None, modal_hidden_size=2048):
self.__dict__ = config.__dict__
self.modal_hidden_size = modal_hidden_size
if num_labels:
self.num_labels = num_labels
__all__ = ["MMBTConfig"]

View File

@ -0,0 +1,399 @@
# coding=utf-8
# Copyright (c) Facebook, Inc. and its affiliates.
# Copyright (c) HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch MMBT model."""
import torch
from torch import nn
from torch.nn import CrossEntropyLoss, MSELoss
from ....modeling_outputs import BaseModelOutputWithPooling, SequenceClassifierOutput
from ....modeling_utils import ModuleUtilsMixin
from ....utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "MMBTConfig"
class ModalEmbeddings(nn.Module):
"""Generic Modal Embeddings which takes in an encoder, and a transformer embedding."""
def __init__(self, config, encoder, embeddings):
super().__init__()
self.config = config
self.encoder = encoder
self.proj_embeddings = nn.Linear(config.modal_hidden_size, config.hidden_size)
self.position_embeddings = embeddings.position_embeddings
self.token_type_embeddings = embeddings.token_type_embeddings
self.word_embeddings = embeddings.word_embeddings
self.LayerNorm = embeddings.LayerNorm
self.dropout = nn.Dropout(p=config.hidden_dropout_prob)
def forward(self, input_modal, start_token=None, end_token=None, position_ids=None, token_type_ids=None):
token_embeddings = self.proj_embeddings(self.encoder(input_modal))
seq_length = token_embeddings.size(1)
if start_token is not None:
start_token_embeds = self.word_embeddings(start_token)
seq_length += 1
token_embeddings = torch.cat([start_token_embeds.unsqueeze(1), token_embeddings], dim=1)
if end_token is not None:
end_token_embeds = self.word_embeddings(end_token)
seq_length += 1
token_embeddings = torch.cat([token_embeddings, end_token_embeds.unsqueeze(1)], dim=1)
if position_ids is None:
position_ids = torch.arange(seq_length, dtype=torch.long, device=input_modal.device)
position_ids = position_ids.unsqueeze(0).expand(input_modal.size(0), seq_length)
if token_type_ids is None:
token_type_ids = torch.zeros(
(input_modal.size(0), seq_length), dtype=torch.long, device=input_modal.device
)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = token_embeddings + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
MMBT_START_DOCSTRING = r"""
MMBT model was proposed in [Supervised Multimodal Bitransformers for Classifying Images and
Text](https://github.com/facebookresearch/mmbt) by Douwe Kiela, Suvrat Bhooshan, Hamed Firooz, Davide Testuggine.
It's a supervised multimodal bitransformer model that fuses information from text and other image encoders, and
obtain state-of-the-art performance on various multimodal classification benchmark tasks.
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`MMBTConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration.
transformer (`nn.Module`): A text transformer that is used by MMBT.
It should have embeddings, encoder, and pooler attributes.
encoder (`nn.Module`): Encoder for the second modality.
It should take in a batch of modal inputs and return k, n dimension embeddings.
"""
MMBT_INPUTS_DOCSTRING = r"""
Args:
input_modal (`torch.FloatTensor` of shape `(batch_size, ***)`):
The other modality data. It will be the shape that the encoder for that type expects. e.g. With an Image
Encoder, the shape would be (batch_size, channels, height, width)
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. It does not expect [CLS] token to be added as it's
appended to the end of other modality embeddings. Indices can be obtained using [`AutoTokenizer`]. See
[`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
modal_start_tokens (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Optional start token to be added to Other Modality Embedding. [CLS] Most commonly used for classification
tasks.
modal_end_tokens (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Optional end token to be added to Other Modality Embedding. [SEP] Most commonly used.
attention_mask (*optional*) `torch.FloatTensor` of shape `(batch_size, sequence_length)`:
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
token_type_ids (*optional*) `torch.LongTensor` of shape `(batch_size, sequence_length)`:
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
1]`:
- 0 corresponds to a *sentence A* token,
- 1 corresponds to a *sentence B* token.
[What are token type IDs?](../glossary#token-type-ids)
modal_token_type_ids (*optional*) `torch.LongTensor` of shape `(batch_size, modal_sequence_length)`:
Segment token indices to indicate different portions of the non-text modality. The embeddings from these
tokens will be summed with the respective token embeddings for the non-text modality.
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
modal_position_ids (`torch.LongTensor` of shape `(batch_size, modal_sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings for the non-text modality.
Selected in the range `[0, config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, embedding_dim)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare MMBT Model outputting raw hidden-states without any specific head on top.",
MMBT_START_DOCSTRING,
)
class MMBTModel(nn.Module, ModuleUtilsMixin):
def __init__(self, config, transformer, encoder):
super().__init__()
self.config = config
self.transformer = transformer
self.modal_encoder = ModalEmbeddings(config, encoder, transformer.embeddings)
@add_start_docstrings_to_model_forward(MMBT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_modal,
input_ids=None,
modal_start_tokens=None,
modal_end_tokens=None,
attention_mask=None,
token_type_ids=None,
modal_token_type_ids=None,
position_ids=None,
modal_position_ids=None,
inputs_embeds=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
r"""
Returns:
Examples:
```python
# For example purposes. Not runnable.
transformer = BertModel.from_pretrained("google-bert/bert-base-uncased")
encoder = ImageEncoder(args)
mmbt = MMBTModel(config, transformer, encoder)
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_txt_shape = input_ids.size()
elif inputs_embeds is not None:
input_txt_shape = inputs_embeds.size()[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
device = input_ids.device if input_ids is not None else inputs_embeds.device
modal_embeddings = self.modal_encoder(
input_modal,
start_token=modal_start_tokens,
end_token=modal_end_tokens,
position_ids=modal_position_ids,
token_type_ids=modal_token_type_ids,
)
input_modal_shape = modal_embeddings.size()[:-1]
if token_type_ids is None:
token_type_ids = torch.ones(input_txt_shape, dtype=torch.long, device=device)
txt_embeddings = self.transformer.embeddings(
input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
)
embedding_output = torch.cat([modal_embeddings, txt_embeddings], 1)
input_shape = embedding_output.size()[:-1]
if attention_mask is None:
attention_mask = torch.ones(input_shape, device=device)
else:
attention_mask = torch.cat(
[torch.ones(input_modal_shape, device=device, dtype=torch.long), attention_mask], dim=1
)
if encoder_attention_mask is None:
encoder_attention_mask = torch.ones(input_shape, device=device)
else:
encoder_attention_mask = torch.cat(
[torch.ones(input_modal_shape, device=device), encoder_attention_mask], dim=1
)
extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
encoder_outputs = self.transformer.encoder(
embedding_output,
attention_mask=extended_attention_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_extended_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = encoder_outputs[0]
pooled_output = self.transformer.pooler(sequence_output)
if not return_dict:
return (sequence_output, pooled_output) + encoder_outputs[1:]
return BaseModelOutputWithPooling(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
@add_start_docstrings(
"""
MMBT Model with a sequence classification/regression head on top (a linear layer on top of the pooled output)
""",
MMBT_START_DOCSTRING,
MMBT_INPUTS_DOCSTRING,
)
class MMBTForClassification(nn.Module):
r"""
**labels**: (*optional*) `torch.LongTensor` of shape `(batch_size,)`:
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Returns: *Tuple* comprising various elements depending on the configuration (config) and inputs: **loss**:
(*optional*, returned when `labels` is provided) `torch.FloatTensor` of shape `(1,)`: Classification (or
regression if config.num_labels==1) loss. **logits**:
`torch.FloatTensor` of shape `(batch_size, config.num_labels)` Classification (or regression if
config.num_labels==1) scores (before SoftMax).
**hidden_states**: (*optional*, returned when `output_hidden_states=True`) list of `torch.FloatTensor` (one for
the output of each layer + the output of the embeddings) of shape `(batch_size, sequence_length, hidden_size)`:
Hidden-states of the model at the output of each layer plus the initial embedding outputs. **attentions**:
(*optional*, returned when `output_attentions=True`) list of `torch.FloatTensor` (one for each layer) of shape
`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used
to compute the weighted average in the self-attention heads.
Examples:
```python
# For example purposes. Not runnable.
transformer = BertModel.from_pretrained("google-bert/bert-base-uncased")
encoder = ImageEncoder(args)
model = MMBTForClassification(config, transformer, encoder)
outputs = model(input_modal, input_ids, labels=labels)
loss, logits = outputs[:2]
```"""
def __init__(self, config, transformer, encoder):
super().__init__()
self.num_labels = config.num_labels
self.mmbt = MMBTModel(config, transformer, encoder)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
def forward(
self,
input_modal,
input_ids=None,
modal_start_tokens=None,
modal_end_tokens=None,
attention_mask=None,
token_type_ids=None,
modal_token_type_ids=None,
position_ids=None,
modal_position_ids=None,
inputs_embeds=None,
labels=None,
return_dict=None,
):
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.mmbt(
input_modal=input_modal,
input_ids=input_ids,
modal_start_tokens=modal_start_tokens,
modal_end_tokens=modal_end_tokens,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
modal_token_type_ids=modal_token_type_ids,
position_ids=position_ids,
modal_position_ids=modal_position_ids,
inputs_embeds=inputs_embeds,
return_dict=return_dict,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
if self.num_labels == 1:
# We are doing regression
loss_fct = MSELoss()
loss = loss_fct(logits.view(-1), labels.view(-1))
else:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
__all__ = ["MMBTForClassification", "MMBTModel", "ModalEmbeddings"]

View File

@ -0,0 +1,27 @@
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ....utils import _LazyModule
from ....utils.import_utils import define_import_structure
if TYPE_CHECKING:
from .configuration_nat import *
from .modeling_nat import *
else:
import sys
_file = globals()["__file__"]
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

View File

@ -0,0 +1,148 @@
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Neighborhood Attention Transformer model configuration"""
from ....configuration_utils import PreTrainedConfig
from ....utils import logging
from ....utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
logger = logging.get_logger(__name__)
class NatConfig(BackboneConfigMixin, PreTrainedConfig):
r"""
This is the configuration class to store the configuration of a [`NatModel`]. It is used to instantiate a Nat model
according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the Nat
[shi-labs/nat-mini-in1k-224](https://huggingface.co/shi-labs/nat-mini-in1k-224) architecture.
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PreTrainedConfig`] for more information.
Args:
patch_size (`int`, *optional*, defaults to 4):
The size (resolution) of each patch. NOTE: Only patch size of 4 is supported at the moment.
num_channels (`int`, *optional*, defaults to 3):
The number of input channels.
embed_dim (`int`, *optional*, defaults to 64):
Dimensionality of patch embedding.
depths (`list[int]`, *optional*, defaults to `[3, 4, 6, 5]`):
Number of layers in each level of the encoder.
num_heads (`list[int]`, *optional*, defaults to `[2, 4, 8, 16]`):
Number of attention heads in each layer of the Transformer encoder.
kernel_size (`int`, *optional*, defaults to 7):
Neighborhood Attention kernel size.
mlp_ratio (`float`, *optional*, defaults to 3.0):
Ratio of MLP hidden dimensionality to embedding dimensionality.
qkv_bias (`bool`, *optional*, defaults to `True`):
Whether or not a learnable bias should be added to the queries, keys and values.
hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
The dropout probability for all fully connected layers in the embeddings and encoder.
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
drop_path_rate (`float`, *optional*, defaults to 0.1):
Stochastic depth rate.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
`"selu"` and `"gelu_new"` are supported.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the layer normalization layers.
layer_scale_init_value (`float`, *optional*, defaults to 0.0):
The initial value for the layer scale. Disabled if <=0.
out_features (`list[str]`, *optional*):
If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
(depending on how many stages the model has). If unset and `out_indices` is set, will default to the
corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
same order as defined in the `stage_names` attribute.
out_indices (`list[int]`, *optional*):
If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
If unset and `out_features` is unset, will default to the last stage. Must be in the
same order as defined in the `stage_names` attribute.
Example:
```python
>>> from transformers import NatConfig, NatModel
>>> # Initializing a Nat shi-labs/nat-mini-in1k-224 style configuration
>>> configuration = NatConfig()
>>> # Initializing a model (with random weights) from the shi-labs/nat-mini-in1k-224 style configuration
>>> model = NatModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "nat"
attribute_map = {
"num_attention_heads": "num_heads",
"num_hidden_layers": "num_layers",
}
def __init__(
self,
patch_size=4,
num_channels=3,
embed_dim=64,
depths=[3, 4, 6, 5],
num_heads=[2, 4, 8, 16],
kernel_size=7,
mlp_ratio=3.0,
qkv_bias=True,
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
drop_path_rate=0.1,
hidden_act="gelu",
initializer_range=0.02,
layer_norm_eps=1e-5,
layer_scale_init_value=0.0,
out_features=None,
out_indices=None,
**kwargs,
):
super().__init__(**kwargs)
self.patch_size = patch_size
self.num_channels = num_channels
self.embed_dim = embed_dim
self.depths = depths
self.num_layers = len(depths)
self.num_heads = num_heads
self.kernel_size = kernel_size
self.mlp_ratio = mlp_ratio
self.qkv_bias = qkv_bias
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.drop_path_rate = drop_path_rate
self.hidden_act = hidden_act
self.layer_norm_eps = layer_norm_eps
self.initializer_range = initializer_range
# we set the hidden_size attribute in order to make Nat work with VisionEncoderDecoderModel
# this indicates the channel dimension after the last stage of the model
self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
self.layer_scale_init_value = layer_scale_init_value
self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
self._out_features, self._out_indices = get_aligned_output_features_output_indices(
out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
)
__all__ = ["NatConfig"]

Some files were not shown because too many files have changed in this diff Show More