mirror of
https://github.com/huggingface/transformers.git
synced 2025-11-20 02:04:43 +08:00
Compare commits
28 Commits
reverse-co
...
feat/use_c
| Author | SHA1 | Date | |
|---|---|---|---|
| f829d8723f | |||
| 239f094d45 | |||
| bffa27b693 | |||
| 7657b805df | |||
| 6ccacf3ab9 | |||
| 5804c1fa47 | |||
| 15b73034d7 | |||
| f0d026f033 | |||
| 383018e7b9 | |||
| 95d75eb7b1 | |||
| 6db2dd319c | |||
| e2fb8d6062 | |||
| a5c903f877 | |||
| 67302b043e | |||
| 9f31104786 | |||
| d372b82754 | |||
| b2feaa215f | |||
| 1acbd0b327 | |||
| c40b370bd0 | |||
| b1bdf9cb39 | |||
| cd416f3c5c | |||
| 1742d1198d | |||
| 16924cd33a | |||
| 266d3b0568 | |||
| 8d6c4583bf | |||
| 2cc9152da0 | |||
| 8637f6e7ae | |||
| 0e74a71c03 |
@ -420,8 +420,6 @@
|
||||
title: BLOOM
|
||||
- local: model_doc/blt
|
||||
title: BLT
|
||||
- local: model_doc/bort
|
||||
title: BORT
|
||||
- local: model_doc/byt5
|
||||
title: ByT5
|
||||
- local: model_doc/camembert
|
||||
@ -476,8 +474,6 @@
|
||||
title: Ernie4_5
|
||||
- local: model_doc/ernie4_5_moe
|
||||
title: Ernie4_5_MoE
|
||||
- local: model_doc/ernie_m
|
||||
title: ErnieM
|
||||
- local: model_doc/esm
|
||||
title: ESM
|
||||
- local: model_doc/exaone4
|
||||
@ -532,8 +528,6 @@
|
||||
title: GPTBigCode
|
||||
- local: model_doc/gpt_oss
|
||||
title: GptOss
|
||||
- local: model_doc/gptsan-japanese
|
||||
title: GPTSAN Japanese
|
||||
- local: model_doc/gpt-sw3
|
||||
title: GPTSw3
|
||||
- local: model_doc/granite
|
||||
@ -558,8 +552,6 @@
|
||||
title: Jamba
|
||||
- local: model_doc/jetmoe
|
||||
title: JetMoe
|
||||
- local: model_doc/jukebox
|
||||
title: Jukebox
|
||||
- local: model_doc/led
|
||||
title: LED
|
||||
- local: model_doc/lfm2
|
||||
@ -594,8 +586,6 @@
|
||||
title: MarkupLM
|
||||
- local: model_doc/mbart
|
||||
title: MBart and MBart-50
|
||||
- local: model_doc/mega
|
||||
title: MEGA
|
||||
- local: model_doc/megatron-bert
|
||||
title: MegatronBERT
|
||||
- local: model_doc/megatron_gpt2
|
||||
@ -630,8 +620,6 @@
|
||||
title: myt5
|
||||
- local: model_doc/nemotron
|
||||
title: Nemotron
|
||||
- local: model_doc/nezha
|
||||
title: NEZHA
|
||||
- local: model_doc/nllb
|
||||
title: NLLB
|
||||
- local: model_doc/nllb-moe
|
||||
@ -646,8 +634,6 @@
|
||||
title: Olmo3
|
||||
- local: model_doc/olmoe
|
||||
title: OLMoE
|
||||
- local: model_doc/open-llama
|
||||
title: Open-Llama
|
||||
- local: model_doc/opt
|
||||
title: OPT
|
||||
- local: model_doc/pegasus
|
||||
@ -668,8 +654,6 @@
|
||||
title: PLBart
|
||||
- local: model_doc/prophetnet
|
||||
title: ProphetNet
|
||||
- local: model_doc/qdqbert
|
||||
title: QDQBert
|
||||
- local: model_doc/qwen2
|
||||
title: Qwen2
|
||||
- local: model_doc/qwen2_moe
|
||||
@ -682,16 +666,12 @@
|
||||
title: Qwen3Next
|
||||
- local: model_doc/rag
|
||||
title: RAG
|
||||
- local: model_doc/realm
|
||||
title: REALM
|
||||
- local: model_doc/recurrent_gemma
|
||||
title: RecurrentGemma
|
||||
- local: model_doc/reformer
|
||||
title: Reformer
|
||||
- local: model_doc/rembert
|
||||
title: RemBERT
|
||||
- local: model_doc/retribert
|
||||
title: RetriBERT
|
||||
- local: model_doc/roberta
|
||||
title: RoBERTa
|
||||
- local: model_doc/roberta-prelayernorm
|
||||
@ -720,10 +700,6 @@
|
||||
title: T5Gemma
|
||||
- local: model_doc/t5v1.1
|
||||
title: T5v1.1
|
||||
- local: model_doc/tapex
|
||||
title: TAPEX
|
||||
- local: model_doc/transfo-xl
|
||||
title: Transformer XL
|
||||
- local: model_doc/ul2
|
||||
title: UL2
|
||||
- local: model_doc/umt5
|
||||
@ -736,8 +712,6 @@
|
||||
title: XGLM
|
||||
- local: model_doc/xlm
|
||||
title: XLM
|
||||
- local: model_doc/xlm-prophetnet
|
||||
title: XLM-ProphetNet
|
||||
- local: model_doc/xlm-roberta
|
||||
title: XLM-RoBERTa
|
||||
- local: model_doc/xlm-roberta-xl
|
||||
@ -784,8 +758,6 @@
|
||||
title: Depth Anything V2
|
||||
- local: model_doc/depth_pro
|
||||
title: DepthPro
|
||||
- local: model_doc/deta
|
||||
title: DETA
|
||||
- local: model_doc/detr
|
||||
title: DETR
|
||||
- local: model_doc/dinat
|
||||
@ -800,8 +772,6 @@
|
||||
title: DiT
|
||||
- local: model_doc/dpt
|
||||
title: DPT
|
||||
- local: model_doc/efficientformer
|
||||
title: EfficientFormer
|
||||
- local: model_doc/efficientloftr
|
||||
title: EfficientLoFTR
|
||||
- local: model_doc/efficientnet
|
||||
@ -838,8 +808,6 @@
|
||||
title: MobileViT
|
||||
- local: model_doc/mobilevitv2
|
||||
title: MobileViTV2
|
||||
- local: model_doc/nat
|
||||
title: NAT
|
||||
- local: model_doc/poolformer
|
||||
title: PoolFormer
|
||||
- local: model_doc/prompt_depth_anything
|
||||
@ -886,12 +854,8 @@
|
||||
title: Timm Wrapper
|
||||
- local: model_doc/upernet
|
||||
title: UperNet
|
||||
- local: model_doc/van
|
||||
title: VAN
|
||||
- local: model_doc/vit
|
||||
title: Vision Transformer (ViT)
|
||||
- local: model_doc/vit_hybrid
|
||||
title: ViT Hybrid
|
||||
- local: model_doc/vitdet
|
||||
title: ViTDet
|
||||
- local: model_doc/vit_mae
|
||||
@ -930,8 +894,6 @@
|
||||
title: Hubert
|
||||
- local: model_doc/kyutai_speech_to_text
|
||||
title: Kyutai Speech-To-Text
|
||||
- local: model_doc/mctct
|
||||
title: MCTCT
|
||||
- local: model_doc/mimi
|
||||
title: Mimi
|
||||
- local: model_doc/mms
|
||||
@ -958,8 +920,6 @@
|
||||
title: SEW-D
|
||||
- local: model_doc/speech_to_text
|
||||
title: Speech2Text
|
||||
- local: model_doc/speech_to_text_2
|
||||
title: Speech2Text2
|
||||
- local: model_doc/speecht5
|
||||
title: SpeechT5
|
||||
- local: model_doc/unispeech
|
||||
@ -1188,8 +1148,6 @@
|
||||
title: TAPAS
|
||||
- local: model_doc/trocr
|
||||
title: TrOCR
|
||||
- local: model_doc/tvlt
|
||||
title: TVLT
|
||||
- local: model_doc/tvp
|
||||
title: TVP
|
||||
- local: model_doc/udop
|
||||
@ -1216,8 +1174,6 @@
|
||||
- sections:
|
||||
- local: model_doc/decision_transformer
|
||||
title: Decision Transformer
|
||||
- local: model_doc/trajectory_transformer
|
||||
title: Trajectory Transformer
|
||||
title: Reinforcement learning models
|
||||
- sections:
|
||||
- local: model_doc/autoformer
|
||||
@ -1233,10 +1189,6 @@
|
||||
- local: model_doc/timesfm
|
||||
title: TimesFM
|
||||
title: Time series models
|
||||
- sections:
|
||||
- local: model_doc/graphormer
|
||||
title: Graphormer
|
||||
title: Graph models
|
||||
title: Models
|
||||
- sections:
|
||||
- local: internal/modeling_utils
|
||||
|
||||
@ -1,60 +0,0 @@
|
||||
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2020-10-20 and added to Hugging Face Transformers on 2023-06-20.*
|
||||
|
||||
# BORT
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This model is in maintenance mode only, we do not accept any new PRs changing its code.
|
||||
|
||||
If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
|
||||
You can do so by running the following command: `pip install -U transformers==4.30.0`.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Overview
|
||||
|
||||
The BORT model was proposed in [Optimal Subarchitecture Extraction for BERT](https://huggingface.co/papers/2010.10499) by
|
||||
Adrian de Wynter and Daniel J. Perry. It is an optimal subset of architectural parameters for the BERT, which the
|
||||
authors refer to as "Bort".
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*We extract an optimal subset of architectural parameters for the BERT architecture from Devlin et al. (2018) by
|
||||
applying recent breakthroughs in algorithms for neural architecture search. This optimal subset, which we refer to as
|
||||
"Bort", is demonstrably smaller, having an effective (that is, not counting the embedding layer) size of 5.5% the
|
||||
original BERT-large architecture, and 16% of the net size. Bort is also able to be pretrained in 288 GPU hours, which
|
||||
is 1.2% of the time required to pretrain the highest-performing BERT parametric architectural variant, RoBERTa-large
|
||||
(Liu et al., 2019), and about 33% of that of the world-record, in GPU hours, required to train BERT-large on the same
|
||||
hardware. It is also 7.9x faster on a CPU, as well as being better performing than other compressed variants of the
|
||||
architecture, and some of the non-compressed variants: it obtains performance improvements of between 0.3% and 31%,
|
||||
absolute, with respect to BERT-large, on multiple public natural language understanding (NLU) benchmarks.*
|
||||
|
||||
This model was contributed by [stefan-it](https://huggingface.co/stefan-it). The original code can be found [here](https://github.com/alexa/bort/).
|
||||
|
||||
## Usage tips
|
||||
|
||||
- BORT's model architecture is based on BERT, refer to [BERT's documentation page](bert) for the
|
||||
model's API reference as well as usage examples.
|
||||
- BORT uses the RoBERTa tokenizer instead of the BERT tokenizer, refer to [RoBERTa's documentation page](roberta) for the tokenizer's API reference as well as usage examples.
|
||||
- BORT requires a specific fine-tuning algorithm, called [Agora](https://adewynter.github.io/notes/bort_algorithms_and_applications.html#fine-tuning-with-algebraic-topology) ,
|
||||
that is sadly not open-sourced yet. It would be very useful for the community, if someone tries to implement the
|
||||
algorithm to make BORT fine-tuning work.
|
||||
@ -1,78 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2022-12-12 and added to Hugging Face Transformers on 2023-06-20.*
|
||||
|
||||
# DETA
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This model is in maintenance mode only, we don't accept any new PRs changing its code.
|
||||
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
|
||||
You can do so by running the following command: `pip install -U transformers==4.40.2`.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Overview
|
||||
|
||||
The DETA model was proposed in [NMS Strikes Back](https://huggingface.co/papers/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl.
|
||||
DETA (short for Detection Transformers with Assignment) improves [Deformable DETR](deformable_detr) by replacing the one-to-one bipartite Hungarian matching loss
|
||||
with one-to-many label assignments used in traditional detectors with non-maximum suppression (NMS). This leads to significant gains of up to 2.5 mAP.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Detection Transformer (DETR) directly transforms queries to unique objects by using one-to-one bipartite matching during training and enables end-to-end object detection. Recently, these models have surpassed traditional detectors on COCO with undeniable elegance. However, they differ from traditional detectors in multiple designs, including model architecture and training schedules, and thus the effectiveness of one-to-one matching is not fully understood. In this work, we conduct a strict comparison between the one-to-one Hungarian matching in DETRs and the one-to-many label assignments in traditional detectors with non-maximum supervision (NMS). Surprisingly, we observe one-to-many assignments with NMS consistently outperform standard one-to-one matching under the same setting, with a significant gain of up to 2.5 mAP. Our detector that trains Deformable-DETR with traditional IoU-based label assignment achieved 50.2 COCO mAP within 12 epochs (1x schedule) with ResNet50 backbone, outperforming all existing traditional or transformer-based detectors in this setting. On multiple datasets, schedules, and architectures, we consistently show bipartite matching is unnecessary for performant detection transformers. Furthermore, we attribute the success of detection transformers to their expressive transformer architecture.*
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/deta_architecture.jpg"
|
||||
alt="drawing" width="600"/>
|
||||
|
||||
<small> DETA overview. Taken from the <a href="https://huggingface.co/papers/2212.06137">original paper</a>. </small>
|
||||
|
||||
This model was contributed by [nielsr](https://huggingface.co/nielsr).
|
||||
The original code can be found [here](https://github.com/jozhang97/DETA).
|
||||
|
||||
## Resources
|
||||
|
||||
A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DETA.
|
||||
|
||||
- Demo notebooks for DETA can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETA).
|
||||
- Scripts for finetuning [`DetaForObjectDetection`] with [`Trainer`] or [Accelerate](https://huggingface.co/docs/accelerate/index) can be found [here](https://github.com/huggingface/transformers/tree/main/examples/pytorch/object-detection).
|
||||
- See also: [Object detection task guide](../tasks/object_detection).
|
||||
|
||||
If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
|
||||
|
||||
## DetaConfig
|
||||
|
||||
[[autodoc]] DetaConfig
|
||||
|
||||
## DetaImageProcessor
|
||||
|
||||
[[autodoc]] DetaImageProcessor
|
||||
- preprocess
|
||||
- post_process_object_detection
|
||||
|
||||
## DetaModel
|
||||
|
||||
[[autodoc]] DetaModel
|
||||
- forward
|
||||
|
||||
## DetaForObjectDetection
|
||||
|
||||
[[autodoc]] DetaForObjectDetection
|
||||
- forward
|
||||
@ -1,85 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2022-06-02 and added to Hugging Face Transformers on 2023-06-20.*
|
||||
|
||||
# EfficientFormer
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This model is in maintenance mode only, we don't accept any new PRs changing its code.
|
||||
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
|
||||
You can do so by running the following command: `pip install -U transformers==4.40.2`.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Overview
|
||||
|
||||
The EfficientFormer model was proposed in [EfficientFormer: Vision Transformers at MobileNet Speed](https://huggingface.co/papers/2206.01191)
|
||||
by Yanyu Li, Geng Yuan, Yang Wen, Eric Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren. EfficientFormer proposes a
|
||||
dimension-consistent pure transformer that can be run on mobile devices for dense prediction tasks like image classification, object
|
||||
detection and semantic segmentation.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Vision Transformers (ViT) have shown rapid progress in computer vision tasks, achieving promising results on various benchmarks.
|
||||
However, due to the massive number of parameters and model design, e.g., attention mechanism, ViT-based models are generally
|
||||
times slower than lightweight convolutional networks. Therefore, the deployment of ViT for real-time applications is particularly
|
||||
challenging, especially on resource-constrained hardware such as mobile devices. Recent efforts try to reduce the computation
|
||||
complexity of ViT through network architecture search or hybrid design with MobileNet block, yet the inference speed is still
|
||||
unsatisfactory. This leads to an important question: can transformers run as fast as MobileNet while obtaining high performance?
|
||||
To answer this, we first revisit the network architecture and operators used in ViT-based models and identify inefficient designs.
|
||||
Then we introduce a dimension-consistent pure transformer (without MobileNet blocks) as a design paradigm.
|
||||
Finally, we perform latency-driven slimming to get a series of final models dubbed EfficientFormer.
|
||||
Extensive experiments show the superiority of EfficientFormer in performance and speed on mobile devices.
|
||||
Our fastest model, EfficientFormer-L1, achieves 79.2% top-1 accuracy on ImageNet-1K with only 1.6 ms inference latency on
|
||||
iPhone 12 (compiled with CoreML), which { runs as fast as MobileNetV2×1.4 (1.6 ms, 74.7% top-1),} and our largest model,
|
||||
EfficientFormer-L7, obtains 83.3% accuracy with only 7.0 ms latency. Our work proves that properly designed transformers can
|
||||
reach extremely low latency on mobile devices while maintaining high performance.*
|
||||
|
||||
This model was contributed by [novice03](https://huggingface.co/novice03) and [Bearnardd](https://huggingface.co/Bearnardd).
|
||||
The original code can be found [here](https://github.com/snap-research/EfficientFormer).
|
||||
|
||||
## Documentation resources
|
||||
|
||||
- [Image classification task guide](../tasks/image_classification)
|
||||
|
||||
## EfficientFormerConfig
|
||||
|
||||
[[autodoc]] EfficientFormerConfig
|
||||
|
||||
## EfficientFormerImageProcessor
|
||||
|
||||
[[autodoc]] EfficientFormerImageProcessor
|
||||
- preprocess
|
||||
|
||||
## EfficientFormerModel
|
||||
|
||||
[[autodoc]] EfficientFormerModel
|
||||
- forward
|
||||
|
||||
## EfficientFormerForImageClassification
|
||||
|
||||
[[autodoc]] EfficientFormerForImageClassification
|
||||
- forward
|
||||
|
||||
## EfficientFormerForImageClassificationWithTeacher
|
||||
|
||||
[[autodoc]] EfficientFormerForImageClassificationWithTeacher
|
||||
- forward
|
||||
@ -1,97 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace and Baidu Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2020-12-31 and added to Hugging Face Transformers on 2023-06-20.*
|
||||
|
||||
# ErnieM
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This model is in maintenance mode only, we don't accept any new PRs changing its code.
|
||||
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
|
||||
You can do so by running the following command: `pip install -U transformers==4.40.2`.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Overview
|
||||
|
||||
The ErnieM model was proposed in [ERNIE-M: Enhanced Multilingual Representation by Aligning
|
||||
Cross-lingual Semantics with Monolingual Corpora](https://huggingface.co/papers/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun,
|
||||
Hao Tian, Hua Wu, Haifeng Wang.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Recent studies have demonstrated that pre-trained cross-lingual models achieve impressive performance in downstream cross-lingual tasks. This improvement benefits from learning a large amount of monolingual and parallel corpora. Although it is generally acknowledged that parallel corpora are critical for improving the model performance, existing methods are often constrained by the size of parallel corpora, especially for lowresource languages. In this paper, we propose ERNIE-M, a new training method that encourages the model to align the representation of multiple languages with monolingual corpora, to overcome the constraint that the parallel corpus size places on the model performance. Our key insight is to integrate back-translation into the pre-training process. We generate pseudo-parallel sentence pairs on a monolingual corpus to enable the learning of semantic alignments between different languages, thereby enhancing the semantic modeling of cross-lingual models. Experimental results show that ERNIE-M outperforms existing cross-lingual models and delivers new state-of-the-art results in various cross-lingual downstream tasks.*
|
||||
This model was contributed by [Susnato Dhar](https://huggingface.co/susnato). The original code can be found [here](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/paddlenlp/transformers/ernie_m).
|
||||
|
||||
## Usage tips
|
||||
|
||||
- Ernie-M is a BERT-like model so it is a stacked Transformer Encoder.
|
||||
- Instead of using MaskedLM for pretraining (like BERT) the authors used two novel techniques: `Cross-attention Masked Language Modeling` and `Back-translation Masked Language Modeling`. For now these two LMHead objectives are not implemented here.
|
||||
- It is a multilingual language model.
|
||||
- Next Sentence Prediction was not used in pretraining process.
|
||||
|
||||
## Resources
|
||||
|
||||
- [Text classification task guide](../tasks/sequence_classification)
|
||||
- [Token classification task guide](../tasks/token_classification)
|
||||
- [Question answering task guide](../tasks/question_answering)
|
||||
- [Multiple choice task guide](../tasks/multiple_choice)
|
||||
|
||||
## ErnieMConfig
|
||||
|
||||
[[autodoc]] ErnieMConfig
|
||||
|
||||
## ErnieMTokenizer
|
||||
|
||||
[[autodoc]] ErnieMTokenizer
|
||||
- build_inputs_with_special_tokens
|
||||
- get_special_tokens_mask
|
||||
- create_token_type_ids_from_sequences
|
||||
- save_vocabulary
|
||||
|
||||
## ErnieMModel
|
||||
|
||||
[[autodoc]] ErnieMModel
|
||||
- forward
|
||||
|
||||
## ErnieMForSequenceClassification
|
||||
|
||||
[[autodoc]] ErnieMForSequenceClassification
|
||||
- forward
|
||||
|
||||
## ErnieMForMultipleChoice
|
||||
|
||||
[[autodoc]] ErnieMForMultipleChoice
|
||||
- forward
|
||||
|
||||
## ErnieMForTokenClassification
|
||||
|
||||
[[autodoc]] ErnieMForTokenClassification
|
||||
- forward
|
||||
|
||||
## ErnieMForQuestionAnswering
|
||||
|
||||
[[autodoc]] ErnieMForQuestionAnswering
|
||||
- forward
|
||||
|
||||
## ErnieMForInformationExtraction
|
||||
|
||||
[[autodoc]] ErnieMForInformationExtraction
|
||||
- forward
|
||||
@ -1,145 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2023-02-07 and added to Hugging Face Transformers on 2023-06-20.*
|
||||
|
||||
# GPTSAN-japanese
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This model is in maintenance mode only, we don't accept any new PRs changing its code.
|
||||
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
|
||||
You can do so by running the following command: `pip install -U transformers==4.40.2`.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Overview
|
||||
|
||||
The [GPTSAN-japanese](https://huggingface.co/Tanrei/GPTSAN-japanese) model was released in the repository by Toshiyuki Sakamoto (tanreinama).
|
||||
|
||||
GPTSAN is a Japanese language model using Switch Transformer. It has the same structure as the model introduced as Prefix LM
|
||||
in the T5 paper, and support both Text Generation and Masked Language Modeling tasks. These basic tasks similarly can
|
||||
fine-tune for translation or summarization.
|
||||
|
||||
### Usage example
|
||||
|
||||
The `generate()` method can be used to generate text using GPTSAN-Japanese model.
|
||||
|
||||
```python
|
||||
>>> from transformers import AutoModel, AutoTokenizer
|
||||
from accelerate import Accelerator
|
||||
>>> import torch
|
||||
|
||||
>>> device = Accelerator().device
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
|
||||
>>> model = AutoModel.from_pretrained("Tanrei/GPTSAN-japanese").to(device)
|
||||
>>> x_tok = tokenizer("は、", prefix_text="織田信長", return_tensors="pt")
|
||||
>>> torch.manual_seed(0)
|
||||
>>> gen_tok = model.generate(x_tok.input_ids.to(model.device), token_type_ids=x_tok.token_type_ids.to(model.device), max_new_tokens=20)
|
||||
>>> tokenizer.decode(gen_tok[0])
|
||||
'織田信長は、2004年に『戦国BASARA』のために、豊臣秀吉'
|
||||
```
|
||||
|
||||
## GPTSAN Features
|
||||
|
||||
GPTSAN has some unique features. It has a model structure of Prefix-LM. It works as a shifted Masked Language Model for Prefix Input tokens. Un-prefixed inputs behave like normal generative models.
|
||||
The Spout vector is a GPTSAN specific input. Spout is pre-trained with random inputs, but you can specify a class of text or an arbitrary vector during fine-tuning. This allows you to indicate the tendency of the generated text.
|
||||
GPTSAN has a sparse Feed Forward based on Switch-Transformer. You can also add other layers and train them partially. See the original GPTSAN repository for details.
|
||||
|
||||
### Prefix-LM Model
|
||||
|
||||
GPTSAN has the structure of the model named Prefix-LM in the `T5` paper. (The original GPTSAN repository calls it `hybrid`)
|
||||
In GPTSAN, the `Prefix` part of Prefix-LM, that is, the input position that can be referenced by both tokens, can be specified with any length.
|
||||
Arbitrary lengths can also be specified differently for each batch.
|
||||
This length applies to the text entered in `prefix_text` for the tokenizer.
|
||||
The tokenizer returns the mask of the `Prefix` part of Prefix-LM as `token_type_ids`.
|
||||
The model treats the part where `token_type_ids` is 1 as a `Prefix` part, that is, the input can refer to both tokens before and after.
|
||||
|
||||
## Usage tips
|
||||
|
||||
Specifying the Prefix part is done with a mask passed to self-attention.
|
||||
When token_type_ids=None or all zero, it is equivalent to regular causal mask
|
||||
|
||||
for example:
|
||||
|
||||
>>> x_token = tokenizer("アイウエ")
|
||||
|
||||
```text
|
||||
input_ids: | SOT | SEG | ア | イ | ウ | エ |
|
||||
token_type_ids: | 1 | 0 | 0 | 0 | 0 | 0 |
|
||||
prefix_lm_mask:
|
||||
SOT | 1 0 0 0 0 0 |
|
||||
SEG | 1 1 0 0 0 0 |
|
||||
ア | 1 1 1 0 0 0 |
|
||||
イ | 1 1 1 1 0 0 |
|
||||
ウ | 1 1 1 1 1 0 |
|
||||
エ | 1 1 1 1 1 1 |
|
||||
```
|
||||
|
||||
>>> x_token = tokenizer("", prefix_text="アイウエ")
|
||||
|
||||
```text
|
||||
input_ids: | SOT | ア | イ | ウ | エ | SEG |
|
||||
token_type_ids: | 1 | 1 | 1 | 1 | 1 | 0 |
|
||||
prefix_lm_mask:
|
||||
SOT | 1 1 1 1 1 0 |
|
||||
ア | 1 1 1 1 1 0 |
|
||||
イ | 1 1 1 1 1 0 |
|
||||
ウ | 1 1 1 1 1 0 |
|
||||
エ | 1 1 1 1 1 0 |
|
||||
SEG | 1 1 1 1 1 1 |
|
||||
```
|
||||
|
||||
>>> x_token = tokenizer("ウエ", prefix_text="アイ")
|
||||
|
||||
```text
|
||||
input_ids: | SOT | ア | イ | SEG | ウ | エ |
|
||||
token_type_ids: | 1 | 1 | 1 | 0 | 0 | 0 |
|
||||
prefix_lm_mask:
|
||||
SOT | 1 1 1 0 0 0 |
|
||||
ア | 1 1 1 0 0 0 |
|
||||
イ | 1 1 1 0 0 0 |
|
||||
SEG | 1 1 1 1 0 0 |
|
||||
ウ | 1 1 1 1 1 0 |
|
||||
エ | 1 1 1 1 1 1 |
|
||||
```
|
||||
|
||||
### Spout Vector
|
||||
|
||||
A Spout Vector is a special vector for controlling text generation.
|
||||
This vector is treated as the first embedding in self-attention to bring extraneous attention to the generated tokens.
|
||||
In the pre-trained model published from `Tanrei/GPTSAN-japanese`, the Spout Vector is a 128-dimensional vector that passes through 8 fully connected layers in the model and is projected into the space acting as external attention.
|
||||
The Spout Vector projected by the fully connected layer is split to be passed to all self-attentions.
|
||||
|
||||
## GPTSanJapaneseConfig
|
||||
|
||||
[[autodoc]] GPTSanJapaneseConfig
|
||||
|
||||
## GPTSanJapaneseTokenizer
|
||||
|
||||
[[autodoc]] GPTSanJapaneseTokenizer
|
||||
|
||||
## GPTSanJapaneseModel
|
||||
|
||||
[[autodoc]] GPTSanJapaneseModel
|
||||
|
||||
## GPTSanJapaneseForConditionalGeneration
|
||||
|
||||
[[autodoc]] GPTSanJapaneseForConditionalGeneration
|
||||
- forward
|
||||
@ -1,60 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team and Microsoft. All rights reserved.
|
||||
|
||||
Licensed under the MIT License; you may not use this file except in compliance with
|
||||
the License.
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2021-06-09 and added to Hugging Face Transformers on 2023-06-20.*
|
||||
|
||||
# Graphormer
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This model is in maintenance mode only, we don't accept any new PRs changing its code.
|
||||
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
|
||||
You can do so by running the following command: `pip install -U transformers==4.40.2`.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Overview
|
||||
|
||||
The Graphormer model was proposed in [Do Transformers Really Perform Bad for Graph Representation?](https://huggingface.co/papers/2106.05234) by
|
||||
Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen and Tie-Yan Liu. It is a Graph Transformer model, modified to allow computations on graphs instead of text sequences by generating embeddings and features of interest during preprocessing and collation, then using a modified attention.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*The Transformer architecture has become a dominant choice in many domains, such as natural language processing and computer vision. Yet, it has not achieved competitive performance on popular leaderboards of graph-level prediction compared to mainstream GNN variants. Therefore, it remains a mystery how Transformers could perform well for graph representation learning. In this paper, we solve this mystery by presenting Graphormer, which is built upon the standard Transformer architecture, and could attain excellent results on a broad range of graph representation learning tasks, especially on the recent OGB Large-Scale Challenge. Our key insight to utilizing Transformer in the graph is the necessity of effectively encoding the structural information of a graph into the model. To this end, we propose several simple yet effective structural encoding methods to help Graphormer better model graph-structured data. Besides, we mathematically characterize the expressive power of Graphormer and exhibit that with our ways of encoding the structural information of graphs, many popular GNN variants could be covered as the special cases of Graphormer.*
|
||||
|
||||
This model was contributed by [clefourrier](https://huggingface.co/clefourrier). The original code can be found [here](https://github.com/microsoft/Graphormer).
|
||||
|
||||
## Usage tips
|
||||
|
||||
This model will not work well on large graphs (more than 100 nodes/edges), as it will make the memory explode.
|
||||
You can reduce the batch size, increase your RAM, or decrease the `UNREACHABLE_NODE_DISTANCE` parameter in algos_graphormer.pyx, but it will be hard to go above 700 nodes/edges.
|
||||
|
||||
This model does not use a tokenizer, but instead a special collator during training.
|
||||
|
||||
## GraphormerConfig
|
||||
|
||||
[[autodoc]] GraphormerConfig
|
||||
|
||||
## GraphormerModel
|
||||
|
||||
[[autodoc]] GraphormerModel
|
||||
- forward
|
||||
|
||||
## GraphormerForGraphClassification
|
||||
|
||||
[[autodoc]] GraphormerForGraphClassification
|
||||
- forward
|
||||
@ -1,99 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2020-04-30 and added to Hugging Face Transformers on 2023-06-20.*
|
||||
|
||||
# Jukebox
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This model is in maintenance mode only, we don't accept any new PRs changing its code.
|
||||
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
|
||||
You can do so by running the following command: `pip install -U transformers==4.40.2`.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Overview
|
||||
|
||||
The Jukebox model was proposed in [Jukebox: A generative model for music](https://huggingface.co/papers/2005.00341)
|
||||
by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford,
|
||||
Ilya Sutskever. It introduces a generative music model which can produce minute long samples that can be conditioned on
|
||||
an artist, genres and lyrics.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*We introduce Jukebox, a model that generates music with singing in the raw audio domain. We tackle the long context of raw audio using a multiscale VQ-VAE to compress it to discrete codes, and modeling those using autoregressive Transformers. We show that the combined model at scale can generate high-fidelity and diverse songs with coherence up to multiple minutes. We can condition on artist and genre to steer the musical and vocal style, and on unaligned lyrics to make the singing more controllable. We are releasing thousands of non cherry-picked samples, along with model weights and code.*
|
||||
|
||||
As shown on the following figure, Jukebox is made of 3 `priors` which are decoder only models. They follow the architecture described in [Generating Long Sequences with Sparse Transformers](https://huggingface.co/papers/1904.10509), modified to support longer context length.
|
||||
First, a autoencoder is used to encode the text lyrics. Next, the first (also called `top_prior`) prior attends to the last hidden states extracted from the lyrics encoder. The priors are linked to the previous priors respectively via an `AudioConditioner` module. The`AudioConditioner` upsamples the outputs of the previous prior to raw tokens at a certain audio frame per second resolution.
|
||||
The metadata such as *artist, genre and timing* are passed to each prior, in the form of a start token and positional embedding for the timing data. The hidden states are mapped to the closest codebook vector from the VQVAE in order to convert them to raw audio.
|
||||
|
||||

|
||||
|
||||
This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ).
|
||||
The original code can be found [here](https://github.com/openai/jukebox).
|
||||
|
||||
## Usage tips
|
||||
|
||||
- This model only supports inference. This is for a few reasons, mostly because it requires a crazy amount of memory to train. Feel free to open a PR and add what's missing to have a full integration with the hugging face trainer!
|
||||
- This model is very slow, and takes 8h to generate a minute long audio using the 5b top prior on a V100 GPU. In order automaticallay handle the device on which the model should execute, use `accelerate`.
|
||||
- Contrary to the paper, the order of the priors goes from `0` to `1` as it felt more intuitive : we sample starting from `0`.
|
||||
- Primed sampling (conditioning the sampling on raw audio) requires more memory than ancestral sampling and should be used with `fp16` set to `True`.
|
||||
|
||||
This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ).
|
||||
The original code can be found [here](https://github.com/openai/jukebox).
|
||||
|
||||
## JukeboxConfig
|
||||
|
||||
[[autodoc]] JukeboxConfig
|
||||
|
||||
## JukeboxPriorConfig
|
||||
|
||||
[[autodoc]] JukeboxPriorConfig
|
||||
|
||||
## JukeboxVQVAEConfig
|
||||
|
||||
[[autodoc]] JukeboxVQVAEConfig
|
||||
|
||||
## JukeboxTokenizer
|
||||
|
||||
[[autodoc]] JukeboxTokenizer
|
||||
- save_vocabulary
|
||||
|
||||
## JukeboxModel
|
||||
|
||||
[[autodoc]] JukeboxModel
|
||||
- ancestral_sample
|
||||
- primed_sample
|
||||
- continue_sample
|
||||
- upsample
|
||||
- _sample
|
||||
|
||||
## JukeboxPrior
|
||||
|
||||
[[autodoc]] JukeboxPrior
|
||||
- sample
|
||||
- forward
|
||||
|
||||
## JukeboxVQVAE
|
||||
|
||||
[[autodoc]] JukeboxVQVAE
|
||||
- forward
|
||||
- encode
|
||||
- decode
|
||||
@ -1,84 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2021-10-30 and added to Hugging Face Transformers on 2023-06-20.*
|
||||
|
||||
# M-CTC-T
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This model is in maintenance mode only, so we won't accept any new PRs changing its code.
|
||||
|
||||
If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
|
||||
You can do so by running the following command: `pip install -U transformers==4.30.0`.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Overview
|
||||
|
||||
The M-CTC-T model was proposed in [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://huggingface.co/papers/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert. The model is a 1B-param transformer encoder, with a CTC head over 8065 character labels and a language identification head over 60 language ID labels. It is trained on Common Voice (version 6.1, December 2020 release) and VoxPopuli. After training on Common Voice and VoxPopuli, the model is trained on Common Voice only. The labels are unnormalized character-level transcripts (punctuation and capitalization are not removed). The model takes as input Mel filterbank features from a 16Khz audio signal.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Semi-supervised learning through pseudo-labeling has become a staple of state-of-the-art monolingual
|
||||
speech recognition systems. In this work, we extend pseudo-labeling to massively multilingual speech
|
||||
recognition with 60 languages. We propose a simple pseudo-labeling recipe that works well even
|
||||
with low-resource languages: train a supervised multilingual model, fine-tune it with semi-supervised
|
||||
learning on a target language, generate pseudo-labels for that language, and train a final model using
|
||||
pseudo-labels for all languages, either from scratch or by fine-tuning. Experiments on the labeled
|
||||
Common Voice and unlabeled VoxPopuli datasets show that our recipe can yield a model with better
|
||||
performance for many languages that also transfers well to LibriSpeech.*
|
||||
|
||||
This model was contributed by [cwkeam](https://huggingface.co/cwkeam). The original code can be found [here](https://github.com/flashlight/wav2letter/tree/main/recipes/mling_pl).
|
||||
|
||||
## Usage tips
|
||||
|
||||
The PyTorch version of this model is only available in torch 1.9 and higher.
|
||||
|
||||
## Resources
|
||||
|
||||
- [Automatic speech recognition task guide](../tasks/asr)
|
||||
|
||||
## MCTCTConfig
|
||||
|
||||
[[autodoc]] MCTCTConfig
|
||||
|
||||
## MCTCTFeatureExtractor
|
||||
|
||||
[[autodoc]] MCTCTFeatureExtractor
|
||||
- __call__
|
||||
|
||||
## MCTCTProcessor
|
||||
|
||||
[[autodoc]] MCTCTProcessor
|
||||
- __call__
|
||||
- from_pretrained
|
||||
- save_pretrained
|
||||
- batch_decode
|
||||
- decode
|
||||
|
||||
## MCTCTModel
|
||||
|
||||
[[autodoc]] MCTCTModel
|
||||
- forward
|
||||
|
||||
## MCTCTForCTC
|
||||
|
||||
[[autodoc]] MCTCTForCTC
|
||||
- forward
|
||||
@ -1,94 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2022-09-21 and added to Hugging Face Transformers on 2023-06-20.*
|
||||
|
||||
# MEGA
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This model is in maintenance mode only, we don't accept any new PRs changing its code.
|
||||
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
|
||||
You can do so by running the following command: `pip install -U transformers==4.40.2`.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Overview
|
||||
|
||||
The MEGA model was proposed in [Mega: Moving Average Equipped Gated Attention](https://huggingface.co/papers/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
|
||||
MEGA proposes a new approach to self-attention with each encoder layer having a multi-headed exponential moving average in addition to a single head of standard dot-product attention, giving the attention mechanism
|
||||
stronger positional biases. This allows MEGA to perform competitively to Transformers on standard benchmarks including LRA
|
||||
while also having significantly fewer parameters. MEGA's compute efficiency allows it to scale to very long sequences, making it an
|
||||
attractive option for long-document NLP tasks.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*The design choices in the Transformer attention mechanism, including weak inductive bias and quadratic computational complexity, have limited its application for modeling long sequences. In this paper, we introduce Mega, a simple, theoretically grounded, single-head gated attention mechanism equipped with (exponential) moving average to incorporate inductive bias of position-aware local dependencies into the position-agnostic attention mechanism. We further propose a variant of Mega that offers linear time and space complexity yet yields only minimal quality loss, by efficiently splitting the whole sequence into multiple chunks with fixed length. Extensive experiments on a wide range of sequence modeling benchmarks, including the Long Range Arena, neural machine translation, auto-regressive language modeling, and image and speech classification, show that Mega achieves significant improvements over other sequence models, including variants of Transformers and recent state space models.*
|
||||
|
||||
This model was contributed by [mnaylor](https://huggingface.co/mnaylor).
|
||||
The original code can be found [here](https://github.com/facebookresearch/mega).
|
||||
|
||||
## Usage tips
|
||||
|
||||
- MEGA can perform quite well with relatively few parameters. See Appendix D in the MEGA paper for examples of architectural specs which perform well in various settings. If using MEGA as a decoder, be sure to set `bidirectional=False` to avoid errors with default bidirectional.
|
||||
- Mega-chunk is a variant of mega that reduces time and spaces complexity from quadratic to linear. Utilize chunking with MegaConfig.use_chunking and control chunk size with MegaConfig.chunk_size
|
||||
|
||||
## Implementation Notes
|
||||
|
||||
- The original implementation of MEGA had an inconsistent expectation of attention masks for padding and causal self-attention between the softmax attention and Laplace/squared ReLU method. This implementation addresses that inconsistency.
|
||||
- The original implementation did not include token type embeddings; this implementation adds support for these, with the option controlled by MegaConfig.add_token_type_embeddings
|
||||
|
||||
## MegaConfig
|
||||
|
||||
[[autodoc]] MegaConfig
|
||||
|
||||
## MegaModel
|
||||
|
||||
[[autodoc]] MegaModel
|
||||
- forward
|
||||
|
||||
## MegaForCausalLM
|
||||
|
||||
[[autodoc]] MegaForCausalLM
|
||||
- forward
|
||||
|
||||
## MegaForMaskedLM
|
||||
|
||||
[[autodoc]] MegaForMaskedLM
|
||||
- forward
|
||||
|
||||
## MegaForSequenceClassification
|
||||
|
||||
[[autodoc]] MegaForSequenceClassification
|
||||
- forward
|
||||
|
||||
## MegaForMultipleChoice
|
||||
|
||||
[[autodoc]] MegaForMultipleChoice
|
||||
- forward
|
||||
|
||||
## MegaForTokenClassification
|
||||
|
||||
[[autodoc]] MegaForTokenClassification
|
||||
- forward
|
||||
|
||||
## MegaForQuestionAnswering
|
||||
|
||||
[[autodoc]] MegaForQuestionAnswering
|
||||
- forward
|
||||
@ -1,101 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2022-04-14 and added to Hugging Face Transformers on 2023-06-20.*
|
||||
|
||||
# Neighborhood Attention Transformer
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This model is in maintenance mode only, we don't accept any new PRs changing its code.
|
||||
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
|
||||
You can do so by running the following command: `pip install -U transformers==4.40.2`.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Overview
|
||||
|
||||
NAT was proposed in [Neighborhood Attention Transformer](https://huggingface.co/papers/2204.07143)
|
||||
by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
|
||||
|
||||
It is a hierarchical vision transformer based on Neighborhood Attention, a sliding-window self attention pattern.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*We present Neighborhood Attention (NA), the first efficient and scalable sliding-window attention mechanism for vision.
|
||||
NA is a pixel-wise operation, localizing self attention (SA) to the nearest neighboring pixels, and therefore enjoys a
|
||||
linear time and space complexity compared to the quadratic complexity of SA. The sliding-window pattern allows NA's
|
||||
receptive field to grow without needing extra pixel shifts, and preserves translational equivariance, unlike
|
||||
Swin Transformer's Window Self Attention (WSA). We develop NATTEN (Neighborhood Attention Extension), a Python package
|
||||
with efficient C++ and CUDA kernels, which allows NA to run up to 40% faster than Swin's WSA while using up to 25% less
|
||||
memory. We further present Neighborhood Attention Transformer (NAT), a new hierarchical transformer design based on NA
|
||||
that boosts image classification and downstream vision performance. Experimental results on NAT are competitive;
|
||||
NAT-Tiny reaches 83.2% top-1 accuracy on ImageNet, 51.4% mAP on MS-COCO and 48.4% mIoU on ADE20K, which is 1.9%
|
||||
ImageNet accuracy, 1.0% COCO mAP, and 2.6% ADE20K mIoU improvement over a Swin model with similar size.*
|
||||
|
||||
<img
|
||||
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/neighborhood-attention-pattern.jpg"
|
||||
alt="drawing" width="600"/>
|
||||
|
||||
<small> Neighborhood Attention compared to other attention patterns.
|
||||
Taken from the <a href="https://huggingface.co/papers/2204.07143">original paper</a>.</small>
|
||||
|
||||
This model was contributed by [Ali Hassani](https://huggingface.co/alihassanijr).
|
||||
The original code can be found [here](https://github.com/SHI-Labs/Neighborhood-Attention-Transformer).
|
||||
|
||||
## Usage tips
|
||||
|
||||
- One can use the [`AutoImageProcessor`] API to prepare images for the model.
|
||||
- NAT can be used as a *backbone*. When `output_hidden_states = True`,
|
||||
it will output both `hidden_states` and `reshaped_hidden_states`.
|
||||
The `reshaped_hidden_states` have a shape of `(batch, num_channels, height, width)` rather than
|
||||
`(batch_size, height, width, num_channels)`.
|
||||
|
||||
Notes:
|
||||
|
||||
- NAT depends on [NATTEN](https://github.com/SHI-Labs/NATTEN/)'s implementation of Neighborhood Attention.
|
||||
You can install it with pre-built wheels for Linux by referring to [shi-labs.com/natten](https://shi-labs.com/natten),
|
||||
or build on your system by running `pip install natten`.
|
||||
Note that the latter will likely take time to compile. NATTEN does not support Windows devices yet.
|
||||
- Patch size of 4 is only supported at the moment.
|
||||
|
||||
## Resources
|
||||
|
||||
A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with NAT.
|
||||
|
||||
<PipelineTag pipeline="image-classification"/>
|
||||
|
||||
- [`NatForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
|
||||
- See also: [Image classification task guide](../tasks/image_classification)
|
||||
|
||||
If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
|
||||
|
||||
## NatConfig
|
||||
|
||||
[[autodoc]] NatConfig
|
||||
|
||||
## NatModel
|
||||
|
||||
[[autodoc]] NatModel
|
||||
- forward
|
||||
|
||||
## NatForImageClassification
|
||||
|
||||
[[autodoc]] NatForImageClassification
|
||||
- forward
|
||||
@ -1,101 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2019-08-31 and added to Hugging Face Transformers on 2023-06-20.*
|
||||
|
||||
# Nezha
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This model is in maintenance mode only, we don't accept any new PRs changing its code.
|
||||
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
|
||||
You can do so by running the following command: `pip install -U transformers==4.40.2`.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Overview
|
||||
|
||||
The Nezha model was proposed in [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://huggingface.co/papers/1909.00204) by Junqiu Wei et al.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*The pre-trained language models have achieved great successes in various natural language understanding (NLU) tasks
|
||||
due to its capacity to capture the deep contextualized information in text by pre-training on large-scale corpora.
|
||||
In this technical report, we present our practice of pre-training language models named NEZHA (NEural contextualiZed
|
||||
representation for CHinese lAnguage understanding) on Chinese corpora and finetuning for the Chinese NLU tasks.
|
||||
The current version of NEZHA is based on BERT with a collection of proven improvements, which include Functional
|
||||
Relative Positional Encoding as an effective positional encoding scheme, Whole Word Masking strategy,
|
||||
Mixed Precision Training and the LAMB Optimizer in training the models. The experimental results show that NEZHA
|
||||
achieves the state-of-the-art performances when finetuned on several representative Chinese tasks, including
|
||||
named entity recognition (People's Daily NER), sentence matching (LCQMC), Chinese sentiment classification (ChnSenti)
|
||||
and natural language inference (XNLI).*
|
||||
|
||||
This model was contributed by [sijunhe](https://huggingface.co/sijunhe). The original code can be found [here](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/NEZHA-PyTorch).
|
||||
|
||||
## Resources
|
||||
|
||||
- [Text classification task guide](../tasks/sequence_classification)
|
||||
- [Token classification task guide](../tasks/token_classification)
|
||||
- [Question answering task guide](../tasks/question_answering)
|
||||
- [Masked language modeling task guide](../tasks/masked_language_modeling)
|
||||
- [Multiple choice task guide](../tasks/multiple_choice)
|
||||
|
||||
## NezhaConfig
|
||||
|
||||
[[autodoc]] NezhaConfig
|
||||
|
||||
## NezhaModel
|
||||
|
||||
[[autodoc]] NezhaModel
|
||||
- forward
|
||||
|
||||
## NezhaForPreTraining
|
||||
|
||||
[[autodoc]] NezhaForPreTraining
|
||||
- forward
|
||||
|
||||
## NezhaForMaskedLM
|
||||
|
||||
[[autodoc]] NezhaForMaskedLM
|
||||
- forward
|
||||
|
||||
## NezhaForNextSentencePrediction
|
||||
|
||||
[[autodoc]] NezhaForNextSentencePrediction
|
||||
- forward
|
||||
|
||||
## NezhaForSequenceClassification
|
||||
|
||||
[[autodoc]] NezhaForSequenceClassification
|
||||
- forward
|
||||
|
||||
## NezhaForMultipleChoice
|
||||
|
||||
[[autodoc]] NezhaForMultipleChoice
|
||||
- forward
|
||||
|
||||
## NezhaForTokenClassification
|
||||
|
||||
[[autodoc]] NezhaForTokenClassification
|
||||
- forward
|
||||
|
||||
## NezhaForQuestionAnswering
|
||||
|
||||
[[autodoc]] NezhaForQuestionAnswering
|
||||
- forward
|
||||
@ -1,66 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2023-04-16 and added to Hugging Face Transformers on 2023-06-20.*
|
||||
|
||||
# Open-Llama
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This model is in maintenance mode only, we don't accept any new PRs changing its code.
|
||||
|
||||
If you run into any issues running this model, please reinstall the last version that supported this model: v4.31.0.
|
||||
You can do so by running the following command: `pip install -U transformers==4.31.0`.
|
||||
|
||||
</Tip>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This model differs from the [OpenLLaMA models](https://huggingface.co/models?search=openllama) on the Hugging Face Hub, which primarily use the [LLaMA](llama) architecture.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Overview
|
||||
|
||||
The Open-Llama model was proposed in the open source Open-Llama project by community developer s-JoL.
|
||||
|
||||
The model is mainly based on LLaMA with some modifications, incorporating memory-efficient attention from Xformers, stable embedding from Bloom, and shared input-output embedding from PaLM.
|
||||
And the model is pre-trained on both Chinese and English, which gives it better performance on Chinese language tasks.
|
||||
|
||||
This model was contributed by [s-JoL](https://huggingface.co/s-JoL).
|
||||
The original code was released on GitHub by [s-JoL](https://github.com/s-JoL), but is now removed.
|
||||
|
||||
## OpenLlamaConfig
|
||||
|
||||
[[autodoc]] OpenLlamaConfig
|
||||
|
||||
## OpenLlamaModel
|
||||
|
||||
[[autodoc]] OpenLlamaModel
|
||||
- forward
|
||||
|
||||
## OpenLlamaForCausalLM
|
||||
|
||||
[[autodoc]] OpenLlamaForCausalLM
|
||||
- forward
|
||||
|
||||
## OpenLlamaForSequenceClassification
|
||||
|
||||
[[autodoc]] OpenLlamaForSequenceClassification
|
||||
- forward
|
||||
@ -1,183 +0,0 @@
|
||||
<!--Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2020-04-20 and added to Hugging Face Transformers on 2023-06-20.*
|
||||
|
||||
# QDQBERT
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This model is in maintenance mode only, we don't accept any new PRs changing its code.
|
||||
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
|
||||
You can do so by running the following command: `pip install -U transformers==4.40.2`.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Overview
|
||||
|
||||
The QDQBERT model can be referenced in [Integer Quantization for Deep Learning Inference: Principles and Empirical
|
||||
Evaluation](https://huggingface.co/papers/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius
|
||||
Micikevicius.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Quantization techniques can reduce the size of Deep Neural Networks and improve inference latency and throughput by
|
||||
taking advantage of high throughput integer instructions. In this paper we review the mathematical aspects of
|
||||
quantization parameters and evaluate their choices on a wide range of neural network models for different application
|
||||
domains, including vision, speech, and language. We focus on quantization techniques that are amenable to acceleration
|
||||
by processors with high-throughput integer math pipelines. We also present a workflow for 8-bit quantization that is
|
||||
able to maintain accuracy within 1% of the floating-point baseline on all networks studied, including models that are
|
||||
more difficult to quantize, such as MobileNets and BERT-large.*
|
||||
|
||||
This model was contributed by [shangz](https://huggingface.co/shangz).
|
||||
|
||||
## Usage tips
|
||||
|
||||
- QDQBERT model adds fake quantization operations (pair of QuantizeLinear/DequantizeLinear ops) to (i) linear layer
|
||||
inputs and weights, (ii) matmul inputs, (iii) residual add inputs, in BERT model.
|
||||
- QDQBERT requires the dependency of [Pytorch Quantization Toolkit](https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization). To install `pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com`
|
||||
- QDQBERT model can be loaded from any checkpoint of HuggingFace BERT model (for example *google-bert/bert-base-uncased*), and
|
||||
perform Quantization Aware Training/Post Training Quantization.
|
||||
- A complete example of using QDQBERT model to perform Quatization Aware Training and Post Training Quantization for
|
||||
SQUAD task can be found at https://github.com/huggingface/transformers-research-projects/tree/main/quantization-qdqbert.
|
||||
|
||||
### Set default quantizers
|
||||
|
||||
QDQBERT model adds fake quantization operations (pair of QuantizeLinear/DequantizeLinear ops) to BERT by
|
||||
`TensorQuantizer` in [Pytorch Quantization Toolkit](https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization). `TensorQuantizer` is the module
|
||||
for quantizing tensors, with `QuantDescriptor` defining how the tensor should be quantized. Refer to [Pytorch
|
||||
Quantization Toolkit userguide](https://docs.nvidia.com/deeplearning/tensorrt/pytorch-quantization-toolkit/docs/userguide.html) for more details.
|
||||
|
||||
Before creating QDQBERT model, one has to set the default `QuantDescriptor` defining default tensor quantizers.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> import pytorch_quantization.nn as quant_nn
|
||||
>>> from pytorch_quantization.tensor_quant import QuantDescriptor
|
||||
|
||||
>>> # The default tensor quantizer is set to use Max calibration method
|
||||
>>> input_desc = QuantDescriptor(num_bits=8, calib_method="max")
|
||||
>>> # The default tensor quantizer is set to be per-channel quantization for weights
|
||||
>>> weight_desc = QuantDescriptor(num_bits=8, axis=((0,)))
|
||||
>>> quant_nn.QuantLinear.set_default_quant_desc_input(input_desc)
|
||||
>>> quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc)
|
||||
```
|
||||
|
||||
### Calibration
|
||||
|
||||
Calibration is the terminology of passing data samples to the quantizer and deciding the best scaling factors for
|
||||
tensors. After setting up the tensor quantizers, one can use the following example to calibrate the model:
|
||||
|
||||
```python
|
||||
>>> # Find the TensorQuantizer and enable calibration
|
||||
>>> for name, module in model.named_modules():
|
||||
... if name.endswith("_input_quantizer"):
|
||||
... module.enable_calib()
|
||||
... module.disable_quant() # Use full precision data to calibrate
|
||||
|
||||
>>> # Feeding data samples
|
||||
>>> model(x)
|
||||
>>> # ...
|
||||
|
||||
>>> # Finalize calibration
|
||||
>>> for name, module in model.named_modules():
|
||||
... if name.endswith("_input_quantizer"):
|
||||
... module.load_calib_amax()
|
||||
... module.enable_quant()
|
||||
|
||||
>>> # If running on accelerator, it needs to call `.to(xx)` again because new tensors will be created by calibration process
|
||||
>>> from accelerate import Accelerator
|
||||
>>> device = Accelerator().device
|
||||
>>> model.to(device)
|
||||
|
||||
>>> # Keep running the quantized model
|
||||
>>> # ...
|
||||
```
|
||||
|
||||
### Export to ONNX
|
||||
|
||||
The goal of exporting to ONNX is to deploy inference by [TensorRT](https://developer.nvidia.com/tensorrt). Fake
|
||||
quantization will be broken into a pair of QuantizeLinear/DequantizeLinear ONNX ops. After setting static member of
|
||||
TensorQuantizer to use Pytorch's own fake quantization functions, fake quantized model can be exported to ONNX, follow
|
||||
the instructions in [torch.onnx](https://pytorch.org/docs/stable/onnx.html). Example:
|
||||
|
||||
```python
|
||||
>>> from pytorch_quantization.nn import TensorQuantizer
|
||||
|
||||
>>> TensorQuantizer.use_fb_fake_quant = True
|
||||
|
||||
>>> # Load the calibrated model
|
||||
>>> ...
|
||||
>>> # ONNX export
|
||||
>>> torch.onnx.export(...)
|
||||
```
|
||||
|
||||
## Resources
|
||||
|
||||
- [Text classification task guide](../tasks/sequence_classification)
|
||||
- [Token classification task guide](../tasks/token_classification)
|
||||
- [Question answering task guide](../tasks/question_answering)
|
||||
- [Causal language modeling task guide](../tasks/language_modeling)
|
||||
- [Masked language modeling task guide](../tasks/masked_language_modeling)
|
||||
- [Multiple choice task guide](../tasks/multiple_choice)
|
||||
|
||||
## QDQBertConfig
|
||||
|
||||
[[autodoc]] QDQBertConfig
|
||||
|
||||
## QDQBertModel
|
||||
|
||||
[[autodoc]] QDQBertModel
|
||||
- forward
|
||||
|
||||
## QDQBertLMHeadModel
|
||||
|
||||
[[autodoc]] QDQBertLMHeadModel
|
||||
- forward
|
||||
|
||||
## QDQBertForMaskedLM
|
||||
|
||||
[[autodoc]] QDQBertForMaskedLM
|
||||
- forward
|
||||
|
||||
## QDQBertForSequenceClassification
|
||||
|
||||
[[autodoc]] QDQBertForSequenceClassification
|
||||
- forward
|
||||
|
||||
## QDQBertForNextSentencePrediction
|
||||
|
||||
[[autodoc]] QDQBertForNextSentencePrediction
|
||||
- forward
|
||||
|
||||
## QDQBertForMultipleChoice
|
||||
|
||||
[[autodoc]] QDQBertForMultipleChoice
|
||||
- forward
|
||||
|
||||
## QDQBertForTokenClassification
|
||||
|
||||
[[autodoc]] QDQBertForTokenClassification
|
||||
- forward
|
||||
|
||||
## QDQBertForQuestionAnswering
|
||||
|
||||
[[autodoc]] QDQBertForQuestionAnswering
|
||||
- forward
|
||||
@ -1,102 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2020-02-10 and added to Hugging Face Transformers on 2023-06-20.*
|
||||
|
||||
# REALM
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This model is in maintenance mode only, we don't accept any new PRs changing its code.
|
||||
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
|
||||
You can do so by running the following command: `pip install -U transformers==4.40.2`.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Overview
|
||||
|
||||
The REALM model was proposed in [REALM: Retrieval-Augmented Language Model Pre-Training](https://huggingface.co/papers/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. It's a
|
||||
retrieval-augmented language model that firstly retrieves documents from a textual knowledge corpus and then
|
||||
utilizes retrieved documents to process question answering tasks.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Language model pre-training has been shown to capture a surprising amount of world knowledge, crucial for NLP tasks
|
||||
such as question answering. However, this knowledge is stored implicitly in the parameters of a neural network,
|
||||
requiring ever-larger networks to cover more facts. To capture knowledge in a more modular and interpretable way, we
|
||||
augment language model pre-training with a latent knowledge retriever, which allows the model to retrieve and attend
|
||||
over documents from a large corpus such as Wikipedia, used during pre-training, fine-tuning and inference. For the
|
||||
first time, we show how to pre-train such a knowledge retriever in an unsupervised manner, using masked language
|
||||
modeling as the learning signal and backpropagating through a retrieval step that considers millions of documents. We
|
||||
demonstrate the effectiveness of Retrieval-Augmented Language Model pre-training (REALM) by fine-tuning on the
|
||||
challenging task of Open-domain Question Answering (Open-QA). We compare against state-of-the-art models for both
|
||||
explicit and implicit knowledge storage on three popular Open-QA benchmarks, and find that we outperform all previous
|
||||
methods by a significant margin (4-16% absolute accuracy), while also providing qualitative benefits such as
|
||||
interpretability and modularity.*
|
||||
|
||||
This model was contributed by [qqaatw](https://huggingface.co/qqaatw). The original code can be found
|
||||
[here](https://github.com/google-research/language/tree/master/language/realm).
|
||||
|
||||
## RealmConfig
|
||||
|
||||
[[autodoc]] RealmConfig
|
||||
|
||||
## RealmTokenizer
|
||||
|
||||
[[autodoc]] RealmTokenizer
|
||||
- build_inputs_with_special_tokens
|
||||
- get_special_tokens_mask
|
||||
- create_token_type_ids_from_sequences
|
||||
- save_vocabulary
|
||||
- batch_encode_candidates
|
||||
|
||||
## RealmTokenizerFast
|
||||
|
||||
[[autodoc]] RealmTokenizerFast
|
||||
- batch_encode_candidates
|
||||
|
||||
## RealmRetriever
|
||||
|
||||
[[autodoc]] RealmRetriever
|
||||
|
||||
## RealmEmbedder
|
||||
|
||||
[[autodoc]] RealmEmbedder
|
||||
- forward
|
||||
|
||||
## RealmScorer
|
||||
|
||||
[[autodoc]] RealmScorer
|
||||
- forward
|
||||
|
||||
## RealmKnowledgeAugEncoder
|
||||
|
||||
[[autodoc]] RealmKnowledgeAugEncoder
|
||||
- forward
|
||||
|
||||
## RealmReader
|
||||
|
||||
[[autodoc]] RealmReader
|
||||
- forward
|
||||
|
||||
## RealmForOpenQA
|
||||
|
||||
[[autodoc]] RealmForOpenQA
|
||||
- block_embedding_to
|
||||
- forward
|
||||
@ -1,57 +0,0 @@
|
||||
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2020-06-12 and added to Hugging Face Transformers on 2023-06-20.*
|
||||
|
||||
# RetriBERT
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This model is in maintenance mode only, so we won't accept any new PRs changing its code.
|
||||
|
||||
If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
|
||||
You can do so by running the following command: `pip install -U transformers==4.30.0`.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Overview
|
||||
|
||||
The [RetriBERT](https://huggingface.co/yjernite/retribert-base-uncased/tree/main) model was proposed in the blog post [Explain Anything Like I'm Five: A Model for Open Domain Long Form
|
||||
Question Answering](https://yjernite.github.io/lfqa.html). RetriBERT is a small model that uses either a single or
|
||||
pair of BERT encoders with lower-dimension projection for dense semantic indexing of text.
|
||||
|
||||
This model was contributed by [yjernite](https://huggingface.co/yjernite). Code to train and use the model can be
|
||||
found [here](https://github.com/huggingface/transformers/tree/main/examples/research-projects/distillation).
|
||||
|
||||
## RetriBertConfig
|
||||
|
||||
[[autodoc]] RetriBertConfig
|
||||
|
||||
## RetriBertTokenizer
|
||||
|
||||
[[autodoc]] RetriBertTokenizer
|
||||
|
||||
## RetriBertTokenizerFast
|
||||
|
||||
[[autodoc]] RetriBertTokenizerFast
|
||||
|
||||
## RetriBertModel
|
||||
|
||||
[[autodoc]] RetriBertModel
|
||||
- forward
|
||||
@ -1,133 +0,0 @@
|
||||
<!--Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2021-04-14 and added to Hugging Face Transformers on 2023-06-20.*
|
||||
|
||||
# Speech2Text2
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This model is in maintenance mode only, we don't accept any new PRs changing its code.
|
||||
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
|
||||
You can do so by running the following command: `pip install -U transformers==4.40.2`.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Overview
|
||||
|
||||
The Speech2Text2 model is used together with [Wav2Vec2](wav2vec2) for Speech Translation models proposed in
|
||||
[Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://huggingface.co/papers/2104.06678) by
|
||||
Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
|
||||
|
||||
Speech2Text2 is a *decoder-only* transformer model that can be used with any speech *encoder-only*, such as
|
||||
[Wav2Vec2](wav2vec2) or [HuBERT](hubert) for Speech-to-Text tasks. Please refer to the
|
||||
[SpeechEncoderDecoder](speech-encoder-decoder) class on how to combine Speech2Text2 with any speech *encoder-only*
|
||||
model.
|
||||
|
||||
This model was contributed by [Patrick von Platen](https://huggingface.co/patrickvonplaten).
|
||||
|
||||
The original code can be found [here](https://github.com/pytorch/fairseq/blob/1f7ef9ed1e1061f8c7f88f8b94c7186834398690/fairseq/models/wav2vec/wav2vec2_asr.py#L266).
|
||||
|
||||
## Usage tips
|
||||
|
||||
- Speech2Text2 achieves state-of-the-art results on the CoVoST Speech Translation dataset. For more information, see
|
||||
the [official models](https://huggingface.co/models?other=speech2text2) .
|
||||
- Speech2Text2 is always used within the [SpeechEncoderDecoder](speech-encoder-decoder) framework.
|
||||
- Speech2Text2's tokenizer is based on [fastBPE](https://github.com/glample/fastBPE).
|
||||
|
||||
## Inference
|
||||
|
||||
Speech2Text2's [`SpeechEncoderDecoderModel`] model accepts raw waveform input values from speech and
|
||||
makes use of [`~generation.GenerationMixin.generate`] to translate the input speech
|
||||
autoregressively to the target language.
|
||||
|
||||
The [`Wav2Vec2FeatureExtractor`] class is responsible for preprocessing the input speech and
|
||||
[`Speech2Text2Tokenizer`] decodes the generated target tokens to the target string. The
|
||||
[`Speech2Text2Processor`] wraps [`Wav2Vec2FeatureExtractor`] and
|
||||
[`Speech2Text2Tokenizer`] into a single instance to both extract the input features and decode the
|
||||
predicted token ids.
|
||||
|
||||
- Step-by-step Speech Translation
|
||||
|
||||
```python
|
||||
>>> from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel
|
||||
>>> from datasets import load_dataset
|
||||
|
||||
>>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
|
||||
>>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
|
||||
|
||||
|
||||
>>> def map_to_array(example):
|
||||
... example["speech"] = example["audio"]["array"]
|
||||
... return example
|
||||
|
||||
|
||||
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||
>>> ds = ds.map(map_to_array)
|
||||
|
||||
>>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
|
||||
>>> generated_ids = model.generate(inputs=inputs["input_values"], attention_mask=inputs["attention_mask"])
|
||||
|
||||
>>> transcription = processor.batch_decode(generated_ids)
|
||||
```
|
||||
|
||||
- Speech Translation via Pipelines
|
||||
|
||||
The automatic speech recognition pipeline can also be used to translate speech in just a couple lines of code
|
||||
|
||||
```python
|
||||
>>> from datasets import load_dataset
|
||||
>>> from transformers import pipeline
|
||||
|
||||
>>> librispeech_en = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||
>>> asr = pipeline(
|
||||
... "automatic-speech-recognition",
|
||||
... model="facebook/s2t-wav2vec2-large-en-de",
|
||||
... feature_extractor="facebook/s2t-wav2vec2-large-en-de",
|
||||
... )
|
||||
|
||||
>>> translation_de = asr(librispeech_en[0]["file"])
|
||||
```
|
||||
|
||||
See [model hub](https://huggingface.co/models?filter=speech2text2) to look for Speech2Text2 checkpoints.
|
||||
|
||||
## Resources
|
||||
|
||||
- [Causal language modeling task guide](../tasks/language_modeling)
|
||||
|
||||
## Speech2Text2Config
|
||||
|
||||
[[autodoc]] Speech2Text2Config
|
||||
|
||||
## Speech2TextTokenizer
|
||||
|
||||
[[autodoc]] Speech2Text2Tokenizer
|
||||
- batch_decode
|
||||
- decode
|
||||
- save_vocabulary
|
||||
|
||||
## Speech2Text2Processor
|
||||
|
||||
[[autodoc]] Speech2Text2Processor
|
||||
- __call__
|
||||
- from_pretrained
|
||||
- save_pretrained
|
||||
- batch_decode
|
||||
- decode
|
||||
|
||||
## Speech2Text2ForCausalLM
|
||||
|
||||
[[autodoc]] Speech2Text2ForCausalLM
|
||||
- forward
|
||||
@ -1,155 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2021-07-16 and added to Hugging Face Transformers on 2023-06-20.*
|
||||
|
||||
# TAPEX
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This model is in maintenance mode only, we don't accept any new PRs changing its code.
|
||||
|
||||
If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
|
||||
You can do so by running the following command: `pip install -U transformers==4.30.0`.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Overview
|
||||
|
||||
The TAPEX model was proposed in [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://huggingface.co/papers/2107.07653) by Qian Liu,
|
||||
Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. TAPEX pre-trains a BART model to solve synthetic SQL queries, after
|
||||
which it can be fine-tuned to answer natural language questions related to tabular data, as well as performing table fact checking.
|
||||
|
||||
TAPEX has been fine-tuned on several datasets:
|
||||
|
||||
- [SQA](https://www.microsoft.com/en-us/download/details.aspx?id=54253) (Sequential Question Answering by Microsoft)
|
||||
- [WTQ](https://github.com/ppasupat/WikiTableQuestions) (Wiki Table Questions by Stanford University)
|
||||
- [WikiSQL](https://github.com/salesforce/WikiSQL) (by Salesforce)
|
||||
- [TabFact](https://tabfact.github.io/) (by USCB NLP Lab).
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Recent progress in language model pre-training has achieved a great success via leveraging large-scale unstructured textual data. However, it is
|
||||
still a challenge to apply pre-training on structured tabular data due to the absence of large-scale high-quality tabular data. In this paper, we
|
||||
propose TAPEX to show that table pre-training can be achieved by learning a neural SQL executor over a synthetic corpus, which is obtained by automatically
|
||||
synthesizing executable SQL queries and their execution outputs. TAPEX addresses the data scarcity challenge via guiding the language model to mimic a SQL
|
||||
executor on the diverse, large-scale and high-quality synthetic corpus. We evaluate TAPEX on four benchmark datasets. Experimental results demonstrate that
|
||||
TAPEX outperforms previous table pre-training approaches by a large margin and achieves new state-of-the-art results on all of them. This includes improvements
|
||||
on the weakly-supervised WikiSQL denotation accuracy to 89.5% (+2.3%), the WikiTableQuestions denotation accuracy to 57.5% (+4.8%), the SQA denotation accuracy
|
||||
to 74.5% (+3.5%), and the TabFact accuracy to 84.2% (+3.2%). To our knowledge, this is the first work to exploit table pre-training via synthetic executable programs
|
||||
and to achieve new state-of-the-art results on various downstream tasks.*
|
||||
|
||||
## Usage tips
|
||||
|
||||
- TAPEX is a generative (seq2seq) model. One can directly plug in the weights of TAPEX into a BART model.
|
||||
- TAPEX has checkpoints on the hub that are either pre-trained only, or fine-tuned on WTQ, SQA, WikiSQL and TabFact.
|
||||
- Sentences + tables are presented to the model as `sentence + " " + linearized table`. The linearized table has the following format:
|
||||
`col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : ...`.
|
||||
- TAPEX has its own tokenizer, that allows to prepare all data for the model easily. One can pass Pandas DataFrames and strings to the tokenizer,
|
||||
and it will automatically create the `input_ids` and `attention_mask` (as shown in the usage examples below).
|
||||
|
||||
### Usage: inference
|
||||
|
||||
Below, we illustrate how to use TAPEX for table question answering. As one can see, one can directly plug in the weights of TAPEX into a BART model.
|
||||
We use the [Auto API](auto), which will automatically instantiate the appropriate tokenizer ([`TapexTokenizer`]) and model ([`BartForConditionalGeneration`]) for us,
|
||||
based on the configuration file of the checkpoint on the hub.
|
||||
|
||||
```python
|
||||
>>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
||||
>>> import pandas as pd
|
||||
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")
|
||||
>>> model = AutoModelForSeq2SeqLM.from_pretrained("microsoft/tapex-large-finetuned-wtq")
|
||||
|
||||
>>> # prepare table + question
|
||||
>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
|
||||
>>> table = pd.DataFrame.from_dict(data)
|
||||
>>> question = "how many movies does Leonardo Di Caprio have?"
|
||||
|
||||
>>> encoding = tokenizer(table, question, return_tensors="pt")
|
||||
|
||||
>>> # let the model generate an answer autoregressively
|
||||
>>> outputs = model.generate(**encoding)
|
||||
|
||||
>>> # decode back to text
|
||||
>>> predicted_answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
|
||||
>>> print(predicted_answer)
|
||||
53
|
||||
```
|
||||
|
||||
Note that [`TapexTokenizer`] also supports batched inference. Hence, one can provide a batch of different tables/questions, or a batch of a single table
|
||||
and multiple questions, or a batch of a single query and multiple tables. Let's illustrate this:
|
||||
|
||||
```python
|
||||
>>> # prepare table + question
|
||||
>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
|
||||
>>> table = pd.DataFrame.from_dict(data)
|
||||
>>> questions = [
|
||||
... "how many movies does Leonardo Di Caprio have?",
|
||||
... "which actor has 69 movies?",
|
||||
... "what's the first name of the actor who has 87 movies?",
|
||||
... ]
|
||||
>>> encoding = tokenizer(table, questions, padding=True, return_tensors="pt")
|
||||
|
||||
>>> # let the model generate an answer autoregressively
|
||||
>>> outputs = model.generate(**encoding)
|
||||
|
||||
>>> # decode back to text
|
||||
>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
||||
[' 53', ' george clooney', ' brad pitt']
|
||||
```
|
||||
|
||||
In case one wants to do table verification (i.e. the task of determining whether a given sentence is supported or refuted by the contents
|
||||
of a table), one can instantiate a [`BartForSequenceClassification`] model. TAPEX has checkpoints on the hub fine-tuned on TabFact, an important
|
||||
benchmark for table fact checking (it achieves 84% accuracy). The code example below again leverages the [Auto API](auto).
|
||||
|
||||
```python
|
||||
>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
||||
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/tapex-large-finetuned-tabfact")
|
||||
>>> model = AutoModelForSequenceClassification.from_pretrained("microsoft/tapex-large-finetuned-tabfact")
|
||||
|
||||
>>> # prepare table + sentence
|
||||
>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
|
||||
>>> table = pd.DataFrame.from_dict(data)
|
||||
>>> sentence = "George Clooney has 30 movies"
|
||||
|
||||
>>> encoding = tokenizer(table, sentence, return_tensors="pt")
|
||||
|
||||
>>> # forward pass
|
||||
>>> outputs = model(**encoding)
|
||||
|
||||
>>> # print prediction
|
||||
>>> predicted_class_idx = outputs.logits[0].argmax(dim=0).item()
|
||||
>>> print(model.config.id2label[predicted_class_idx])
|
||||
Refused
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
TAPEX architecture is the same as BART, except for tokenization. Refer to [BART documentation](bart) for information on
|
||||
configuration classes and their parameters. TAPEX-specific tokenizer is documented below.
|
||||
|
||||
</Tip>
|
||||
|
||||
## TapexTokenizer
|
||||
|
||||
[[autodoc]] TapexTokenizer
|
||||
- __call__
|
||||
- save_vocabulary
|
||||
@ -1,66 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2021-06-03 and added to Hugging Face Transformers on 2023-06-20.*
|
||||
|
||||
# Trajectory Transformer
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This model is in maintenance mode only, so we won't accept any new PRs changing its code.
|
||||
|
||||
If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
|
||||
You can do so by running the following command: `pip install -U transformers==4.30.0`.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Overview
|
||||
|
||||
The Trajectory Transformer model was proposed in [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://huggingface.co/papers/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Reinforcement learning (RL) is typically concerned with estimating stationary policies or single-step models,
|
||||
leveraging the Markov property to factorize problems in time. However, we can also view RL as a generic sequence
|
||||
modeling problem, with the goal being to produce a sequence of actions that leads to a sequence of high rewards.
|
||||
Viewed in this way, it is tempting to consider whether high-capacity sequence prediction models that work well
|
||||
in other domains, such as natural-language processing, can also provide effective solutions to the RL problem.
|
||||
To this end, we explore how RL can be tackled with the tools of sequence modeling, using a Transformer architecture
|
||||
to model distributions over trajectories and repurposing beam search as a planning algorithm. Framing RL as sequence
|
||||
modeling problem simplifies a range of design decisions, allowing us to dispense with many of the components common
|
||||
in offline RL algorithms. We demonstrate the flexibility of this approach across long-horizon dynamics prediction,
|
||||
imitation learning, goal-conditioned RL, and offline RL. Further, we show that this approach can be combined with
|
||||
existing model-free algorithms to yield a state-of-the-art planner in sparse-reward, long-horizon tasks.*
|
||||
|
||||
This model was contributed by [CarlCochet](https://huggingface.co/CarlCochet). The original code can be found [here](https://github.com/jannerm/trajectory-transformer).
|
||||
|
||||
## Usage tips
|
||||
|
||||
This Transformer is used for deep reinforcement learning. To use it, you need to create sequences from
|
||||
actions, states and rewards from all previous timesteps. This model will treat all these elements together
|
||||
as one big sequence (a trajectory).
|
||||
|
||||
## TrajectoryTransformerConfig
|
||||
|
||||
[[autodoc]] TrajectoryTransformerConfig
|
||||
|
||||
## TrajectoryTransformerModel
|
||||
|
||||
[[autodoc]] TrajectoryTransformerModel
|
||||
- forward
|
||||
@ -1,136 +0,0 @@
|
||||
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2019-01-09 and added to Hugging Face Transformers on 2023-06-20.*
|
||||
|
||||
# Transformer XL
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This model is in maintenance mode only, so we won't accept any new PRs changing its code. This model was deprecated due to security issues linked to `pickle.load`.
|
||||
|
||||
We recommend switching to more recent models for improved security.
|
||||
|
||||
In case you would still like to use `TransfoXL` in your experiments, we recommend using the [Hub checkpoint](https://huggingface.co/transfo-xl/transfo-xl-wt103) with a specific revision to ensure you are downloading safe files from the Hub.
|
||||
|
||||
You will need to set the environment variable `TRUST_REMOTE_CODE` to `True` in order to allow the
|
||||
usage of `pickle.load()`:
|
||||
|
||||
```python
|
||||
import os
|
||||
from transformers import TransfoXLTokenizer, TransfoXLLMHeadModel
|
||||
|
||||
os.environ["TRUST_REMOTE_CODE"] = "True"
|
||||
|
||||
checkpoint = 'transfo-xl/transfo-xl-wt103'
|
||||
revision = '40a186da79458c9f9de846edfaea79c412137f97'
|
||||
|
||||
tokenizer = TransfoXLTokenizer.from_pretrained(checkpoint, revision=revision)
|
||||
model = TransfoXLLMHeadModel.from_pretrained(checkpoint, revision=revision)
|
||||
```
|
||||
|
||||
If you run into any issues running this model, please reinstall the last version that supported this model: v4.35.0.
|
||||
You can do so by running the following command: `pip install -U transformers==4.35.0`.
|
||||
|
||||
</Tip>
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<a href="https://huggingface.co/models?filter=transfo-xl">
|
||||
<img alt="Models" src="https://img.shields.io/badge/All_model_pages-transfo--xl-blueviolet">
|
||||
</a>
|
||||
<a href="https://huggingface.co/spaces/docs-demos/transfo-xl-wt103">
|
||||
<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
|
||||
</a>
|
||||
</div>
|
||||
|
||||
## Overview
|
||||
|
||||
The Transformer-XL model was proposed in [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://huggingface.co/papers/1901.02860) by Zihang Dai, Zhilin Yang, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan
|
||||
Salakhutdinov. It's a causal (uni-directional) transformer with relative positioning (sinusoïdal) embeddings which can
|
||||
reuse previously computed hidden-states to attend to longer context (memory). This model also uses adaptive softmax
|
||||
inputs and outputs (tied).
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Transformers have a potential of learning longer-term dependency, but are limited by a fixed-length context in the
|
||||
setting of language modeling. We propose a novel neural architecture Transformer-XL that enables learning dependency
|
||||
beyond a fixed length without disrupting temporal coherence. It consists of a segment-level recurrence mechanism and a
|
||||
novel positional encoding scheme. Our method not only enables capturing longer-term dependency, but also resolves the
|
||||
context fragmentation problem. As a result, Transformer-XL learns dependency that is 80% longer than RNNs and 450%
|
||||
longer than vanilla Transformers, achieves better performance on both short and long sequences, and is up to 1,800+
|
||||
times faster than vanilla Transformers during evaluation. Notably, we improve the state-of-the-art results of
|
||||
bpc/perplexity to 0.99 on enwiki8, 1.08 on text8, 18.3 on WikiText-103, 21.8 on One Billion Word, and 54.5 on Penn
|
||||
Treebank (without finetuning). When trained only on WikiText-103, Transformer-XL manages to generate reasonably
|
||||
coherent, novel text articles with thousands of tokens.*
|
||||
|
||||
This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/kimiyoung/transformer-xl).
|
||||
|
||||
## Usage tips
|
||||
|
||||
- Transformer-XL uses relative sinusoidal positional embeddings. Padding can be done on the left or on the right. The
|
||||
original implementation trains on SQuAD with padding on the left, therefore the padding defaults are set to left.
|
||||
- Transformer-XL is one of the few models that has no sequence length limit.
|
||||
- Same as a regular GPT model, but introduces a recurrence mechanism for two consecutive segments (similar to a regular RNNs with two consecutive inputs). In this context, a segment is a number of consecutive tokens (for instance 512) that may span across multiple documents, and segments are fed in order to the model.
|
||||
- Basically, the hidden states of the previous segment are concatenated to the current input to compute the attention scores. This allows the model to pay attention to information that was in the previous segment as well as the current one. By stacking multiple attention layers, the receptive field can be increased to multiple previous segments.
|
||||
- This changes the positional embeddings to positional relative embeddings (as the regular positional embeddings would give the same results in the current input and the current hidden state at a given position) and needs to make some adjustments in the way attention scores are computed.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
TransformerXL does **not** work with *torch.nn.DataParallel* due to a bug in PyTorch, see [issue #36035](https://github.com/pytorch/pytorch/issues/36035)
|
||||
|
||||
</Tip>
|
||||
|
||||
## Resources
|
||||
|
||||
- [Text classification task guide](../tasks/sequence_classification)
|
||||
- [Causal language modeling task guide](../tasks/language_modeling)
|
||||
|
||||
## TransfoXLConfig
|
||||
|
||||
[[autodoc]] TransfoXLConfig
|
||||
|
||||
## TransfoXLTokenizer
|
||||
|
||||
[[autodoc]] TransfoXLTokenizer
|
||||
- save_vocabulary
|
||||
|
||||
## TransfoXL specific outputs
|
||||
|
||||
[[autodoc]] models.deprecated.transfo_xl.modeling_transfo_xl.TransfoXLModelOutput
|
||||
|
||||
[[autodoc]] models.deprecated.transfo_xl.modeling_transfo_xl.TransfoXLLMHeadModelOutput
|
||||
|
||||
## TransfoXLModel
|
||||
|
||||
[[autodoc]] TransfoXLModel
|
||||
- forward
|
||||
|
||||
## TransfoXLLMHeadModel
|
||||
|
||||
[[autodoc]] TransfoXLLMHeadModel
|
||||
- forward
|
||||
|
||||
## TransfoXLForSequenceClassification
|
||||
|
||||
[[autodoc]] TransfoXLForSequenceClassification
|
||||
- forward
|
||||
|
||||
## Internal Layers
|
||||
|
||||
[[autodoc]] AdaptiveEmbedding
|
||||
@ -1,90 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2022-09-28 and added to Hugging Face Transformers on 2023-06-20.*
|
||||
|
||||
# TVLT
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This model is in maintenance mode only, we don't accept any new PRs changing its code.
|
||||
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
|
||||
You can do so by running the following command: `pip install -U transformers==4.40.2`.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Overview
|
||||
|
||||
The TVLT model was proposed in [TVLT: Textless Vision-Language Transformer](https://huggingface.co/papers/2209.14156)
|
||||
by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal (the first three authors contributed equally). The Textless Vision-Language Transformer (TVLT) is a model that uses raw visual and audio inputs for vision-and-language representation learning, without using text-specific modules such as tokenization or automatic speech recognition (ASR). It can perform various audiovisual and vision-language tasks like retrieval, question answering, etc.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*In this work, we present the Textless Vision-Language Transformer (TVLT), where homogeneous transformer blocks take raw visual and audio inputs for vision-and-language representation learning with minimal modality-specific design, and do not use text-specific modules such as tokenization or automatic speech recognition (ASR). TVLT is trained by reconstructing masked patches of continuous video frames and audio spectrograms (masked autoencoding) and contrastive modeling to align video and audio. TVLT attains performance comparable to its text-based counterpart on various multimodal tasks, such as visual question answering, image retrieval, video retrieval, and multimodal sentiment analysis, with 28x faster inference speed and only 1/3 of the parameters. Our findings suggest the possibility of learning compact and efficient visual-linguistic representations from low-level visual and audio signals without assuming the prior existence of text.*
|
||||
|
||||
<p align="center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/tvlt_architecture.png"
|
||||
alt="drawing" width="600"/>
|
||||
</p>
|
||||
|
||||
<small> TVLT architecture. Taken from the <a href="[https://huggingface.co/papers/2102.03334](https://huggingface.co/papers/2209.14156)">original paper</a>. </small>
|
||||
|
||||
The original code can be found [here](https://github.com/zinengtang/TVLT). This model was contributed by [Zineng Tang](https://huggingface.co/ZinengTang).
|
||||
|
||||
## Usage tips
|
||||
|
||||
- TVLT is a model that takes both `pixel_values` and `audio_values` as input. One can use [`TvltProcessor`] to prepare data for the model.
|
||||
This processor wraps an image processor (for the image/video modality) and an audio feature extractor (for the audio modality) into one.
|
||||
- TVLT is trained with images/videos and audios of various sizes: the authors resize and crop the input images/videos to 224 and limit the length of audio spectrogram to 2048. To make batching of videos and audios possible, the authors use a `pixel_mask` that indicates which pixels are real/padding and `audio_mask` that indicates which audio values are real/padding.
|
||||
- The design of TVLT is very similar to that of a standard Vision Transformer (ViT) and masked autoencoder (MAE) as in [ViTMAE](vitmae). The difference is that the model includes embedding layers for the audio modality.
|
||||
- The PyTorch version of this model is only available in torch 1.10 and higher.
|
||||
|
||||
## TvltConfig
|
||||
|
||||
[[autodoc]] TvltConfig
|
||||
|
||||
## TvltProcessor
|
||||
|
||||
[[autodoc]] TvltProcessor
|
||||
- __call__
|
||||
|
||||
## TvltFeatureExtractor
|
||||
|
||||
[[autodoc]] TvltFeatureExtractor
|
||||
- __call__
|
||||
|
||||
## TvltImageProcessor
|
||||
|
||||
[[autodoc]] TvltImageProcessor
|
||||
- preprocess
|
||||
|
||||
## TvltModel
|
||||
|
||||
[[autodoc]] TvltModel
|
||||
- forward
|
||||
|
||||
## TvltForPreTraining
|
||||
|
||||
[[autodoc]] TvltForPreTraining
|
||||
- forward
|
||||
|
||||
## TvltForAudioVisualClassification
|
||||
|
||||
[[autodoc]] TvltForAudioVisualClassification
|
||||
- forward
|
||||
@ -1,76 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2022-02-20 and added to Hugging Face Transformers on 2023-06-20.*
|
||||
|
||||
# VAN
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This model is in maintenance mode only, we don't accept any new PRs changing its code.
|
||||
|
||||
If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
|
||||
You can do so by running the following command: `pip install -U transformers==4.30.0`.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Overview
|
||||
|
||||
The VAN model was proposed in [Visual Attention Network](https://huggingface.co/papers/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
|
||||
|
||||
This paper introduces a new attention layer based on convolution operations able to capture both local and distant relationships. This is done by combining normal and large kernel convolution layers. The latter uses a dilated convolution to capture distant correlations.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*While originally designed for natural language processing tasks, the self-attention mechanism has recently taken various computer vision areas by storm. However, the 2D nature of images brings three challenges for applying self-attention in computer vision. (1) Treating images as 1D sequences neglects their 2D structures. (2) The quadratic complexity is too expensive for high-resolution images. (3) It only captures spatial adaptability but ignores channel adaptability. In this paper, we propose a novel large kernel attention (LKA) module to enable self-adaptive and long-range correlations in self-attention while avoiding the above issues. We further introduce a novel neural network based on LKA, namely Visual Attention Network (VAN). While extremely simple, VAN outperforms the state-of-the-art vision transformers and convolutional neural networks with a large margin in extensive experiments, including image classification, object detection, semantic segmentation, instance segmentation, etc. Code is available at [this https URL](https://github.com/Visual-Attention-Network/VAN-Classification).*
|
||||
|
||||
Tips:
|
||||
|
||||
- VAN does not have an embedding layer, thus the `hidden_states` will have a length equal to the number of stages.
|
||||
|
||||
The figure below illustrates the architecture of a Visual Attention Layer. Taken from the [original paper](https://huggingface.co/papers/2202.09741).
|
||||
|
||||
<img width="600" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/van_architecture.png"/>
|
||||
|
||||
This model was contributed by [Francesco](https://huggingface.co/Francesco). The original code can be found [here](https://github.com/Visual-Attention-Network/VAN-Classification).
|
||||
|
||||
## Resources
|
||||
|
||||
A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with VAN.
|
||||
|
||||
<PipelineTag pipeline="image-classification"/>
|
||||
|
||||
- [`VanForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
|
||||
- See also: [Image classification task guide](../tasks/image_classification)
|
||||
|
||||
If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
|
||||
|
||||
## VanConfig
|
||||
|
||||
[[autodoc]] VanConfig
|
||||
|
||||
## VanModel
|
||||
|
||||
[[autodoc]] VanModel
|
||||
- forward
|
||||
|
||||
## VanForImageClassification
|
||||
|
||||
[[autodoc]] VanForImageClassification
|
||||
- forward
|
||||
@ -1,112 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2020-10-22 and added to Hugging Face Transformers on 2023-06-20.*
|
||||
|
||||
# Hybrid Vision Transformer (ViT Hybrid)
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This model is in maintenance mode only, we don't accept any new PRs changing its code.
|
||||
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
|
||||
You can do so by running the following command: `pip install -U transformers==4.40.2`.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Overview
|
||||
|
||||
The hybrid Vision Transformer (ViT) model was proposed in [An Image is Worth 16x16 Words: Transformers for Image Recognition
|
||||
at Scale](https://huggingface.co/papers/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk
|
||||
Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob
|
||||
Uszkoreit, Neil Houlsby. It's the first paper that successfully trains a Transformer encoder on ImageNet, attaining
|
||||
very good results compared to familiar convolutional architectures. ViT hybrid is a slight variant of the [plain Vision Transformer](vit),
|
||||
by leveraging a convolutional backbone (specifically, [BiT](bit)) whose features are used as initial "tokens" for the Transformer.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*While the Transformer architecture has become the de-facto standard for natural language processing tasks, its
|
||||
applications to computer vision remain limited. In vision, attention is either applied in conjunction with
|
||||
convolutional networks, or used to replace certain components of convolutional networks while keeping their overall
|
||||
structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to
|
||||
sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of
|
||||
data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.),
|
||||
Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring
|
||||
substantially fewer computational resources to train.*
|
||||
|
||||
This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code (written in JAX) can be
|
||||
found [here](https://github.com/google-research/vision_transformer).
|
||||
|
||||
## Using Scaled Dot Product Attention (SDPA)
|
||||
|
||||
PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function
|
||||
encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the
|
||||
[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
|
||||
or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
|
||||
page for more information.
|
||||
|
||||
SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set
|
||||
`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
|
||||
|
||||
```py
|
||||
from transformers import ViTHybridForImageClassification
|
||||
model = ViTHybridForImageClassification.from_pretrained("google/vit-hybrid-base-bit-384", attn_implementation="sdpa", dtype=torch.float16)
|
||||
...
|
||||
```
|
||||
|
||||
For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
|
||||
|
||||
On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` and `google/vit-hybrid-base-bit-384` model, we saw the following speedups during inference.
|
||||
|
||||
| Batch size | Average inference time (ms), eager mode | Average inference time (ms), sdpa model | Speed up, Sdpa / Eager (x) |
|
||||
|--------------|-------------------------------------------|-------------------------------------------|------------------------------|
|
||||
| 1 | 29 | 18 | 1.61 |
|
||||
| 2 | 26 | 18 | 1.44 |
|
||||
| 4 | 25 | 18 | 1.39 |
|
||||
| 8 | 34 | 24 | 1.42 |
|
||||
|
||||
## Resources
|
||||
|
||||
A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViT Hybrid.
|
||||
|
||||
<PipelineTag pipeline="image-classification"/>
|
||||
|
||||
- [`ViTHybridForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
|
||||
- See also: [Image classification task guide](../tasks/image_classification)
|
||||
|
||||
If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
|
||||
|
||||
## ViTHybridConfig
|
||||
|
||||
[[autodoc]] ViTHybridConfig
|
||||
|
||||
## ViTHybridImageProcessor
|
||||
|
||||
[[autodoc]] ViTHybridImageProcessor
|
||||
- preprocess
|
||||
|
||||
## ViTHybridModel
|
||||
|
||||
[[autodoc]] ViTHybridModel
|
||||
- forward
|
||||
|
||||
## ViTHybridForImageClassification
|
||||
|
||||
[[autodoc]] ViTHybridForImageClassification
|
||||
- forward
|
||||
@ -1,99 +0,0 @@
|
||||
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2020-01-13 and added to Hugging Face Transformers on 2023-06-20.*
|
||||
|
||||
# XLM-ProphetNet
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This model is in maintenance mode only, we don't accept any new PRs changing its code.
|
||||
If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
|
||||
You can do so by running the following command: `pip install -U transformers==4.40.2`.
|
||||
|
||||
</Tip>
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<a href="https://huggingface.co/models?filter=xprophetnet">
|
||||
<img alt="Models" src="https://img.shields.io/badge/All_model_pages-xprophetnet-blueviolet">
|
||||
</a>
|
||||
<a href="https://huggingface.co/spaces/docs-demos/xprophetnet-large-wiki100-cased-xglue-ntg">
|
||||
<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
|
||||
</a>
|
||||
</div>
|
||||
|
||||
**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign
|
||||
@patrickvonplaten
|
||||
|
||||
## Overview
|
||||
|
||||
The XLM-ProphetNet model was proposed in [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training,](https://huggingface.co/papers/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei
|
||||
Zhang, Ming Zhou on 13 Jan, 2020.
|
||||
|
||||
XLM-ProphetNet is an encoder-decoder model and can predict n-future tokens for "ngram" language modeling instead of
|
||||
just the next token. Its architecture is identical to ProhpetNet, but the model was trained on the multi-lingual
|
||||
"wiki100" Wikipedia dump. XLM-ProphetNet's model architecture and pretraining objective is same as ProphetNet, but XLM-ProphetNet was pre-trained on the cross-lingual dataset XGLUE.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*In this paper, we present a new sequence-to-sequence pretraining model called ProphetNet, which introduces a novel
|
||||
self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of
|
||||
the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by
|
||||
n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time
|
||||
step. The future n-gram prediction explicitly encourages the model to plan for the future tokens and prevent
|
||||
overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale
|
||||
dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for
|
||||
abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new
|
||||
state-of-the-art results on all these datasets compared to the models using the same scale pretraining corpus.*
|
||||
|
||||
The Authors' code can be found [here](https://github.com/microsoft/ProphetNet).
|
||||
|
||||
## Resources
|
||||
|
||||
- [Causal language modeling task guide](../tasks/language_modeling)
|
||||
- [Translation task guide](../tasks/translation)
|
||||
- [Summarization task guide](../tasks/summarization)
|
||||
|
||||
## XLMProphetNetConfig
|
||||
|
||||
[[autodoc]] XLMProphetNetConfig
|
||||
|
||||
## XLMProphetNetTokenizer
|
||||
|
||||
[[autodoc]] XLMProphetNetTokenizer
|
||||
|
||||
## XLMProphetNetModel
|
||||
|
||||
[[autodoc]] XLMProphetNetModel
|
||||
|
||||
## XLMProphetNetEncoder
|
||||
|
||||
[[autodoc]] XLMProphetNetEncoder
|
||||
|
||||
## XLMProphetNetDecoder
|
||||
|
||||
[[autodoc]] XLMProphetNetDecoder
|
||||
|
||||
## XLMProphetNetForConditionalGeneration
|
||||
|
||||
[[autodoc]] XLMProphetNetForConditionalGeneration
|
||||
|
||||
## XLMProphetNetForCausalLM
|
||||
|
||||
[[autodoc]] XLMProphetNetForCausalLM
|
||||
@ -252,8 +252,6 @@
|
||||
title: Blenderbot Small
|
||||
- local: model_doc/bloom
|
||||
title: BLOOM
|
||||
- local: model_doc/bort
|
||||
title: BORT
|
||||
- local: model_doc/byt5
|
||||
title: ByT5
|
||||
- local: model_doc/camembert
|
||||
@ -297,8 +295,6 @@
|
||||
title: Deformable DETR
|
||||
- local: model_doc/deit
|
||||
title: DeiT
|
||||
- local: model_doc/deta
|
||||
title: DETA
|
||||
- local: model_doc/detr
|
||||
title: DETR
|
||||
- local: model_doc/dinat
|
||||
|
||||
@ -1,55 +0,0 @@
|
||||
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# BORT
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
このモデルはメンテナンス モードのみであり、コードを変更する新しい PR は受け付けられません。
|
||||
|
||||
このモデルの実行中に問題が発生した場合は、このモデルをサポートしていた最後のバージョン (v4.30.0) を再インストールしてください。
|
||||
これを行うには、コマンド `pip install -U Transformers==4.30.0` を実行します。
|
||||
|
||||
</Tip>
|
||||
|
||||
## Overview
|
||||
|
||||
BORT モデルは、[Optimal Subarchitecture Extraction for BERT](https://huggingface.co/papers/2010.10499) で提案されました。
|
||||
Adrian de Wynter and Daniel J. Perry.これは、BERT のアーキテクチャ パラメータの最適なサブセットです。
|
||||
著者は「ボルト」と呼んでいます。
|
||||
|
||||
論文の要約は次のとおりです。
|
||||
|
||||
*Devlin らから BERT アーキテクチャのアーキテクチャ パラメータの最適なサブセットを抽出します。 (2018)
|
||||
ニューラル アーキテクチャ検索のアルゴリズムにおける最近の画期的な技術を適用します。この最適なサブセットを次のように呼びます。
|
||||
"Bort" は明らかに小さく、有効 (つまり、埋め込み層を考慮しない) サイズは 5.5% です。
|
||||
オリジナルの BERT 大規模アーキテクチャ、およびネット サイズの 16%。 Bort は 288 GPU 時間で事前トレーニングすることもできます。
|
||||
最高パフォーマンスの BERT パラメトリック アーキテクチャ バリアントである RoBERTa-large の事前トレーニングに必要な時間の 1.2%
|
||||
(Liu et al., 2019)、同じマシンで BERT-large をトレーニングするのに必要な GPU 時間の世界記録の約 33%
|
||||
ハードウェア。また、CPU 上で 7.9 倍高速であるだけでなく、他の圧縮バージョンよりもパフォーマンスが優れています。
|
||||
アーキテクチャ、および一部の非圧縮バリアント: 0.3% ~ 31% のパフォーマンス向上が得られます。
|
||||
BERT-large に関して、複数の公開自然言語理解 (NLU) ベンチマークにおける絶対的な評価。*
|
||||
|
||||
このモデルは [stefan-it](https://huggingface.co/stefan-it) によって提供されました。元のコードは[ここ](https://github.com/alexa/bort/)にあります。
|
||||
|
||||
## Usage tips
|
||||
|
||||
- BORT のモデル アーキテクチャは BERT に基づいています。詳細については、[BERT のドキュメント ページ](bert) を参照してください。
|
||||
モデルの API リファレンスと使用例。
|
||||
- BORT は BERT トークナイザーの代わりに RoBERTa トークナイザーを使用します。トークナイザーの API リファレンスと使用例については、[RoBERTa のドキュメント ページ](roberta) を参照してください。
|
||||
- BORT には、 [Agora](https://adewynter.github.io/notes/bort_algorithms_and_applications.html#fine-tuning-with-algebraic-topology) と呼ばれる特定の微調整アルゴリズムが必要です。
|
||||
残念ながらまだオープンソース化されていません。誰かが実装しようとすると、コミュニティにとって非常に役立ちます。
|
||||
BORT の微調整を機能させるためのアルゴリズム。
|
||||
@ -1,64 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# DETA
|
||||
|
||||
## Overview
|
||||
|
||||
DETA モデルは、[NMS Strikes Back](https://huggingface.co/papers/2212.06137) で Jeffrey Ouyang-Zhang、Jang Hyun Cho、Xingyi Zhou、Philipp Krähenbühl によって提案されました。
|
||||
DETA (Detection Transformers with Assignment の略) は、1 対 1 の 2 部ハンガリアン マッチング損失を置き換えることにより、[Deformable DETR](deformable_detr) を改善します。
|
||||
非最大抑制 (NMS) を備えた従来の検出器で使用される 1 対多のラベル割り当てを使用します。これにより、最大 2.5 mAP の大幅な増加が得られます。
|
||||
|
||||
論文の要約は次のとおりです。
|
||||
|
||||
*Detection Transformer (DETR) は、トレーニング中に 1 対 1 の 2 部マッチングを使用してクエリを一意のオブジェクトに直接変換し、エンドツーエンドのオブジェクト検出を可能にします。最近、これらのモデルは、紛れもない優雅さで COCO の従来の検出器を上回りました。ただし、モデル アーキテクチャやトレーニング スケジュールなど、さまざまな設計において従来の検出器とは異なるため、1 対 1 マッチングの有効性は完全には理解されていません。この研究では、DETR での 1 対 1 のハンガリー語マッチングと、非最大監視 (NMS) を備えた従来の検出器での 1 対多のラベル割り当てとの間の厳密な比較を行います。驚くべきことに、NMS を使用した 1 対多の割り当ては、同じ設定の下で標準的な 1 対 1 のマッチングよりも一貫して優れており、最大 2.5 mAP という大幅な向上が見られます。従来の IoU ベースのラベル割り当てを使用して Deformable-DETR をトレーニングする当社の検出器は、ResNet50 バックボーンを使用して 12 エポック (1x スケジュール) 以内に 50.2 COCO mAP を達成し、この設定で既存のすべての従来の検出器またはトランスベースの検出器を上回りました。複数のデータセット、スケジュール、アーキテクチャに関して、私たちは一貫して、パフォーマンスの高い検出トランスフォーマーには二部マッチングが不要であることを示しています。さらに、検出トランスの成功は、表現力豊かなトランス アーキテクチャによるものであると考えています。*
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/deta_architecture.jpg"
|
||||
alt="drawing" width="600"/>
|
||||
|
||||
<small> DETA の概要。 <a href="https://huggingface.co/papers/2212.06137">元の論文</a>から抜粋。 </small>
|
||||
|
||||
このモデルは、[nielsr](https://huggingface.co/nielsr) によって提供されました。
|
||||
元のコードは [ここ](https://github.com/jozhang97/DETA) にあります。
|
||||
|
||||
## Resources
|
||||
|
||||
DETA の使用を開始するのに役立つ公式 Hugging Face およびコミュニティ (🌎 で示されている) リソースのリスト。
|
||||
|
||||
- DETA のデモ ノートブックは [こちら](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETA) にあります。
|
||||
- 参照: [オブジェクト検出タスク ガイド](../tasks/object_detection)
|
||||
|
||||
ここに含めるリソースの送信に興味がある場合は、お気軽にプル リクエストを開いてください。審査させていただきます。リソースは、既存のリソースを複製するのではなく、何か新しいものを示すことが理想的です。
|
||||
|
||||
## DetaConfig
|
||||
|
||||
[[autodoc]] DetaConfig
|
||||
|
||||
## DetaImageProcessor
|
||||
|
||||
[[autodoc]] DetaImageProcessor
|
||||
- preprocess
|
||||
- post_process_object_detection
|
||||
|
||||
## DetaModel
|
||||
|
||||
[[autodoc]] DetaModel
|
||||
- forward
|
||||
|
||||
## DetaForObjectDetection
|
||||
|
||||
[[autodoc]] DetaForObjectDetection
|
||||
- forward
|
||||
@ -1193,8 +1193,6 @@
|
||||
- sections:
|
||||
- local: in_translation
|
||||
title: Decision Transformer
|
||||
- local: model_doc/trajectory_transformer
|
||||
title: Trajectory Transformer
|
||||
title: 강화학습 모델
|
||||
- sections:
|
||||
- local: model_doc/autoformer
|
||||
@ -1210,10 +1208,6 @@
|
||||
- local: in_translation
|
||||
title: TimesFM
|
||||
title: 시게열 모델
|
||||
- sections:
|
||||
- local: model_doc/graphormer
|
||||
title: Graphormer
|
||||
title: 그래프 모델
|
||||
title: 모델
|
||||
- sections:
|
||||
- local: internal/modeling_utils
|
||||
|
||||
@ -1,52 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team and Microsoft. All rights reserved.
|
||||
|
||||
Licensed under the MIT License; you may not use this file except in compliance with
|
||||
the License.
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Graphormer[[graphormer]]
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
이 모델은 유지 보수 모드로만 운영되며, 코드를 변경하는 새로운 PR(Pull Request)은 받지 않습니다.
|
||||
이 모델을 실행하는 데 문제가 발생한다면, 이 모델을 지원하는 마지막 버전인 v4.40.2를 다시 설치해 주세요. 다음 명령어를 실행하여 재설치할 수 있습니다: `pip install -U transformers==4.40.2`.
|
||||
|
||||
</Tip>
|
||||
|
||||
## 개요[[overview]]
|
||||
|
||||
Graphormer 모델은 Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu가 제안한 [트랜스포머가 그래프 표현에 있어서 정말 약할까?](https://huggingface.co/papers/2106.05234) 라는 논문에서 소개되었습니다. Graphormer는 그래프 트랜스포머 모델입니다. 텍스트 시퀀스 대신 그래프에서 계산을 수행할 수 있도록 수정되었으며, 전처리와 병합 과정에서 임베딩과 관심 특성을 생성한 후 수정된 어텐션을 사용합니다.
|
||||
|
||||
해당 논문의 초록입니다:
|
||||
|
||||
*트랜스포머 아키텍처는 자연어 처리와 컴퓨터 비전 등 많은 분야에서 지배적인 선택을 받고 있는 아키텍처 입니다. 그러나 그래프 수준 예측 리더보드 상에서는 주류 GNN 변형모델들에 비해 경쟁력 있는 성능을 달성하지 못했습니다. 따라서 트랜스포머가 그래프 표현 학습에서 어떻게 잘 수행될 수 있을지는 여전히 미스터리였습니다. 본 논문에서는 Graphormer를 제시함으로써 이 미스터리를 해결합니다. Graphormer는 표준 트랜스포머 아키텍처를 기반으로 구축되었으며, 특히 최근의 OpenGraphBenchmark Large-Scale Challenge(OGB-LSC)의 광범위한 그래프 표현 학습 작업에서 탁월한 결과를 얻을 수 있었습니다. 그래프에서 트랜스포머를 활용하는데 핵심은 그래프의 구조적 정보를 모델에 효과적으로 인코딩하는 것입니다. 이를 위해 우리는 Graphormer가 그래프 구조 데이터를 더 잘 모델링할 수 있도록 돕는 몇 가지 간단하면서도 효과적인 구조적 인코딩 방법을 제안합니다. 또한, 우리는 Graphormer의 표현을 수학적으로 특성화하고, 그래프의 구조적 정보를 인코딩하는 우리의 방식으로 많은 인기 있는 GNN 변형모델들이 Graphormer의 특수한 경우로 포함될 수 있음을 보여줍니다.*
|
||||
|
||||
이 모델은 [clefourrier](https://huggingface.co/clefourrier)가 기여했습니다. 원본 코드는 [이곳](https://github.com/microsoft/Graphormer)에서 확인할 수 있습니다.
|
||||
|
||||
## 사용 팁[[usage-tips]]
|
||||
|
||||
이 모델은 큰 그래프(100개 이상의 노드개수/엣지개수)에서는 메모리 사용량이 폭발적으로 증가하므로 잘 작동하지 않습니다. 대안으로 배치 크기를 줄이거나, RAM을 늘리거나 또는 algos_graphormer.pyx 파일의 `UNREACHABLE_NODE_DISTANCE` 매개변수를 줄이는 방법도 있지만, 700개 이상의 노드개수/엣지개수를 처리하기에는 여전히 어려울 것입니다.
|
||||
|
||||
이 모델은 토크나이저를 사용하지 않고, 대신 훈련 중에 특별한 콜레이터(collator)를 사용합니다.
|
||||
|
||||
## GraphormerConfig[[transformers.GraphormerConfig]]
|
||||
|
||||
[[autodoc]] GraphormerConfig
|
||||
|
||||
## GraphormerModel[[transformers.GraphormerModel]]
|
||||
|
||||
[[autodoc]] GraphormerModel
|
||||
- forward
|
||||
|
||||
## GraphormerForGraphClassification[[transformers.GraphormerForGraphClassification]]
|
||||
|
||||
[[autodoc]] GraphormerForGraphClassification
|
||||
- forward
|
||||
@ -1,49 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# 궤적 트랜스포머[[trajectory-transformer]]
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
|
||||
이 모델은 유지 보수 모드로만 운영되며, 코드를 변경하는 새로운 PR(Pull Request)은 받지 않습니다.
|
||||
이 모델을 실행하는 데 문제가 발생한다면, 이 모델을 지원하는 마지막 버전인 v4.30.0를 다시 설치해 주세요. 다음 명령어를 실행하여 재설치할 수 있습니다: `pip install -U transformers==4.30.0`.
|
||||
|
||||
</Tip>
|
||||
|
||||
## 개요[[overview]]
|
||||
|
||||
Trajectory Transformer 모델은 Michael Janner, Qiyang Li, Sergey Levine이 제안한 [하나의 커다란 시퀀스 모델링 문제로서의 오프라인 강화학습](https://huggingface.co/papers/2106.02039)라는 논문에서 소개되었습니다.
|
||||
|
||||
해당 논문의 초록입니다:
|
||||
|
||||
*강화학습(RL)은 일반적으로 마르코프 속성을 활용하여 시간에 따라 문제를 인수분해하면서 정적 정책이나 단일 단계 모델을 추정하는 데 중점을 둡니다. 하지만 우리는 RL을 높은 보상 시퀀스로 이어지는 행동 시퀀스를 생성하는 것을 목표로 하는 일반적인 시퀀스 모델링 문제로 볼 수도 있습니다. 이러한 관점에서, 자연어 처리와 같은 다른 도메인에서 잘 작동하는 고용량 시퀀스 예측 모델이 RL 문제에도 효과적인 해결책을 제공할 수 있는지 고려해 볼 만합니다. 이를 위해 우리는 RL을 시퀀스 모델링의 도구로 어떻게 다룰 수 있는지 탐구하며, 트랜스포머 아키텍처를 사용하여 궤적에 대한 분포를 모델링하고 빔 서치를 계획 알고리즘으로 재활용합니다. RL을 시퀀스 모델링 문제로 프레임화하면 다양한 설계 결정이 단순화되어, 오프라인 RL 알고리즘에서 흔히 볼 수 있는 많은 구성 요소를 제거할 수 있습니다. 우리는 이 접근 방식의 유연성을 장기 동역학 예측, 모방 학습, 목표 조건부 RL, 오프라인 RL에 걸쳐 입증합니다. 더 나아가, 이 접근 방식을 기존의 모델 프리 알고리즘과 결합하여 희소 보상, 장기 과제에서 최신 계획기(planner)를 얻을 수 있음을 보여줍니다.*
|
||||
|
||||
이 모델은 [CarlCochet](https://huggingface.co/CarlCochet)에 의해 기여되었습니다.
|
||||
원본 코드는 [이곳](https://github.com/jannerm/trajectory-transformer)에서 확인할 수 있습니다.
|
||||
|
||||
## 사용 팁[[usage-tips]]
|
||||
|
||||
이 트랜스포머는 심층 강화학습에 사용됩니다. 사용하려면 이전의 모든 타임스텝에서의 행동, 상태, 보상으로부터 시퀀스를 생성해야 합니다. 이 모델은 이 모든 요소를 함께 하나의 큰 시퀀스(궤적)로 취급합니다.
|
||||
|
||||
## TrajectoryTransformerConfig[[transformers.TrajectoryTransformerConfig]]
|
||||
|
||||
[[autodoc]] TrajectoryTransformerConfig
|
||||
|
||||
## TrajectoryTransformerModel[[transformers.TrajectoryTransformerModel]]
|
||||
|
||||
[[autodoc]] TrajectoryTransformerModel
|
||||
- forward
|
||||
@ -302,10 +302,9 @@ class PreTrainedConfig(PushToHubMixin):
|
||||
self.sep_token_id = sep_token_id
|
||||
self.decoder_start_token_id = decoder_start_token_id
|
||||
|
||||
# Retrocompatibility: Parameters for sequence generation. While we will keep the ability to load these
|
||||
# parameters, saving them will be deprecated. In a distant future, we won't need to load them.
|
||||
for parameter_name, default_value in self._get_global_generation_defaults().items():
|
||||
setattr(self, parameter_name, kwargs.pop(parameter_name, default_value))
|
||||
# Parameters for sequence generation saved in the config are popped instead of loading them.
|
||||
for parameter_name in self._get_global_generation_defaults().keys():
|
||||
kwargs.pop(parameter_name, None)
|
||||
|
||||
# Name or path to the pretrained checkpoint
|
||||
self._name_or_path = str(kwargs.pop("name_or_path", ""))
|
||||
@ -445,14 +444,11 @@ class PreTrainedConfig(PushToHubMixin):
|
||||
|
||||
non_default_generation_parameters = self._get_non_default_generation_parameters()
|
||||
if len(non_default_generation_parameters) > 0:
|
||||
# TODO (joao): this should be an exception if the user has modified the loaded config. See #33886
|
||||
warnings.warn(
|
||||
raise ValueError(
|
||||
"Some non-default generation parameters are set in the model config. These should go into either a) "
|
||||
"`model.generation_config` (as opposed to `model.config`); OR b) a GenerationConfig file "
|
||||
"(https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model)."
|
||||
"This warning will become an exception in the future."
|
||||
f"\nNon-default generation parameters: {str(non_default_generation_parameters)}",
|
||||
UserWarning,
|
||||
)
|
||||
|
||||
os.makedirs(save_directory, exist_ok=True)
|
||||
@ -1101,40 +1097,18 @@ class PreTrainedConfig(PushToHubMixin):
|
||||
non_default_generation_parameters = {}
|
||||
decoder_attribute_name = None
|
||||
|
||||
# Some composite models don't have a default config, use their decoder config as a fallback for default values
|
||||
# If no known pattern is matched, then `default_config = None` -> check against the global generation defaults
|
||||
if not self.has_no_defaults_at_init:
|
||||
default_config = self.__class__()
|
||||
else:
|
||||
decoder_config = self.get_text_config(decoder=True)
|
||||
if decoder_config is not self:
|
||||
default_config = decoder_config.__class__()
|
||||
else:
|
||||
default_config = None
|
||||
|
||||
# If it is a composite model, we want to check the subconfig that will be used for generation
|
||||
self_decoder_config = self if decoder_attribute_name is None else getattr(self, decoder_attribute_name)
|
||||
|
||||
for parameter_name, default_global_value in self._get_global_generation_defaults().items():
|
||||
if hasattr(self_decoder_config, parameter_name):
|
||||
is_default_in_config = is_default_generation_value = None
|
||||
parameter_value = getattr(self_decoder_config, parameter_name)
|
||||
# Three cases in which is okay for the model config to hold generation config parameters:
|
||||
parameter_value = getattr(self_decoder_config, parameter_name, None)
|
||||
# Two cases in which is okay for the model config to hold generation config parameters:
|
||||
# 1. The parameter is set to `None`, effectively delegating its value to the generation config
|
||||
if parameter_value is None:
|
||||
# 2. The parameter is set the global generation defaults
|
||||
if parameter_value is None or parameter_value == default_global_value:
|
||||
continue
|
||||
# 2. If we have a default config, then the instance should hold the same generation defaults
|
||||
if default_config is not None:
|
||||
is_default_in_config = parameter_value == getattr(default_config, parameter_name)
|
||||
# 3. if we don't have a default config, then the instance should hold the global generation defaults
|
||||
else:
|
||||
is_default_generation_value = parameter_value == default_global_value
|
||||
|
||||
is_non_default = (is_default_in_config is False) or (
|
||||
is_default_in_config is None and is_default_generation_value is False
|
||||
)
|
||||
if is_non_default:
|
||||
non_default_generation_parameters[parameter_name] = getattr(self_decoder_config, parameter_name)
|
||||
non_default_generation_parameters[parameter_name] = getattr(self_decoder_config, parameter_name)
|
||||
|
||||
return non_default_generation_parameters
|
||||
|
||||
|
||||
@ -1731,10 +1731,8 @@ SLOW_TO_FAST_CONVERTERS = {
|
||||
"OpenAIGPTTokenizer": OpenAIGPTConverter,
|
||||
"PegasusTokenizer": PegasusConverter,
|
||||
"Qwen2Tokenizer": Qwen2Converter,
|
||||
"RealmTokenizer": BertConverter,
|
||||
"ReformerTokenizer": ReformerConverter,
|
||||
"RemBertTokenizer": RemBertConverter,
|
||||
"RetriBertTokenizer": BertConverter,
|
||||
"RobertaTokenizer": RobertaConverter,
|
||||
"RoFormerTokenizer": RoFormerConverter,
|
||||
"SeamlessM4TTokenizer": SeamlessM4TConverter,
|
||||
|
||||
@ -46,22 +46,6 @@ if TYPE_CHECKING:
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
str_to_torch_dtype = {
|
||||
"BOOL": torch.bool,
|
||||
"U8": torch.uint8,
|
||||
"I8": torch.int8,
|
||||
"I16": torch.int16,
|
||||
"F16": torch.float16,
|
||||
"BF16": torch.bfloat16,
|
||||
"I32": torch.int32,
|
||||
"F32": torch.float32,
|
||||
"F64": torch.float64,
|
||||
"I64": torch.int64,
|
||||
"F8_E4M3": torch.float8_e4m3fn,
|
||||
"F8_E5M2": torch.float8_e5m2,
|
||||
}
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@ -148,34 +132,33 @@ class ConversionOps:
|
||||
class Chunk(ConversionOps):
|
||||
"""Split a tensor along ``dim`` into equally sized chunks or using explicit ``sizes``."""
|
||||
|
||||
reverse_op: type[ConversionOps]
|
||||
|
||||
def __init__(self, dim: int = 0, chunks: Optional[int] = None, sizes: Optional[Sequence[int]] = None):
|
||||
if chunks is None and sizes is None:
|
||||
raise ValueError("`chunks` or `sizes` must be provided for Chunk operations.")
|
||||
if chunks is not None and chunks <= 0:
|
||||
raise ValueError("`chunks` must be a strictly positive integer.")
|
||||
self.dim = dim
|
||||
self.chunks = chunks
|
||||
self.sizes = list(sizes) if sizes is not None else None
|
||||
self.reverse_op = Concatenate
|
||||
|
||||
@property
|
||||
def reverse_op(self) -> ConversionOps:
|
||||
return Concatenate(self.dim)
|
||||
|
||||
def convert(self, value: torch.Tensor, concrete_target_keys=None, *args, **kwargs) -> list[torch.Tensor]:
|
||||
# chunk requires a single tensor input (maybe not when saving actually!)
|
||||
udpate_ = []
|
||||
if concrete_target_keys is not None: # when saving we have multiple tensors
|
||||
for layer in value:
|
||||
for tensors in layer:
|
||||
chunk_size = len(concrete_target_keys)
|
||||
udpate_+= [dict(zip(concrete_target_keys, torch.chunk(tensors, chunks=chunk_size, dim=self.dim)))]
|
||||
return udpate_
|
||||
def convert(self, value: torch.Tensor, *args, **kwargs) -> list[torch.Tensor]:
|
||||
# chunk requires a single tensor input
|
||||
if len(value) != 1 or len(value[0]) != 1:
|
||||
raise ValueError("Chunk operation requires a single tensor input.")
|
||||
return list(torch.chunk(value[0][0], self.chunks, dim=self.dim))
|
||||
|
||||
|
||||
class Concatenate(ConversionOps):
|
||||
"""Concatenate tensors along `dim` using a reusable buffer."""
|
||||
|
||||
reverse_op: type[ConversionOps]
|
||||
|
||||
def __init__(self, dim: int = 0):
|
||||
self.dim = dim
|
||||
|
||||
@property
|
||||
def reverse_op(self) -> ConversionOps:
|
||||
return Chunk(self.dim)
|
||||
self.reverse_op = Chunk
|
||||
|
||||
@torch.no_grad
|
||||
def convert(self, value: Sequence[torch.Tensor], *args, **kwargs) -> torch.Tensor:
|
||||
@ -188,7 +171,7 @@ class Concatenate(ConversionOps):
|
||||
return torch.cat(tuple(tensors), dim=self.dim)
|
||||
|
||||
|
||||
class MergeModulelist(ConversionOps):
|
||||
class MergeModulelist(Concatenate):
|
||||
"""
|
||||
Merge a list of tensors into a single tensor along the first dimension.
|
||||
We explicitly define this because for EP or TP you want to make sure you know what you are doing!
|
||||
@ -196,11 +179,8 @@ class MergeModulelist(ConversionOps):
|
||||
"""
|
||||
|
||||
def __init__(self, dim: int = 0):
|
||||
self.dim = dim
|
||||
|
||||
@property
|
||||
def reverse_op(self) -> ConversionOps:
|
||||
return SplitModulelist(self.dim)
|
||||
super().__init__(dim=dim)
|
||||
self.reverse_op = SplitModulelist
|
||||
|
||||
@torch.no_grad
|
||||
def convert(self, value: Sequence[torch.Tensor], *args, **kwargs) -> list[torch.Tensor]:
|
||||
@ -216,24 +196,26 @@ class MergeModulelist(ConversionOps):
|
||||
class SplitModulelist(ConversionOps):
|
||||
"""Inverse of :class:`MergeModulelist` using explicit split sizes per group."""
|
||||
|
||||
def __init__(self, dim: int = 0):
|
||||
self.dim = dim
|
||||
|
||||
@property
|
||||
def reverse_op(self) -> ConversionOps:
|
||||
return MergeModulelist(self.dim)
|
||||
def __init__(self, sizes: Sequence[Sequence[int]], dim: int = 0):
|
||||
if not isinstance(sizes, Sequence) or not all(isinstance(sub, Sequence) and sub for sub in sizes):
|
||||
raise ValueError("`sizes` must be a sequence of non-empty sequences of integers.")
|
||||
self.sizes = [list(sub) for sub in sizes]
|
||||
self.dim = dim
|
||||
self.reverse_op = MergeModulelist
|
||||
|
||||
@torch.no_grad
|
||||
def convert(self, value: Sequence[torch.Tensor], concrete_target_keys=None, config=None, *args, **kwargs) -> list[list[torch.Tensor]]:
|
||||
result = []
|
||||
for i, layers in enumerate(value):
|
||||
tmp = {}
|
||||
if not isinstance(layers, dict):
|
||||
layers = {concrete_target_keys[i]: layers[i] for i in range(len(layers))}
|
||||
for k, v in layers.items():
|
||||
splits = torch.chunk(v, config.num_experts, dim=self.dim)
|
||||
tmp.update({k.replace("*", str(i)): v for i, v in enumerate(splits)})
|
||||
result.append(tmp)
|
||||
def convert(self, value: Sequence[torch.Tensor], *, context: dict[str, Any]) -> list[list[torch.Tensor]]:
|
||||
if not isinstance(value, Sequence):
|
||||
raise TypeError("SplitModulelist expects a sequence of tensors.")
|
||||
if len(value) != len(self.sizes):
|
||||
raise ValueError("Number of tensors does not match the provided split specifications.")
|
||||
|
||||
result: list[list[torch.Tensor]] = []
|
||||
for tensor, split_sizes in zip(value, self.sizes):
|
||||
if not isinstance(tensor, torch.Tensor):
|
||||
raise TypeError("SplitModulelist can only split torch.Tensor instances.")
|
||||
splits = torch.split(tensor, split_sizes, dim=self.dim)
|
||||
result.append(list(splits))
|
||||
return result
|
||||
|
||||
|
||||
@ -314,21 +296,16 @@ class ConversionEntry:
|
||||
GLOBAL_WORKERS = min(16, (os.cpu_count() or 8) * 2) # NVMe: 8-16; HDD/NFS: 2-4
|
||||
|
||||
|
||||
def _materialize_copy(tensor, dtype=None):
|
||||
def _materialize_copy(tensor, device=None, dtype=None):
|
||||
tensor = tensor[...]
|
||||
if dtype is not None:
|
||||
tensor = tensor.to(dtype)
|
||||
if dtype is not None or device is not None:
|
||||
tensor = tensor.to(device=device, dtype=dtype)
|
||||
return tensor
|
||||
|
||||
def spawn_dematerialize(thread_pool, tensor, dtype=None) -> Future:
|
||||
def _job():
|
||||
return tensor.detach() #.cpu()
|
||||
|
||||
return thread_pool.submit(_job)
|
||||
|
||||
def spawn_materialize(thread_pool, tensor, dtype=None) -> Future:
|
||||
def spawn_materialize(thread_pool, tensor, device=None, dtype=None) -> Future:
|
||||
def _job():
|
||||
return _materialize_copy(tensor, dtype)
|
||||
return _materialize_copy(tensor, device, dtype)
|
||||
|
||||
return thread_pool.submit(_job)
|
||||
|
||||
@ -376,7 +353,7 @@ def log_to_misc(
|
||||
values, target_keys = extras
|
||||
descriptor = f"{op_name} " if op_name else ""
|
||||
misc[layer_name] = (
|
||||
f"{e}\nError: {descriptor}on tensors destined for {target_keys}. Ckpt contains: {values}"
|
||||
f"{e}\nError: {descriptor}on tensors destined for {target_keys}. Ckpt contains: {len(values[0])}"
|
||||
)
|
||||
elif isinstance(extras, str):
|
||||
suffix = f" via {op_name}" if op_name else ""
|
||||
@ -396,11 +373,15 @@ def set_param_for_module(
|
||||
missing_keys: MutableSet[str],
|
||||
misc: MutableMapping[str, Any],
|
||||
distributed_operation: Optional[TensorParallelLayer],
|
||||
hf_quantizer: HfQuantizer,
|
||||
):
|
||||
with log_to_misc(layer_name, misc, layer_name):
|
||||
module_path, _, param_name = layer_name.rpartition(".")
|
||||
module_obj = model.get_submodule(module_path) if module_path else model
|
||||
param_value = param_value[0] if isinstance(param_value, list) else param_value[...]
|
||||
if isinstance(param_value, list):
|
||||
param_value = param_value[0]
|
||||
elif not isinstance(param_value, torch.nn.Parameter):
|
||||
param_value = param_value[...]
|
||||
ref = getattr(module_obj, param_name)
|
||||
|
||||
use_dtensor = hasattr(distributed_operation, "use_dtensor") and distributed_operation.use_dtensor
|
||||
@ -422,7 +403,7 @@ def set_param_for_module(
|
||||
|
||||
# Remove from missing keys (it's either mismatched, or all good)
|
||||
missing_keys.discard(layer_name)
|
||||
if ref is not None and ref.shape != param_value.shape:
|
||||
if ref is not None and ref.shape != param_value.shape and hf_quantizer is None:
|
||||
mismatch_keys.add((layer_name, param_value.shape, ref.shape))
|
||||
module_obj.param_name._is_hf_initialized = False # Needs to be initialized
|
||||
else:
|
||||
@ -441,7 +422,7 @@ def convert_and_load_state_dict_in_model(
|
||||
state_dict: dict[str, Any],
|
||||
weight_mapping: dict[str, WeightConverter] | None,
|
||||
tp_plan: dict[str, str] | None,
|
||||
quantizer: HfQuantizer | None,
|
||||
hf_quantizer: HfQuantizer | None,
|
||||
dtype: torch.dtype | None = None,
|
||||
device_map: dict | None = None,
|
||||
dtype_plan: dict | None = None,
|
||||
@ -454,7 +435,10 @@ def convert_and_load_state_dict_in_model(
|
||||
|
||||
prefix = model.base_model_prefix
|
||||
tp_plan = tp_plan or {} # {glob_pattern: plan_obj_or_key}
|
||||
device_map = device_map or {} # {exact_target_key: device}
|
||||
device_map = device_map or {"": "cpu"} # {exact_target_key: device}
|
||||
device_map_regex = re.compile(
|
||||
"|".join(rf"({k})" for k in sorted(device_map.keys(), key=lambda x: x.count("."), reverse=True))
|
||||
)
|
||||
dtype_plan = dtype_plan or {} # {glob_pattern: dtype}
|
||||
weight_mapping = weight_mapping or {} # {glob_pattern: WeightConverter}
|
||||
meta_model_state_dict = model.state_dict()
|
||||
@ -503,20 +487,14 @@ def convert_and_load_state_dict_in_model(
|
||||
unexpected_keys.add(t)
|
||||
continue
|
||||
|
||||
if quantizer is not None and quantizer.param_needs_quantization(model, t):
|
||||
if quantizer.__class__.__name__ == "FineGrainedFP8HfQuantizer":
|
||||
from .integrations.finegrained_fp8 import Fp8Quantize
|
||||
|
||||
converter.quantization_operation = Fp8Quantize() # TODO support other methods
|
||||
else:
|
||||
raise ValueError("This quantization method is gonna be supported SOOOON")
|
||||
else:
|
||||
_dtype = dtype
|
||||
matched_dtype_pattern = match_glob(t, dtype_policy_alt, dtype_policy_by_group_name)
|
||||
if matched_dtype_pattern is not None:
|
||||
_dtype = dtype_plan[matched_dtype_pattern]
|
||||
elif empty_param.dtype != _dtype:
|
||||
_dtype = empty_param.dtype
|
||||
if hf_quantizer is not None and hf_quantizer.param_needs_quantization(model, t):
|
||||
converter.quantization_operation = hf_quantizer.get_quantize_ops()
|
||||
_dtype = dtype
|
||||
matched_dtype_pattern = match_glob(t, dtype_policy_alt, dtype_policy_by_group_name)
|
||||
if matched_dtype_pattern is not None:
|
||||
_dtype = dtype_plan[matched_dtype_pattern]
|
||||
elif empty_param.dtype != _dtype:
|
||||
_dtype = empty_param.dtype
|
||||
|
||||
first_target_key = new_target_key[0]
|
||||
target_key = "|".join(new_target_key)
|
||||
@ -541,7 +519,9 @@ def convert_and_load_state_dict_in_model(
|
||||
)
|
||||
|
||||
if future is None: # If not TP, async materialize the tensors. TODO handle disk offload?
|
||||
future = spawn_materialize(thread_pool, tensor, _dtype)
|
||||
device_match = device_map_regex.match(first_target_key)
|
||||
param_device = device_map[device_match.group()] if device_match else device_map.get("", "cpu")
|
||||
future = spawn_materialize(thread_pool, tensor, param_device, _dtype)
|
||||
entry.collected_tensors[target_key].setdefault(converter_key, []).append(future)
|
||||
|
||||
# 2. Actually convert the ckpt
|
||||
@ -577,9 +557,7 @@ def convert_and_load_state_dict_in_model(
|
||||
if op := converter.quantization_operation:
|
||||
with log_to_misc(layer_name, misc, op=op):
|
||||
realized_value.update(
|
||||
op.convert(
|
||||
{k: realized_value.pop(k)}, quant_config=quantizer.quantization_config
|
||||
)
|
||||
op.convert({k: realized_value.pop(k)}, model=model, missing_keys=missing_keys)
|
||||
)
|
||||
|
||||
for k, output_value in realized_value.items():
|
||||
@ -593,6 +571,7 @@ def convert_and_load_state_dict_in_model(
|
||||
missing_keys,
|
||||
misc,
|
||||
converter.distributed_operation,
|
||||
hf_quantizer,
|
||||
)
|
||||
|
||||
except SkipLayer:
|
||||
@ -605,116 +584,19 @@ def convert_and_load_state_dict_in_model(
|
||||
|
||||
|
||||
# TODO this is not done yet!
|
||||
def revert_weight_conversion(model, state_dict, weight_mapping):
|
||||
def revert_weight_conversion(model, state_dict):
|
||||
mapping = getattr(model, "_checkpoint_conversion_mapping", {}) # IDK why but setting this will fail all llava.
|
||||
reverse_key_mapping = [(v, k) for k, v in mapping.items()] # todo also take it into account
|
||||
|
||||
tp_plan = model.tp_plan or {} # {glob_pattern: plan_obj_or_key}
|
||||
weight_mapping = weight_mapping or {} # {glob_pattern: WeightConverter}
|
||||
|
||||
misc = {}
|
||||
final_state_dict = {}
|
||||
# Global thread_pool
|
||||
thread_pool = ThreadPoolExecutor(max_workers=GLOBAL_WORKERS)
|
||||
|
||||
_patterns = list(itertools.chain.from_iterable([k.target_keys for k in weight_mapping]))
|
||||
target_to_source = {sk: k for k in weight_mapping for sk in k.target_keys}
|
||||
weight_pattern_alt, weight_pattern_by_group_name = build_glob_alt(_patterns)
|
||||
tp_plan_alt, tp_plan_by_group_name = build_glob_alt(list(tp_plan.keys()))
|
||||
|
||||
state_dict = sorted(state_dict.items(), key=lambda kv: dot_natural_key(kv[0]))
|
||||
# 1. Create the conversion entries
|
||||
by_conversion_pattern: dict[str, ConversionEntry] = {}
|
||||
for original_key, tensor in state_dict:
|
||||
matched_pattern = match_glob(original_key, weight_pattern_alt, weight_pattern_by_group_name)
|
||||
if matched_pattern is not None:
|
||||
converter = target_to_source[matched_pattern] # TODO make sure its the ref
|
||||
sub_with_extractor = partial(re.sub, matched_pattern.replace("*", r"(\d+)"), string=original_key)
|
||||
entry_key = "|".join(map(sub_with_extractor, converter.source_keys))
|
||||
target_key = entry_key # at this point we don't know how many we'll collect :)
|
||||
entry: ConversionEntry = by_conversion_pattern.setdefault(entry_key, ConversionEntry(converter))
|
||||
converter_key = sub_with_extractor(matched_pattern)
|
||||
else:
|
||||
converter = WeightConverter(original_key)
|
||||
converter_key = entry_key = target_key = original_key
|
||||
entry = by_conversion_pattern.setdefault(converter_key, ConversionEntry(converter))
|
||||
|
||||
if False and quantizer is not None and quantizer.param_needs_quantization(model, t):
|
||||
if quantizer.__class__.__name__ == "FineGrainedFP8HfQuantizer":
|
||||
from .integrations.finegrained_fp8 import Fp8Quantize
|
||||
|
||||
converter.quantization_operation = Fp8Quantize() # TODO support other methods
|
||||
else:
|
||||
raise ValueError("This quantization method is gonna be supported SOOOON")
|
||||
|
||||
future = None
|
||||
# if device_mesh:
|
||||
# if matched_tp_pattern := match_glob(first_target_key, tp_plan_alt, tp_plan_by_group_name):
|
||||
# if getattr(converter, "distributed_operation", {}) is None:
|
||||
# tp_layer = ALL_PARALLEL_STYLES[model.tp_plan[matched_tp_pattern]].__class__
|
||||
# converter.distributed_operation = tp_layer(
|
||||
# device_mesh=device_mesh, rank=device_map[""].index, empty_param=empty_param.clone()
|
||||
# )
|
||||
# # VERY IMPORTANT: this tells us wether we collected stuffs or not.
|
||||
# shard_index = len(entry.collected_tensors[target_key].get(converter_key, []))
|
||||
# future = spawn_tp_dematerialize(
|
||||
# thread_pool,
|
||||
# tensor,
|
||||
# converter.distributed_operation,
|
||||
# shard_index,
|
||||
# )
|
||||
|
||||
if future is None: # If not TP, async materialize the tensors. TODO handle disk offload?
|
||||
future = spawn_dematerialize(thread_pool, tensor) # -> should we put it to CPU always?
|
||||
entry.collected_tensors[target_key].setdefault(converter_key, []).append(future)
|
||||
|
||||
# 2. Actually convert the ckpt
|
||||
keys = list(by_conversion_pattern.keys())
|
||||
|
||||
|
||||
with logging.tqdm(total=len(keys), desc="saving weights") as pbar:
|
||||
for key in keys[::-1]: # revert to process simple keys first
|
||||
group = by_conversion_pattern.pop(key)
|
||||
converter = group.weight_converter
|
||||
operations = converter.operations if isinstance(converter.operations, list) else [converter.operations]
|
||||
for layer_name, tensors_for_this_layer in group.collected_tensors.items():
|
||||
pbar.update(1)
|
||||
pbar.set_postfix({"Materializing param": layer_name})
|
||||
pbar.refresh()
|
||||
concrete_target_keys = layer_name.split("|")
|
||||
try:
|
||||
with log_to_misc(layer_name, misc):
|
||||
values = [[k.result() for k in inner] for inner in tensors_for_this_layer.values()]
|
||||
|
||||
for op in operations[::-1]:
|
||||
with log_to_misc(layer_name, misc, (values, concrete_target_keys), operations):
|
||||
reverse_op = op.reverse_op
|
||||
values = reverse_op.convert(values, concrete_target_keys, config=model.config)
|
||||
|
||||
values = [values] if not isinstance(values, list) else values
|
||||
with log_to_misc(layer_name, misc, (values, concrete_target_keys), operations):
|
||||
if len(values) == 1 and isinstance(values[0], dict):
|
||||
realized_value = values[0]
|
||||
else:
|
||||
realized_value = dict(zip(concrete_target_keys, values))
|
||||
|
||||
for k in list(realized_value.keys()).copy():
|
||||
if op := converter.quantization_operation: # dequantize
|
||||
with log_to_misc(layer_name, misc, op=op):
|
||||
realized_value.update(
|
||||
op.convert(
|
||||
{k: realized_value.pop(k)}, # quant_config=quantizer.quantization_config
|
||||
)
|
||||
)
|
||||
|
||||
for k, output_value in realized_value.items():
|
||||
final_state_dict[k] = output_value[0] if isinstance(output_value, list) else output_value
|
||||
|
||||
# TODO @Cyrilvallez handle scheduled saving, gather and etc
|
||||
# schedule the saving of the weights using the threadpool. `save_file`
|
||||
|
||||
except SkipLayer:
|
||||
continue
|
||||
del group
|
||||
print(misc)
|
||||
return final_state_dict
|
||||
reverse_key_mapping = [(v, k) for k, v in mapping.items()]
|
||||
original_state_dict = {}
|
||||
for key, value in state_dict.items():
|
||||
for pattern, inverse_converter in reverse_key_mapping:
|
||||
# TODO FIXME you name it
|
||||
replacement = inverse_converter.lstrip("^") # strip off un-needed chars and patterns
|
||||
replacement = re.sub(r"\(.*\)", "", replacement)
|
||||
key, n_replace = re.subn(pattern, replacement, key)
|
||||
# Early exit of the loop
|
||||
if n_replace > 0:
|
||||
break
|
||||
original_state_dict[key] = value
|
||||
state_dict = original_state_dict
|
||||
return state_dict
|
||||
|
||||
@ -918,7 +918,9 @@ class GenerationConfig(PushToHubMixin):
|
||||
else:
|
||||
logger.info(f"loading configuration file {configuration_file} from cache at {resolved_config_file}")
|
||||
|
||||
if kwargs.get("return_unused_kwargs") is True:
|
||||
if kwargs.get("_from_model_config", False):
|
||||
return cls.from_model_config(config_dict)
|
||||
elif kwargs.get("return_unused_kwargs") is True:
|
||||
config, unused_kwargs = cls.from_dict(config_dict, **kwargs)
|
||||
config._original_object_hash = hash(config) # Hash to detect whether the instance was modified
|
||||
return config, unused_kwargs
|
||||
@ -1084,19 +1086,19 @@ class GenerationConfig(PushToHubMixin):
|
||||
writer.write(self.to_json_string(use_diff=use_diff))
|
||||
|
||||
@classmethod
|
||||
def from_model_config(cls, model_config: PreTrainedConfig) -> "GenerationConfig":
|
||||
def from_model_config(cls, model_config: PreTrainedConfig | dict) -> "GenerationConfig":
|
||||
"""
|
||||
Instantiates a [`GenerationConfig`] from a [`PreTrainedConfig`]. This function is useful to convert legacy
|
||||
[`PreTrainedConfig`] objects, which may contain generation parameters, into a stand-alone [`GenerationConfig`].
|
||||
|
||||
Args:
|
||||
model_config (`PreTrainedConfig`):
|
||||
model_config (`PreTrainedConfig | dict`):
|
||||
The model config that will be used to instantiate the generation config.
|
||||
|
||||
Returns:
|
||||
[`GenerationConfig`]: The configuration object instantiated from those parameters.
|
||||
"""
|
||||
config_dict = model_config.to_dict()
|
||||
config_dict = model_config.to_dict() if not isinstance(model_config, dict) else model_config
|
||||
config_dict.pop("_from_model_config", None)
|
||||
|
||||
# Removes all `None` from the model config dict -- this lets the generation config defaults to take hold
|
||||
@ -1106,14 +1108,15 @@ class GenerationConfig(PushToHubMixin):
|
||||
|
||||
# Special case: some models have generation attributes set in the decoder. Use them if still unset in the
|
||||
# generation config (which in turn is defined from the outer attributes of model config).
|
||||
decoder_config = model_config.get_text_config(decoder=True)
|
||||
if decoder_config is not model_config:
|
||||
default_generation_config = GenerationConfig()
|
||||
decoder_config_dict = decoder_config.to_dict()
|
||||
for attr in generation_config.to_dict():
|
||||
is_unset = getattr(generation_config, attr) == getattr(default_generation_config, attr)
|
||||
if attr in decoder_config_dict and is_unset:
|
||||
setattr(generation_config, attr, decoder_config_dict[attr])
|
||||
if not isinstance(model_config, dict):
|
||||
decoder_config = model_config.get_text_config(decoder=True)
|
||||
if decoder_config is not model_config:
|
||||
default_generation_config = GenerationConfig()
|
||||
decoder_config_dict = decoder_config.to_dict()
|
||||
for attr in generation_config.to_dict():
|
||||
is_unset = getattr(generation_config, attr) == getattr(default_generation_config, attr)
|
||||
if attr in decoder_config_dict and is_unset:
|
||||
setattr(generation_config, attr, decoder_config_dict[attr])
|
||||
|
||||
# If any `output_...` flag is set to `True`, we ensure `return_dict_in_generate` is set to `True`.
|
||||
if generation_config.return_dict_in_generate is False:
|
||||
|
||||
@ -27,6 +27,7 @@ from typing import Optional
|
||||
import torch
|
||||
from torch import nn
|
||||
from tqdm import tqdm
|
||||
from tqdm.contrib.logging import logging_redirect_tqdm
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...generation.configuration_utils import GenerationConfig
|
||||
@ -813,6 +814,7 @@ class ContinuousBatchingManager:
|
||||
"""Check if the background generation thread is running."""
|
||||
return self._generation_thread is not None and self._generation_thread.is_alive()
|
||||
|
||||
# NOTE: don't forget to update `continuous_batching_context_manager` when changing this method's definition
|
||||
def stop(self, block: bool = True, timeout: Optional[float] = None) -> None:
|
||||
"""Signal the background thread to stop.
|
||||
|
||||
@ -1063,14 +1065,35 @@ class ContinuousMixin:
|
||||
"""Mixin class for models to add continuous batching capabilities."""
|
||||
|
||||
@contextmanager
|
||||
def continuous_batching_context_manager(self, **kwargs) -> Generator[ContinuousBatchingManager]:
|
||||
manager = self.init_continuous_batching(**kwargs)
|
||||
def continuous_batching_context_manager(
|
||||
self,
|
||||
generation_config: GenerationConfig | None = None,
|
||||
manual_eviction: bool = False,
|
||||
max_queue_size: int = 0,
|
||||
num_q_cuda_graphs: int = 0,
|
||||
num_kv_cuda_graphs: int = 0,
|
||||
allow_prefix_sharing: bool = True,
|
||||
block: bool = True,
|
||||
timeout: Optional[float] = None,
|
||||
) -> Generator[ContinuousBatchingManager]:
|
||||
manager = self.init_continuous_batching(
|
||||
generation_config,
|
||||
manual_eviction,
|
||||
max_queue_size,
|
||||
num_q_cuda_graphs,
|
||||
num_kv_cuda_graphs,
|
||||
allow_prefix_sharing,
|
||||
)
|
||||
manager.start()
|
||||
try:
|
||||
yield manager
|
||||
finally:
|
||||
manager.stop(block=True)
|
||||
logger.debug(
|
||||
"Continuous batching loop finished"
|
||||
) # a dummy log needed for the logs of stop to show. Won't show
|
||||
manager.stop(block=block, timeout=timeout)
|
||||
|
||||
# NOTE: don't forget to update `continuous_batching_context_manager` when changing this method's definition
|
||||
def init_continuous_batching(
|
||||
self,
|
||||
generation_config: Optional[GenerationConfig] = None,
|
||||
@ -1147,42 +1170,40 @@ class ContinuousMixin:
|
||||
progress_bar = False
|
||||
|
||||
# Initialize manager with the batch inputs
|
||||
manager = self.init_continuous_batching(
|
||||
generation_config=generation_config,
|
||||
num_q_cuda_graphs=num_q_cuda_graphs,
|
||||
num_kv_cuda_graphs=num_kv_cuda_graphs,
|
||||
)
|
||||
manager.start()
|
||||
results = {}
|
||||
num_requests = len(inputs)
|
||||
try:
|
||||
from tqdm.contrib.logging import logging_redirect_tqdm
|
||||
with (
|
||||
self.continuous_batching_context_manager(
|
||||
generation_config=generation_config,
|
||||
num_q_cuda_graphs=num_q_cuda_graphs,
|
||||
num_kv_cuda_graphs=num_kv_cuda_graphs,
|
||||
block=True,
|
||||
timeout=5,
|
||||
) as manager,
|
||||
logging_redirect_tqdm([logger]),
|
||||
tqdm(
|
||||
total=num_requests,
|
||||
disable=(not progress_bar),
|
||||
desc=f"Solving {num_requests} requests",
|
||||
unit="request",
|
||||
) as pbar,
|
||||
):
|
||||
try:
|
||||
manager.add_requests(inputs=inputs, max_new_tokens=kwargs.get("max_new_tokens"))
|
||||
finished_count = 0
|
||||
while finished_count < num_requests:
|
||||
result = manager.get_result(timeout=1)
|
||||
if result:
|
||||
req_id = result.request_id
|
||||
if result.is_finished():
|
||||
results[req_id] = result
|
||||
finished_count += 1
|
||||
pbar.update(1)
|
||||
else:
|
||||
if not manager.is_running():
|
||||
logger.error("Generation thread terminated unexpectedly.")
|
||||
break
|
||||
|
||||
with logging_redirect_tqdm([logger]):
|
||||
with tqdm(
|
||||
total=num_requests,
|
||||
disable=(not progress_bar),
|
||||
desc=f"Solving {num_requests} requests",
|
||||
unit="request",
|
||||
) as pbar:
|
||||
manager.add_requests(inputs=inputs, max_new_tokens=kwargs.get("max_new_tokens"))
|
||||
finished_count = 0
|
||||
while finished_count < num_requests:
|
||||
result = manager.get_result(timeout=1)
|
||||
if result:
|
||||
req_id = result.request_id
|
||||
if result.is_finished():
|
||||
results[req_id] = result
|
||||
finished_count += 1
|
||||
pbar.update(1)
|
||||
else:
|
||||
if not manager.is_running():
|
||||
logger.error("Generation thread terminated unexpectedly.")
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during batch generation: {e}", exc_info=True)
|
||||
finally:
|
||||
logger.debug("Generate batch is finished.") # a dummy log needed for the logs of stop to show. Won't show.
|
||||
manager.stop(block=True, timeout=5.0)
|
||||
except Exception as e:
|
||||
logger.error(f"Error during batch generation: {e}", exc_info=True)
|
||||
return results
|
||||
|
||||
@ -410,6 +410,14 @@ class GenerationMixin(ContinuousMixin):
|
||||
logger.info(
|
||||
"Generation config file not found, using a generation config created from the model config."
|
||||
)
|
||||
self.generation_config = GenerationConfig.from_pretrained(
|
||||
pretrained_model_name_or_path,
|
||||
config_file_name="config.json",
|
||||
_from_auto=from_auto_class,
|
||||
_from_pipeline=from_pipeline,
|
||||
_from_model_config=True,
|
||||
**repo_loading_kwargs,
|
||||
)
|
||||
# Load custom generate function if `pretrained_model_name_or_path` defines it (and override `generate`)
|
||||
if hasattr(self, "load_custom_generate") and trust_remote_code:
|
||||
try:
|
||||
@ -1778,14 +1786,12 @@ class GenerationMixin(ContinuousMixin):
|
||||
):
|
||||
new_generation_config = GenerationConfig.from_model_config(self.config)
|
||||
if new_generation_config != self.generation_config: # 4)
|
||||
warnings.warn(
|
||||
"You have modified the pretrained model configuration to control generation. This is a"
|
||||
" deprecated strategy to control generation and will be removed in v5."
|
||||
raise ValueError(
|
||||
"You have modified the pretrained model configuration to control generation."
|
||||
" This strategy to control generation is not supported anymore. "
|
||||
" Please use and modify the model generation configuration (see"
|
||||
" https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )",
|
||||
UserWarning,
|
||||
)
|
||||
self.generation_config = new_generation_config
|
||||
|
||||
generation_config = self.generation_config
|
||||
using_model_generation_config = True
|
||||
|
||||
@ -162,6 +162,25 @@ def copy_(tensor: torch.Tensor, other: torch.Tensor) -> torch.Tensor:
|
||||
return tensor
|
||||
|
||||
|
||||
# Here, we need to check several modules imported, and hot patch all of them, as sometimes torch does
|
||||
# something like `from torch.nn.init import xavier_uniform_` in their internals (e.g in torch.nn.modules.activations,
|
||||
# where MultiHeadAttention lives), so the function name is binded at import time and just doing
|
||||
# `setattr(torch.nn.init, name, globals()[name])` is thus not enough
|
||||
# The following list should be enough for all torch versions we work with
|
||||
TORCH_MODULES_TO_PATCH = (
|
||||
"torch.nn.init",
|
||||
"torch.nn.modules.activation",
|
||||
"torch.nn.modules.transformer",
|
||||
"torch.nn.modules.linear",
|
||||
"torch.nn.modules.loss",
|
||||
"torch.nn.modules.batchnorm",
|
||||
"torch.nn.modules.conv",
|
||||
"torch.nn.modules.normalization",
|
||||
"torch.nn.modules.rnn",
|
||||
"torch.nn.modules.sparse",
|
||||
)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def guard_torch_init_functions():
|
||||
"""
|
||||
@ -174,18 +193,16 @@ def guard_torch_init_functions():
|
||||
originals = defaultdict(dict)
|
||||
try:
|
||||
# Replace all torch funcs by the ones in this file
|
||||
for name in TORCH_INIT_FUNCTIONS.keys():
|
||||
# Here, we need to check all modules imported, and hot patch all of them, as usually torch does
|
||||
# something like `from torch.nn.init import xavier_uniform_` in their internals (e.g in torch.nn.modules,
|
||||
# where MultiHeadAttention lives), so the function name is binded at import time and just doing
|
||||
# `setattr(torch.nn.init, name, gloabls()[name])` is thus not enough
|
||||
for module in sys.modules.copy().values():
|
||||
if module and hasattr(module, name):
|
||||
originals[module][name] = getattr(module, name)
|
||||
setattr(module, name, globals()[name])
|
||||
for module_name in TORCH_MODULES_TO_PATCH:
|
||||
if module_name in sys.modules:
|
||||
module = sys.modules[module_name]
|
||||
for func_name in TORCH_INIT_FUNCTIONS.keys():
|
||||
if hasattr(module, func_name):
|
||||
originals[module][func_name] = getattr(module, func_name)
|
||||
setattr(module, func_name, globals()[func_name])
|
||||
yield
|
||||
finally:
|
||||
# Set back the original functions on all modules
|
||||
for module, functions in originals.items():
|
||||
for name, func in functions.items():
|
||||
setattr(module, name, func)
|
||||
for func_name, func in functions.items():
|
||||
setattr(module, func_name, func)
|
||||
|
||||
@ -32,8 +32,8 @@ _import_structure = {
|
||||
"unpack_weights",
|
||||
],
|
||||
"bitsandbytes": [
|
||||
"Bnb4bitQuantize",
|
||||
"dequantize_and_replace",
|
||||
"get_keys_to_not_convert",
|
||||
"replace_with_bnb_linear",
|
||||
"validate_bnb_backend_availability",
|
||||
],
|
||||
@ -177,8 +177,8 @@ if TYPE_CHECKING:
|
||||
unpack_weights,
|
||||
)
|
||||
from .bitsandbytes import (
|
||||
Bnb4bitQuantize,
|
||||
dequantize_and_replace,
|
||||
get_keys_to_not_convert,
|
||||
replace_with_bnb_linear,
|
||||
validate_bnb_backend_availability,
|
||||
)
|
||||
|
||||
@ -19,9 +19,9 @@ and simplicity/ease of use.
|
||||
import copy
|
||||
import inspect
|
||||
import os
|
||||
from collections import defaultdict
|
||||
from collections import OrderedDict, defaultdict
|
||||
from contextlib import contextmanager
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import TYPE_CHECKING, Optional, Union
|
||||
|
||||
from ..utils import (
|
||||
is_accelerate_available,
|
||||
@ -39,8 +39,9 @@ if is_torch_available():
|
||||
import torch.nn as nn
|
||||
|
||||
if is_accelerate_available():
|
||||
from accelerate import dispatch_model, infer_auto_device_map
|
||||
from accelerate.utils import check_tied_parameters_on_same_device, get_max_memory
|
||||
from accelerate import dispatch_model
|
||||
from accelerate.utils import get_max_memory
|
||||
from accelerate.utils.modeling import clean_device_map, get_max_layer_size, get_module_size_with_ties
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..modeling_utils import PreTrainedModel
|
||||
@ -258,7 +259,7 @@ def check_and_set_device_map(device_map: "torch.device | int | str | dict | None
|
||||
|
||||
|
||||
def compute_module_sizes(
|
||||
model: "PreTrainedModel", hf_quantizer: "HfQuantizer | None"
|
||||
model: "PreTrainedModel", hf_quantizer: "HfQuantizer | None" = None, buffers_only: bool = False
|
||||
) -> tuple[dict[str, int], dict[str, int]]:
|
||||
"""
|
||||
Compute the size of each submodule of a given model (in bytes).
|
||||
@ -268,7 +269,13 @@ def compute_module_sizes(
|
||||
"""
|
||||
all_module_sizes = defaultdict(int)
|
||||
leaves_module_sizes = defaultdict(int)
|
||||
for name, param in model.state_dict().items():
|
||||
|
||||
if buffers_only:
|
||||
named_tensors = model.named_buffers(recurse=True)
|
||||
else:
|
||||
named_tensors = model.state_dict().items()
|
||||
|
||||
for name, param in named_tensors:
|
||||
if hf_quantizer is not None:
|
||||
dtype_size = hf_quantizer.param_element_size(model, name)
|
||||
else:
|
||||
@ -283,6 +290,14 @@ def compute_module_sizes(
|
||||
return all_module_sizes, leaves_module_sizes
|
||||
|
||||
|
||||
def compute_module_total_buffer_size(model: nn.Module, hf_quantizer: "HfQuantizer | None" = None):
|
||||
"""
|
||||
Compute the total size of buffers in each submodule of a given model.
|
||||
"""
|
||||
module_sizes, _ = compute_module_sizes(model, hf_quantizer, buffers_only=True)
|
||||
return module_sizes.get("", 0)
|
||||
|
||||
|
||||
def get_balanced_memory(
|
||||
model: "PreTrainedModel",
|
||||
max_memory: dict[int | str, int | str] | None = None,
|
||||
@ -393,20 +408,11 @@ def _get_device_map(
|
||||
device_map: dict | str | None,
|
||||
max_memory: dict | None,
|
||||
hf_quantizer: "HfQuantizer | None",
|
||||
dtype: torch.dtype | None,
|
||||
) -> dict:
|
||||
"""Compute the final `device_map` to use if we passed a value in ['auto', 'balanced', 'balanced_low_0', 'sequential'].
|
||||
Otherwise, we check for any device inconsistencies in the device_map.
|
||||
"""
|
||||
if isinstance(device_map, str):
|
||||
special_dtypes = {}
|
||||
if hf_quantizer is not None:
|
||||
special_dtypes = hf_quantizer.get_special_dtypes_update(model, dtype)
|
||||
|
||||
target_dtype = dtype
|
||||
if hf_quantizer is not None:
|
||||
target_dtype = hf_quantizer.adjust_target_dtype(target_dtype)
|
||||
|
||||
no_split_modules = model._get_no_split_modules(device_map)
|
||||
|
||||
if device_map != "sequential":
|
||||
@ -438,19 +444,13 @@ def _get_device_map(
|
||||
device_map = infer_auto_device_map(
|
||||
model,
|
||||
max_memory=inferred_max_memory,
|
||||
dtype=target_dtype,
|
||||
no_split_module_classes=no_split_modules,
|
||||
special_dtypes=special_dtypes,
|
||||
hf_quantizer=hf_quantizer,
|
||||
)
|
||||
|
||||
if hf_quantizer is not None:
|
||||
hf_quantizer.validate_environment(device_map=device_map)
|
||||
|
||||
elif device_map is not None:
|
||||
tied_params = find_tied_parameters(model)
|
||||
# check if we don't have tied param in different devices
|
||||
check_tied_parameters_on_same_device(tied_params, device_map)
|
||||
|
||||
return device_map
|
||||
|
||||
|
||||
@ -547,3 +547,383 @@ def accelerate_disk_offload(
|
||||
else:
|
||||
disk_offload_index = {}
|
||||
return disk_offload_index, disk_only_shard_files, is_offloaded_safetensors
|
||||
|
||||
|
||||
def _init_infer_auto_device_map(
|
||||
model: nn.Module,
|
||||
max_memory: Optional[dict[Union[int, str], Union[int, str]]] = None,
|
||||
no_split_module_classes: Optional[list[str]] = None,
|
||||
tied_parameters: Optional[list[list[str]]] = None,
|
||||
hf_quantizer: "HfQuantizer | None" = None,
|
||||
) -> tuple[
|
||||
list[Union[int, str]],
|
||||
dict[Union[int, str], Union[int, str]],
|
||||
list[Union[int, str]],
|
||||
list[int],
|
||||
dict[str, int],
|
||||
list[list[str]],
|
||||
list[str],
|
||||
list[tuple[str, nn.Module]],
|
||||
]:
|
||||
"""
|
||||
Initialize variables required for computing the device map for model allocation.
|
||||
"""
|
||||
max_memory = get_max_memory(max_memory)
|
||||
if no_split_module_classes is None:
|
||||
no_split_module_classes = []
|
||||
elif not isinstance(no_split_module_classes, (list, tuple)):
|
||||
no_split_module_classes = [no_split_module_classes]
|
||||
|
||||
devices = list(max_memory.keys())
|
||||
if "disk" not in devices:
|
||||
devices.append("disk")
|
||||
gpus = [device for device in devices if device not in ["cpu", "disk"]]
|
||||
|
||||
# Devices that need to keep space for a potential offloaded layer.
|
||||
if "mps" in gpus:
|
||||
main_devices = ["mps"]
|
||||
elif len(gpus) > 0:
|
||||
main_devices = [gpus[0], "cpu"]
|
||||
else:
|
||||
main_devices = ["cpu"]
|
||||
|
||||
module_sizes, _ = compute_module_sizes(model, hf_quantizer)
|
||||
|
||||
if tied_parameters is None:
|
||||
if len(model.all_tied_weights_keys) > 0:
|
||||
# create a list of list of tied params
|
||||
tied_parameters = [list(t) for t in model.all_tied_weights_keys.items()]
|
||||
else:
|
||||
tied_parameters = [[]]
|
||||
|
||||
# Direct submodules and parameters
|
||||
modules_to_treat = (
|
||||
list(model.named_parameters(recurse=False))
|
||||
+ list(model.named_children())
|
||||
+ list(model.named_buffers(recurse=False))
|
||||
)
|
||||
|
||||
return (
|
||||
devices,
|
||||
max_memory,
|
||||
main_devices,
|
||||
gpus,
|
||||
module_sizes,
|
||||
tied_parameters,
|
||||
no_split_module_classes,
|
||||
modules_to_treat,
|
||||
)
|
||||
|
||||
|
||||
def infer_auto_device_map(
|
||||
model: nn.Module,
|
||||
max_memory: Optional[dict[Union[int, str], Union[int, str]]] = None,
|
||||
no_split_module_classes: Optional[list[str]] = None,
|
||||
verbose: bool = False,
|
||||
clean_result: bool = True,
|
||||
offload_buffers: bool = False,
|
||||
tied_parameters: Optional[list[list[str]]] = None,
|
||||
hf_quantizer: "HfQuantizer | None" = None,
|
||||
):
|
||||
"""
|
||||
Compute a device map for a given model giving priority to GPUs, then offload on CPU and finally offload to disk,
|
||||
such that:
|
||||
- we don't exceed the memory available of any of the GPU.
|
||||
- if offload to the CPU is needed, there is always room left on GPU 0 to put back the layer offloaded on CPU that
|
||||
has the largest size.
|
||||
- if offload to the CPU is needed,we don't exceed the RAM available on the CPU.
|
||||
- if offload to the disk is needed, there is always room left on the CPU to put back the layer offloaded on disk
|
||||
that has the largest size.
|
||||
|
||||
<Tip>
|
||||
|
||||
All computation is done analyzing sizes and dtypes of the model parameters. As a result, the model can be on the
|
||||
meta device (as it would if initialized within the `init_empty_weights` context manager).
|
||||
|
||||
</Tip>
|
||||
|
||||
Args:
|
||||
model (`torch.nn.Module`):
|
||||
The model to analyze.
|
||||
max_memory (`Dict`, *optional*):
|
||||
A dictionary device identifier to maximum memory. Will default to the maximum memory available if unset.
|
||||
Example: `max_memory={0: "1GB"}`.
|
||||
no_split_module_classes (`List[str]`, *optional*):
|
||||
A list of layer class names that should never be split across device (for instance any layer that has a
|
||||
residual connection).
|
||||
verbose (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to provide debugging statements as the function builds the device_map.
|
||||
clean_result (`bool`, *optional*, defaults to `True`):
|
||||
Clean the resulting device_map by grouping all submodules that go on the same device together.
|
||||
offload_buffers (`bool`, *optional*, defaults to `False`):
|
||||
In the layers that are offloaded on the CPU or the hard drive, whether or not to offload the buffers as
|
||||
well as the parameters.
|
||||
"""
|
||||
|
||||
# Initialize the variables
|
||||
(
|
||||
devices,
|
||||
max_memory,
|
||||
main_devices,
|
||||
gpus,
|
||||
module_sizes,
|
||||
tied_parameters,
|
||||
no_split_module_classes,
|
||||
modules_to_treat,
|
||||
) = _init_infer_auto_device_map(model, max_memory, no_split_module_classes, tied_parameters, hf_quantizer)
|
||||
|
||||
device_map = OrderedDict()
|
||||
current_device = 0
|
||||
device_memory_used = dict.fromkeys(devices, 0)
|
||||
device_buffer_sizes = {}
|
||||
device_minimum_assignment_memory = {}
|
||||
|
||||
# Initialize maximum largest layer, to know which space to keep in memory
|
||||
max_layer_size, max_layer_names = get_max_layer_size(modules_to_treat, module_sizes, no_split_module_classes)
|
||||
|
||||
# Ready ? This is going to be a bit messy.
|
||||
while len(modules_to_treat) > 0:
|
||||
name, module = modules_to_treat.pop(0)
|
||||
if verbose:
|
||||
print(f"\nTreating module {name}.")
|
||||
# Max size in the remaining layers may have changed since we took one, so we maybe update it.
|
||||
max_layer_names = [n for n in max_layer_names if n != name and not n.startswith(name + ".")]
|
||||
if len(max_layer_names) == 0:
|
||||
max_layer_size, max_layer_names = get_max_layer_size(
|
||||
[(n, m) for n, m in modules_to_treat if isinstance(m, torch.nn.Module)],
|
||||
module_sizes,
|
||||
no_split_module_classes,
|
||||
)
|
||||
# Assess size needed
|
||||
module_size = module_sizes[name]
|
||||
|
||||
# We keep relevant tied parameters only: one of the tied parameters in the group is inside the current module
|
||||
# and the other is not.
|
||||
# Note: If we are currently processing the name `compute.weight`, an other parameter named
|
||||
# e.g. `compute.weight_submodule.parameter`
|
||||
# needs to be considered outside the current module, hence the check with additional dots.
|
||||
tied_param_groups = [
|
||||
tied_group
|
||||
for tied_group in tied_parameters
|
||||
if any(name + "." in k + "." for k in tied_group) and not all(name + "." in k + "." for k in tied_group)
|
||||
]
|
||||
|
||||
if verbose and len(tied_param_groups) > 0:
|
||||
print(f" Found the relevant tied param groups {tied_param_groups}")
|
||||
|
||||
# Then we keep track of all the parameters that are tied to the current module, but not in the current module
|
||||
tied_params = sum(
|
||||
[[p for p in tied_group if name + "." not in p + "."] for tied_group in tied_param_groups], []
|
||||
)
|
||||
|
||||
if verbose and len(tied_params) > 0:
|
||||
print(f" So those parameters need to be taken into account {tied_params}")
|
||||
|
||||
device = devices[current_device]
|
||||
current_max_size = max_memory[device] if device != "disk" else None
|
||||
current_memory_reserved = 0
|
||||
# Reduce max size available by the largest layer.
|
||||
if devices[current_device] in main_devices:
|
||||
current_max_size = current_max_size - max_layer_size
|
||||
current_memory_reserved = max_layer_size
|
||||
|
||||
module_size_with_ties, tied_module_names, tied_modules = get_module_size_with_ties(
|
||||
tied_params, module_size, module_sizes, modules_to_treat
|
||||
)
|
||||
|
||||
# The module and its tied modules fit on the current device.
|
||||
if current_max_size is None or device_memory_used[device] + module_size_with_ties <= current_max_size:
|
||||
if verbose:
|
||||
output = f"Putting {name}"
|
||||
|
||||
if tied_module_names:
|
||||
output += f" and {tied_module_names}"
|
||||
else:
|
||||
output += f" (size={module_size})"
|
||||
|
||||
if current_max_size is not None:
|
||||
output += f" (available={current_max_size - device_memory_used[device]})"
|
||||
|
||||
output += f" on {device}."
|
||||
print(output)
|
||||
|
||||
device_memory_used[device] += module_size_with_ties
|
||||
|
||||
# Assign the primary module to the device.
|
||||
device_map[name] = device
|
||||
|
||||
# Assign tied modules if any.
|
||||
for tied_module_name in tied_module_names:
|
||||
if tied_module_name in [m[0] for m in modules_to_treat]:
|
||||
# Find the index of the tied module in the list
|
||||
tied_module_index = next(i for i, (n, _) in enumerate(modules_to_treat) if n == tied_module_name)
|
||||
# Remove the tied module from the list to prevent reprocessing
|
||||
modules_to_treat.pop(tied_module_index)
|
||||
|
||||
# Assign the tied module to the device
|
||||
device_map[tied_module_name] = device
|
||||
|
||||
# Buffer Handling
|
||||
if not offload_buffers and isinstance(module, nn.Module):
|
||||
# Compute the total buffer size for the module
|
||||
current_buffer_size = compute_module_total_buffer_size(module, hf_quantizer)
|
||||
# Update the buffer size on the device
|
||||
device_buffer_sizes[device] = device_buffer_sizes.get(device, 0) + current_buffer_size
|
||||
|
||||
continue
|
||||
|
||||
# The current module itself fits, so we try to split the tied modules.
|
||||
if len(tied_params) > 0 and device_memory_used[device] + module_size <= current_max_size:
|
||||
# can we split one of the tied modules to make it smaller or do we need to go on the next device?
|
||||
if verbose:
|
||||
print(
|
||||
f"Not enough space on {devices[current_device]} to put {name} and {tied_module_names} (space "
|
||||
f"available {current_max_size - device_memory_used[device]}, needed size {module_size_with_ties})."
|
||||
)
|
||||
split_happened = False
|
||||
for tied_module_name, tied_module in zip(tied_module_names, tied_modules):
|
||||
tied_module_children = list(tied_module.named_children())
|
||||
if len(tied_module_children) == 0 or tied_module.__class__.__name__ in no_split_module_classes:
|
||||
# can't break this one.
|
||||
continue
|
||||
|
||||
if verbose:
|
||||
print(f"Splitting {tied_module_name}.")
|
||||
tied_module_children = list(tied_module.named_parameters(recurse=False)) + tied_module_children
|
||||
tied_module_children = [(f"{tied_module_name}.{n}", v) for n, v in tied_module_children]
|
||||
tied_module_index = [i for i, (n, _) in enumerate(modules_to_treat) if n == tied_module_name][0]
|
||||
|
||||
modules_to_treat = (
|
||||
[(name, module)]
|
||||
+ modules_to_treat[:tied_module_index]
|
||||
+ tied_module_children
|
||||
+ modules_to_treat[tied_module_index + 1 :]
|
||||
)
|
||||
# Update the max layer size.
|
||||
max_layer_size, max_layer_names = get_max_layer_size(
|
||||
[(n, m) for n, m in modules_to_treat if isinstance(m, torch.nn.Module)],
|
||||
module_sizes,
|
||||
no_split_module_classes,
|
||||
)
|
||||
split_happened = True
|
||||
break
|
||||
|
||||
if split_happened:
|
||||
continue
|
||||
|
||||
# If the tied module is not split, we go to the next device
|
||||
if verbose:
|
||||
print("None of the tied module can be split, going to the next device.")
|
||||
|
||||
# The current module itself doesn't fit, so we have to split it or go to the next device.
|
||||
if device_memory_used[device] + module_size >= current_max_size:
|
||||
# Split or not split?
|
||||
modules_children = (
|
||||
[]
|
||||
if isinstance(module, nn.Parameter) or isinstance(module, torch.Tensor)
|
||||
else list(module.named_children())
|
||||
)
|
||||
if verbose:
|
||||
print(
|
||||
f"Not enough space on {devices[current_device]} to put {name} (space available "
|
||||
f"{current_max_size - device_memory_used[device]}, module size {module_size})."
|
||||
)
|
||||
if len(modules_children) == 0 or module.__class__.__name__ in no_split_module_classes:
|
||||
# -> no split, we go to the next device
|
||||
if verbose:
|
||||
print("This module cannot be split, going to the next device.")
|
||||
|
||||
else:
|
||||
# -> split, we replace the module studied by its children + parameters
|
||||
if verbose:
|
||||
print(f"Splitting {name}.")
|
||||
modules_children = list(module.named_parameters(recurse=False)) + modules_children
|
||||
modules_to_treat = [(f"{name}.{n}", v) for n, v in modules_children] + modules_to_treat
|
||||
# Update the max layer size.
|
||||
max_layer_size, max_layer_names = get_max_layer_size(
|
||||
[(n, m) for n, m in modules_to_treat if isinstance(m, torch.nn.Module)],
|
||||
module_sizes,
|
||||
no_split_module_classes,
|
||||
)
|
||||
continue
|
||||
|
||||
if device_memory_used[device] == 0:
|
||||
device_minimum_assignment_memory[device] = module_size_with_ties + current_memory_reserved
|
||||
|
||||
# Neither the current module nor any tied modules can be split, so we move to the next device.
|
||||
device_memory_used[device] = device_memory_used[device] + current_memory_reserved
|
||||
current_device += 1
|
||||
modules_to_treat = [(name, module)] + modules_to_treat
|
||||
|
||||
device_memory_used = {device: mem for device, mem in device_memory_used.items() if mem > 0}
|
||||
|
||||
if clean_result:
|
||||
device_map = clean_device_map(device_map)
|
||||
|
||||
non_gpu_buffer_size = device_buffer_sizes.get("cpu", 0) + device_buffer_sizes.get("disk", 0)
|
||||
if non_gpu_buffer_size > 0 and not offload_buffers:
|
||||
is_buffer_fit_any_gpu = False
|
||||
for gpu_device, gpu_max_memory in max_memory.items():
|
||||
if gpu_device == "cpu" or gpu_device == "disk":
|
||||
continue
|
||||
|
||||
if not is_buffer_fit_any_gpu:
|
||||
gpu_memory_used = device_memory_used.get(gpu_device, 0)
|
||||
|
||||
if gpu_max_memory >= non_gpu_buffer_size + gpu_memory_used:
|
||||
is_buffer_fit_any_gpu = True
|
||||
|
||||
if len(gpus) > 0 and not is_buffer_fit_any_gpu:
|
||||
logger.warning(
|
||||
f"Current model requires {non_gpu_buffer_size} bytes of buffer for offloaded layers, which seems does "
|
||||
f"not fit any GPU's remaining memory. If you are experiencing a OOM later, please consider using "
|
||||
f"offload_buffers=True."
|
||||
)
|
||||
|
||||
if device_minimum_assignment_memory:
|
||||
devices_info = "\n".join(
|
||||
f" - {device}: {mem} bytes required" for device, mem in device_minimum_assignment_memory.items()
|
||||
)
|
||||
logger.info(
|
||||
f"Based on the current allocation process, no modules could be assigned to the following devices due to "
|
||||
f"insufficient memory:\n"
|
||||
f"{devices_info}\n"
|
||||
f"These minimum requirements are specific to this allocation attempt and may vary. Consider increasing "
|
||||
f"the available memory for these devices to at least the specified minimum, or adjusting the model config."
|
||||
)
|
||||
|
||||
check_tied_parameters_on_same_device(tied_parameters, device_map)
|
||||
return device_map
|
||||
|
||||
|
||||
def _get_param_device(param, device_map):
|
||||
if param in device_map:
|
||||
return device_map[param]
|
||||
parent_param = ".".join(param.split(".")[:-1])
|
||||
if parent_param == param:
|
||||
raise ValueError(f"The `device_map` does not contain the module {param}.")
|
||||
else:
|
||||
return _get_param_device(parent_param, device_map)
|
||||
|
||||
|
||||
def check_tied_parameters_on_same_device(tied_params, device_map):
|
||||
"""
|
||||
Check if tied parameters are on the same device
|
||||
|
||||
Args:
|
||||
tied_params (`List[List[str]]`):
|
||||
A list of lists of parameter names being all tied together.
|
||||
|
||||
device_map (`Dict[str, Union[int, str, torch.device]]`):
|
||||
A map that specifies where each submodule should go.
|
||||
|
||||
"""
|
||||
for tie_param in tied_params:
|
||||
tie_param_devices = {}
|
||||
for param in tie_param:
|
||||
tie_param_devices[param] = _get_param_device(param, device_map)
|
||||
if len(set(tie_param_devices.values())) > 1:
|
||||
logger.warning(
|
||||
f"Tied parameters are on different devices: {tie_param_devices}. "
|
||||
"Please modify your custom device map or set `device_map='auto'`. "
|
||||
)
|
||||
|
||||
@ -1,6 +1,10 @@
|
||||
import inspect
|
||||
from collections import defaultdict
|
||||
from inspect import signature
|
||||
from typing import Optional
|
||||
|
||||
from ..core_model_loading import ConversionOps
|
||||
from ..quantizers.quantizers_utils import get_module_from_name
|
||||
from ..utils import (
|
||||
get_available_devices,
|
||||
is_accelerate_available,
|
||||
@ -27,12 +31,112 @@ if is_accelerate_available():
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class Bnb4bitQuantize(ConversionOps):
|
||||
def __init__(self, hf_quantizer):
|
||||
self.hf_quantizer = hf_quantizer
|
||||
|
||||
def convert(
|
||||
self, input_dict: torch.Tensor, model: Optional[torch.nn.Module] = None, missing_keys=None, **kwargs
|
||||
) -> dict[str, torch.Tensor]:
|
||||
"""
|
||||
we need to store some parameters to create the quantized weight. For example, bnb requires 6 values that are stored in the checkpoint to recover the quantized weight. So we store them in a dict that it stored in hf_quantizer for now as we can't save it in the op since we create an op per tensor.
|
||||
"""
|
||||
target_key, value = tuple(input_dict.items())[0]
|
||||
value = value[0] if isinstance(value, list) else value
|
||||
|
||||
full_name = target_key
|
||||
# update param name to get the weights instead of the quantized stats
|
||||
target_key = self.hf_quantizer.get_param_name(target_key)
|
||||
module, _ = get_module_from_name(model, target_key)
|
||||
|
||||
if not self.hf_quantizer.pre_quantized:
|
||||
# Support models using `Conv1D` in place of `nn.Linear` (e.g. openai-community/gpt2) by transposing the weight matrix prior to quantization.
|
||||
# Since weights are saved in the correct "orientation", we skip transposing when loading.
|
||||
if issubclass(module.source_cls, Conv1D):
|
||||
value = value.T
|
||||
old_value = model.get_parameter_or_buffer(target_key)
|
||||
new_value = bnb.nn.Params4bit(value, requires_grad=False, **old_value.__dict__).to(value.device)
|
||||
# remove missing keys that were create when initializing Params4bit
|
||||
for key in new_value.quant_state.as_dict(packed=True).keys():
|
||||
missing_keys.discard(f"{full_name}.{key}")
|
||||
return {target_key: new_value}
|
||||
else:
|
||||
module_name = target_key.rsplit(".", 1)[0]
|
||||
# Save the states for later quantization when they are all gathered
|
||||
if not hasattr(self.hf_quantizer, "param_quant_stats"):
|
||||
self.hf_quantizer.param_quant_stats = defaultdict(dict)
|
||||
self.hf_quantizer.param_quant_stats[module_name].update({full_name: value})
|
||||
missing_keys.discard(full_name)
|
||||
# We are ready for quantization in this case (note, the +1 is for the weight itself)
|
||||
if len(self.hf_quantizer.param_quant_stats[module_name]) == len(self.hf_quantizer.bnb_keys) + 1:
|
||||
weight = self.hf_quantizer.param_quant_stats[module_name].pop(f"{module_name}.weight")
|
||||
new_value = bnb.nn.Params4bit.from_prequantized(
|
||||
data=weight,
|
||||
quantized_stats=self.hf_quantizer.param_quant_stats[module_name],
|
||||
requires_grad=False,
|
||||
device=value.device,
|
||||
module=module,
|
||||
)
|
||||
del self.hf_quantizer.param_quant_stats[module_name]
|
||||
return {target_key: new_value}
|
||||
return {}
|
||||
|
||||
|
||||
class Bnb8bitQuantize(ConversionOps):
|
||||
def __init__(self, hf_quantizer):
|
||||
self.hf_quantizer = hf_quantizer
|
||||
|
||||
def convert(
|
||||
self, input_dict: torch.Tensor, model: Optional[torch.nn.Module] = None, missing_keys=None, **kwargs
|
||||
) -> dict[str, torch.Tensor]:
|
||||
target_key, value = tuple(input_dict.items())[0]
|
||||
value = value[0] if isinstance(value, list) else value
|
||||
|
||||
module, tensor_name = get_module_from_name(model, target_key)
|
||||
module_name = target_key.rsplit(".", 1)[0]
|
||||
|
||||
if not self.hf_quantizer.pre_quantized:
|
||||
# Support models using `Conv1D` in place of `nn.Linear` (e.g. openai-community/gpt2) by transposing the weight matrix prior to quantization.
|
||||
# Since weights are saved in the correct "orientation", we skip transposing when loading.
|
||||
if issubclass(module.source_cls, Conv1D):
|
||||
value = value.T
|
||||
value_device = value.device
|
||||
kwargs = getattr(module, tensor_name).__dict__
|
||||
kwargs.pop("SCB", None)
|
||||
new_value = bnb.nn.Int8Params(value.to("cpu"), requires_grad=False, **kwargs).to(value_device)
|
||||
missing_keys.discard(f"{module_name}.weight_format")
|
||||
missing_keys.discard(f"{module_name}.SCB")
|
||||
return {target_key: new_value}
|
||||
else:
|
||||
missing_keys.discard(target_key)
|
||||
# useless key that gets saved for no reason
|
||||
if tensor_name.endswith("weight_format"):
|
||||
return {}
|
||||
# Save the states for later quantization when they are all gathered
|
||||
if not hasattr(self.hf_quantizer, "param_quant_stats"):
|
||||
self.hf_quantizer.param_quant_stats = defaultdict(dict)
|
||||
self.hf_quantizer.param_quant_stats[module_name].update({target_key: value})
|
||||
# We are ready for quantization in this case (SCB and the weight)
|
||||
if len(self.hf_quantizer.param_quant_stats[module_name]) == 2:
|
||||
weight = self.hf_quantizer.param_quant_stats[module_name].pop(f"{module_name}.weight")
|
||||
kwargs = getattr(module, "weight").__dict__
|
||||
weight_device = weight.device
|
||||
new_value = bnb.nn.Int8Params(weight.to("cpu"), requires_grad=False, **kwargs).to(weight_device)
|
||||
setattr(new_value, "SCB", self.hf_quantizer.param_quant_stats[module_name][f"{module_name}.SCB"])
|
||||
del self.hf_quantizer.param_quant_stats[module_name]
|
||||
# sometimes, weight_format is not saved so we need to remove it from missing keys ...
|
||||
missing_keys.discard(f"{module_name}.weight_format")
|
||||
return {f"{module_name}.weight": new_value}
|
||||
return {}
|
||||
|
||||
|
||||
def _replace_with_bnb_linear(
|
||||
model,
|
||||
modules_to_not_convert=None,
|
||||
current_key_name=None,
|
||||
quantization_config=None,
|
||||
has_been_replaced=False,
|
||||
pre_quantized=False,
|
||||
):
|
||||
"""
|
||||
Private method that wraps the recursion for module replacement.
|
||||
@ -58,13 +162,18 @@ def _replace_with_bnb_linear(
|
||||
out_features = module.out_features
|
||||
|
||||
if quantization_config.quantization_method() == "llm_int8":
|
||||
model._modules[name] = bnb.nn.Linear8bitLt(
|
||||
new_module = bnb.nn.Linear8bitLt(
|
||||
in_features,
|
||||
out_features,
|
||||
module.bias is not None,
|
||||
has_fp16_weights=quantization_config.llm_int8_has_fp16_weight,
|
||||
threshold=quantization_config.llm_int8_threshold,
|
||||
)
|
||||
# hack to create the correct keys in the state dict with the right dtype
|
||||
new_module.weight.SCB = torch.empty(1, dtype=torch.float32)
|
||||
if pre_quantized:
|
||||
new_module.weight.data = new_module.weight.data.to(dtype=torch.int8)
|
||||
model._modules[name] = new_module
|
||||
has_been_replaced = True
|
||||
else:
|
||||
if (
|
||||
@ -78,7 +187,7 @@ def _replace_with_bnb_linear(
|
||||
if "quant_storage" in list(signature(bnb.nn.Linear4bit).parameters)
|
||||
else {}
|
||||
)
|
||||
model._modules[name] = bnb.nn.Linear4bit(
|
||||
new_module = bnb.nn.Linear4bit(
|
||||
in_features,
|
||||
out_features,
|
||||
module.bias is not None,
|
||||
@ -87,6 +196,30 @@ def _replace_with_bnb_linear(
|
||||
quant_type=quantization_config.bnb_4bit_quant_type,
|
||||
**extra_kwargs,
|
||||
)
|
||||
from bitsandbytes.functional import QuantState
|
||||
|
||||
# hack to create the correct keys in the state dict with the right dtype
|
||||
absmax_dtype = (
|
||||
torch.uint8 if quantization_config.bnb_4bit_use_double_quant else torch.float32
|
||||
)
|
||||
new_module.weight.quant_state = QuantState(
|
||||
absmax=torch.empty(1, dtype=absmax_dtype),
|
||||
code=torch.empty(1, dtype=torch.float32),
|
||||
shape=(1,),
|
||||
offset=torch.empty(1),
|
||||
quant_type=quantization_config.bnb_4bit_quant_type,
|
||||
state2=QuantState(
|
||||
absmax=torch.empty(1, dtype=torch.float32),
|
||||
code=torch.empty(1, dtype=torch.float32),
|
||||
)
|
||||
if quantization_config.bnb_4bit_use_double_quant
|
||||
else None,
|
||||
)
|
||||
if pre_quantized:
|
||||
# this is kind of an edge case when supporting both loading and quantization ...
|
||||
# we need to set the right dtype as we cast the checkpoint with the dtype of the meta model
|
||||
new_module.weight.data = new_module.weight.data.to(dtype=torch.uint8)
|
||||
model._modules[name] = new_module
|
||||
has_been_replaced = True
|
||||
# Store the module class in case we need to transpose the weight later
|
||||
model._modules[name].source_cls = type(module)
|
||||
@ -99,13 +232,16 @@ def _replace_with_bnb_linear(
|
||||
current_key_name,
|
||||
quantization_config,
|
||||
has_been_replaced=has_been_replaced,
|
||||
pre_quantized=pre_quantized,
|
||||
)
|
||||
# Remove the last key for recursion
|
||||
current_key_name.pop(-1)
|
||||
return model, has_been_replaced
|
||||
|
||||
|
||||
def replace_with_bnb_linear(model, modules_to_not_convert=None, current_key_name=None, quantization_config=None):
|
||||
def replace_with_bnb_linear(
|
||||
model, modules_to_not_convert=None, current_key_name=None, quantization_config=None, pre_quantized=False
|
||||
):
|
||||
"""
|
||||
A helper function to replace all `torch.nn.Linear` modules by `bnb.nn.Linear8bit` modules from the `bitsandbytes`
|
||||
library. This will enable running your models using mixed int8 precision as described by the paper `LLM.int8():
|
||||
@ -137,7 +273,7 @@ def replace_with_bnb_linear(model, modules_to_not_convert=None, current_key_name
|
||||
"""
|
||||
modules_to_not_convert = ["lm_head"] if modules_to_not_convert is None else modules_to_not_convert
|
||||
model, has_been_replaced = _replace_with_bnb_linear(
|
||||
model, modules_to_not_convert, current_key_name, quantization_config
|
||||
model, modules_to_not_convert, current_key_name, quantization_config, pre_quantized=pre_quantized
|
||||
)
|
||||
|
||||
if not has_been_replaced:
|
||||
|
||||
@ -549,6 +549,8 @@ def replace_with_fp8_linear(
|
||||
quantization_config=None,
|
||||
):
|
||||
"""Helper function to replace model layers with FP8 versions."""
|
||||
if modules_to_not_convert is None:
|
||||
modules_to_not_convert = []
|
||||
modules_to_not_convert += ["lm_head"]
|
||||
|
||||
if quantization_config.modules_to_not_convert is not None:
|
||||
@ -570,35 +572,29 @@ def replace_with_fp8_linear(
|
||||
return model
|
||||
|
||||
|
||||
class QuantizationOp(ConversionOps):
|
||||
"""Base class for quantization operations."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class Fp8Quantize(QuantizationOp):
|
||||
class Fp8Quantize(ConversionOps):
|
||||
"""
|
||||
A quantization operation that creates two tensors, weight and scale out of a weight.
|
||||
"""
|
||||
|
||||
reverse_op: type[ConversionOps]
|
||||
|
||||
def __init__(self, block_size: Optional[tuple[int, int]] = None):
|
||||
self.block_size = block_size
|
||||
def __init__(self, hf_quantizer):
|
||||
self.hf_quantizer = hf_quantizer
|
||||
self.reverse_op = Fp8Dequantize
|
||||
|
||||
def convert(self, input_dict: torch.Tensor, *, quant_config: dict[str, Any]) -> dict[str, torch.Tensor]:
|
||||
def convert(self, input_dict: torch.Tensor, **kwargs) -> dict[str, torch.Tensor]:
|
||||
# Unpack single key/value (value may be wrapped in a list)
|
||||
target_keys, value = tuple(input_dict.items())[0]
|
||||
value = value[0] if isinstance(value, list) else value
|
||||
|
||||
# Resolve block size (support dict-like or attr-like quant_config)
|
||||
block_size = None
|
||||
if quant_config is not None:
|
||||
if isinstance(quant_config, dict):
|
||||
block_size = quant_config.get("weight_block_size")
|
||||
if self.hf_quantizer.quantization_config is not None:
|
||||
if isinstance(self.hf_quantizer.quantization_config, dict):
|
||||
block_size = self.hf_quantizer.quantization_config.get("weight_block_size")
|
||||
else:
|
||||
block_size = getattr(quant_config, "weight_block_size", None)
|
||||
block_size = getattr(self.hf_quantizer.quantization_config, "weight_block_size", None)
|
||||
if block_size is None:
|
||||
block_size = (value.shape[-2], value.shape[-1])
|
||||
|
||||
@ -656,7 +652,7 @@ class Fp8Quantize(QuantizationOp):
|
||||
}
|
||||
|
||||
|
||||
class Fp8Dequantize(QuantizationOp):
|
||||
class Fp8Dequantize(ConversionOps):
|
||||
"""Inverse operation of :class:`Fp8Quantize`. Takes a pair (weight, scale) and reconstructs the fp32 tensor."""
|
||||
|
||||
def __init__(self, block_size: Optional[tuple[int, int]] = None):
|
||||
|
||||
@ -654,17 +654,18 @@ def maybe_load_adapters(
|
||||
token_from_adapter_kwargs = adapter_kwargs.pop("token", None)
|
||||
|
||||
if _adapter_model_path is None:
|
||||
peft_kwargs = adapter_kwargs.copy()
|
||||
for arg_name in ("cache_dir", "proxies", "subfolder"): # don't override revision
|
||||
if (arg_name not in peft_kwargs) and (arg_name in download_kwargs):
|
||||
peft_kwargs[arg_name] = download_kwargs[arg_name]
|
||||
if "commit_hash" in download_kwargs:
|
||||
peft_kwargs["_commit_hash"] = download_kwargs["commit_hash"]
|
||||
peft_kwargs["force_download"] = bool(download_kwargs.get("force_download", False))
|
||||
peft_kwargs["local_files_only"] = bool(download_kwargs.get("local_files_only", False))
|
||||
peft_kwargs["token"] = token or token_from_adapter_kwargs
|
||||
_adapter_model_path = find_adapter_config_file(
|
||||
pretrained_model_name_or_path,
|
||||
cache_dir=download_kwargs.get("cache_dir"),
|
||||
force_download=bool(download_kwargs.get("force_download", False)),
|
||||
proxies=download_kwargs.get("proxies"),
|
||||
token=token or token_from_adapter_kwargs,
|
||||
revision=download_kwargs.get("revision"),
|
||||
local_files_only=bool(download_kwargs.get("local_files_only", False)),
|
||||
subfolder=download_kwargs.get("subfolder", ""),
|
||||
_commit_hash=download_kwargs.get("commit_hash"),
|
||||
**adapter_kwargs,
|
||||
**peft_kwargs,
|
||||
)
|
||||
|
||||
if _adapter_model_path is not None and os.path.isfile(_adapter_model_path):
|
||||
|
||||
@ -447,6 +447,7 @@ def RTDetrForObjectDetectionLoss(
|
||||
outputs_loss = {}
|
||||
outputs_loss["logits"] = logits
|
||||
outputs_loss["pred_boxes"] = pred_boxes
|
||||
auxiliary_outputs = None
|
||||
if config.auxiliary_loss:
|
||||
if denoising_meta_values is not None:
|
||||
dn_out_coord, outputs_coord = torch.split(outputs_coord, denoising_meta_values["dn_num_split"], dim=2)
|
||||
|
||||
@ -408,7 +408,7 @@ def _prepare_4d_causal_attention_mask_for_sdpa(
|
||||
# Attend to all tokens in masked rows from the causal_mask, for example the relevant first rows when
|
||||
# using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
|
||||
# Details: https://github.com/pytorch/pytorch/issues/110213
|
||||
if not is_tracing_ and expanded_4d_mask.device.type == "cuda":
|
||||
if not is_tracing_ and expanded_4d_mask.device.type in ["cuda", "xpu"]:
|
||||
expanded_4d_mask = AttentionMaskConverter._unmask_unattended(
|
||||
expanded_4d_mask, min_dtype=torch.finfo(inputs_embeds.dtype).min
|
||||
)
|
||||
|
||||
@ -47,7 +47,11 @@ from torch.utils.checkpoint import checkpoint
|
||||
from . import initialization as init
|
||||
from .configuration_utils import PreTrainedConfig
|
||||
from .conversion_mapping import get_checkpoint_conversion_mapping
|
||||
from .core_model_loading import WeightConverter, convert_and_load_state_dict_in_model, revert_weight_conversion
|
||||
from .core_model_loading import (
|
||||
WeightConverter,
|
||||
convert_and_load_state_dict_in_model,
|
||||
revert_weight_conversion,
|
||||
)
|
||||
from .distributed import DistributedConfig
|
||||
from .dynamic_module_utils import custom_object_save
|
||||
from .generation import CompileConfig, GenerationConfig
|
||||
@ -1456,6 +1460,8 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
|
||||
self.config._attn_implementation_internal = self._check_and_adjust_attn_implementation(
|
||||
self.config._attn_implementation, is_init_check=True
|
||||
)
|
||||
if self.can_generate():
|
||||
self.generation_config = GenerationConfig.from_model_config(config)
|
||||
|
||||
# for initialization of the loss
|
||||
loss_type = self.__class__.__name__
|
||||
@ -1470,8 +1476,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
|
||||
|
||||
self.name_or_path = config.name_or_path
|
||||
self.warnings_issued = {}
|
||||
self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None
|
||||
|
||||
# Overwrite the class attribute to make it an instance attribute, so models like
|
||||
# `InstructBlipForConditionalGeneration` can dynamically update it without modifying the class attribute
|
||||
# when a different component (e.g. language_model) is used.
|
||||
@ -2201,26 +2205,78 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
|
||||
"""
|
||||
self._require_grads_hook.remove()
|
||||
|
||||
def get_encoder(self, modality: Optional[str] = None):
|
||||
"""
|
||||
Best-effort lookup of the *encoder* module. If provided with `modality` argument,
|
||||
it looks for a modality-specific encoder in multimodal models (e.g. "image_encoder")
|
||||
By default the function returns model's text encoder if any, and otherwise returns `self`.
|
||||
|
||||
Possible `modality` values are "image", "video" and "audio".
|
||||
"""
|
||||
# NOTE: new models need to use existing names for layers if possible, so this list doesn't grow infinitely
|
||||
if modality in ["image", "video"]:
|
||||
possible_module_names = ["vision_tower", "visual", "vision_model", "vision_encoder", "image_tower"]
|
||||
elif modality == "audio":
|
||||
possible_module_names = ["audio_tower", "audio_encoder", "speech_encoder"]
|
||||
elif modality is None:
|
||||
possible_module_names = ["text_encoder", "encoder"]
|
||||
else:
|
||||
raise ValueError(f'Unnrecognized modality, has to be "image", "video" or "audio" but found {modality}')
|
||||
|
||||
for name in possible_module_names:
|
||||
if hasattr(self, name):
|
||||
return getattr(self, name)
|
||||
|
||||
if self.base_model is not self and hasattr(self.base_model, "get_encoder"):
|
||||
return self.base_model.get_encoder(modality=modality)
|
||||
|
||||
# If this is a base transformer model (no encoder/model attributes), return self
|
||||
return self
|
||||
|
||||
def set_encoder(self, encoder, modality: Optional[str] = None):
|
||||
"""
|
||||
Symmetric setter. Mirrors the lookup logic used in `get_encoder`.
|
||||
"""
|
||||
|
||||
# NOTE: new models need to use existing names for layers if possible, so this list doesn't grow infinitely
|
||||
if modality in ["image", "video"]:
|
||||
possible_module_names = ["vision_tower", "visual", "vision_model", "vision_encoder", "image_tower"]
|
||||
if modality == "audio":
|
||||
possible_module_names = ["audio_tower", "audio_encoder"]
|
||||
elif modality is None:
|
||||
possible_module_names = ["text_encoder", "encoder"]
|
||||
else:
|
||||
raise ValueError(f'Unnrecognized modality, has to be "image", "video" or "audio" but found {modality}')
|
||||
|
||||
for name in possible_module_names:
|
||||
if hasattr(self, name):
|
||||
setattr(self, name, encoder)
|
||||
return
|
||||
|
||||
if self.base_model is not self:
|
||||
if hasattr(self.base_model, "set_encoder"):
|
||||
self.base_model.set_encoder(encoder, modality=modality)
|
||||
else:
|
||||
self.model = encoder
|
||||
|
||||
def get_decoder(self):
|
||||
"""
|
||||
Best-effort lookup of the *decoder* module.
|
||||
|
||||
Order of attempts (covers ~85 % of current usages):
|
||||
|
||||
1. `self.decoder`
|
||||
2. `self.model` (many wrappers store the decoder here)
|
||||
3. `self.model.get_decoder()` (nested wrappers)
|
||||
1. `self.decoder/self.language_model/self.text_model`
|
||||
2. `self.base_model` (many wrappers store the decoder here)
|
||||
3. `self.base_model.get_decoder()` (nested wrappers)
|
||||
4. fallback: raise for the few exotic models that need a bespoke rule
|
||||
"""
|
||||
if hasattr(self, "decoder"):
|
||||
return self.decoder
|
||||
possible_module_names = ["language_model", "text_model", "decoder", "text_decoder"]
|
||||
for name in possible_module_names:
|
||||
if hasattr(self, name):
|
||||
return getattr(self, name)
|
||||
|
||||
if hasattr(self, "model"):
|
||||
inner = self.model
|
||||
# See: https://github.com/huggingface/transformers/issues/40815
|
||||
if hasattr(inner, "get_decoder") and type(inner) is not type(self):
|
||||
return inner.get_decoder()
|
||||
return inner
|
||||
if self.base_model is not self and hasattr(self.base_model, "get_decoder"):
|
||||
return self.base_model.get_decoder()
|
||||
|
||||
# If this is a base transformer model (no decoder/model attributes), return self
|
||||
# This handles cases like MistralModel which is itself the decoder
|
||||
@ -2231,19 +2287,18 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
|
||||
Symmetric setter. Mirrors the lookup logic used in `get_decoder`.
|
||||
"""
|
||||
|
||||
if hasattr(self, "decoder"):
|
||||
self.decoder = decoder
|
||||
return
|
||||
possible_module_names = ["language_model", "text_model", "decoder"]
|
||||
for name in possible_module_names:
|
||||
if hasattr(self, name):
|
||||
print(name)
|
||||
setattr(self, name, decoder)
|
||||
return
|
||||
|
||||
if hasattr(self, "model"):
|
||||
inner = self.model
|
||||
if hasattr(inner, "set_decoder"):
|
||||
inner.set_decoder(decoder)
|
||||
if self.base_model is not self:
|
||||
if hasattr(self.base_model, "set_decoder"):
|
||||
self.base_model.set_decoder(decoder)
|
||||
else:
|
||||
self.model = decoder
|
||||
return
|
||||
|
||||
return
|
||||
|
||||
@torch.no_grad()
|
||||
def _init_weights(self, module):
|
||||
@ -3049,7 +3104,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
|
||||
variant: Optional[str] = None,
|
||||
token: Optional[Union[str, bool]] = None,
|
||||
save_peft_format: bool = True,
|
||||
save_original_format: bool = True, # TODO next PR will make it go to True
|
||||
save_original_format: bool = False, # TODO next PR will make it go to True
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
@ -3174,19 +3229,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
|
||||
# Save the config
|
||||
if is_main_process:
|
||||
if not _hf_peft_config_loaded:
|
||||
# If the model config has set attributes that should be in the generation config, move them there.
|
||||
misplaced_generation_parameters = model_to_save.config._get_non_default_generation_parameters()
|
||||
if self.can_generate() and len(misplaced_generation_parameters) > 0:
|
||||
warnings.warn(
|
||||
"Moving the following attributes in the config to the generation config: "
|
||||
f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
|
||||
"generation parameters in the model config, as opposed to in the generation config.",
|
||||
UserWarning,
|
||||
)
|
||||
for param_name, param_value in misplaced_generation_parameters.items():
|
||||
setattr(model_to_save.generation_config, param_name, param_value)
|
||||
setattr(model_to_save.config, param_name, None)
|
||||
|
||||
model_to_save.config.save_pretrained(save_directory)
|
||||
if self.can_generate():
|
||||
model_to_save.generation_config.save_pretrained(save_directory)
|
||||
@ -3341,8 +3383,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
|
||||
# MEGA BIG TODO HERE: self._conversion_ops needs to be used to save the final ckpt
|
||||
# using what was loaded. Actually self._conversion_ops wont work because we need it
|
||||
# even if the files are not legacy -> thus no conversion happened
|
||||
weight_mapping = get_checkpoint_conversion_mapping(self.config.model_type)
|
||||
state_dict = revert_weight_conversion(self, state_dict, weight_mapping)
|
||||
state_dict = revert_weight_conversion(self, state_dict)
|
||||
|
||||
# Shard the model if it is too big.
|
||||
if not _hf_peft_config_loaded:
|
||||
@ -3882,7 +3923,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
|
||||
subfolder = kwargs.pop("subfolder", "")
|
||||
commit_hash = kwargs.pop("_commit_hash", None)
|
||||
variant = kwargs.pop("variant", None)
|
||||
adapter_kwargs = kwargs.pop("adapter_kwargs", {})
|
||||
adapter_kwargs = (kwargs.pop("adapter_kwargs", {}) or {}).copy()
|
||||
adapter_name = kwargs.pop("adapter_name", "default")
|
||||
generation_config = kwargs.pop("generation_config", None)
|
||||
gguf_file = kwargs.pop("gguf_file", None)
|
||||
@ -4073,7 +4114,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
|
||||
|
||||
# Prepare the full device map
|
||||
if device_map is not None:
|
||||
device_map = _get_device_map(model, device_map, max_memory, hf_quantizer, dtype)
|
||||
device_map = _get_device_map(model, device_map, max_memory, hf_quantizer)
|
||||
|
||||
# restore default dtype
|
||||
if dtype_orig is not None:
|
||||
@ -4187,9 +4228,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
|
||||
expanded_device_map = expand_device_map(device_map, expected_keys)
|
||||
caching_allocator_warmup(model, expanded_device_map, hf_quantizer)
|
||||
|
||||
if device_map is None:
|
||||
device_map = {"": "cpu"}
|
||||
keys = sorted(device_map.keys(), key=len, reverse=True)
|
||||
tp_plan = getattr(model, "_tp_plan", None)
|
||||
error_msgs = []
|
||||
|
||||
@ -4212,33 +4250,18 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
|
||||
missing_keys, unexpected_keys, mismatched_keys, misc = set(), set(), set(), set()
|
||||
else:
|
||||
all_pointer = set()
|
||||
# Checkpoints are safetensors
|
||||
if checkpoint_files is not None and checkpoint_files[0].endswith(".safetensors"):
|
||||
pattern = re.compile(r"(" + "|".join(map(re.escape, keys)) + r")")
|
||||
if sharded_metadata is None:
|
||||
k_v_iterator = dict.fromkeys(
|
||||
safe_open(checkpoint_files[0], framework="pt").keys(), checkpoint_files[0].rsplit("/", 1)[1]
|
||||
).items()
|
||||
else:
|
||||
k_v_iterator = sharded_metadata["weight_map"].items()
|
||||
|
||||
merged_state_dict = {}
|
||||
for k, v in k_v_iterator:
|
||||
match = pattern.match(k)
|
||||
if match and match.group(1) != "":
|
||||
device = device_map[match.group(1)]
|
||||
else:
|
||||
device = device_map.get("", "cpu")
|
||||
if isinstance(device, torch.device):
|
||||
device = device.index # safetensors only
|
||||
if device == "disk":
|
||||
device = "cpu" # we read to cpu to then write to disk
|
||||
file_pointer = safe_open(
|
||||
os.path.join(checkpoint_files[0].rsplit("/", 1)[0], v), framework="pt", device=device
|
||||
)
|
||||
for file in checkpoint_files:
|
||||
file_pointer = safe_open(file, framework="pt", device="cpu")
|
||||
all_pointer.add(file_pointer)
|
||||
merged_state_dict[k] = file_pointer.get_slice(k) # don't materialize yet
|
||||
for k in file_pointer.keys():
|
||||
merged_state_dict[k] = file_pointer.get_slice(k) # don't materialize yet
|
||||
# User passed an explicit state_dict
|
||||
elif state_dict is not None:
|
||||
merged_state_dict = state_dict
|
||||
# Checkpoints are .bin
|
||||
elif checkpoint_files is not None:
|
||||
merged_state_dict = {}
|
||||
for ckpt_file in checkpoint_files:
|
||||
@ -4710,7 +4733,7 @@ def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: dict,
|
||||
else None
|
||||
)
|
||||
total_byte_count = defaultdict(lambda: 0)
|
||||
tied_param_names = _get_tied_weight_keys(model)
|
||||
tied_param_names = model.all_tied_weights_keys.keys()
|
||||
for param_name, device in accelerator_device_map.items():
|
||||
# Skip if the parameter has already been accounted for (tied weights)
|
||||
if param_name in tied_param_names:
|
||||
@ -4724,6 +4747,9 @@ def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: dict,
|
||||
try:
|
||||
param = model.get_parameter_or_buffer(param_name)
|
||||
except AttributeError:
|
||||
# TODO: for now let's skip if we can't find the parameters
|
||||
if hf_quantizer is not None:
|
||||
continue
|
||||
raise AttributeError(f"Parameter {param_name} not found in model")
|
||||
|
||||
# The dtype of different parameters may be different with composite models or `keep_in_fp32_modules`
|
||||
|
||||
@ -596,7 +596,7 @@ class AriaTextPreTrainedModel(PreTrainedModel):
|
||||
@auto_docstring
|
||||
class AriaPreTrainedModel(PreTrainedModel):
|
||||
config: AriaConfig
|
||||
base_model_prefix = ""
|
||||
base_model_prefix = "model"
|
||||
supports_gradient_checkpointing = True
|
||||
_no_split_modules = ["AriaDecoderLayer"]
|
||||
_skip_keys_device_placement = ["past_key_values"]
|
||||
@ -893,6 +893,10 @@ class AriaModelOutputWithPast(BaseModelOutputWithPast):
|
||||
"""
|
||||
)
|
||||
class AriaModel(AriaPreTrainedModel):
|
||||
_checkpoint_conversion_mapping = {
|
||||
r"^language_model.model": "language_model",
|
||||
}
|
||||
|
||||
def __init__(self, config: AriaConfig):
|
||||
super().__init__(config)
|
||||
self.vision_tower = AutoModel.from_config(config.vision_config)
|
||||
@ -906,12 +910,6 @@ class AriaModel(AriaPreTrainedModel):
|
||||
def set_input_embeddings(self, value):
|
||||
self.language_model.set_input_embeddings(value)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.language_model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.language_model
|
||||
|
||||
def get_image_features(
|
||||
self,
|
||||
pixel_values: torch.FloatTensor,
|
||||
@ -1071,12 +1069,6 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
|
||||
def get_output_embeddings(self) -> nn.Module:
|
||||
return self.lm_head
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model.set_decoder(decoder)
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model.get_decoder()
|
||||
|
||||
def get_image_features(
|
||||
self,
|
||||
pixel_values: torch.FloatTensor,
|
||||
@ -1089,19 +1081,6 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
|
||||
vision_feature_layer=vision_feature_layer,
|
||||
)
|
||||
|
||||
# Make modules available through conditional class for BC
|
||||
@property
|
||||
def language_model(self):
|
||||
return self.model.language_model
|
||||
|
||||
@property
|
||||
def vision_tower(self):
|
||||
return self.model.vision_tower
|
||||
|
||||
@property
|
||||
def multi_modal_projector(self):
|
||||
return self.model.multi_modal_projector
|
||||
|
||||
@can_return_tuple
|
||||
@auto_docstring
|
||||
def forward(
|
||||
|
||||
@ -1206,7 +1206,7 @@ class AriaTextPreTrainedModel(PreTrainedModel):
|
||||
|
||||
class AriaPreTrainedModel(LlamaPreTrainedModel):
|
||||
config: AriaConfig
|
||||
base_model_prefix = ""
|
||||
base_model_prefix = "model"
|
||||
_can_compile_fullgraph = False # MoE models don't work with torch.compile (dynamic slicing)
|
||||
_supports_attention_backend = True
|
||||
|
||||
|
||||
@ -30,7 +30,7 @@ from torch import nn
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, EncoderDecoderCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...masking_utils import eager_mask, padding_mask_function
|
||||
from ...masking_utils import create_bidirectional_mask
|
||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import BaseModelOutput, CausalLMOutputWithPast
|
||||
@ -336,20 +336,10 @@ class AudioFlamingo3Encoder(AudioFlamingo3PreTrainedModel):
|
||||
- 0 for tokens that are **masked**.
|
||||
"""
|
||||
|
||||
# Prepare attention mask for transformer layers
|
||||
batch_size = input_features.shape[0]
|
||||
seq_len = (input_features.shape[-1] - 1) // 2 + 1 # After conv2 downsampling
|
||||
|
||||
input_features_lengths = input_features_mask.sum(-1)
|
||||
input_features_lengths = (input_features_lengths - 1) // 2 + 1 # conv2 downsampling
|
||||
input_features_mask = torch.arange(seq_len, device=input_features.device) < input_features_lengths[:, None]
|
||||
attention_mask = eager_mask(
|
||||
batch_size=batch_size,
|
||||
cache_position=torch.arange(seq_len, device=input_features.device),
|
||||
kv_length=seq_len,
|
||||
mask_function=padding_mask_function(input_features_mask),
|
||||
dtype=self.conv1.weight.dtype,
|
||||
)
|
||||
|
||||
# Conv front-end
|
||||
inputs_embeds = nn.functional.gelu(self.conv1(input_features))
|
||||
@ -360,6 +350,12 @@ class AudioFlamingo3Encoder(AudioFlamingo3PreTrainedModel):
|
||||
hidden_states = inputs_embeds + self.embed_positions.weight
|
||||
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
|
||||
|
||||
attention_mask = create_bidirectional_mask(
|
||||
config=self.config,
|
||||
input_embeds=hidden_states,
|
||||
attention_mask=input_features_mask,
|
||||
)
|
||||
|
||||
# Transformer stack
|
||||
for layer in self.layers:
|
||||
drop = self.training and torch.rand([]) < self.layerdrop
|
||||
|
||||
@ -21,7 +21,7 @@ from torch import nn
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache
|
||||
from ...masking_utils import eager_mask, padding_mask_function
|
||||
from ...masking_utils import create_bidirectional_mask
|
||||
from ...modeling_outputs import BaseModelOutput, CausalLMOutputWithPast
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
|
||||
@ -73,20 +73,10 @@ class AudioFlamingo3Encoder(Qwen2AudioEncoder):
|
||||
- 0 for tokens that are **masked**.
|
||||
"""
|
||||
|
||||
# Prepare attention mask for transformer layers
|
||||
batch_size = input_features.shape[0]
|
||||
seq_len = (input_features.shape[-1] - 1) // 2 + 1 # After conv2 downsampling
|
||||
|
||||
input_features_lengths = input_features_mask.sum(-1)
|
||||
input_features_lengths = (input_features_lengths - 1) // 2 + 1 # conv2 downsampling
|
||||
input_features_mask = torch.arange(seq_len, device=input_features.device) < input_features_lengths[:, None]
|
||||
attention_mask = eager_mask(
|
||||
batch_size=batch_size,
|
||||
cache_position=torch.arange(seq_len, device=input_features.device),
|
||||
kv_length=seq_len,
|
||||
mask_function=padding_mask_function(input_features_mask),
|
||||
dtype=self.conv1.weight.dtype,
|
||||
)
|
||||
|
||||
# Conv front-end
|
||||
inputs_embeds = nn.functional.gelu(self.conv1(input_features))
|
||||
@ -97,6 +87,12 @@ class AudioFlamingo3Encoder(Qwen2AudioEncoder):
|
||||
hidden_states = inputs_embeds + self.embed_positions.weight
|
||||
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
|
||||
|
||||
attention_mask = create_bidirectional_mask(
|
||||
config=self.config,
|
||||
input_embeds=hidden_states,
|
||||
attention_mask=input_features_mask,
|
||||
)
|
||||
|
||||
# Transformer stack
|
||||
for layer in self.layers:
|
||||
drop = self.training and torch.rand([]) < self.layerdrop
|
||||
|
||||
@ -288,8 +288,9 @@ class _BaseAutoModelClass:
|
||||
if is_peft_available():
|
||||
if adapter_kwargs is None:
|
||||
adapter_kwargs = {}
|
||||
if token is not None:
|
||||
adapter_kwargs["token"] = token
|
||||
adapter_kwargs = adapter_kwargs.copy() # avoid mutating original
|
||||
if token is not None:
|
||||
adapter_kwargs["token"] = token
|
||||
|
||||
maybe_adapter_path = find_adapter_config_file(
|
||||
pretrained_model_name_or_path, _commit_hash=commit_hash, **adapter_kwargs
|
||||
|
||||
@ -114,7 +114,6 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
|
||||
("deit", "DeiTConfig"),
|
||||
("depth_anything", "DepthAnythingConfig"),
|
||||
("depth_pro", "DepthProConfig"),
|
||||
("deta", "DetaConfig"),
|
||||
("detr", "DetrConfig"),
|
||||
("dia", "DiaConfig"),
|
||||
("diffllama", "DiffLlamaConfig"),
|
||||
@ -132,7 +131,6 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
|
||||
("edgetam", "EdgeTamConfig"),
|
||||
("edgetam_video", "EdgeTamVideoConfig"),
|
||||
("edgetam_vision_model", "EdgeTamVisionConfig"),
|
||||
("efficientformer", "EfficientFormerConfig"),
|
||||
("efficientloftr", "EfficientLoFTRConfig"),
|
||||
("efficientnet", "EfficientNetConfig"),
|
||||
("electra", "ElectraConfig"),
|
||||
@ -143,7 +141,6 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
|
||||
("ernie", "ErnieConfig"),
|
||||
("ernie4_5", "Ernie4_5Config"),
|
||||
("ernie4_5_moe", "Ernie4_5_MoeConfig"),
|
||||
("ernie_m", "ErnieMConfig"),
|
||||
("esm", "EsmConfig"),
|
||||
("evolla", "EvollaConfig"),
|
||||
("exaone4", "Exaone4Config"),
|
||||
@ -190,14 +187,12 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
|
||||
("gpt_neox_japanese", "GPTNeoXJapaneseConfig"),
|
||||
("gpt_oss", "GptOssConfig"),
|
||||
("gptj", "GPTJConfig"),
|
||||
("gptsan-japanese", "GPTSanJapaneseConfig"),
|
||||
("granite", "GraniteConfig"),
|
||||
("granite_speech", "GraniteSpeechConfig"),
|
||||
("granitemoe", "GraniteMoeConfig"),
|
||||
("granitemoehybrid", "GraniteMoeHybridConfig"),
|
||||
("granitemoeshared", "GraniteMoeSharedConfig"),
|
||||
("granitevision", "LlavaNextConfig"),
|
||||
("graphormer", "GraphormerConfig"),
|
||||
("grounding-dino", "GroundingDinoConfig"),
|
||||
("groupvit", "GroupViTConfig"),
|
||||
("helium", "HeliumConfig"),
|
||||
@ -221,7 +216,6 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
|
||||
("jamba", "JambaConfig"),
|
||||
("janus", "JanusConfig"),
|
||||
("jetmoe", "JetMoeConfig"),
|
||||
("jukebox", "JukeboxConfig"),
|
||||
("kosmos-2", "Kosmos2Config"),
|
||||
("kosmos-2.5", "Kosmos2_5Config"),
|
||||
("kyutai_speech_to_text", "KyutaiSpeechToTextConfig"),
|
||||
@ -257,8 +251,6 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
|
||||
("maskformer", "MaskFormerConfig"),
|
||||
("maskformer-swin", "MaskFormerSwinConfig"),
|
||||
("mbart", "MBartConfig"),
|
||||
("mctct", "MCTCTConfig"),
|
||||
("mega", "MegaConfig"),
|
||||
("megatron-bert", "MegatronBertConfig"),
|
||||
("metaclip_2", "MetaClip2Config"),
|
||||
("mgp-str", "MgpstrConfig"),
|
||||
@ -287,9 +279,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
|
||||
("musicgen", "MusicgenConfig"),
|
||||
("musicgen_melody", "MusicgenMelodyConfig"),
|
||||
("mvp", "MvpConfig"),
|
||||
("nat", "NatConfig"),
|
||||
("nemotron", "NemotronConfig"),
|
||||
("nezha", "NezhaConfig"),
|
||||
("nllb-moe", "NllbMoeConfig"),
|
||||
("nougat", "VisionEncoderDecoderConfig"),
|
||||
("nystromformer", "NystromformerConfig"),
|
||||
@ -299,7 +289,6 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
|
||||
("olmoe", "OlmoeConfig"),
|
||||
("omdet-turbo", "OmDetTurboConfig"),
|
||||
("oneformer", "OneFormerConfig"),
|
||||
("open-llama", "OpenLlamaConfig"),
|
||||
("openai-gpt", "OpenAIGPTConfig"),
|
||||
("opt", "OPTConfig"),
|
||||
("ovis2", "Ovis2Config"),
|
||||
@ -328,7 +317,6 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
|
||||
("prophetnet", "ProphetNetConfig"),
|
||||
("pvt", "PvtConfig"),
|
||||
("pvt_v2", "PvtV2Config"),
|
||||
("qdqbert", "QDQBertConfig"),
|
||||
("qwen2", "Qwen2Config"),
|
||||
("qwen2_5_omni", "Qwen2_5OmniConfig"),
|
||||
("qwen2_5_vl", "Qwen2_5_VLConfig"),
|
||||
@ -347,13 +335,11 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
|
||||
("qwen3_vl_moe_text", "Qwen3VLMoeTextConfig"),
|
||||
("qwen3_vl_text", "Qwen3VLTextConfig"),
|
||||
("rag", "RagConfig"),
|
||||
("realm", "RealmConfig"),
|
||||
("recurrent_gemma", "RecurrentGemmaConfig"),
|
||||
("reformer", "ReformerConfig"),
|
||||
("regnet", "RegNetConfig"),
|
||||
("rembert", "RemBertConfig"),
|
||||
("resnet", "ResNetConfig"),
|
||||
("retribert", "RetriBertConfig"),
|
||||
("roberta", "RobertaConfig"),
|
||||
("roberta-prelayernorm", "RobertaPreLayerNormConfig"),
|
||||
("roc_bert", "RoCBertConfig"),
|
||||
@ -387,7 +373,6 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
|
||||
("smolvlm_vision", "SmolVLMVisionConfig"),
|
||||
("speech-encoder-decoder", "SpeechEncoderDecoderConfig"),
|
||||
("speech_to_text", "Speech2TextConfig"),
|
||||
("speech_to_text_2", "Speech2Text2Config"),
|
||||
("speecht5", "SpeechT5Config"),
|
||||
("splinter", "SplinterConfig"),
|
||||
("squeezebert", "SqueezeBertConfig"),
|
||||
@ -410,10 +395,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
|
||||
("timesformer", "TimesformerConfig"),
|
||||
("timm_backbone", "TimmBackboneConfig"),
|
||||
("timm_wrapper", "TimmWrapperConfig"),
|
||||
("trajectory_transformer", "TrajectoryTransformerConfig"),
|
||||
("transfo-xl", "TransfoXLConfig"),
|
||||
("trocr", "TrOCRConfig"),
|
||||
("tvlt", "TvltConfig"),
|
||||
("tvp", "TvpConfig"),
|
||||
("udop", "UdopConfig"),
|
||||
("umt5", "UMT5Config"),
|
||||
@ -421,7 +403,6 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
|
||||
("unispeech-sat", "UniSpeechSatConfig"),
|
||||
("univnet", "UnivNetConfig"),
|
||||
("upernet", "UperNetConfig"),
|
||||
("van", "VanConfig"),
|
||||
("vaultgemma", "VaultGemmaConfig"),
|
||||
("video_llama_3", "VideoLlama3Config"),
|
||||
("video_llama_3_vision", "VideoLlama3VisionConfig"),
|
||||
@ -433,7 +414,6 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
|
||||
("vision-text-dual-encoder", "VisionTextDualEncoderConfig"),
|
||||
("visual_bert", "VisualBertConfig"),
|
||||
("vit", "ViTConfig"),
|
||||
("vit_hybrid", "ViTHybridConfig"),
|
||||
("vit_mae", "ViTMAEConfig"),
|
||||
("vit_msn", "ViTMSNConfig"),
|
||||
("vitdet", "VitDetConfig"),
|
||||
@ -454,7 +434,6 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
|
||||
("xcodec", "XcodecConfig"),
|
||||
("xglm", "XGLMConfig"),
|
||||
("xlm", "XLMConfig"),
|
||||
("xlm-prophetnet", "XLMProphetNetConfig"),
|
||||
("xlm-roberta", "XLMRobertaConfig"),
|
||||
("xlm-roberta-xl", "XLMRobertaXLConfig"),
|
||||
("xlnet", "XLNetConfig"),
|
||||
@ -508,7 +487,6 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
|
||||
("blip_2_qformer", "BLIP-2 QFormer"),
|
||||
("bloom", "BLOOM"),
|
||||
("blt", "Blt"),
|
||||
("bort", "BORT"),
|
||||
("bridgetower", "BridgeTower"),
|
||||
("bros", "BROS"),
|
||||
("byt5", "ByT5"),
|
||||
@ -560,7 +538,6 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
|
||||
("depth_anything", "Depth Anything"),
|
||||
("depth_anything_v2", "Depth Anything V2"),
|
||||
("depth_pro", "DepthPro"),
|
||||
("deta", "DETA"),
|
||||
("detr", "DETR"),
|
||||
("dia", "Dia"),
|
||||
("dialogpt", "DialoGPT"),
|
||||
@ -580,7 +557,6 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
|
||||
("edgetam", "EdgeTAM"),
|
||||
("edgetam_video", "EdgeTamVideo"),
|
||||
("edgetam_vision_model", "EdgeTamVisionModel"),
|
||||
("efficientformer", "EfficientFormer"),
|
||||
("efficientloftr", "EfficientLoFTR"),
|
||||
("efficientnet", "EfficientNet"),
|
||||
("electra", "ELECTRA"),
|
||||
@ -591,7 +567,6 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
|
||||
("ernie", "ERNIE"),
|
||||
("ernie4_5", "Ernie4_5"),
|
||||
("ernie4_5_moe", "Ernie4_5_MoE"),
|
||||
("ernie_m", "ErnieM"),
|
||||
("esm", "ESM"),
|
||||
("evolla", "Evolla"),
|
||||
("exaone4", "EXAONE-4.0"),
|
||||
@ -641,14 +616,12 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
|
||||
("gpt_neox_japanese", "GPT NeoX Japanese"),
|
||||
("gpt_oss", "GptOss"),
|
||||
("gptj", "GPT-J"),
|
||||
("gptsan-japanese", "GPTSAN-japanese"),
|
||||
("granite", "Granite"),
|
||||
("granite_speech", "GraniteSpeech"),
|
||||
("granitemoe", "GraniteMoeMoe"),
|
||||
("granitemoehybrid", "GraniteMoeHybrid"),
|
||||
("granitemoeshared", "GraniteMoeSharedMoe"),
|
||||
("granitevision", "LLaVA-NeXT"),
|
||||
("graphormer", "Graphormer"),
|
||||
("grounding-dino", "Grounding DINO"),
|
||||
("groupvit", "GroupViT"),
|
||||
("helium", "Helium"),
|
||||
@ -673,7 +646,6 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
|
||||
("jamba", "Jamba"),
|
||||
("janus", "Janus"),
|
||||
("jetmoe", "JetMoe"),
|
||||
("jukebox", "Jukebox"),
|
||||
("kosmos-2", "KOSMOS-2"),
|
||||
("kosmos-2.5", "KOSMOS-2.5"),
|
||||
("kyutai_speech_to_text", "KyutaiSpeechToText"),
|
||||
@ -714,8 +686,6 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
|
||||
("matcha", "MatCha"),
|
||||
("mbart", "mBART"),
|
||||
("mbart50", "mBART-50"),
|
||||
("mctct", "M-CTC-T"),
|
||||
("mega", "MEGA"),
|
||||
("megatron-bert", "Megatron-BERT"),
|
||||
("megatron_gpt2", "Megatron-GPT2"),
|
||||
("metaclip_2", "MetaCLIP 2"),
|
||||
@ -748,9 +718,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
|
||||
("musicgen_melody", "MusicGen Melody"),
|
||||
("mvp", "MVP"),
|
||||
("myt5", "myt5"),
|
||||
("nat", "NAT"),
|
||||
("nemotron", "Nemotron"),
|
||||
("nezha", "Nezha"),
|
||||
("nllb", "NLLB"),
|
||||
("nllb-moe", "NLLB-MOE"),
|
||||
("nougat", "Nougat"),
|
||||
@ -761,7 +729,6 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
|
||||
("olmoe", "OLMoE"),
|
||||
("omdet-turbo", "OmDet-Turbo"),
|
||||
("oneformer", "OneFormer"),
|
||||
("open-llama", "OpenLlama"),
|
||||
("openai-gpt", "OpenAI GPT"),
|
||||
("opt", "OPT"),
|
||||
("ovis2", "Ovis2"),
|
||||
@ -792,7 +759,6 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
|
||||
("prophetnet", "ProphetNet"),
|
||||
("pvt", "PVT"),
|
||||
("pvt_v2", "PVTv2"),
|
||||
("qdqbert", "QDQBert"),
|
||||
("qwen2", "Qwen2"),
|
||||
("qwen2_5_omni", "Qwen2_5Omni"),
|
||||
("qwen2_5_vl", "Qwen2_5_VL"),
|
||||
@ -811,13 +777,11 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
|
||||
("qwen3_vl_moe_text", "Qwen3VLMoe"),
|
||||
("qwen3_vl_text", "Qwen3VL"),
|
||||
("rag", "RAG"),
|
||||
("realm", "REALM"),
|
||||
("recurrent_gemma", "RecurrentGemma"),
|
||||
("reformer", "Reformer"),
|
||||
("regnet", "RegNet"),
|
||||
("rembert", "RemBERT"),
|
||||
("resnet", "ResNet"),
|
||||
("retribert", "RetriBERT"),
|
||||
("roberta", "RoBERTa"),
|
||||
("roberta-prelayernorm", "RoBERTa-PreLayerNorm"),
|
||||
("roc_bert", "RoCBert"),
|
||||
@ -851,7 +815,6 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
|
||||
("smolvlm_vision", "SmolVLMVisionTransformer"),
|
||||
("speech-encoder-decoder", "Speech Encoder decoder"),
|
||||
("speech_to_text", "Speech2Text"),
|
||||
("speech_to_text_2", "Speech2Text2"),
|
||||
("speecht5", "SpeechT5"),
|
||||
("splinter", "Splinter"),
|
||||
("squeezebert", "SqueezeBERT"),
|
||||
@ -869,17 +832,13 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
|
||||
("t5v1.1", "T5v1.1"),
|
||||
("table-transformer", "Table Transformer"),
|
||||
("tapas", "TAPAS"),
|
||||
("tapex", "TAPEX"),
|
||||
("textnet", "TextNet"),
|
||||
("time_series_transformer", "Time Series Transformer"),
|
||||
("timesfm", "TimesFm"),
|
||||
("timesformer", "TimeSformer"),
|
||||
("timm_backbone", "TimmBackbone"),
|
||||
("timm_wrapper", "TimmWrapperModel"),
|
||||
("trajectory_transformer", "Trajectory Transformer"),
|
||||
("transfo-xl", "Transformer-XL"),
|
||||
("trocr", "TrOCR"),
|
||||
("tvlt", "TVLT"),
|
||||
("tvp", "TVP"),
|
||||
("udop", "UDOP"),
|
||||
("ul2", "UL2"),
|
||||
@ -888,7 +847,6 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
|
||||
("unispeech-sat", "UniSpeechSat"),
|
||||
("univnet", "UnivNet"),
|
||||
("upernet", "UPerNet"),
|
||||
("van", "VAN"),
|
||||
("vaultgemma", "VaultGemma"),
|
||||
("video_llama_3", "VideoLlama3"),
|
||||
("video_llama_3_vision", "VideoLlama3Vision"),
|
||||
@ -900,7 +858,6 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
|
||||
("vision-text-dual-encoder", "VisionTextDualEncoder"),
|
||||
("visual_bert", "VisualBERT"),
|
||||
("vit", "ViT"),
|
||||
("vit_hybrid", "ViT Hybrid"),
|
||||
("vit_mae", "ViTMAE"),
|
||||
("vit_msn", "ViTMSN"),
|
||||
("vitdet", "VitDet"),
|
||||
@ -922,7 +879,6 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
|
||||
("xcodec", "X-CODEC"),
|
||||
("xglm", "XGLM"),
|
||||
("xlm", "XLM"),
|
||||
("xlm-prophetnet", "XLM-ProphetNet"),
|
||||
("xlm-roberta", "XLM-RoBERTa"),
|
||||
("xlm-roberta-xl", "XLM-RoBERTa-XL"),
|
||||
("xlm-v", "XLM-V"),
|
||||
@ -941,32 +897,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
|
||||
|
||||
# This is tied to the processing `-` -> `_` in `model_type_to_module_name`. For example, instead of putting
|
||||
# `transfo-xl` (as in `CONFIG_MAPPING_NAMES`), we should use `transfo_xl`.
|
||||
DEPRECATED_MODELS = [
|
||||
"bort",
|
||||
"deta",
|
||||
"efficientformer",
|
||||
"ernie_m",
|
||||
"gptsan_japanese",
|
||||
"graphormer",
|
||||
"jukebox",
|
||||
"mctct",
|
||||
"mega",
|
||||
"mmbt",
|
||||
"nat",
|
||||
"nezha",
|
||||
"open_llama",
|
||||
"qdqbert",
|
||||
"realm",
|
||||
"retribert",
|
||||
"speech_to_text_2",
|
||||
"tapex",
|
||||
"trajectory_transformer",
|
||||
"transfo_xl",
|
||||
"tvlt",
|
||||
"van",
|
||||
"vit_hybrid",
|
||||
"xlm_prophetnet",
|
||||
]
|
||||
DEPRECATED_MODELS = []
|
||||
|
||||
SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict[str, str](
|
||||
[
|
||||
|
||||
@ -50,7 +50,6 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
|
||||
("hubert", "Wav2Vec2FeatureExtractor"),
|
||||
("kyutai_speech_to_text", "KyutaiSpeechToTextFeatureExtractor"),
|
||||
("markuplm", "MarkupLMFeatureExtractor"),
|
||||
("mctct", "MCTCTFeatureExtractor"),
|
||||
("mimi", "EncodecFeatureExtractor"),
|
||||
("moonshine", "Wav2Vec2FeatureExtractor"),
|
||||
("moshi", "EncodecFeatureExtractor"),
|
||||
|
||||
@ -89,7 +89,6 @@ else:
|
||||
("deit", ("DeiTImageProcessor", "DeiTImageProcessorFast")),
|
||||
("depth_anything", ("DPTImageProcessor", "DPTImageProcessorFast")),
|
||||
("depth_pro", ("DepthProImageProcessor", "DepthProImageProcessorFast")),
|
||||
("deta", ("DetaImageProcessor", None)),
|
||||
("detr", ("DetrImageProcessor", "DetrImageProcessorFast")),
|
||||
("dinat", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||
("dinov2", ("BitImageProcessor", "BitImageProcessorFast")),
|
||||
@ -97,7 +96,6 @@ else:
|
||||
("donut-swin", ("DonutImageProcessor", "DonutImageProcessorFast")),
|
||||
("dpt", ("DPTImageProcessor", "DPTImageProcessorFast")),
|
||||
("edgetam", (None, "Sam2ImageProcessorFast")),
|
||||
("efficientformer", ("EfficientFormerImageProcessor", None)),
|
||||
("efficientloftr", ("EfficientLoFTRImageProcessor", "EfficientLoFTRImageProcessorFast")),
|
||||
("efficientnet", ("EfficientNetImageProcessor", "EfficientNetImageProcessorFast")),
|
||||
("emu3", ("Emu3ImageProcessor", None)),
|
||||
@ -149,7 +147,6 @@ else:
|
||||
("mobilenet_v2", ("MobileNetV2ImageProcessor", "MobileNetV2ImageProcessorFast")),
|
||||
("mobilevit", ("MobileViTImageProcessor", "MobileViTImageProcessorFast")),
|
||||
("mobilevitv2", ("MobileViTImageProcessor", "MobileViTImageProcessorFast")),
|
||||
("nat", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||
("nougat", ("NougatImageProcessor", "NougatImageProcessorFast")),
|
||||
("omdet-turbo", ("DetrImageProcessor", "DetrImageProcessorFast")),
|
||||
("oneformer", ("OneFormerImageProcessor", "OneFormerImageProcessorFast")),
|
||||
@ -195,18 +192,15 @@ else:
|
||||
("timesformer", ("VideoMAEImageProcessor", None)),
|
||||
("timm_wrapper", ("TimmWrapperImageProcessor", None)),
|
||||
("trocr", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||
("tvlt", ("TvltImageProcessor", None)),
|
||||
("tvp", ("TvpImageProcessor", "TvpImageProcessorFast")),
|
||||
("udop", ("LayoutLMv3ImageProcessor", "LayoutLMv3ImageProcessorFast")),
|
||||
("upernet", ("SegformerImageProcessor", "SegformerImageProcessorFast")),
|
||||
("van", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
|
||||
("video_llama_3", ("VideoLlama3ImageProcessor", "VideoLlama3ImageProcessorFast")),
|
||||
("video_llava", ("VideoLlavaImageProcessor", None)),
|
||||
("videomae", ("VideoMAEImageProcessor", None)),
|
||||
("vilt", ("ViltImageProcessor", "ViltImageProcessorFast")),
|
||||
("vipllava", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
|
||||
("vit", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||
("vit_hybrid", ("ViTHybridImageProcessor", None)),
|
||||
("vit_mae", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||
("vit_msn", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||
("vitmatte", ("VitMatteImageProcessor", "VitMatteImageProcessorFast")),
|
||||
|
||||
@ -119,7 +119,6 @@ MODEL_MAPPING_NAMES = OrderedDict(
|
||||
("deformable_detr", "DeformableDetrModel"),
|
||||
("deit", "DeiTModel"),
|
||||
("depth_pro", "DepthProModel"),
|
||||
("deta", "DetaModel"),
|
||||
("detr", "DetrModel"),
|
||||
("dia", "DiaModel"),
|
||||
("diffllama", "DiffLlamaModel"),
|
||||
@ -137,7 +136,6 @@ MODEL_MAPPING_NAMES = OrderedDict(
|
||||
("edgetam", "EdgeTamModel"),
|
||||
("edgetam_video", "EdgeTamVideoModel"),
|
||||
("edgetam_vision_model", "EdgeTamVisionModel"),
|
||||
("efficientformer", "EfficientFormerModel"),
|
||||
("efficientloftr", "EfficientLoFTRModel"),
|
||||
("efficientnet", "EfficientNetModel"),
|
||||
("electra", "ElectraModel"),
|
||||
@ -146,7 +144,6 @@ MODEL_MAPPING_NAMES = OrderedDict(
|
||||
("ernie", "ErnieModel"),
|
||||
("ernie4_5", "Ernie4_5Model"),
|
||||
("ernie4_5_moe", "Ernie4_5_MoeModel"),
|
||||
("ernie_m", "ErnieMModel"),
|
||||
("esm", "EsmModel"),
|
||||
("evolla", "EvollaModel"),
|
||||
("exaone4", "Exaone4Model"),
|
||||
@ -193,12 +190,10 @@ MODEL_MAPPING_NAMES = OrderedDict(
|
||||
("gpt_neox_japanese", "GPTNeoXJapaneseModel"),
|
||||
("gpt_oss", "GptOssModel"),
|
||||
("gptj", "GPTJModel"),
|
||||
("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
|
||||
("granite", "GraniteModel"),
|
||||
("granitemoe", "GraniteMoeModel"),
|
||||
("granitemoehybrid", "GraniteMoeHybridModel"),
|
||||
("granitemoeshared", "GraniteMoeSharedModel"),
|
||||
("graphormer", "GraphormerModel"),
|
||||
("grounding-dino", "GroundingDinoModel"),
|
||||
("groupvit", "GroupViTModel"),
|
||||
("helium", "HeliumModel"),
|
||||
@ -222,7 +217,6 @@ MODEL_MAPPING_NAMES = OrderedDict(
|
||||
("jamba", "JambaModel"),
|
||||
("janus", "JanusModel"),
|
||||
("jetmoe", "JetMoeModel"),
|
||||
("jukebox", "JukeboxModel"),
|
||||
("kosmos-2", "Kosmos2Model"),
|
||||
("kosmos-2.5", "Kosmos2_5Model"),
|
||||
("kyutai_speech_to_text", "KyutaiSpeechToTextModel"),
|
||||
@ -257,8 +251,6 @@ MODEL_MAPPING_NAMES = OrderedDict(
|
||||
("maskformer", "MaskFormerModel"),
|
||||
("maskformer-swin", "MaskFormerSwinModel"),
|
||||
("mbart", "MBartModel"),
|
||||
("mctct", "MCTCTModel"),
|
||||
("mega", "MegaModel"),
|
||||
("megatron-bert", "MegatronBertModel"),
|
||||
("metaclip_2", "MetaClip2Model"),
|
||||
("mgp-str", "MgpstrForSceneTextRecognition"),
|
||||
@ -287,9 +279,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
|
||||
("musicgen", "MusicgenModel"),
|
||||
("musicgen_melody", "MusicgenMelodyModel"),
|
||||
("mvp", "MvpModel"),
|
||||
("nat", "NatModel"),
|
||||
("nemotron", "NemotronModel"),
|
||||
("nezha", "NezhaModel"),
|
||||
("nllb-moe", "NllbMoeModel"),
|
||||
("nystromformer", "NystromformerModel"),
|
||||
("olmo", "OlmoModel"),
|
||||
@ -298,7 +288,6 @@ MODEL_MAPPING_NAMES = OrderedDict(
|
||||
("olmoe", "OlmoeModel"),
|
||||
("omdet-turbo", "OmDetTurboForObjectDetection"),
|
||||
("oneformer", "OneFormerModel"),
|
||||
("open-llama", "OpenLlamaModel"),
|
||||
("openai-gpt", "OpenAIGPTModel"),
|
||||
("opt", "OPTModel"),
|
||||
("ovis2", "Ovis2Model"),
|
||||
@ -324,7 +313,6 @@ MODEL_MAPPING_NAMES = OrderedDict(
|
||||
("prophetnet", "ProphetNetModel"),
|
||||
("pvt", "PvtModel"),
|
||||
("pvt_v2", "PvtV2Model"),
|
||||
("qdqbert", "QDQBertModel"),
|
||||
("qwen2", "Qwen2Model"),
|
||||
("qwen2_5_vl", "Qwen2_5_VLModel"),
|
||||
("qwen2_5_vl_text", "Qwen2_5_VLTextModel"),
|
||||
@ -344,7 +332,6 @@ MODEL_MAPPING_NAMES = OrderedDict(
|
||||
("regnet", "RegNetModel"),
|
||||
("rembert", "RemBertModel"),
|
||||
("resnet", "ResNetModel"),
|
||||
("retribert", "RetriBertModel"),
|
||||
("roberta", "RobertaModel"),
|
||||
("roberta-prelayernorm", "RobertaPreLayerNormModel"),
|
||||
("roc_bert", "RoCBertModel"),
|
||||
@ -395,16 +382,12 @@ MODEL_MAPPING_NAMES = OrderedDict(
|
||||
("timesformer", "TimesformerModel"),
|
||||
("timm_backbone", "TimmBackbone"),
|
||||
("timm_wrapper", "TimmWrapperModel"),
|
||||
("trajectory_transformer", "TrajectoryTransformerModel"),
|
||||
("transfo-xl", "TransfoXLModel"),
|
||||
("tvlt", "TvltModel"),
|
||||
("tvp", "TvpModel"),
|
||||
("udop", "UdopModel"),
|
||||
("umt5", "UMT5Model"),
|
||||
("unispeech", "UniSpeechModel"),
|
||||
("unispeech-sat", "UniSpeechSatModel"),
|
||||
("univnet", "UnivNetModel"),
|
||||
("van", "VanModel"),
|
||||
("vaultgemma", "VaultGemmaModel"),
|
||||
("video_llama_3", "VideoLlama3Model"),
|
||||
("video_llama_3_vision", "VideoLlama3VisionModel"),
|
||||
@ -415,7 +398,6 @@ MODEL_MAPPING_NAMES = OrderedDict(
|
||||
("vision-text-dual-encoder", "VisionTextDualEncoderModel"),
|
||||
("visual_bert", "VisualBertModel"),
|
||||
("vit", "ViTModel"),
|
||||
("vit_hybrid", "ViTHybridModel"),
|
||||
("vit_mae", "ViTMAEModel"),
|
||||
("vit_msn", "ViTMSNModel"),
|
||||
("vitdet", "VitDetModel"),
|
||||
@ -433,7 +415,6 @@ MODEL_MAPPING_NAMES = OrderedDict(
|
||||
("xcodec", "XcodecModel"),
|
||||
("xglm", "XGLMModel"),
|
||||
("xlm", "XLMModel"),
|
||||
("xlm-prophetnet", "XLMProphetNetModel"),
|
||||
("xlm-roberta", "XLMRobertaModel"),
|
||||
("xlm-roberta-xl", "XLMRobertaXLModel"),
|
||||
("xlnet", "XLNetModel"),
|
||||
@ -478,7 +459,6 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
|
||||
("gpt-sw3", "GPT2LMHeadModel"),
|
||||
("gpt2", "GPT2LMHeadModel"),
|
||||
("gpt_bigcode", "GPTBigCodeForCausalLM"),
|
||||
("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
|
||||
("hiera", "HieraForPreTraining"),
|
||||
("ibert", "IBertForMaskedLM"),
|
||||
("idefics", "IdeficsForVisionText2Text"),
|
||||
@ -495,7 +475,6 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
|
||||
("lxmert", "LxmertForPreTraining"),
|
||||
("mamba", "MambaForCausalLM"),
|
||||
("mamba2", "Mamba2ForCausalLM"),
|
||||
("mega", "MegaForMaskedLM"),
|
||||
("megatron-bert", "MegatronBertForPreTraining"),
|
||||
("mistral3", "Mistral3ForConditionalGeneration"),
|
||||
("mllama", "MllamaForConditionalGeneration"),
|
||||
@ -504,12 +483,10 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
|
||||
("mpt", "MptForCausalLM"),
|
||||
("mra", "MraForMaskedLM"),
|
||||
("mvp", "MvpForConditionalGeneration"),
|
||||
("nezha", "NezhaForPreTraining"),
|
||||
("nllb-moe", "NllbMoeForConditionalGeneration"),
|
||||
("openai-gpt", "OpenAIGPTLMHeadModel"),
|
||||
("paligemma", "PaliGemmaForConditionalGeneration"),
|
||||
("qwen2_audio", "Qwen2AudioForConditionalGeneration"),
|
||||
("retribert", "RetriBertModel"),
|
||||
("roberta", "RobertaForMaskedLM"),
|
||||
("roberta-prelayernorm", "RobertaPreLayerNormForMaskedLM"),
|
||||
("roc_bert", "RoCBertForPreTraining"),
|
||||
@ -520,8 +497,6 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
|
||||
("t5", "T5ForConditionalGeneration"),
|
||||
("t5gemma", "T5GemmaForConditionalGeneration"),
|
||||
("tapas", "TapasForMaskedLM"),
|
||||
("transfo-xl", "TransfoXLLMHeadModel"),
|
||||
("tvlt", "TvltForPreTraining"),
|
||||
("unispeech", "UniSpeechForPreTraining"),
|
||||
("unispeech-sat", "UniSpeechSatForPreTraining"),
|
||||
("video_llava", "VideoLlavaForConditionalGeneration"),
|
||||
@ -579,7 +554,6 @@ MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
|
||||
("gpt_neox", "GPTNeoXForCausalLM"),
|
||||
("gpt_neox_japanese", "GPTNeoXJapaneseForCausalLM"),
|
||||
("gptj", "GPTJForCausalLM"),
|
||||
("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
|
||||
("ibert", "IBertForMaskedLM"),
|
||||
("layoutlm", "LayoutLMForMaskedLM"),
|
||||
("led", "LEDForConditionalGeneration"),
|
||||
@ -590,7 +564,6 @@ MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
|
||||
("mamba", "MambaForCausalLM"),
|
||||
("mamba2", "Mamba2ForCausalLM"),
|
||||
("marian", "MarianMTModel"),
|
||||
("mega", "MegaForMaskedLM"),
|
||||
("megatron-bert", "MegatronBertForCausalLM"),
|
||||
("mobilebert", "MobileBertForMaskedLM"),
|
||||
("moonshine", "MoonshineForConditionalGeneration"),
|
||||
@ -598,14 +571,12 @@ MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
|
||||
("mpt", "MptForCausalLM"),
|
||||
("mra", "MraForMaskedLM"),
|
||||
("mvp", "MvpForConditionalGeneration"),
|
||||
("nezha", "NezhaForMaskedLM"),
|
||||
("nllb-moe", "NllbMoeForConditionalGeneration"),
|
||||
("nystromformer", "NystromformerForMaskedLM"),
|
||||
("openai-gpt", "OpenAIGPTLMHeadModel"),
|
||||
("pegasus_x", "PegasusXForConditionalGeneration"),
|
||||
("plbart", "PLBartForConditionalGeneration"),
|
||||
("pop2piano", "Pop2PianoForConditionalGeneration"),
|
||||
("qdqbert", "QDQBertForMaskedLM"),
|
||||
("reformer", "ReformerModelWithLMHead"),
|
||||
("rembert", "RemBertForMaskedLM"),
|
||||
("roberta", "RobertaForMaskedLM"),
|
||||
@ -619,7 +590,6 @@ MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
|
||||
("t5", "T5ForConditionalGeneration"),
|
||||
("t5gemma", "T5GemmaForConditionalGeneration"),
|
||||
("tapas", "TapasForMaskedLM"),
|
||||
("transfo-xl", "TransfoXLLMHeadModel"),
|
||||
("wav2vec2", "Wav2Vec2ForMaskedLM"),
|
||||
("whisper", "WhisperForConditionalGeneration"),
|
||||
("xlm", "XLMWithLMHeadModel"),
|
||||
@ -713,7 +683,6 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
|
||||
("mamba2", "Mamba2ForCausalLM"),
|
||||
("marian", "MarianForCausalLM"),
|
||||
("mbart", "MBartForCausalLM"),
|
||||
("mega", "MegaForCausalLM"),
|
||||
("megatron-bert", "MegatronBertForCausalLM"),
|
||||
("minimax", "MiniMaxForCausalLM"),
|
||||
("ministral", "MinistralForCausalLM"),
|
||||
@ -731,7 +700,6 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
|
||||
("olmo2", "Olmo2ForCausalLM"),
|
||||
("olmo3", "Olmo3ForCausalLM"),
|
||||
("olmoe", "OlmoeForCausalLM"),
|
||||
("open-llama", "OpenLlamaForCausalLM"),
|
||||
("openai-gpt", "OpenAIGPTLMHeadModel"),
|
||||
("opt", "OPTForCausalLM"),
|
||||
("pegasus", "PegasusForCausalLM"),
|
||||
@ -742,7 +710,6 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
|
||||
("phimoe", "PhimoeForCausalLM"),
|
||||
("plbart", "PLBartForCausalLM"),
|
||||
("prophetnet", "ProphetNetForCausalLM"),
|
||||
("qdqbert", "QDQBertLMHeadModel"),
|
||||
("qwen2", "Qwen2ForCausalLM"),
|
||||
("qwen2_moe", "Qwen2MoeForCausalLM"),
|
||||
("qwen3", "Qwen3ForCausalLM"),
|
||||
@ -758,16 +725,13 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
|
||||
("rwkv", "RwkvForCausalLM"),
|
||||
("seed_oss", "SeedOssForCausalLM"),
|
||||
("smollm3", "SmolLM3ForCausalLM"),
|
||||
("speech_to_text_2", "Speech2Text2ForCausalLM"),
|
||||
("stablelm", "StableLmForCausalLM"),
|
||||
("starcoder2", "Starcoder2ForCausalLM"),
|
||||
("transfo-xl", "TransfoXLLMHeadModel"),
|
||||
("trocr", "TrOCRForCausalLM"),
|
||||
("vaultgemma", "VaultGemmaForCausalLM"),
|
||||
("whisper", "WhisperForCausalLM"),
|
||||
("xglm", "XGLMForCausalLM"),
|
||||
("xlm", "XLMWithLMHeadModel"),
|
||||
("xlm-prophetnet", "XLMProphetNetForCausalLM"),
|
||||
("xlm-roberta", "XLMRobertaForCausalLM"),
|
||||
("xlm-roberta-xl", "XLMRobertaXLForCausalLM"),
|
||||
("xlnet", "XLNetLMHeadModel"),
|
||||
@ -793,7 +757,6 @@ MODEL_FOR_IMAGE_MAPPING_NAMES = OrderedDict(
|
||||
("deformable_detr", "DeformableDetrModel"),
|
||||
("deit", "DeiTModel"),
|
||||
("depth_pro", "DepthProModel"),
|
||||
("deta", "DetaModel"),
|
||||
("detr", "DetrModel"),
|
||||
("dinat", "DinatModel"),
|
||||
("dinov2", "Dinov2Model"),
|
||||
@ -801,7 +764,6 @@ MODEL_FOR_IMAGE_MAPPING_NAMES = OrderedDict(
|
||||
("dinov3_convnext", "DINOv3ConvNextModel"),
|
||||
("dinov3_vit", "DINOv3ViTModel"),
|
||||
("dpt", "DPTModel"),
|
||||
("efficientformer", "EfficientFormerModel"),
|
||||
("efficientnet", "EfficientNetModel"),
|
||||
("focalnet", "FocalNetModel"),
|
||||
("glpn", "GLPNModel"),
|
||||
@ -816,7 +778,6 @@ MODEL_FOR_IMAGE_MAPPING_NAMES = OrderedDict(
|
||||
("mobilenet_v2", "MobileNetV2Model"),
|
||||
("mobilevit", "MobileViTModel"),
|
||||
("mobilevitv2", "MobileViTV2Model"),
|
||||
("nat", "NatModel"),
|
||||
("poolformer", "PoolFormerModel"),
|
||||
("pvt", "PvtModel"),
|
||||
("regnet", "RegNetModel"),
|
||||
@ -831,10 +792,8 @@ MODEL_FOR_IMAGE_MAPPING_NAMES = OrderedDict(
|
||||
("timesformer", "TimesformerModel"),
|
||||
("timm_backbone", "TimmBackbone"),
|
||||
("timm_wrapper", "TimmWrapperModel"),
|
||||
("van", "VanModel"),
|
||||
("videomae", "VideoMAEModel"),
|
||||
("vit", "ViTModel"),
|
||||
("vit_hybrid", "ViTHybridModel"),
|
||||
("vit_mae", "ViTMAEModel"),
|
||||
("vit_msn", "ViTMSNModel"),
|
||||
("vitdet", "VitDetModel"),
|
||||
@ -879,13 +838,6 @@ MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
|
||||
("dinov2", "Dinov2ForImageClassification"),
|
||||
("dinov2_with_registers", "Dinov2WithRegistersForImageClassification"),
|
||||
("donut-swin", "DonutSwinForImageClassification"),
|
||||
(
|
||||
"efficientformer",
|
||||
(
|
||||
"EfficientFormerForImageClassification",
|
||||
"EfficientFormerForImageClassificationWithTeacher",
|
||||
),
|
||||
),
|
||||
("efficientnet", "EfficientNetForImageClassification"),
|
||||
("focalnet", "FocalNetForImageClassification"),
|
||||
("hgnet_v2", "HGNetV2ForImageClassification"),
|
||||
@ -901,7 +853,6 @@ MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
|
||||
("mobilenet_v2", "MobileNetV2ForImageClassification"),
|
||||
("mobilevit", "MobileViTForImageClassification"),
|
||||
("mobilevitv2", "MobileViTV2ForImageClassification"),
|
||||
("nat", "NatForImageClassification"),
|
||||
(
|
||||
"perceiver",
|
||||
(
|
||||
@ -924,9 +875,7 @@ MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
|
||||
("swinv2", "Swinv2ForImageClassification"),
|
||||
("textnet", "TextNetForImageClassification"),
|
||||
("timm_wrapper", "TimmWrapperForImageClassification"),
|
||||
("van", "VanForImageClassification"),
|
||||
("vit", "ViTForImageClassification"),
|
||||
("vit_hybrid", "ViTHybridForImageClassification"),
|
||||
("vit_msn", "ViTMSNForImageClassification"),
|
||||
]
|
||||
)
|
||||
@ -1097,17 +1046,14 @@ MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
|
||||
("longformer", "LongformerForMaskedLM"),
|
||||
("luke", "LukeForMaskedLM"),
|
||||
("mbart", "MBartForConditionalGeneration"),
|
||||
("mega", "MegaForMaskedLM"),
|
||||
("megatron-bert", "MegatronBertForMaskedLM"),
|
||||
("mobilebert", "MobileBertForMaskedLM"),
|
||||
("modernbert", "ModernBertForMaskedLM"),
|
||||
("mpnet", "MPNetForMaskedLM"),
|
||||
("mra", "MraForMaskedLM"),
|
||||
("mvp", "MvpForConditionalGeneration"),
|
||||
("nezha", "NezhaForMaskedLM"),
|
||||
("nystromformer", "NystromformerForMaskedLM"),
|
||||
("perceiver", "PerceiverForMaskedLM"),
|
||||
("qdqbert", "QDQBertForMaskedLM"),
|
||||
("reformer", "ReformerForMaskedLM"),
|
||||
("rembert", "RemBertForMaskedLM"),
|
||||
("roberta", "RobertaForMaskedLM"),
|
||||
@ -1132,7 +1078,6 @@ MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES = OrderedDict(
|
||||
("d_fine", "DFineForObjectDetection"),
|
||||
("dab-detr", "DabDetrForObjectDetection"),
|
||||
("deformable_detr", "DeformableDetrForObjectDetection"),
|
||||
("deta", "DetaForObjectDetection"),
|
||||
("detr", "DetrForObjectDetection"),
|
||||
("rt_detr", "RTDetrForObjectDetection"),
|
||||
("rt_detr_v2", "RTDetrV2ForObjectDetection"),
|
||||
@ -1173,7 +1118,6 @@ MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
|
||||
("blenderbot-small", "BlenderbotSmallForConditionalGeneration"),
|
||||
("encoder-decoder", "EncoderDecoderModel"),
|
||||
("fsmt", "FSMTForConditionalGeneration"),
|
||||
("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
|
||||
("granite_speech", "GraniteSpeechForConditionalGeneration"),
|
||||
("led", "LEDForConditionalGeneration"),
|
||||
("longt5", "LongT5ForConditionalGeneration"),
|
||||
@ -1195,7 +1139,6 @@ MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
|
||||
("t5gemma", "T5GemmaForConditionalGeneration"),
|
||||
("umt5", "UMT5ForConditionalGeneration"),
|
||||
("voxtral", "VoxtralForConditionalGeneration"),
|
||||
("xlm-prophetnet", "XLMProphetNetForConditionalGeneration"),
|
||||
]
|
||||
)
|
||||
|
||||
@ -1241,7 +1184,6 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
|
||||
("doge", "DogeForSequenceClassification"),
|
||||
("electra", "ElectraForSequenceClassification"),
|
||||
("ernie", "ErnieForSequenceClassification"),
|
||||
("ernie_m", "ErnieMForSequenceClassification"),
|
||||
("esm", "EsmForSequenceClassification"),
|
||||
("exaone4", "Exaone4ForSequenceClassification"),
|
||||
("falcon", "FalconForSequenceClassification"),
|
||||
@ -1277,7 +1219,6 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
|
||||
("luke", "LukeForSequenceClassification"),
|
||||
("markuplm", "MarkupLMForSequenceClassification"),
|
||||
("mbart", "MBartForSequenceClassification"),
|
||||
("mega", "MegaForSequenceClassification"),
|
||||
("megatron-bert", "MegatronBertForSequenceClassification"),
|
||||
("minimax", "MiniMaxForSequenceClassification"),
|
||||
("ministral", "MinistralForSequenceClassification"),
|
||||
@ -1292,9 +1233,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
|
||||
("mt5", "MT5ForSequenceClassification"),
|
||||
("mvp", "MvpForSequenceClassification"),
|
||||
("nemotron", "NemotronForSequenceClassification"),
|
||||
("nezha", "NezhaForSequenceClassification"),
|
||||
("nystromformer", "NystromformerForSequenceClassification"),
|
||||
("open-llama", "OpenLlamaForSequenceClassification"),
|
||||
("openai-gpt", "OpenAIGPTForSequenceClassification"),
|
||||
("opt", "OPTForSequenceClassification"),
|
||||
("perceiver", "PerceiverForSequenceClassification"),
|
||||
@ -1303,7 +1242,6 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
|
||||
("phi3", "Phi3ForSequenceClassification"),
|
||||
("phimoe", "PhimoeForSequenceClassification"),
|
||||
("plbart", "PLBartForSequenceClassification"),
|
||||
("qdqbert", "QDQBertForSequenceClassification"),
|
||||
("qwen2", "Qwen2ForSequenceClassification"),
|
||||
("qwen2_moe", "Qwen2MoeForSequenceClassification"),
|
||||
("qwen3", "Qwen3ForSequenceClassification"),
|
||||
@ -1323,7 +1261,6 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
|
||||
("t5", "T5ForSequenceClassification"),
|
||||
("t5gemma", "T5GemmaForSequenceClassification"),
|
||||
("tapas", "TapasForSequenceClassification"),
|
||||
("transfo-xl", "TransfoXLForSequenceClassification"),
|
||||
("umt5", "UMT5ForSequenceClassification"),
|
||||
("xlm", "XLMForSequenceClassification"),
|
||||
("xlm-roberta", "XLMRobertaForSequenceClassification"),
|
||||
@ -1356,7 +1293,6 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
|
||||
("distilbert", "DistilBertForQuestionAnswering"),
|
||||
("electra", "ElectraForQuestionAnswering"),
|
||||
("ernie", "ErnieForQuestionAnswering"),
|
||||
("ernie_m", "ErnieMForQuestionAnswering"),
|
||||
("exaone4", "Exaone4ForQuestionAnswering"),
|
||||
("falcon", "FalconForQuestionAnswering"),
|
||||
("flaubert", "FlaubertForQuestionAnsweringSimple"),
|
||||
@ -1377,7 +1313,6 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
|
||||
("lxmert", "LxmertForQuestionAnswering"),
|
||||
("markuplm", "MarkupLMForQuestionAnswering"),
|
||||
("mbart", "MBartForQuestionAnswering"),
|
||||
("mega", "MegaForQuestionAnswering"),
|
||||
("megatron-bert", "MegatronBertForQuestionAnswering"),
|
||||
("minimax", "MiniMaxForQuestionAnswering"),
|
||||
("ministral", "MinistralForQuestionAnswering"),
|
||||
@ -1391,10 +1326,8 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
|
||||
("mt5", "MT5ForQuestionAnswering"),
|
||||
("mvp", "MvpForQuestionAnswering"),
|
||||
("nemotron", "NemotronForQuestionAnswering"),
|
||||
("nezha", "NezhaForQuestionAnswering"),
|
||||
("nystromformer", "NystromformerForQuestionAnswering"),
|
||||
("opt", "OPTForQuestionAnswering"),
|
||||
("qdqbert", "QDQBertForQuestionAnswering"),
|
||||
("qwen2", "Qwen2ForQuestionAnswering"),
|
||||
("qwen2_moe", "Qwen2MoeForQuestionAnswering"),
|
||||
("qwen3", "Qwen3ForQuestionAnswering"),
|
||||
@ -1466,7 +1399,6 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
|
||||
("distilbert", "DistilBertForTokenClassification"),
|
||||
("electra", "ElectraForTokenClassification"),
|
||||
("ernie", "ErnieForTokenClassification"),
|
||||
("ernie_m", "ErnieMForTokenClassification"),
|
||||
("esm", "EsmForTokenClassification"),
|
||||
("exaone4", "Exaone4ForTokenClassification"),
|
||||
("falcon", "FalconForTokenClassification"),
|
||||
@ -1493,7 +1425,6 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
|
||||
("longformer", "LongformerForTokenClassification"),
|
||||
("luke", "LukeForTokenClassification"),
|
||||
("markuplm", "MarkupLMForTokenClassification"),
|
||||
("mega", "MegaForTokenClassification"),
|
||||
("megatron-bert", "MegatronBertForTokenClassification"),
|
||||
("minimax", "MiniMaxForTokenClassification"),
|
||||
("ministral", "MinistralForTokenClassification"),
|
||||
@ -1506,12 +1437,10 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
|
||||
("mra", "MraForTokenClassification"),
|
||||
("mt5", "MT5ForTokenClassification"),
|
||||
("nemotron", "NemotronForTokenClassification"),
|
||||
("nezha", "NezhaForTokenClassification"),
|
||||
("nystromformer", "NystromformerForTokenClassification"),
|
||||
("persimmon", "PersimmonForTokenClassification"),
|
||||
("phi", "PhiForTokenClassification"),
|
||||
("phi3", "Phi3ForTokenClassification"),
|
||||
("qdqbert", "QDQBertForTokenClassification"),
|
||||
("qwen2", "Qwen2ForTokenClassification"),
|
||||
("qwen2_moe", "Qwen2MoeForTokenClassification"),
|
||||
("qwen3", "Qwen3ForTokenClassification"),
|
||||
@ -1553,22 +1482,18 @@ MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES = OrderedDict(
|
||||
("distilbert", "DistilBertForMultipleChoice"),
|
||||
("electra", "ElectraForMultipleChoice"),
|
||||
("ernie", "ErnieForMultipleChoice"),
|
||||
("ernie_m", "ErnieMForMultipleChoice"),
|
||||
("flaubert", "FlaubertForMultipleChoice"),
|
||||
("fnet", "FNetForMultipleChoice"),
|
||||
("funnel", "FunnelForMultipleChoice"),
|
||||
("ibert", "IBertForMultipleChoice"),
|
||||
("longformer", "LongformerForMultipleChoice"),
|
||||
("luke", "LukeForMultipleChoice"),
|
||||
("mega", "MegaForMultipleChoice"),
|
||||
("megatron-bert", "MegatronBertForMultipleChoice"),
|
||||
("mobilebert", "MobileBertForMultipleChoice"),
|
||||
("modernbert", "ModernBertForMultipleChoice"),
|
||||
("mpnet", "MPNetForMultipleChoice"),
|
||||
("mra", "MraForMultipleChoice"),
|
||||
("nezha", "NezhaForMultipleChoice"),
|
||||
("nystromformer", "NystromformerForMultipleChoice"),
|
||||
("qdqbert", "QDQBertForMultipleChoice"),
|
||||
("rembert", "RemBertForMultipleChoice"),
|
||||
("roberta", "RobertaForMultipleChoice"),
|
||||
("roberta-prelayernorm", "RobertaPreLayerNormForMultipleChoice"),
|
||||
@ -1591,8 +1516,6 @@ MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES = OrderedDict(
|
||||
("fnet", "FNetForNextSentencePrediction"),
|
||||
("megatron-bert", "MegatronBertForNextSentencePrediction"),
|
||||
("mobilebert", "MobileBertForNextSentencePrediction"),
|
||||
("nezha", "NezhaForNextSentencePrediction"),
|
||||
("qdqbert", "QDQBertForNextSentencePrediction"),
|
||||
]
|
||||
)
|
||||
|
||||
@ -1619,7 +1542,6 @@ MODEL_FOR_CTC_MAPPING_NAMES = OrderedDict(
|
||||
# Model for Connectionist temporal classification (CTC) mapping
|
||||
("data2vec-audio", "Data2VecAudioForCTC"),
|
||||
("hubert", "HubertForCTC"),
|
||||
("mctct", "MCTCTForCTC"),
|
||||
("parakeet_ctc", "ParakeetForCTC"),
|
||||
("sew", "SEWForCTC"),
|
||||
("sew-d", "SEWDForCTC"),
|
||||
@ -1713,7 +1635,6 @@ MODEL_FOR_BACKBONE_MAPPING_NAMES = OrderedDict(
|
||||
("hgnet_v2", "HGNetV2Backbone"),
|
||||
("hiera", "HieraBackbone"),
|
||||
("maskformer-swin", "MaskFormerSwinBackbone"),
|
||||
("nat", "NatBackbone"),
|
||||
("pvt_v2", "PvtV2Backbone"),
|
||||
("resnet", "ResNetBackbone"),
|
||||
("rt_detr_resnet", "RTDetrResNetBackbone"),
|
||||
|
||||
@ -102,7 +102,6 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
|
||||
("llava_next_video", "LlavaNextVideoProcessor"),
|
||||
("llava_onevision", "LlavaOnevisionProcessor"),
|
||||
("markuplm", "MarkupLMProcessor"),
|
||||
("mctct", "MCTCTProcessor"),
|
||||
("metaclip_2", "CLIPProcessor"),
|
||||
("mgp-str", "MgpstrProcessor"),
|
||||
("mistral3", "PixtralProcessor"),
|
||||
@ -138,10 +137,8 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
|
||||
("siglip2", "Siglip2Processor"),
|
||||
("smolvlm", "SmolVLMProcessor"),
|
||||
("speech_to_text", "Speech2TextProcessor"),
|
||||
("speech_to_text_2", "Speech2Text2Processor"),
|
||||
("speecht5", "SpeechT5Processor"),
|
||||
("trocr", "TrOCRProcessor"),
|
||||
("tvlt", "TvltProcessor"),
|
||||
("tvp", "TvpProcessor"),
|
||||
("udop", "UdopProcessor"),
|
||||
("unispeech", "Wav2Vec2Processor"),
|
||||
|
||||
@ -239,7 +239,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
|
||||
("ernie", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("ernie4_5", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("ernie4_5_moe", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("ernie_m", ("ErnieMTokenizer" if is_sentencepiece_available() else None, None)),
|
||||
("esm", ("EsmTokenizer", None)),
|
||||
("evolla", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
|
||||
(
|
||||
@ -321,7 +320,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
|
||||
("gpt_neox_japanese", ("GPTNeoXJapaneseTokenizer", None)),
|
||||
("gpt_oss", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("gptj", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
|
||||
("gptsan-japanese", ("GPTSanJapaneseTokenizer", None)),
|
||||
("granite", ("GPT2Tokenizer", None)),
|
||||
("granite_speech", ("GPT2Tokenizer", None)),
|
||||
("granitemoe", ("GPT2Tokenizer", None)),
|
||||
@ -354,7 +352,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
|
||||
"LlamaTokenizerFast" if is_tokenizers_available() else None,
|
||||
),
|
||||
),
|
||||
("jukebox", ("JukeboxTokenizer", None)),
|
||||
(
|
||||
"kosmos-2",
|
||||
(
|
||||
@ -426,7 +423,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
|
||||
"MBart50TokenizerFast" if is_tokenizers_available() else None,
|
||||
),
|
||||
),
|
||||
("mega", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("megatron-bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
|
||||
(
|
||||
"metaclip_2",
|
||||
@ -501,7 +497,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
|
||||
("mvp", ("MvpTokenizer", "MvpTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("myt5", ("MyT5Tokenizer", None)),
|
||||
("nemotron", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("nezha", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
|
||||
(
|
||||
"nllb",
|
||||
(
|
||||
@ -590,7 +585,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
|
||||
("plbart", ("PLBartTokenizer" if is_sentencepiece_available() else None, None)),
|
||||
("pop2piano", ("Pop2PianoTokenizer", None)),
|
||||
("prophetnet", ("ProphetNetTokenizer", None)),
|
||||
("qdqbert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
|
||||
(
|
||||
"qwen2",
|
||||
(
|
||||
@ -634,7 +628,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
|
||||
("qwen3_vl", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
|
||||
("qwen3_vl_moe", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
|
||||
("rag", ("RagTokenizer", None)),
|
||||
("realm", ("RealmTokenizer", "RealmTokenizerFast" if is_tokenizers_available() else None)),
|
||||
(
|
||||
"recurrent_gemma",
|
||||
(
|
||||
@ -656,7 +649,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
|
||||
"RemBertTokenizerFast" if is_tokenizers_available() else None,
|
||||
),
|
||||
),
|
||||
("retribert", ("RetriBertTokenizer", "RetriBertTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
|
||||
(
|
||||
"roberta-prelayernorm",
|
||||
@ -697,7 +689,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
|
||||
("smollm3", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("smolvlm", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)),
|
||||
("speech_to_text_2", ("Speech2Text2Tokenizer", None)),
|
||||
("speecht5", ("SpeechT5Tokenizer" if is_sentencepiece_available() else None, None)),
|
||||
("splinter", ("SplinterTokenizer", "SplinterTokenizerFast")),
|
||||
(
|
||||
@ -728,8 +719,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
|
||||
),
|
||||
),
|
||||
("tapas", ("TapasTokenizer", None)),
|
||||
("tapex", ("TapexTokenizer", None)),
|
||||
("transfo-xl", ("TransfoXLTokenizer", None)),
|
||||
("trocr", ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("tvp", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
|
||||
(
|
||||
@ -780,7 +769,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
|
||||
),
|
||||
),
|
||||
("xlm", ("XLMTokenizer", None)),
|
||||
("xlm-prophetnet", ("XLMProphetNetTokenizer" if is_sentencepiece_available() else None, None)),
|
||||
(
|
||||
"xlm-roberta",
|
||||
(
|
||||
|
||||
@ -1342,9 +1342,6 @@ class AutoformerModel(AutoformerPreTrainedModel):
|
||||
)
|
||||
return reshaped_lagged_sequence, features, loc, scale, static_feat
|
||||
|
||||
def get_encoder(self):
|
||||
return self.encoder
|
||||
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
@ -1588,12 +1585,6 @@ class AutoformerForPrediction(AutoformerPreTrainedModel):
|
||||
def output_params(self, decoder_output):
|
||||
return self.parameter_projection(decoder_output[:, -self.config.prediction_length :, :])
|
||||
|
||||
def get_encoder(self):
|
||||
return self.model.get_encoder()
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model.get_decoder()
|
||||
|
||||
@torch.jit.ignore
|
||||
def output_distribution(self, params, loc=None, scale=None, trailing_n=None) -> torch.distributions.Distribution:
|
||||
sliced_params = params
|
||||
|
||||
@ -90,6 +90,7 @@ class AyaVisionMultiModalProjector(nn.Module):
|
||||
@auto_docstring
|
||||
class AyaVisionPreTrainedModel(PreTrainedModel):
|
||||
config: AyaVisionConfig
|
||||
base_model_prefix = "model"
|
||||
input_modalities = ["image", "text"]
|
||||
supports_gradient_checkpointing = True
|
||||
_skip_keys_device_placement = "past_key_values"
|
||||
@ -162,6 +163,10 @@ class AyaVisionModelOutputWithPast(BaseModelOutputWithPast):
|
||||
"""
|
||||
)
|
||||
class AyaVisionModel(AyaVisionPreTrainedModel):
|
||||
_checkpoint_conversion_mapping = {
|
||||
r"^language_model.model": "language_model",
|
||||
}
|
||||
|
||||
def __init__(self, config: AyaVisionConfig):
|
||||
super().__init__(config)
|
||||
self.vision_tower = AutoModel.from_config(config.vision_config)
|
||||
@ -176,12 +181,6 @@ class AyaVisionModel(AyaVisionPreTrainedModel):
|
||||
def set_input_embeddings(self, value):
|
||||
self.language_model.set_input_embeddings(value)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.language_model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.language_model
|
||||
|
||||
def get_image_features(
|
||||
self,
|
||||
pixel_values: torch.FloatTensor,
|
||||
@ -352,12 +351,6 @@ class AyaVisionForConditionalGeneration(AyaVisionPreTrainedModel, GenerationMixi
|
||||
def get_output_embeddings(self) -> nn.Module:
|
||||
return self.lm_head
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model.set_decoder(decoder)
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model.get_decoder()
|
||||
|
||||
def get_image_features(
|
||||
self,
|
||||
pixel_values: torch.FloatTensor,
|
||||
@ -372,19 +365,6 @@ class AyaVisionForConditionalGeneration(AyaVisionPreTrainedModel, GenerationMixi
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Make modules available through conditional class for BC
|
||||
@property
|
||||
def language_model(self):
|
||||
return self.model.language_model
|
||||
|
||||
@property
|
||||
def vision_tower(self):
|
||||
return self.model.vision_tower
|
||||
|
||||
@property
|
||||
def multi_modal_projector(self):
|
||||
return self.model.multi_modal_projector
|
||||
|
||||
@can_return_tuple
|
||||
@auto_docstring
|
||||
def forward(
|
||||
|
||||
@ -166,11 +166,10 @@ class BartConfig(PreTrainedConfig):
|
||||
)
|
||||
self.tie_encoder_decoder = True
|
||||
# ensure backward compatibility for BART CNN models
|
||||
if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
|
||||
if kwargs.get("force_bos_token_to_be_generated", False):
|
||||
self.forced_bos_token_id = self.bos_token_id
|
||||
warnings.warn(
|
||||
f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
|
||||
"The config can simply be saved and uploaded again to be fixed."
|
||||
f"Please make sure the generation config includes `forced_bos_token_id={self.bos_token_id}`. "
|
||||
)
|
||||
|
||||
|
||||
|
||||
@ -905,9 +905,6 @@ class BartModel(BartPreTrainedModel):
|
||||
self.encoder.embed_tokens = self.shared
|
||||
self.decoder.embed_tokens = self.shared
|
||||
|
||||
def get_encoder(self):
|
||||
return self.encoder
|
||||
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
@ -1037,12 +1034,6 @@ class BartForConditionalGeneration(BartPreTrainedModel, GenerationMixin):
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
def get_encoder(self):
|
||||
return self.model.get_encoder()
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model.get_decoder()
|
||||
|
||||
def resize_token_embeddings(
|
||||
self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None, mean_resizing: bool = True
|
||||
) -> nn.Embedding:
|
||||
@ -1498,12 +1489,6 @@ class BartForCausalLM(BartPreTrainedModel, GenerationMixin):
|
||||
def set_input_embeddings(self, value):
|
||||
self.model.decoder.embed_tokens = value
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model.decoder = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model.decoder
|
||||
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
|
||||
@ -22,7 +22,6 @@ if TYPE_CHECKING:
|
||||
from .modeling_bert import *
|
||||
from .tokenization_bert import *
|
||||
from .tokenization_bert_fast import *
|
||||
from .tokenization_bert_tf import *
|
||||
else:
|
||||
import sys
|
||||
|
||||
|
||||
@ -2083,9 +2083,6 @@ class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel):
|
||||
self.encoder.embed_tokens = self.shared
|
||||
self.decoder.embed_tokens = self.shared
|
||||
|
||||
def get_encoder(self):
|
||||
return self.encoder
|
||||
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
@ -2205,12 +2202,6 @@ class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel, Gene
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
def get_encoder(self):
|
||||
return self.model.get_encoder()
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model.get_decoder()
|
||||
|
||||
def resize_token_embeddings(
|
||||
self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None, mean_resizing: bool = True
|
||||
) -> nn.Embedding:
|
||||
@ -2609,12 +2600,6 @@ class BigBirdPegasusForCausalLM(BigBirdPegasusPreTrainedModel, GenerationMixin):
|
||||
def set_input_embeddings(self, value):
|
||||
self.model.decoder.embed_tokens = value
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model.decoder = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model.decoder
|
||||
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
|
||||
@ -869,9 +869,6 @@ class BlenderbotModel(BlenderbotPreTrainedModel):
|
||||
self.encoder.embed_tokens = self.shared
|
||||
self.decoder.embed_tokens = self.shared
|
||||
|
||||
def get_encoder(self):
|
||||
return self.encoder
|
||||
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
@ -1009,12 +1006,6 @@ class BlenderbotForConditionalGeneration(BlenderbotPreTrainedModel, GenerationMi
|
||||
|
||||
return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
|
||||
def get_encoder(self):
|
||||
return self.model.get_encoder()
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model.get_decoder()
|
||||
|
||||
def resize_token_embeddings(
|
||||
self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None, mean_resizing: bool = True
|
||||
) -> nn.Embedding:
|
||||
@ -1189,12 +1180,6 @@ class BlenderbotForCausalLM(BlenderbotPreTrainedModel, GenerationMixin):
|
||||
def set_input_embeddings(self, value):
|
||||
self.model.decoder.embed_tokens = value
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model.decoder = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model.decoder
|
||||
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
|
||||
@ -842,9 +842,6 @@ class BlenderbotSmallModel(BlenderbotSmallPreTrainedModel):
|
||||
self.encoder.embed_tokens = self.shared
|
||||
self.decoder.embed_tokens = self.shared
|
||||
|
||||
def get_encoder(self):
|
||||
return self.encoder
|
||||
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
@ -969,12 +966,6 @@ class BlenderbotSmallForConditionalGeneration(BlenderbotSmallPreTrainedModel, Ge
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
def get_encoder(self):
|
||||
return self.model.get_encoder()
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model.get_decoder()
|
||||
|
||||
def resize_token_embeddings(
|
||||
self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None, mean_resizing: bool = True
|
||||
) -> nn.Embedding:
|
||||
@ -1149,12 +1140,6 @@ class BlenderbotSmallForCausalLM(BlenderbotSmallPreTrainedModel, GenerationMixin
|
||||
def set_input_embeddings(self, value):
|
||||
self.model.decoder.embed_tokens = value
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model.decoder = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model.decoder
|
||||
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
|
||||
@ -1058,11 +1058,11 @@ class Blip2Model(Blip2PreTrainedModel):
|
||||
def get_output_embeddings(self) -> nn.Module:
|
||||
return self.language_model.get_output_embeddings()
|
||||
|
||||
def get_encoder(self):
|
||||
return self.language_model.get_encoder()
|
||||
|
||||
def get_decoder(self):
|
||||
return self.language_model.get_decoder()
|
||||
def get_encoder(self, modality=None):
|
||||
if modality is None:
|
||||
return self.language_model.get_encoder()
|
||||
else:
|
||||
return super().get_encoder(modality=modality)
|
||||
|
||||
@filter_out_non_signature_kwargs()
|
||||
@auto_docstring
|
||||
@ -1579,11 +1579,11 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin):
|
||||
def get_output_embeddings(self) -> nn.Module:
|
||||
return self.language_model.get_output_embeddings()
|
||||
|
||||
def get_encoder(self):
|
||||
return self.language_model.get_encoder()
|
||||
|
||||
def get_decoder(self):
|
||||
return self.language_model.get_decoder()
|
||||
def get_encoder(self, modality=None):
|
||||
if modality is None:
|
||||
return self.language_model.get_encoder()
|
||||
else:
|
||||
return super().get_encoder(modality=modality)
|
||||
|
||||
def _preprocess_accelerate(self):
|
||||
r"""
|
||||
|
||||
@ -430,6 +430,7 @@ class BltCrossAttention(nn.Module):
|
||||
@auto_docstring
|
||||
class BltPreTrainedModel(PreTrainedModel):
|
||||
config: BltConfig
|
||||
base_model_prefix = "model"
|
||||
input_modalities = ["image", "text"]
|
||||
supports_gradient_checkpointing = True
|
||||
_no_split_modules = ["BltTransformerLayer"]
|
||||
|
||||
@ -778,7 +778,7 @@ class ClvpConditioningEncoder(nn.Module):
|
||||
@auto_docstring
|
||||
class ClvpPreTrainedModel(PreTrainedModel):
|
||||
config: ClvpConfig
|
||||
base_model_prefix = "clvp"
|
||||
base_model_prefix = "model"
|
||||
supports_gradient_checkpointing = True
|
||||
_skip_keys_device_placement = "past_key_values"
|
||||
|
||||
|
||||
@ -129,6 +129,7 @@ class Cohere2VisionCausalLMOutputWithPast(ModelOutput):
|
||||
@auto_docstring
|
||||
class Cohere2VisionPreTrainedModel(PreTrainedModel):
|
||||
config: Cohere2VisionConfig
|
||||
base_model_prefix = "model"
|
||||
input_modalities = ["image", "text"]
|
||||
supports_gradient_checkpointing = True
|
||||
_skip_keys_device_placement = "past_key_values"
|
||||
@ -142,7 +143,6 @@ class Cohere2VisionPreTrainedModel(PreTrainedModel):
|
||||
"hidden_states": "DecoderLayer",
|
||||
"attentions": "Attention",
|
||||
}
|
||||
base_model_prefix = "model"
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
@ -167,12 +167,6 @@ class Cohere2VisionModel(Cohere2VisionPreTrainedModel):
|
||||
def set_input_embeddings(self, value):
|
||||
self.language_model.set_input_embeddings(value)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.language_model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.language_model
|
||||
|
||||
def get_image_features(self, pixel_values: torch.FloatTensor):
|
||||
"""
|
||||
Obtains image last hidden states from the vision tower and apply multimodal projection.
|
||||
@ -285,28 +279,9 @@ class Cohere2VisionForConditionalGeneration(Cohere2VisionPreTrainedModel, Genera
|
||||
def get_output_embeddings(self) -> nn.Module:
|
||||
return self.lm_head
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model.set_decoder(decoder)
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model.get_decoder()
|
||||
|
||||
def get_image_features(self, pixel_values: torch.FloatTensor):
|
||||
return self.model.get_image_features(pixel_values=pixel_values)
|
||||
|
||||
# Make modules available through conditional class for BC
|
||||
@property
|
||||
def language_model(self):
|
||||
return self.model.language_model
|
||||
|
||||
@property
|
||||
def vision_tower(self):
|
||||
return self.model.vision_tower
|
||||
|
||||
@property
|
||||
def multi_modal_projector(self):
|
||||
return self.model.multi_modal_projector
|
||||
|
||||
@check_model_inputs()
|
||||
@auto_docstring
|
||||
def forward(
|
||||
|
||||
@ -171,10 +171,10 @@ class ColQwen2ForRetrieval(ColQwen2PreTrainedModel):
|
||||
|
||||
# Custom data preparation to fix an issue with the gradient flow when training with multiple GPUs.
|
||||
if inputs_embeds is None:
|
||||
inputs_embeds = self.vlm.language_model.embed_tokens(input_ids)
|
||||
inputs_embeds = self.vlm.get_input_embeddings()(input_ids)
|
||||
|
||||
if pixel_values is not None:
|
||||
image_embeds = self.vlm.visual(pixel_values, grid_thw=image_grid_thw)
|
||||
image_embeds = self.vlm.model.visual(pixel_values, grid_thw=image_grid_thw)
|
||||
image_mask = (
|
||||
(input_ids == self.config.vlm_config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds)
|
||||
)
|
||||
|
||||
@ -352,10 +352,10 @@ class ColQwen2ForRetrieval(ColPaliForRetrieval):
|
||||
|
||||
# Custom data preparation to fix an issue with the gradient flow when training with multiple GPUs.
|
||||
if inputs_embeds is None:
|
||||
inputs_embeds = self.vlm.language_model.embed_tokens(input_ids)
|
||||
inputs_embeds = self.vlm.get_input_embeddings()(input_ids)
|
||||
|
||||
if pixel_values is not None:
|
||||
image_embeds = self.vlm.visual(pixel_values, grid_thw=image_grid_thw)
|
||||
image_embeds = self.vlm.model.visual(pixel_values, grid_thw=image_grid_thw)
|
||||
image_mask = (
|
||||
(input_ids == self.config.vlm_config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds)
|
||||
)
|
||||
|
||||
@ -1324,9 +1324,6 @@ class ConditionalDetrModel(ConditionalDetrPreTrainedModel):
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
def get_encoder(self):
|
||||
return self.encoder
|
||||
|
||||
def freeze_backbone(self):
|
||||
for name, param in self.backbone.conv_encoder.model.named_parameters():
|
||||
param.requires_grad_(False)
|
||||
|
||||
@ -1198,9 +1198,6 @@ class DFineModel(DFinePreTrainedModel):
|
||||
|
||||
self.post_init()
|
||||
|
||||
def get_encoder(self):
|
||||
return self.encoder
|
||||
|
||||
def freeze_backbone(self):
|
||||
for param in self.backbone.parameters():
|
||||
param.requires_grad_(False)
|
||||
|
||||
@ -1202,9 +1202,6 @@ class DabDetrModel(DabDetrPreTrainedModel):
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
def get_encoder(self):
|
||||
return self.encoder
|
||||
|
||||
def freeze_backbone(self):
|
||||
for name, param in self.backbone.conv_encoder.model.named_parameters():
|
||||
param.requires_grad_(False)
|
||||
|
||||
@ -1359,9 +1359,6 @@ class DeformableDetrModel(DeformableDetrPreTrainedModel):
|
||||
|
||||
self.post_init()
|
||||
|
||||
def get_encoder(self):
|
||||
return self.encoder
|
||||
|
||||
def freeze_backbone(self):
|
||||
for name, param in self.backbone.conv_encoder.model.named_parameters():
|
||||
param.requires_grad_(False)
|
||||
|
||||
@ -18,30 +18,9 @@ from ...utils.import_utils import define_import_structure
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .bort import *
|
||||
from .deta import *
|
||||
from .efficientformer import *
|
||||
from .ernie_m import *
|
||||
from .gptsan_japanese import *
|
||||
from .graphormer import *
|
||||
from .jukebox import *
|
||||
from .mctct import *
|
||||
from .mega import *
|
||||
from .mmbt import *
|
||||
from .nat import *
|
||||
from .nezha import *
|
||||
from .open_llama import *
|
||||
from .qdqbert import *
|
||||
from .realm import *
|
||||
from .retribert import *
|
||||
from .speech_to_text_2 import *
|
||||
from .tapex import *
|
||||
from .trajectory_transformer import *
|
||||
from .transfo_xl import *
|
||||
from .tvlt import *
|
||||
from .van import *
|
||||
from .vit_hybrid import *
|
||||
from .xlm_prophetnet import *
|
||||
pass
|
||||
# Add models to deprecate like:
|
||||
# from .XXX import *
|
||||
else:
|
||||
import sys
|
||||
|
||||
|
||||
@ -1,318 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020, The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Convert Bort checkpoint."""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import gluonnlp as nlp
|
||||
import mxnet as mx
|
||||
import numpy as np
|
||||
import torch
|
||||
from gluonnlp.base import get_home_dir
|
||||
from gluonnlp.model.bert import BERTEncoder
|
||||
from gluonnlp.model.utils import _load_vocab
|
||||
from gluonnlp.vocab import Vocab
|
||||
from packaging import version
|
||||
from torch import nn
|
||||
|
||||
from transformers import BertConfig, BertForMaskedLM, BertModel, RobertaTokenizer
|
||||
from transformers.models.bert.modeling_bert import (
|
||||
BertIntermediate,
|
||||
BertLayer,
|
||||
BertOutput,
|
||||
BertSelfAttention,
|
||||
BertSelfOutput,
|
||||
)
|
||||
from transformers.utils import logging
|
||||
|
||||
|
||||
if version.parse(nlp.__version__) != version.parse("0.8.3"):
|
||||
raise Exception("requires gluonnlp == 0.8.3")
|
||||
|
||||
if version.parse(mx.__version__) != version.parse("1.5.0"):
|
||||
raise Exception("requires mxnet == 1.5.0")
|
||||
|
||||
logging.set_verbosity_info()
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
SAMPLE_TEXT = "The Nymphenburg Palace is a beautiful palace in Munich!"
|
||||
|
||||
|
||||
def convert_bort_checkpoint_to_pytorch(bort_checkpoint_path: str, pytorch_dump_folder_path: str):
|
||||
"""
|
||||
Convert the original Bort checkpoint (based on MXNET and Gluonnlp) to our BERT structure-
|
||||
"""
|
||||
|
||||
# Original Bort configuration
|
||||
bort_4_8_768_1024_hparams = {
|
||||
"attention_cell": "multi_head",
|
||||
"num_layers": 4,
|
||||
"units": 1024,
|
||||
"hidden_size": 768,
|
||||
"max_length": 512,
|
||||
"num_heads": 8,
|
||||
"scaled": True,
|
||||
"dropout": 0.1,
|
||||
"use_residual": True,
|
||||
"embed_size": 1024,
|
||||
"embed_dropout": 0.1,
|
||||
"word_embed": None,
|
||||
"layer_norm_eps": 1e-5,
|
||||
"token_type_vocab_size": 2,
|
||||
}
|
||||
|
||||
predefined_args = bort_4_8_768_1024_hparams
|
||||
|
||||
# Let's construct the original Bort model here
|
||||
# Taken from official BERT implementation, see:
|
||||
# https://github.com/alexa/bort/blob/master/bort/bort.py
|
||||
encoder = BERTEncoder(
|
||||
attention_cell=predefined_args["attention_cell"],
|
||||
num_layers=predefined_args["num_layers"],
|
||||
units=predefined_args["units"],
|
||||
hidden_size=predefined_args["hidden_size"],
|
||||
max_length=predefined_args["max_length"],
|
||||
num_heads=predefined_args["num_heads"],
|
||||
scaled=predefined_args["scaled"],
|
||||
dropout=predefined_args["dropout"],
|
||||
output_attention=False,
|
||||
output_all_encodings=False,
|
||||
use_residual=predefined_args["use_residual"],
|
||||
activation=predefined_args.get("activation", "gelu"),
|
||||
layer_norm_eps=predefined_args.get("layer_norm_eps", None),
|
||||
)
|
||||
|
||||
# Vocab information needs to be fetched first
|
||||
# It's the same as RoBERTa, so RobertaTokenizer can be used later
|
||||
vocab_name = "openwebtext_ccnews_stories_books_cased"
|
||||
|
||||
# Specify download folder to Gluonnlp's vocab
|
||||
gluon_cache_dir = os.path.join(get_home_dir(), "models")
|
||||
bort_vocab = _load_vocab(vocab_name, None, gluon_cache_dir, cls=Vocab)
|
||||
|
||||
original_bort = nlp.model.BERTModel(
|
||||
encoder,
|
||||
len(bort_vocab),
|
||||
units=predefined_args["units"],
|
||||
embed_size=predefined_args["embed_size"],
|
||||
embed_dropout=predefined_args["embed_dropout"],
|
||||
word_embed=predefined_args["word_embed"],
|
||||
use_pooler=False,
|
||||
use_token_type_embed=False,
|
||||
token_type_vocab_size=predefined_args["token_type_vocab_size"],
|
||||
use_classifier=False,
|
||||
use_decoder=False,
|
||||
)
|
||||
|
||||
original_bort.load_parameters(bort_checkpoint_path, cast_dtype=True, ignore_extra=True)
|
||||
params = original_bort._collect_params_with_prefix()
|
||||
|
||||
# Build our config 🤗
|
||||
hf_bort_config_json = {
|
||||
"architectures": ["BertForMaskedLM"],
|
||||
"attention_probs_dropout_prob": predefined_args["dropout"],
|
||||
"hidden_act": "gelu",
|
||||
"hidden_dropout_prob": predefined_args["dropout"],
|
||||
"hidden_size": predefined_args["embed_size"],
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": predefined_args["hidden_size"],
|
||||
"layer_norm_eps": predefined_args["layer_norm_eps"],
|
||||
"max_position_embeddings": predefined_args["max_length"],
|
||||
"model_type": "bort",
|
||||
"num_attention_heads": predefined_args["num_heads"],
|
||||
"num_hidden_layers": predefined_args["num_layers"],
|
||||
"pad_token_id": 1, # 2 = BERT, 1 = RoBERTa
|
||||
"type_vocab_size": 1, # 2 = BERT, 1 = RoBERTa
|
||||
"vocab_size": len(bort_vocab),
|
||||
}
|
||||
|
||||
hf_bort_config = BertConfig.from_dict(hf_bort_config_json)
|
||||
hf_bort_model = BertForMaskedLM(hf_bort_config)
|
||||
hf_bort_model.eval()
|
||||
|
||||
# Parameter mapping table (Gluonnlp to Transformers)
|
||||
# * denotes layer index
|
||||
#
|
||||
# | Gluon Parameter | Transformers Parameter
|
||||
# | -------------------------------------------------------------- | ----------------------
|
||||
# | `encoder.layer_norm.beta` | `bert.embeddings.LayerNorm.bias`
|
||||
# | `encoder.layer_norm.gamma` | `bert.embeddings.LayerNorm.weight`
|
||||
# | `encoder.position_weight` | `bert.embeddings.position_embeddings.weight`
|
||||
# | `word_embed.0.weight` | `bert.embeddings.word_embeddings.weight`
|
||||
# | `encoder.transformer_cells.*.attention_cell.proj_key.bias` | `bert.encoder.layer.*.attention.self.key.bias`
|
||||
# | `encoder.transformer_cells.*.attention_cell.proj_key.weight` | `bert.encoder.layer.*.attention.self.key.weight`
|
||||
# | `encoder.transformer_cells.*.attention_cell.proj_query.bias` | `bert.encoder.layer.*.attention.self.query.bias`
|
||||
# | `encoder.transformer_cells.*.attention_cell.proj_query.weight` | `bert.encoder.layer.*.attention.self.query.weight`
|
||||
# | `encoder.transformer_cells.*.attention_cell.proj_value.bias` | `bert.encoder.layer.*.attention.self.value.bias`
|
||||
# | `encoder.transformer_cells.*.attention_cell.proj_value.weight` | `bert.encoder.layer.*.attention.self.value.weight`
|
||||
# | `encoder.transformer_cells.*.ffn.ffn_2.bias` | `bert.encoder.layer.*.attention.output.dense.bias`
|
||||
# | `encoder.transformer_cells.*.ffn.ffn_2.weight` | `bert.encoder.layer.*.attention.output.dense.weight`
|
||||
# | `encoder.transformer_cells.*.layer_norm.beta` | `bert.encoder.layer.*.attention.output.LayerNorm.bias`
|
||||
# | `encoder.transformer_cells.*.layer_norm.gamma` | `bert.encoder.layer.*.attention.output.LayerNorm.weight`
|
||||
# | `encoder.transformer_cells.*.ffn.ffn_1.bias` | `bert.encoder.layer.*.intermediate.dense.bias`
|
||||
# | `encoder.transformer_cells.*.ffn.ffn_1.weight` | `bert.encoder.layer.*.intermediate.dense.weight`
|
||||
# | `encoder.transformer_cells.*.ffn.layer_norm.beta` | `bert.encoder.layer.*.output.LayerNorm.bias`
|
||||
# | `encoder.transformer_cells.*.ffn.layer_norm.gamma` | `bert.encoder.layer.*.output.LayerNorm.weight`
|
||||
# | `encoder.transformer_cells.*.proj.bias` | `bert.encoder.layer.*.output.dense.bias`
|
||||
# | `encoder.transformer_cells.*.proj.weight` | `bert.encoder.layer.*.output.dense.weight`
|
||||
|
||||
# Helper function to convert MXNET Arrays to PyTorch
|
||||
def to_torch(mx_array) -> nn.Parameter:
|
||||
return nn.Parameter(torch.FloatTensor(mx_array.data().asnumpy()))
|
||||
|
||||
# Check param shapes and map new HF param back
|
||||
def check_and_map_params(hf_param, gluon_param):
|
||||
shape_hf = hf_param.shape
|
||||
|
||||
gluon_param = to_torch(params[gluon_param])
|
||||
shape_gluon = gluon_param.shape
|
||||
|
||||
assert shape_hf == shape_gluon, (
|
||||
f"The gluon parameter {gluon_param} has shape {shape_gluon}, but expects shape {shape_hf} for Transformers"
|
||||
)
|
||||
|
||||
return gluon_param
|
||||
|
||||
hf_bort_model.bert.embeddings.word_embeddings.weight = check_and_map_params(
|
||||
hf_bort_model.bert.embeddings.word_embeddings.weight, "word_embed.0.weight"
|
||||
)
|
||||
hf_bort_model.bert.embeddings.position_embeddings.weight = check_and_map_params(
|
||||
hf_bort_model.bert.embeddings.position_embeddings.weight, "encoder.position_weight"
|
||||
)
|
||||
hf_bort_model.bert.embeddings.LayerNorm.bias = check_and_map_params(
|
||||
hf_bort_model.bert.embeddings.LayerNorm.bias, "encoder.layer_norm.beta"
|
||||
)
|
||||
hf_bort_model.bert.embeddings.LayerNorm.weight = check_and_map_params(
|
||||
hf_bort_model.bert.embeddings.LayerNorm.weight, "encoder.layer_norm.gamma"
|
||||
)
|
||||
|
||||
# Inspired by RoBERTa conversion script, we just zero them out (Bort does not use them)
|
||||
hf_bort_model.bert.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
|
||||
hf_bort_model.bert.embeddings.token_type_embeddings.weight.data
|
||||
)
|
||||
|
||||
for i in range(hf_bort_config.num_hidden_layers):
|
||||
layer: BertLayer = hf_bort_model.bert.encoder.layer[i]
|
||||
|
||||
# self attention
|
||||
self_attn: BertSelfAttention = layer.attention.self
|
||||
|
||||
self_attn.key.bias.data = check_and_map_params(
|
||||
self_attn.key.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_key.bias"
|
||||
)
|
||||
|
||||
self_attn.key.weight.data = check_and_map_params(
|
||||
self_attn.key.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_key.weight"
|
||||
)
|
||||
self_attn.query.bias.data = check_and_map_params(
|
||||
self_attn.query.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_query.bias"
|
||||
)
|
||||
self_attn.query.weight.data = check_and_map_params(
|
||||
self_attn.query.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_query.weight"
|
||||
)
|
||||
self_attn.value.bias.data = check_and_map_params(
|
||||
self_attn.value.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_value.bias"
|
||||
)
|
||||
self_attn.value.weight.data = check_and_map_params(
|
||||
self_attn.value.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_value.weight"
|
||||
)
|
||||
|
||||
# self attention output
|
||||
self_output: BertSelfOutput = layer.attention.output
|
||||
|
||||
self_output.dense.bias = check_and_map_params(
|
||||
self_output.dense.bias, f"encoder.transformer_cells.{i}.proj.bias"
|
||||
)
|
||||
self_output.dense.weight = check_and_map_params(
|
||||
self_output.dense.weight, f"encoder.transformer_cells.{i}.proj.weight"
|
||||
)
|
||||
self_output.LayerNorm.bias = check_and_map_params(
|
||||
self_output.LayerNorm.bias, f"encoder.transformer_cells.{i}.layer_norm.beta"
|
||||
)
|
||||
self_output.LayerNorm.weight = check_and_map_params(
|
||||
self_output.LayerNorm.weight, f"encoder.transformer_cells.{i}.layer_norm.gamma"
|
||||
)
|
||||
|
||||
# intermediate
|
||||
intermediate: BertIntermediate = layer.intermediate
|
||||
|
||||
intermediate.dense.bias = check_and_map_params(
|
||||
intermediate.dense.bias, f"encoder.transformer_cells.{i}.ffn.ffn_1.bias"
|
||||
)
|
||||
intermediate.dense.weight = check_and_map_params(
|
||||
intermediate.dense.weight, f"encoder.transformer_cells.{i}.ffn.ffn_1.weight"
|
||||
)
|
||||
|
||||
# output
|
||||
bert_output: BertOutput = layer.output
|
||||
|
||||
bert_output.dense.bias = check_and_map_params(
|
||||
bert_output.dense.bias, f"encoder.transformer_cells.{i}.ffn.ffn_2.bias"
|
||||
)
|
||||
bert_output.dense.weight = check_and_map_params(
|
||||
bert_output.dense.weight, f"encoder.transformer_cells.{i}.ffn.ffn_2.weight"
|
||||
)
|
||||
bert_output.LayerNorm.bias = check_and_map_params(
|
||||
bert_output.LayerNorm.bias, f"encoder.transformer_cells.{i}.ffn.layer_norm.beta"
|
||||
)
|
||||
bert_output.LayerNorm.weight = check_and_map_params(
|
||||
bert_output.LayerNorm.weight, f"encoder.transformer_cells.{i}.ffn.layer_norm.gamma"
|
||||
)
|
||||
|
||||
# Save space and energy 🎄
|
||||
hf_bort_model.half()
|
||||
|
||||
# Compare output of both models
|
||||
tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base")
|
||||
|
||||
input_ids = tokenizer.encode_plus(SAMPLE_TEXT)["input_ids"]
|
||||
|
||||
# Get gluon output
|
||||
gluon_input_ids = mx.nd.array([input_ids])
|
||||
output_gluon = original_bort(inputs=gluon_input_ids, token_types=[])
|
||||
|
||||
# Get Transformer output (save and reload model again)
|
||||
hf_bort_model.save_pretrained(pytorch_dump_folder_path)
|
||||
hf_bort_model = BertModel.from_pretrained(pytorch_dump_folder_path)
|
||||
hf_bort_model.eval()
|
||||
|
||||
input_ids = tokenizer.encode_plus(SAMPLE_TEXT, return_tensors="pt")
|
||||
output_hf = hf_bort_model(**input_ids)[0]
|
||||
|
||||
gluon_layer = output_gluon[0].asnumpy()
|
||||
hf_layer = output_hf[0].detach().numpy()
|
||||
|
||||
max_absolute_diff = np.max(np.abs(hf_layer - gluon_layer)).item()
|
||||
success = np.allclose(gluon_layer, hf_layer, atol=1e-3)
|
||||
|
||||
if success:
|
||||
print("[SUCCESS] Both models do output the same tensors")
|
||||
else:
|
||||
print("[FAIL] Both models do **NOT** output the same tensors")
|
||||
print("Absolute difference is:", max_absolute_diff)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
# Required parameters
|
||||
parser.add_argument(
|
||||
"--bort_checkpoint_path", default=None, type=str, required=True, help="Path the official Bort params file."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
|
||||
)
|
||||
args = parser.parse_args()
|
||||
convert_bort_checkpoint_to_pytorch(args.bort_checkpoint_path, args.pytorch_dump_folder_path)
|
||||
@ -1,28 +0,0 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ....utils import _LazyModule
|
||||
from ....utils.import_utils import define_import_structure
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_deta import *
|
||||
from .image_processing_deta import *
|
||||
from .modeling_deta import *
|
||||
else:
|
||||
import sys
|
||||
|
||||
_file = globals()["__file__"]
|
||||
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
|
||||
@ -1,278 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2022 SenseTime and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""DETA model configuration"""
|
||||
|
||||
from ....configuration_utils import PreTrainedConfig
|
||||
from ....utils import logging
|
||||
from ...auto import CONFIG_MAPPING
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class DetaConfig(PreTrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`DetaModel`]. It is used to instantiate a DETA
|
||||
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
||||
defaults will yield a similar configuration to that of the DETA
|
||||
[SenseTime/deformable-detr](https://huggingface.co/SenseTime/deformable-detr) architecture.
|
||||
|
||||
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PreTrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
backbone_config (`PreTrainedConfig` or `dict`, *optional*, defaults to `ResNetConfig()`):
|
||||
The configuration of the backbone model.
|
||||
backbone (`str`, *optional*):
|
||||
Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
|
||||
will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
|
||||
is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
|
||||
use_pretrained_backbone (`bool`, *optional*, `False`):
|
||||
Whether to use pretrained weights for the backbone.
|
||||
use_timm_backbone (`bool`, *optional*, `False`):
|
||||
Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
|
||||
library.
|
||||
backbone_kwargs (`dict`, *optional*):
|
||||
Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
|
||||
e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
|
||||
num_queries (`int`, *optional*, defaults to 900):
|
||||
Number of object queries, i.e. detection slots. This is the maximal number of objects [`DetaModel`] can
|
||||
detect in a single image. In case `two_stage` is set to `True`, we use `two_stage_num_proposals` instead.
|
||||
d_model (`int`, *optional*, defaults to 256):
|
||||
Dimension of the layers.
|
||||
encoder_layers (`int`, *optional*, defaults to 6):
|
||||
Number of encoder layers.
|
||||
decoder_layers (`int`, *optional*, defaults to 6):
|
||||
Number of decoder layers.
|
||||
encoder_attention_heads (`int`, *optional*, defaults to 8):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
decoder_attention_heads (`int`, *optional*, defaults to 8):
|
||||
Number of attention heads for each attention layer in the Transformer decoder.
|
||||
decoder_ffn_dim (`int`, *optional*, defaults to 2048):
|
||||
Dimension of the "intermediate" (often named feed-forward) layer in decoder.
|
||||
encoder_ffn_dim (`int`, *optional*, defaults to 2048):
|
||||
Dimension of the "intermediate" (often named feed-forward) layer in decoder.
|
||||
activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"silu"` and `"gelu_new"` are supported.
|
||||
dropout (`float`, *optional*, defaults to 0.1):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
activation_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for activations inside the fully connected layer.
|
||||
init_std (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
init_xavier_std (`float`, *optional*, defaults to 1):
|
||||
The scaling factor used for the Xavier initialization gain in the HM Attention map module.
|
||||
encoder_layerdrop (`float`, *optional*, defaults to 0.0):
|
||||
The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556)
|
||||
for more details.
|
||||
auxiliary_loss (`bool`, *optional*, defaults to `False`):
|
||||
Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
|
||||
position_embedding_type (`str`, *optional*, defaults to `"sine"`):
|
||||
Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
|
||||
class_cost (`float`, *optional*, defaults to 1):
|
||||
Relative weight of the classification error in the Hungarian matching cost.
|
||||
bbox_cost (`float`, *optional*, defaults to 5):
|
||||
Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
|
||||
giou_cost (`float`, *optional*, defaults to 2):
|
||||
Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
|
||||
mask_loss_coefficient (`float`, *optional*, defaults to 1):
|
||||
Relative weight of the Focal loss in the panoptic segmentation loss.
|
||||
dice_loss_coefficient (`float`, *optional*, defaults to 1):
|
||||
Relative weight of the DICE/F-1 loss in the panoptic segmentation loss.
|
||||
bbox_loss_coefficient (`float`, *optional*, defaults to 5):
|
||||
Relative weight of the L1 bounding box loss in the object detection loss.
|
||||
giou_loss_coefficient (`float`, *optional*, defaults to 2):
|
||||
Relative weight of the generalized IoU loss in the object detection loss.
|
||||
eos_coefficient (`float`, *optional*, defaults to 0.1):
|
||||
Relative classification weight of the 'no-object' class in the object detection loss.
|
||||
num_feature_levels (`int`, *optional*, defaults to 5):
|
||||
The number of input feature levels.
|
||||
encoder_n_points (`int`, *optional*, defaults to 4):
|
||||
The number of sampled keys in each feature level for each attention head in the encoder.
|
||||
decoder_n_points (`int`, *optional*, defaults to 4):
|
||||
The number of sampled keys in each feature level for each attention head in the decoder.
|
||||
two_stage (`bool`, *optional*, defaults to `True`):
|
||||
Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of
|
||||
DETA, which are further fed into the decoder for iterative bounding box refinement.
|
||||
two_stage_num_proposals (`int`, *optional*, defaults to 300):
|
||||
The number of region proposals to be generated, in case `two_stage` is set to `True`.
|
||||
with_box_refine (`bool`, *optional*, defaults to `True`):
|
||||
Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes
|
||||
based on the predictions from the previous layer.
|
||||
focal_alpha (`float`, *optional*, defaults to 0.25):
|
||||
Alpha parameter in the focal loss.
|
||||
assign_first_stage (`bool`, *optional*, defaults to `True`):
|
||||
Whether to assign each prediction i to the highest overlapping ground truth object if the overlap is larger than a threshold 0.7.
|
||||
assign_second_stage (`bool`, *optional*, defaults to `True`):
|
||||
Whether to assign second assignment procedure in the second stage closely follows the first stage assignment procedure.
|
||||
disable_custom_kernels (`bool`, *optional*, defaults to `True`):
|
||||
Disable the use of custom CUDA and CPU kernels. This option is necessary for the ONNX export, as custom
|
||||
kernels are not supported by PyTorch ONNX export.
|
||||
|
||||
Examples:
|
||||
|
||||
```python
|
||||
>>> from transformers import DetaConfig, DetaModel
|
||||
|
||||
>>> # Initializing a DETA SenseTime/deformable-detr style configuration
|
||||
>>> configuration = DetaConfig()
|
||||
|
||||
>>> # Initializing a model (with random weights) from the SenseTime/deformable-detr style configuration
|
||||
>>> model = DetaModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "deta"
|
||||
attribute_map = {
|
||||
"hidden_size": "d_model",
|
||||
"num_attention_heads": "encoder_attention_heads",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
backbone_config=None,
|
||||
backbone=None,
|
||||
use_pretrained_backbone=False,
|
||||
use_timm_backbone=False,
|
||||
backbone_kwargs=None,
|
||||
num_queries=900,
|
||||
max_position_embeddings=2048,
|
||||
encoder_layers=6,
|
||||
encoder_ffn_dim=2048,
|
||||
encoder_attention_heads=8,
|
||||
decoder_layers=6,
|
||||
decoder_ffn_dim=1024,
|
||||
decoder_attention_heads=8,
|
||||
encoder_layerdrop=0.0,
|
||||
is_encoder_decoder=True,
|
||||
activation_function="relu",
|
||||
d_model=256,
|
||||
dropout=0.1,
|
||||
attention_dropout=0.0,
|
||||
activation_dropout=0.0,
|
||||
init_std=0.02,
|
||||
init_xavier_std=1.0,
|
||||
return_intermediate=True,
|
||||
auxiliary_loss=False,
|
||||
position_embedding_type="sine",
|
||||
num_feature_levels=5,
|
||||
encoder_n_points=4,
|
||||
decoder_n_points=4,
|
||||
two_stage=True,
|
||||
two_stage_num_proposals=300,
|
||||
with_box_refine=True,
|
||||
assign_first_stage=True,
|
||||
assign_second_stage=True,
|
||||
class_cost=1,
|
||||
bbox_cost=5,
|
||||
giou_cost=2,
|
||||
mask_loss_coefficient=1,
|
||||
dice_loss_coefficient=1,
|
||||
bbox_loss_coefficient=5,
|
||||
giou_loss_coefficient=2,
|
||||
eos_coefficient=0.1,
|
||||
focal_alpha=0.25,
|
||||
disable_custom_kernels=True,
|
||||
**kwargs,
|
||||
):
|
||||
if use_pretrained_backbone:
|
||||
raise ValueError("Pretrained backbones are not supported yet.")
|
||||
|
||||
if backbone_config is not None and backbone is not None:
|
||||
raise ValueError("You can't specify both `backbone` and `backbone_config`.")
|
||||
|
||||
if backbone_config is None and backbone is None:
|
||||
logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
|
||||
backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage2", "stage3", "stage4"])
|
||||
else:
|
||||
if isinstance(backbone_config, dict):
|
||||
backbone_model_type = backbone_config.pop("model_type")
|
||||
config_class = CONFIG_MAPPING[backbone_model_type]
|
||||
backbone_config = config_class.from_dict(backbone_config)
|
||||
|
||||
if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
|
||||
raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
|
||||
|
||||
self.backbone_config = backbone_config
|
||||
self.backbone = backbone
|
||||
self.use_pretrained_backbone = use_pretrained_backbone
|
||||
self.use_timm_backbone = use_timm_backbone
|
||||
self.backbone_kwargs = backbone_kwargs
|
||||
self.num_queries = num_queries
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.d_model = d_model
|
||||
self.encoder_ffn_dim = encoder_ffn_dim
|
||||
self.encoder_layers = encoder_layers
|
||||
self.encoder_attention_heads = encoder_attention_heads
|
||||
self.decoder_ffn_dim = decoder_ffn_dim
|
||||
self.decoder_layers = decoder_layers
|
||||
self.decoder_attention_heads = decoder_attention_heads
|
||||
self.dropout = dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.activation_dropout = activation_dropout
|
||||
self.activation_function = activation_function
|
||||
self.init_std = init_std
|
||||
self.init_xavier_std = init_xavier_std
|
||||
self.encoder_layerdrop = encoder_layerdrop
|
||||
self.auxiliary_loss = auxiliary_loss
|
||||
self.position_embedding_type = position_embedding_type
|
||||
# deformable attributes
|
||||
self.num_feature_levels = num_feature_levels
|
||||
self.encoder_n_points = encoder_n_points
|
||||
self.decoder_n_points = decoder_n_points
|
||||
self.two_stage = two_stage
|
||||
self.two_stage_num_proposals = two_stage_num_proposals
|
||||
self.with_box_refine = with_box_refine
|
||||
self.assign_first_stage = assign_first_stage
|
||||
self.assign_second_stage = assign_second_stage
|
||||
if two_stage is True and with_box_refine is False:
|
||||
raise ValueError("If two_stage is True, with_box_refine must be True.")
|
||||
# Hungarian matcher
|
||||
self.class_cost = class_cost
|
||||
self.bbox_cost = bbox_cost
|
||||
self.giou_cost = giou_cost
|
||||
# Loss coefficients
|
||||
self.mask_loss_coefficient = mask_loss_coefficient
|
||||
self.dice_loss_coefficient = dice_loss_coefficient
|
||||
self.bbox_loss_coefficient = bbox_loss_coefficient
|
||||
self.giou_loss_coefficient = giou_loss_coefficient
|
||||
self.eos_coefficient = eos_coefficient
|
||||
self.focal_alpha = focal_alpha
|
||||
self.disable_custom_kernels = disable_custom_kernels
|
||||
super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
|
||||
|
||||
@property
|
||||
def num_attention_heads(self) -> int:
|
||||
return self.encoder_attention_heads
|
||||
|
||||
@property
|
||||
def hidden_size(self) -> int:
|
||||
return self.d_model
|
||||
|
||||
@property
|
||||
def sub_configs(self):
|
||||
return (
|
||||
{"backbone_config": type(self.backbone_config)}
|
||||
if getattr(self, "backbone_config", None) is not None
|
||||
else {}
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["DetaConfig"]
|
||||
@ -1,321 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2022 The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Convert DETA checkpoints from the original repository.
|
||||
|
||||
URL: https://github.com/jozhang97/DETA/tree/master"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
import torch
|
||||
from huggingface_hub import hf_hub_download
|
||||
from PIL import Image
|
||||
|
||||
from transformers import DetaConfig, DetaForObjectDetection, DetaImageProcessor
|
||||
from transformers.utils import logging
|
||||
|
||||
|
||||
logging.set_verbosity_info()
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
def get_deta_config():
|
||||
config = DetaConfig(
|
||||
num_queries=900,
|
||||
encoder_ffn_dim=2048,
|
||||
decoder_ffn_dim=2048,
|
||||
num_feature_levels=5,
|
||||
assign_first_stage=True,
|
||||
with_box_refine=True,
|
||||
two_stage=True,
|
||||
)
|
||||
|
||||
# set labels
|
||||
config.num_labels = 91
|
||||
repo_id = "huggingface/label-files"
|
||||
filename = "coco-detection-id2label.json"
|
||||
id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text())
|
||||
id2label = {int(k): v for k, v in id2label.items()}
|
||||
config.id2label = id2label
|
||||
config.label2id = {v: k for k, v in id2label.items()}
|
||||
|
||||
return config
|
||||
|
||||
|
||||
# here we list all keys to be renamed (original name on the left, our name on the right)
|
||||
def create_rename_keys(config):
|
||||
rename_keys = []
|
||||
|
||||
# stem
|
||||
# fmt: off
|
||||
rename_keys.append(("backbone.0.body.conv1.weight", "model.backbone.model.embedder.embedder.convolution.weight"))
|
||||
rename_keys.append(("backbone.0.body.bn1.weight", "model.backbone.model.embedder.embedder.normalization.weight"))
|
||||
rename_keys.append(("backbone.0.body.bn1.bias", "model.backbone.model.embedder.embedder.normalization.bias"))
|
||||
rename_keys.append(("backbone.0.body.bn1.running_mean", "model.backbone.model.embedder.embedder.normalization.running_mean"))
|
||||
rename_keys.append(("backbone.0.body.bn1.running_var", "model.backbone.model.embedder.embedder.normalization.running_var"))
|
||||
# stages
|
||||
for stage_idx in range(len(config.backbone_config.depths)):
|
||||
for layer_idx in range(config.backbone_config.depths[stage_idx]):
|
||||
# shortcut
|
||||
if layer_idx == 0:
|
||||
rename_keys.append(
|
||||
(
|
||||
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.0.weight",
|
||||
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.convolution.weight",
|
||||
)
|
||||
)
|
||||
rename_keys.append(
|
||||
(
|
||||
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.weight",
|
||||
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.weight",
|
||||
)
|
||||
)
|
||||
rename_keys.append(
|
||||
(
|
||||
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.bias",
|
||||
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.bias",
|
||||
)
|
||||
)
|
||||
rename_keys.append(
|
||||
(
|
||||
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_mean",
|
||||
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_mean",
|
||||
)
|
||||
)
|
||||
rename_keys.append(
|
||||
(
|
||||
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_var",
|
||||
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_var",
|
||||
)
|
||||
)
|
||||
# 3 convs
|
||||
for i in range(3):
|
||||
rename_keys.append(
|
||||
(
|
||||
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.conv{i+1}.weight",
|
||||
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.convolution.weight",
|
||||
)
|
||||
)
|
||||
rename_keys.append(
|
||||
(
|
||||
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.weight",
|
||||
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.weight",
|
||||
)
|
||||
)
|
||||
rename_keys.append(
|
||||
(
|
||||
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.bias",
|
||||
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.bias",
|
||||
)
|
||||
)
|
||||
rename_keys.append(
|
||||
(
|
||||
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_mean",
|
||||
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_mean",
|
||||
)
|
||||
)
|
||||
rename_keys.append(
|
||||
(
|
||||
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_var",
|
||||
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_var",
|
||||
)
|
||||
)
|
||||
# transformer encoder
|
||||
for i in range(config.encoder_layers):
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.weight", f"model.encoder.layers.{i}.self_attn.sampling_offsets.weight"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.bias", f"model.encoder.layers.{i}.self_attn.sampling_offsets.bias"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.weight", f"model.encoder.layers.{i}.self_attn.attention_weights.weight"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.bias", f"model.encoder.layers.{i}.self_attn.attention_weights.bias"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.weight", f"model.encoder.layers.{i}.self_attn.value_proj.weight"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.bias", f"model.encoder.layers.{i}.self_attn.value_proj.bias"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.weight", f"model.encoder.layers.{i}.self_attn.output_proj.weight"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.bias", f"model.encoder.layers.{i}.self_attn.output_proj.bias"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.norm1.weight", f"model.encoder.layers.{i}.self_attn_layer_norm.weight"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"model.encoder.layers.{i}.self_attn_layer_norm.bias"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"model.encoder.layers.{i}.fc1.weight"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"model.encoder.layers.{i}.fc1.bias"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"model.encoder.layers.{i}.fc2.weight"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"model.encoder.layers.{i}.fc2.bias"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"model.encoder.layers.{i}.final_layer_norm.weight"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"model.encoder.layers.{i}.final_layer_norm.bias"))
|
||||
|
||||
# transformer decoder
|
||||
for i in range(config.decoder_layers):
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.weight", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.bias", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.weight", f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.bias", f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.weight", f"model.decoder.layers.{i}.encoder_attn.value_proj.weight"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.bias", f"model.decoder.layers.{i}.encoder_attn.value_proj.bias"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.weight", f"model.decoder.layers.{i}.encoder_attn.output_proj.weight"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.bias", f"model.decoder.layers.{i}.encoder_attn.output_proj.bias"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"model.decoder.layers.{i}.self_attn.out_proj.weight"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"model.decoder.layers.{i}.self_attn.out_proj.bias"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias"))
|
||||
|
||||
# fmt: on
|
||||
|
||||
return rename_keys
|
||||
|
||||
|
||||
def rename_key(dct, old, new):
|
||||
val = dct.pop(old)
|
||||
dct[new] = val
|
||||
|
||||
|
||||
def read_in_decoder_q_k_v(state_dict, config):
|
||||
# transformer decoder self-attention layers
|
||||
hidden_size = config.d_model
|
||||
for i in range(config.decoder_layers):
|
||||
# read in weights + bias of input projection layer of self-attention
|
||||
in_proj_weight = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_weight")
|
||||
in_proj_bias = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_bias")
|
||||
# next, add query, keys and values (in that order) to the state dict
|
||||
state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:hidden_size, :]
|
||||
state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:hidden_size]
|
||||
state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[
|
||||
hidden_size : hidden_size * 2, :
|
||||
]
|
||||
state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
|
||||
state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size:, :]
|
||||
state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size:]
|
||||
|
||||
|
||||
# We will verify our results on an image of cute cats
|
||||
def prepare_img():
|
||||
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
im = Image.open(requests.get(url, stream=True).raw)
|
||||
|
||||
return im
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def convert_deta_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
|
||||
"""
|
||||
Copy/paste/tweak model's weights to our DETA structure.
|
||||
"""
|
||||
|
||||
# load config
|
||||
config = get_deta_config()
|
||||
|
||||
# load original state dict
|
||||
if model_name == "deta-resnet-50":
|
||||
filename = "adet_checkpoint0011.pth"
|
||||
elif model_name == "deta-resnet-50-24-epochs":
|
||||
filename = "adet_2x_checkpoint0023.pth"
|
||||
else:
|
||||
raise ValueError(f"Model name {model_name} not supported")
|
||||
checkpoint_path = hf_hub_download(repo_id="nielsr/deta-checkpoints", filename=filename)
|
||||
state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["model"]
|
||||
|
||||
# rename keys
|
||||
rename_keys = create_rename_keys(config)
|
||||
for src, dest in rename_keys:
|
||||
rename_key(state_dict, src, dest)
|
||||
read_in_decoder_q_k_v(state_dict, config)
|
||||
|
||||
# fix some prefixes
|
||||
for key in state_dict.copy():
|
||||
if "transformer.decoder.class_embed" in key or "transformer.decoder.bbox_embed" in key:
|
||||
val = state_dict.pop(key)
|
||||
state_dict[key.replace("transformer.decoder", "model.decoder")] = val
|
||||
if "input_proj" in key:
|
||||
val = state_dict.pop(key)
|
||||
state_dict["model." + key] = val
|
||||
if "level_embed" in key or "pos_trans" in key or "pix_trans" in key or "enc_output" in key:
|
||||
val = state_dict.pop(key)
|
||||
state_dict[key.replace("transformer", "model")] = val
|
||||
|
||||
# finally, create HuggingFace model and load state dict
|
||||
model = DetaForObjectDetection(config)
|
||||
model.load_state_dict(state_dict)
|
||||
model.eval()
|
||||
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
model.to(device)
|
||||
|
||||
# load image processor
|
||||
processor = DetaImageProcessor(format="coco_detection")
|
||||
|
||||
# verify our conversion on image
|
||||
img = prepare_img()
|
||||
encoding = processor(images=img, return_tensors="pt")
|
||||
pixel_values = encoding["pixel_values"]
|
||||
outputs = model(pixel_values.to(device))
|
||||
|
||||
# verify logits
|
||||
if model_name == "deta-resnet-50":
|
||||
expected_logits = torch.tensor(
|
||||
[[-7.3978, -2.5406, -4.1668], [-8.2684, -3.9933, -3.8096], [-7.0515, -3.7973, -5.8516]]
|
||||
)
|
||||
expected_boxes = torch.tensor([[0.5043, 0.4973, 0.9998], [0.2542, 0.5489, 0.4748], [0.5490, 0.2765, 0.0570]])
|
||||
elif model_name == "deta-resnet-50-24-epochs":
|
||||
expected_logits = torch.tensor(
|
||||
[[-7.1688, -2.4857, -4.8669], [-7.8630, -3.8154, -4.2674], [-7.2730, -4.1865, -5.5323]]
|
||||
)
|
||||
expected_boxes = torch.tensor([[0.5021, 0.4971, 0.9994], [0.2546, 0.5486, 0.4731], [0.1686, 0.1986, 0.2142]])
|
||||
|
||||
assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4)
|
||||
assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4)
|
||||
print("Everything ok!")
|
||||
|
||||
if pytorch_dump_folder_path:
|
||||
# Save model and processor
|
||||
logger.info(f"Saving PyTorch model and processor to {pytorch_dump_folder_path}...")
|
||||
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
|
||||
model.save_pretrained(pytorch_dump_folder_path)
|
||||
processor.save_pretrained(pytorch_dump_folder_path)
|
||||
|
||||
# Push to hub
|
||||
if push_to_hub:
|
||||
print("Pushing model and processor to hub...")
|
||||
model.push_to_hub(f"jozhang97/{model_name}")
|
||||
processor.push_to_hub(f"jozhang97/{model_name}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument(
|
||||
"--model_name",
|
||||
type=str,
|
||||
default="deta-resnet-50",
|
||||
choices=["deta-resnet-50", "deta-resnet-50-24-epochs"],
|
||||
help="Name of the model you'd like to convert.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pytorch_dump_folder_path",
|
||||
default=None,
|
||||
type=str,
|
||||
help="Path to the folder to output PyTorch model.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
convert_deta_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
|
||||
@ -1,328 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2022 The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Convert DETA checkpoints from the original repository.
|
||||
|
||||
URL: https://github.com/jozhang97/DETA/tree/master"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
import torch
|
||||
from huggingface_hub import hf_hub_download
|
||||
from PIL import Image
|
||||
|
||||
from transformers import DetaConfig, DetaForObjectDetection, DetaImageProcessor, SwinConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
|
||||
logging.set_verbosity_info()
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
def get_deta_config(model_name):
|
||||
backbone_config = SwinConfig(
|
||||
embed_dim=192,
|
||||
depths=(2, 2, 18, 2),
|
||||
num_heads=(6, 12, 24, 48),
|
||||
window_size=12,
|
||||
out_features=["stage2", "stage3", "stage4"],
|
||||
)
|
||||
|
||||
config = DetaConfig(
|
||||
backbone_config=backbone_config,
|
||||
num_queries=900,
|
||||
encoder_ffn_dim=2048,
|
||||
decoder_ffn_dim=2048,
|
||||
num_feature_levels=5,
|
||||
assign_first_stage=True,
|
||||
with_box_refine=True,
|
||||
two_stage=True,
|
||||
)
|
||||
|
||||
# set labels
|
||||
repo_id = "huggingface/label-files"
|
||||
if "o365" in model_name:
|
||||
num_labels = 366
|
||||
filename = "object365-id2label.json"
|
||||
else:
|
||||
num_labels = 91
|
||||
filename = "coco-detection-id2label.json"
|
||||
|
||||
config.num_labels = num_labels
|
||||
id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text())
|
||||
id2label = {int(k): v for k, v in id2label.items()}
|
||||
config.id2label = id2label
|
||||
config.label2id = {v: k for k, v in id2label.items()}
|
||||
|
||||
return config
|
||||
|
||||
|
||||
# here we list all keys to be renamed (original name on the left, our name on the right)
|
||||
def create_rename_keys(config):
|
||||
rename_keys = []
|
||||
|
||||
# stem
|
||||
# fmt: off
|
||||
rename_keys.append(("backbone.0.body.patch_embed.proj.weight", "model.backbone.model.embeddings.patch_embeddings.projection.weight"))
|
||||
rename_keys.append(("backbone.0.body.patch_embed.proj.bias", "model.backbone.model.embeddings.patch_embeddings.projection.bias"))
|
||||
rename_keys.append(("backbone.0.body.patch_embed.norm.weight", "model.backbone.model.embeddings.norm.weight"))
|
||||
rename_keys.append(("backbone.0.body.patch_embed.norm.bias", "model.backbone.model.embeddings.norm.bias"))
|
||||
# stages
|
||||
for i in range(len(config.backbone_config.depths)):
|
||||
for j in range(config.backbone_config.depths[i]):
|
||||
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm1.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_before.weight"))
|
||||
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm1.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_before.bias"))
|
||||
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.relative_position_bias_table", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_bias_table"))
|
||||
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.relative_position_index", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_index"))
|
||||
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.proj.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.weight"))
|
||||
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.proj.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.bias"))
|
||||
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm2.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_after.weight"))
|
||||
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm2.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_after.bias"))
|
||||
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc1.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.weight"))
|
||||
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc1.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.bias"))
|
||||
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc2.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.output.dense.weight"))
|
||||
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc2.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.output.dense.bias"))
|
||||
|
||||
if i < 3:
|
||||
rename_keys.append((f"backbone.0.body.layers.{i}.downsample.reduction.weight", f"model.backbone.model.encoder.layers.{i}.downsample.reduction.weight"))
|
||||
rename_keys.append((f"backbone.0.body.layers.{i}.downsample.norm.weight", f"model.backbone.model.encoder.layers.{i}.downsample.norm.weight"))
|
||||
rename_keys.append((f"backbone.0.body.layers.{i}.downsample.norm.bias", f"model.backbone.model.encoder.layers.{i}.downsample.norm.bias"))
|
||||
|
||||
rename_keys.append(("backbone.0.body.norm1.weight", "model.backbone.model.hidden_states_norms.stage2.weight"))
|
||||
rename_keys.append(("backbone.0.body.norm1.bias", "model.backbone.model.hidden_states_norms.stage2.bias"))
|
||||
rename_keys.append(("backbone.0.body.norm2.weight", "model.backbone.model.hidden_states_norms.stage3.weight"))
|
||||
rename_keys.append(("backbone.0.body.norm2.bias", "model.backbone.model.hidden_states_norms.stage3.bias"))
|
||||
rename_keys.append(("backbone.0.body.norm3.weight", "model.backbone.model.hidden_states_norms.stage4.weight"))
|
||||
rename_keys.append(("backbone.0.body.norm3.bias", "model.backbone.model.hidden_states_norms.stage4.bias"))
|
||||
|
||||
# transformer encoder
|
||||
for i in range(config.encoder_layers):
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.weight", f"model.encoder.layers.{i}.self_attn.sampling_offsets.weight"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.bias", f"model.encoder.layers.{i}.self_attn.sampling_offsets.bias"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.weight", f"model.encoder.layers.{i}.self_attn.attention_weights.weight"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.bias", f"model.encoder.layers.{i}.self_attn.attention_weights.bias"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.weight", f"model.encoder.layers.{i}.self_attn.value_proj.weight"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.bias", f"model.encoder.layers.{i}.self_attn.value_proj.bias"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.weight", f"model.encoder.layers.{i}.self_attn.output_proj.weight"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.bias", f"model.encoder.layers.{i}.self_attn.output_proj.bias"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.norm1.weight", f"model.encoder.layers.{i}.self_attn_layer_norm.weight"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"model.encoder.layers.{i}.self_attn_layer_norm.bias"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"model.encoder.layers.{i}.fc1.weight"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"model.encoder.layers.{i}.fc1.bias"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"model.encoder.layers.{i}.fc2.weight"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"model.encoder.layers.{i}.fc2.bias"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"model.encoder.layers.{i}.final_layer_norm.weight"))
|
||||
rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"model.encoder.layers.{i}.final_layer_norm.bias"))
|
||||
|
||||
# transformer decoder
|
||||
for i in range(config.decoder_layers):
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.weight", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.bias", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.weight", f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.bias", f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.weight", f"model.decoder.layers.{i}.encoder_attn.value_proj.weight"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.bias", f"model.decoder.layers.{i}.encoder_attn.value_proj.bias"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.weight", f"model.decoder.layers.{i}.encoder_attn.output_proj.weight"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.bias", f"model.decoder.layers.{i}.encoder_attn.output_proj.bias"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"model.decoder.layers.{i}.self_attn.out_proj.weight"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"model.decoder.layers.{i}.self_attn.out_proj.bias"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight"))
|
||||
rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias"))
|
||||
|
||||
# fmt: on
|
||||
|
||||
return rename_keys
|
||||
|
||||
|
||||
def rename_key(dct, old, new):
|
||||
val = dct.pop(old)
|
||||
dct[new] = val
|
||||
|
||||
|
||||
# we split up the matrix of each encoder layer into queries, keys and values
|
||||
def read_in_swin_q_k_v(state_dict, backbone_config):
|
||||
num_features = [int(backbone_config.embed_dim * 2**i) for i in range(len(backbone_config.depths))]
|
||||
for i in range(len(backbone_config.depths)):
|
||||
dim = num_features[i]
|
||||
for j in range(backbone_config.depths[i]):
|
||||
# fmt: off
|
||||
# read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
|
||||
in_proj_weight = state_dict.pop(f"backbone.0.body.layers.{i}.blocks.{j}.attn.qkv.weight")
|
||||
in_proj_bias = state_dict.pop(f"backbone.0.body.layers.{i}.blocks.{j}.attn.qkv.bias")
|
||||
# next, add query, keys and values (in that order) to the state dict
|
||||
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.query.weight"] = in_proj_weight[:dim, :]
|
||||
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.query.bias"] = in_proj_bias[: dim]
|
||||
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.key.weight"] = in_proj_weight[
|
||||
dim : dim * 2, :
|
||||
]
|
||||
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.key.bias"] = in_proj_bias[
|
||||
dim : dim * 2
|
||||
]
|
||||
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.value.weight"] = in_proj_weight[
|
||||
-dim :, :
|
||||
]
|
||||
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.value.bias"] = in_proj_bias[-dim :]
|
||||
# fmt: on
|
||||
|
||||
|
||||
def read_in_decoder_q_k_v(state_dict, config):
|
||||
# transformer decoder self-attention layers
|
||||
hidden_size = config.d_model
|
||||
for i in range(config.decoder_layers):
|
||||
# read in weights + bias of input projection layer of self-attention
|
||||
in_proj_weight = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_weight")
|
||||
in_proj_bias = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_bias")
|
||||
# next, add query, keys and values (in that order) to the state dict
|
||||
state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:hidden_size, :]
|
||||
state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:hidden_size]
|
||||
state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[
|
||||
hidden_size : hidden_size * 2, :
|
||||
]
|
||||
state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
|
||||
state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size:, :]
|
||||
state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size:]
|
||||
|
||||
|
||||
# We will verify our results on an image of cute cats
|
||||
def prepare_img():
|
||||
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
im = Image.open(requests.get(url, stream=True).raw)
|
||||
|
||||
return im
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def convert_deta_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
|
||||
"""
|
||||
Copy/paste/tweak model's weights to our DETA structure.
|
||||
"""
|
||||
|
||||
# load config
|
||||
config = get_deta_config(model_name)
|
||||
|
||||
# load original state dict
|
||||
if model_name == "deta-swin-large":
|
||||
checkpoint_path = hf_hub_download(repo_id="nielsr/deta-checkpoints", filename="adet_swin_ft.pth")
|
||||
elif model_name == "deta-swin-large-o365":
|
||||
checkpoint_path = hf_hub_download(repo_id="jozhang97/deta-swin-l-o365", filename="deta_swin_pt_o365.pth")
|
||||
else:
|
||||
raise ValueError(f"Model name {model_name} not supported")
|
||||
|
||||
state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["model"]
|
||||
|
||||
# original state dict
|
||||
for name, param in state_dict.items():
|
||||
print(name, param.shape)
|
||||
|
||||
# rename keys
|
||||
rename_keys = create_rename_keys(config)
|
||||
for src, dest in rename_keys:
|
||||
rename_key(state_dict, src, dest)
|
||||
read_in_swin_q_k_v(state_dict, config.backbone_config)
|
||||
read_in_decoder_q_k_v(state_dict, config)
|
||||
|
||||
# fix some prefixes
|
||||
for key in state_dict.copy():
|
||||
if "transformer.decoder.class_embed" in key or "transformer.decoder.bbox_embed" in key:
|
||||
val = state_dict.pop(key)
|
||||
state_dict[key.replace("transformer.decoder", "model.decoder")] = val
|
||||
if "input_proj" in key:
|
||||
val = state_dict.pop(key)
|
||||
state_dict["model." + key] = val
|
||||
if "level_embed" in key or "pos_trans" in key or "pix_trans" in key or "enc_output" in key:
|
||||
val = state_dict.pop(key)
|
||||
state_dict[key.replace("transformer", "model")] = val
|
||||
|
||||
# finally, create HuggingFace model and load state dict
|
||||
model = DetaForObjectDetection(config)
|
||||
model.load_state_dict(state_dict)
|
||||
model.eval()
|
||||
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
model.to(device)
|
||||
|
||||
# load image processor
|
||||
processor = DetaImageProcessor(format="coco_detection")
|
||||
|
||||
# verify our conversion on image
|
||||
img = prepare_img()
|
||||
encoding = processor(images=img, return_tensors="pt")
|
||||
pixel_values = encoding["pixel_values"]
|
||||
outputs = model(pixel_values.to(device))
|
||||
|
||||
# verify logits
|
||||
print("Logits:", outputs.logits[0, :3, :3])
|
||||
print("Boxes:", outputs.pred_boxes[0, :3, :3])
|
||||
if model_name == "deta-swin-large":
|
||||
expected_logits = torch.tensor(
|
||||
[[-7.6308, -2.8485, -5.3737], [-7.2037, -4.5505, -4.8027], [-7.2943, -4.2611, -4.6617]]
|
||||
)
|
||||
expected_boxes = torch.tensor([[0.4987, 0.4969, 0.9999], [0.2549, 0.5498, 0.4805], [0.5498, 0.2757, 0.0569]])
|
||||
elif model_name == "deta-swin-large-o365":
|
||||
expected_logits = torch.tensor(
|
||||
[[-8.0122, -3.5720, -4.9717], [-8.1547, -3.6886, -4.6389], [-7.6610, -3.6194, -5.0134]]
|
||||
)
|
||||
expected_boxes = torch.tensor([[0.2523, 0.5549, 0.4881], [0.7715, 0.4149, 0.4601], [0.5503, 0.2753, 0.0575]])
|
||||
assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4)
|
||||
assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4)
|
||||
print("Everything ok!")
|
||||
|
||||
if pytorch_dump_folder_path:
|
||||
# Save model and processor
|
||||
logger.info(f"Saving PyTorch model and processor to {pytorch_dump_folder_path}...")
|
||||
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
|
||||
model.save_pretrained(pytorch_dump_folder_path)
|
||||
processor.save_pretrained(pytorch_dump_folder_path)
|
||||
|
||||
# Push to hub
|
||||
if push_to_hub:
|
||||
print("Pushing model and processor to hub...")
|
||||
model.push_to_hub(f"jozhang97/{model_name}")
|
||||
processor.push_to_hub(f"jozhang97/{model_name}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument(
|
||||
"--model_name",
|
||||
type=str,
|
||||
default="deta-swin-large",
|
||||
choices=["deta-swin-large", "deta-swin-large-o365"],
|
||||
help="Name of the model you'd like to convert.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pytorch_dump_folder_path",
|
||||
default=None,
|
||||
type=str,
|
||||
help="Path to the folder to output PyTorch model.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
convert_deta_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,28 +0,0 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ....utils import _LazyModule
|
||||
from ....utils.import_utils import define_import_structure
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_efficientformer import *
|
||||
from .image_processing_efficientformer import *
|
||||
from .modeling_efficientformer import *
|
||||
else:
|
||||
import sys
|
||||
|
||||
_file = globals()["__file__"]
|
||||
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
|
||||
@ -1,170 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""EfficientFormer model configuration"""
|
||||
|
||||
from ....configuration_utils import PreTrainedConfig
|
||||
from ....utils import logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class EfficientFormerConfig(PreTrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of an [`EfficientFormerModel`]. It is used to
|
||||
instantiate an EfficientFormer model according to the specified arguments, defining the model architecture.
|
||||
Instantiating a configuration with the defaults will yield a similar configuration to that of the EfficientFormer
|
||||
[snap-research/efficientformer-l1](https://huggingface.co/snap-research/efficientformer-l1) architecture.
|
||||
|
||||
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PreTrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
depths (`List(int)`, *optional*, defaults to `[3, 2, 6, 4]`)
|
||||
Depth of each stage.
|
||||
hidden_sizes (`List(int)`, *optional*, defaults to `[48, 96, 224, 448]`)
|
||||
Dimensionality of each stage.
|
||||
downsamples (`List(bool)`, *optional*, defaults to `[True, True, True, True]`)
|
||||
Whether or not to downsample inputs between two stages.
|
||||
dim (`int`, *optional*, defaults to 448):
|
||||
Number of channels in Meta3D layers
|
||||
key_dim (`int`, *optional*, defaults to 32):
|
||||
The size of the key in meta3D block.
|
||||
attention_ratio (`int`, *optional*, defaults to 4):
|
||||
Ratio of the dimension of the query and value to the dimension of the key in MSHA block
|
||||
resolution (`int`, *optional*, defaults to 7)
|
||||
Size of each patch
|
||||
num_hidden_layers (`int`, *optional*, defaults to 5):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 8):
|
||||
Number of attention heads for each attention layer in the 3D MetaBlock.
|
||||
mlp_expansion_ratio (`int`, *optional*, defaults to 4):
|
||||
Ratio of size of the hidden dimensionality of an MLP to the dimensionality of its input.
|
||||
hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
|
||||
The dropout probability for all fully connected layers in the embeddings and encoder.
|
||||
patch_size (`int`, *optional*, defaults to 16):
|
||||
The size (resolution) of each patch.
|
||||
num_channels (`int`, *optional*, defaults to 3):
|
||||
The number of input channels.
|
||||
pool_size (`int`, *optional*, defaults to 3):
|
||||
Kernel size of pooling layers.
|
||||
downsample_patch_size (`int`, *optional*, defaults to 3):
|
||||
The size of patches in downsampling layers.
|
||||
downsample_stride (`int`, *optional*, defaults to 2):
|
||||
The stride of convolution kernels in downsampling layers.
|
||||
downsample_pad (`int`, *optional*, defaults to 1):
|
||||
Padding in downsampling layers.
|
||||
drop_path_rate (`int`, *optional*, defaults to 0):
|
||||
Rate at which to increase dropout probability in DropPath.
|
||||
num_meta3d_blocks (`int`, *optional*, defaults to 1):
|
||||
The number of 3D MetaBlocks in the last stage.
|
||||
distillation (`bool`, *optional*, defaults to `True`):
|
||||
Whether to add a distillation head.
|
||||
use_layer_scale (`bool`, *optional*, defaults to `True`):
|
||||
Whether to scale outputs from token mixers.
|
||||
layer_scale_init_value (`float`, *optional*, defaults to 1e-5):
|
||||
Factor by which outputs from token mixers are scaled.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"selu"` and `"gelu_new"` are supported.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
image_size (`int`, *optional*, defaults to `224`):
|
||||
The size (resolution) of each image.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import EfficientFormerConfig, EfficientFormerModel
|
||||
|
||||
>>> # Initializing a EfficientFormer efficientformer-l1 style configuration
|
||||
>>> configuration = EfficientFormerConfig()
|
||||
|
||||
>>> # Initializing a EfficientFormerModel (with random weights) from the efficientformer-l3 style configuration
|
||||
>>> model = EfficientFormerModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "efficientformer"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
depths: list[int] = [3, 2, 6, 4],
|
||||
hidden_sizes: list[int] = [48, 96, 224, 448],
|
||||
downsamples: list[bool] = [True, True, True, True],
|
||||
dim: int = 448,
|
||||
key_dim: int = 32,
|
||||
attention_ratio: int = 4,
|
||||
resolution: int = 7,
|
||||
num_hidden_layers: int = 5,
|
||||
num_attention_heads: int = 8,
|
||||
mlp_expansion_ratio: int = 4,
|
||||
hidden_dropout_prob: float = 0.0,
|
||||
patch_size: int = 16,
|
||||
num_channels: int = 3,
|
||||
pool_size: int = 3,
|
||||
downsample_patch_size: int = 3,
|
||||
downsample_stride: int = 2,
|
||||
downsample_pad: int = 1,
|
||||
drop_path_rate: float = 0.0,
|
||||
num_meta3d_blocks: int = 1,
|
||||
distillation: bool = True,
|
||||
use_layer_scale: bool = True,
|
||||
layer_scale_init_value: float = 1e-5,
|
||||
hidden_act: str = "gelu",
|
||||
initializer_range: float = 0.02,
|
||||
layer_norm_eps: float = 1e-12,
|
||||
image_size: int = 224,
|
||||
batch_norm_eps: float = 1e-05,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.hidden_act = hidden_act
|
||||
self.hidden_dropout_prob = hidden_dropout_prob
|
||||
self.hidden_sizes = hidden_sizes
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.initializer_range = initializer_range
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.patch_size = patch_size
|
||||
self.num_channels = num_channels
|
||||
self.depths = depths
|
||||
self.mlp_expansion_ratio = mlp_expansion_ratio
|
||||
self.downsamples = downsamples
|
||||
self.dim = dim
|
||||
self.key_dim = key_dim
|
||||
self.attention_ratio = attention_ratio
|
||||
self.resolution = resolution
|
||||
self.pool_size = pool_size
|
||||
self.downsample_patch_size = downsample_patch_size
|
||||
self.downsample_stride = downsample_stride
|
||||
self.downsample_pad = downsample_pad
|
||||
self.drop_path_rate = drop_path_rate
|
||||
self.num_meta3d_blocks = num_meta3d_blocks
|
||||
self.distillation = distillation
|
||||
self.use_layer_scale = use_layer_scale
|
||||
self.layer_scale_init_value = layer_scale_init_value
|
||||
self.image_size = image_size
|
||||
self.batch_norm_eps = batch_norm_eps
|
||||
|
||||
|
||||
__all__ = [
|
||||
"EfficientFormerConfig",
|
||||
]
|
||||
@ -1,252 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2022 The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Convert EfficientFormer checkpoints from the original repository.
|
||||
|
||||
URL: https://github.com/snap-research/EfficientFormer
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
import torch
|
||||
from PIL import Image
|
||||
from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor
|
||||
|
||||
from transformers import (
|
||||
EfficientFormerConfig,
|
||||
EfficientFormerForImageClassificationWithTeacher,
|
||||
EfficientFormerImageProcessor,
|
||||
)
|
||||
from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling
|
||||
|
||||
|
||||
def rename_key(old_name, num_meta4D_last_stage):
|
||||
new_name = old_name
|
||||
|
||||
if "patch_embed" in old_name:
|
||||
_, layer, param = old_name.split(".")
|
||||
|
||||
if layer == "0":
|
||||
new_name = old_name.replace("0", "convolution1")
|
||||
elif layer == "1":
|
||||
new_name = old_name.replace("1", "batchnorm_before")
|
||||
elif layer == "3":
|
||||
new_name = old_name.replace("3", "convolution2")
|
||||
else:
|
||||
new_name = old_name.replace("4", "batchnorm_after")
|
||||
|
||||
if "network" in old_name and re.search(r"\d\.\d", old_name):
|
||||
two_digit_num = r"\b\d{2}\b"
|
||||
if bool(re.search(two_digit_num, old_name)):
|
||||
match = re.search(r"\d\.\d\d.", old_name).group()
|
||||
else:
|
||||
match = re.search(r"\d\.\d.", old_name).group()
|
||||
if int(match[0]) < 6:
|
||||
trimmed_name = old_name.replace(match, "")
|
||||
trimmed_name = trimmed_name.replace("network", match[0] + ".meta4D_layers.blocks." + match[2:-1])
|
||||
new_name = "intermediate_stages." + trimmed_name
|
||||
else:
|
||||
trimmed_name = old_name.replace(match, "")
|
||||
if int(match[2]) < num_meta4D_last_stage:
|
||||
trimmed_name = trimmed_name.replace("network", "meta4D_layers.blocks." + match[2])
|
||||
else:
|
||||
layer_index = str(int(match[2]) - num_meta4D_last_stage)
|
||||
trimmed_name = trimmed_name.replace("network", "meta3D_layers.blocks." + layer_index)
|
||||
if "norm1" in old_name:
|
||||
trimmed_name = trimmed_name.replace("norm1", "layernorm1")
|
||||
elif "norm2" in old_name:
|
||||
trimmed_name = trimmed_name.replace("norm2", "layernorm2")
|
||||
elif "fc1" in old_name:
|
||||
trimmed_name = trimmed_name.replace("fc1", "linear_in")
|
||||
elif "fc2" in old_name:
|
||||
trimmed_name = trimmed_name.replace("fc2", "linear_out")
|
||||
|
||||
new_name = "last_stage." + trimmed_name
|
||||
|
||||
elif "network" in old_name and re.search(r".\d.", old_name):
|
||||
new_name = old_name.replace("network", "intermediate_stages")
|
||||
|
||||
if "fc" in new_name:
|
||||
new_name = new_name.replace("fc", "convolution")
|
||||
elif ("norm1" in new_name) and ("layernorm1" not in new_name):
|
||||
new_name = new_name.replace("norm1", "batchnorm_before")
|
||||
elif ("norm2" in new_name) and ("layernorm2" not in new_name):
|
||||
new_name = new_name.replace("norm2", "batchnorm_after")
|
||||
if "proj" in new_name:
|
||||
new_name = new_name.replace("proj", "projection")
|
||||
if "dist_head" in new_name:
|
||||
new_name = new_name.replace("dist_head", "distillation_classifier")
|
||||
elif "head" in new_name:
|
||||
new_name = new_name.replace("head", "classifier")
|
||||
elif "patch_embed" in new_name:
|
||||
new_name = "efficientformer." + new_name
|
||||
elif new_name == "norm.weight" or new_name == "norm.bias":
|
||||
new_name = new_name.replace("norm", "layernorm")
|
||||
new_name = "efficientformer." + new_name
|
||||
else:
|
||||
new_name = "efficientformer.encoder." + new_name
|
||||
|
||||
return new_name
|
||||
|
||||
|
||||
def convert_torch_checkpoint(checkpoint, num_meta4D_last_stage):
|
||||
for key in checkpoint.copy():
|
||||
val = checkpoint.pop(key)
|
||||
checkpoint[rename_key(key, num_meta4D_last_stage)] = val
|
||||
|
||||
return checkpoint
|
||||
|
||||
|
||||
# We will verify our results on a COCO image
|
||||
def prepare_img():
|
||||
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
image = Image.open(requests.get(url, stream=True).raw)
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def convert_efficientformer_checkpoint(
|
||||
checkpoint_path: Path, efficientformer_config_file: Path, pytorch_dump_path: Path, push_to_hub: bool
|
||||
):
|
||||
orig_state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["model"]
|
||||
config = EfficientFormerConfig.from_json_file(efficientformer_config_file)
|
||||
model = EfficientFormerForImageClassificationWithTeacher(config)
|
||||
model_name = "_".join(checkpoint_path.split("/")[-1].split(".")[0].split("_")[:-1])
|
||||
|
||||
num_meta4D_last_stage = config.depths[-1] - config.num_meta3d_blocks + 1
|
||||
new_state_dict = convert_torch_checkpoint(orig_state_dict, num_meta4D_last_stage)
|
||||
|
||||
model.load_state_dict(new_state_dict)
|
||||
model.eval()
|
||||
|
||||
pillow_resamplings = {
|
||||
"bilinear": PILImageResampling.BILINEAR,
|
||||
"bicubic": PILImageResampling.BICUBIC,
|
||||
"nearest": PILImageResampling.NEAREST,
|
||||
}
|
||||
|
||||
# prepare image
|
||||
image = prepare_img()
|
||||
image_size = 256
|
||||
crop_size = 224
|
||||
processor = EfficientFormerImageProcessor(
|
||||
size={"shortest_edge": image_size},
|
||||
crop_size={"height": crop_size, "width": crop_size},
|
||||
resample=pillow_resamplings["bicubic"],
|
||||
)
|
||||
pixel_values = processor(images=image, return_tensors="pt").pixel_values
|
||||
|
||||
# original processing pipeline
|
||||
image_transforms = Compose(
|
||||
[
|
||||
Resize(image_size, interpolation=pillow_resamplings["bicubic"]),
|
||||
CenterCrop(crop_size),
|
||||
ToTensor(),
|
||||
Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
|
||||
]
|
||||
)
|
||||
original_pixel_values = image_transforms(image).unsqueeze(0)
|
||||
|
||||
assert torch.allclose(original_pixel_values, pixel_values)
|
||||
|
||||
outputs = model(pixel_values)
|
||||
logits = outputs.logits
|
||||
|
||||
expected_shape = (1, 1000)
|
||||
|
||||
if "l1" in model_name:
|
||||
expected_logits = torch.Tensor(
|
||||
[-0.1312, 0.4353, -1.0499, -0.5124, 0.4183, -0.6793, -1.3777, -0.0893, -0.7358, -2.4328]
|
||||
)
|
||||
assert torch.allclose(logits[0, :10], expected_logits, atol=1e-3)
|
||||
assert logits.shape == expected_shape
|
||||
elif "l3" in model_name:
|
||||
expected_logits = torch.Tensor(
|
||||
[-1.3150, -1.5456, -1.2556, -0.8496, -0.7127, -0.7897, -0.9728, -0.3052, 0.3751, -0.3127]
|
||||
)
|
||||
assert torch.allclose(logits[0, :10], expected_logits, atol=1e-3)
|
||||
assert logits.shape == expected_shape
|
||||
elif "l7" in model_name:
|
||||
expected_logits = torch.Tensor(
|
||||
[-1.0283, -1.4131, -0.5644, -1.3115, -0.5785, -1.2049, -0.7528, 0.1992, -0.3822, -0.0878]
|
||||
)
|
||||
assert logits.shape == expected_shape
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unknown model checkpoint: {checkpoint_path}. Supported version of efficientformer are l1, l3 and l7"
|
||||
)
|
||||
|
||||
# Save Checkpoints
|
||||
Path(pytorch_dump_path).mkdir(exist_ok=True)
|
||||
model.save_pretrained(pytorch_dump_path)
|
||||
print(f"Checkpoint successfully converted. Model saved at {pytorch_dump_path}")
|
||||
processor.save_pretrained(pytorch_dump_path)
|
||||
print(f"Processor successfully saved at {pytorch_dump_path}")
|
||||
|
||||
if push_to_hub:
|
||||
print("Pushing model to the hub...")
|
||||
|
||||
model.push_to_hub(
|
||||
repo_id=f"Bearnardd/{pytorch_dump_path}",
|
||||
commit_message="Add model",
|
||||
use_temp_dir=True,
|
||||
)
|
||||
processor.push_to_hub(
|
||||
repo_id=f"Bearnardd/{pytorch_dump_path}",
|
||||
commit_message="Add image processor",
|
||||
use_temp_dir=True,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
# Required parameters
|
||||
parser.add_argument(
|
||||
"--pytorch_model_path",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to EfficientFormer pytorch checkpoint.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config_file",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The json file for EfficientFormer model config.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
|
||||
)
|
||||
|
||||
parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
|
||||
parser.add_argument(
|
||||
"--no-push_to_hub",
|
||||
dest="push_to_hub",
|
||||
action="store_false",
|
||||
help="Do not push model and image processor to the hub",
|
||||
)
|
||||
parser.set_defaults(push_to_hub=True)
|
||||
|
||||
args = parser.parse_args()
|
||||
convert_efficientformer_checkpoint(
|
||||
checkpoint_path=args.pytorch_model_path,
|
||||
efficientformer_config_file=args.config_file,
|
||||
pytorch_dump_path=args.pytorch_dump_path,
|
||||
push_to_hub=args.push_to_hub,
|
||||
)
|
||||
@ -1,319 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Image processor class for EfficientFormer."""
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ....image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
|
||||
from ....image_transforms import (
|
||||
get_resize_output_image_size,
|
||||
resize,
|
||||
to_channel_dimension_format,
|
||||
)
|
||||
from ....image_utils import (
|
||||
IMAGENET_DEFAULT_MEAN,
|
||||
IMAGENET_DEFAULT_STD,
|
||||
ChannelDimension,
|
||||
ImageInput,
|
||||
PILImageResampling,
|
||||
infer_channel_dimension_format,
|
||||
is_batched,
|
||||
is_scaled_image,
|
||||
to_numpy_array,
|
||||
valid_images,
|
||||
validate_kwargs,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ....utils import TensorType, logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class EfficientFormerImageProcessor(BaseImageProcessor):
|
||||
r"""
|
||||
Constructs a EfficientFormer image processor.
|
||||
|
||||
Args:
|
||||
do_resize (`bool`, *optional*, defaults to `True`):
|
||||
Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
|
||||
size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
|
||||
size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`):
|
||||
Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
|
||||
method.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
|
||||
Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
|
||||
`preprocess` method.
|
||||
do_center_crop (`bool`, *optional*, defaults to `True`):
|
||||
Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
|
||||
`preprocess` method.
|
||||
crop_size (`dict[str, int]` *optional*, defaults to 224):
|
||||
Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
|
||||
method.
|
||||
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
|
||||
parameter in the `preprocess` method.
|
||||
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
||||
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
|
||||
`preprocess` method.
|
||||
do_normalize:
|
||||
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
|
||||
method.
|
||||
image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
|
||||
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
|
||||
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
|
||||
image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
|
||||
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
|
||||
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
do_resize: bool = True,
|
||||
size: Optional[dict[str, int]] = None,
|
||||
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
||||
do_center_crop: bool = True,
|
||||
do_rescale: bool = True,
|
||||
rescale_factor: Union[int, float] = 1 / 255,
|
||||
crop_size: Optional[dict[str, int]] = None,
|
||||
do_normalize: bool = True,
|
||||
image_mean: Optional[Union[float, list[float]]] = None,
|
||||
image_std: Optional[Union[float, list[float]]] = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
size = size if size is not None else {"height": 224, "width": 224}
|
||||
size = get_size_dict(size)
|
||||
crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
|
||||
crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
|
||||
|
||||
self.do_resize = do_resize
|
||||
self.do_rescale = do_rescale
|
||||
self.do_normalize = do_normalize
|
||||
self.do_center_crop = do_center_crop
|
||||
self.crop_size = crop_size
|
||||
self.size = size
|
||||
self.resample = resample
|
||||
self.rescale_factor = rescale_factor
|
||||
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
|
||||
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
|
||||
self._valid_processor_keys = [
|
||||
"images",
|
||||
"do_resize",
|
||||
"size",
|
||||
"resample",
|
||||
"do_center_crop",
|
||||
"crop_size",
|
||||
"do_rescale",
|
||||
"rescale_factor",
|
||||
"do_normalize",
|
||||
"image_mean",
|
||||
"image_std",
|
||||
"return_tensors",
|
||||
"data_format",
|
||||
"input_data_format",
|
||||
]
|
||||
|
||||
def resize(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
size: dict[str, int],
|
||||
resample: PILImageResampling = PILImageResampling.BILINEAR,
|
||||
data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
**kwargs,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Resize an image to `(size["height"], size["width"])`.
|
||||
|
||||
Args:
|
||||
image (`np.ndarray`):
|
||||
Image to resize.
|
||||
size (`dict[str, int]`):
|
||||
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
|
||||
resample:
|
||||
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
|
||||
data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format for the output image. If unset, the channel dimension format of the input
|
||||
image is used. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format of the input image. If not provided, it will be inferred.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`: The resized image.
|
||||
"""
|
||||
size = get_size_dict(size)
|
||||
|
||||
if "shortest_edge" in size:
|
||||
size = get_resize_output_image_size(
|
||||
image, size=size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
|
||||
)
|
||||
# size = get_resize_output_image_size(image, size["shortest_edge"], size["longest_edge"])
|
||||
elif "height" in size and "width" in size:
|
||||
size = (size["height"], size["width"])
|
||||
else:
|
||||
raise ValueError(f"Size must contain 'height' and 'width' keys or 'shortest_edge' key. Got {size.keys()}")
|
||||
return resize(
|
||||
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
|
||||
)
|
||||
|
||||
def preprocess(
|
||||
self,
|
||||
images: ImageInput,
|
||||
do_resize: Optional[bool] = None,
|
||||
size: Optional[dict[str, int]] = None,
|
||||
resample: Optional[PILImageResampling] = None,
|
||||
do_center_crop: Optional[bool] = None,
|
||||
crop_size: Optional[int] = None,
|
||||
do_rescale: Optional[bool] = None,
|
||||
rescale_factor: Optional[float] = None,
|
||||
do_normalize: Optional[bool] = None,
|
||||
image_mean: Optional[Union[float, list[float]]] = None,
|
||||
image_std: Optional[Union[float, list[float]]] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Preprocess an image or batch of images.
|
||||
|
||||
Args:
|
||||
images (`ImageInput`):
|
||||
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
|
||||
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
|
||||
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
||||
Whether to resize the image.
|
||||
size (`dict[str, int]`, *optional*, defaults to `self.size`):
|
||||
Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
|
||||
resizing.
|
||||
resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
|
||||
`PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
|
||||
an effect if `do_resize` is set to `True`.
|
||||
do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
|
||||
Whether to center crop the image.
|
||||
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
||||
Whether to rescale the image values between [0 - 1].
|
||||
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
||||
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
|
||||
crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`):
|
||||
Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
|
||||
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
||||
Whether to normalize the image.
|
||||
image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
|
||||
Image mean to use if `do_normalize` is set to `True`.
|
||||
image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
|
||||
Image standard deviation to use if `do_normalize` is set to `True`.
|
||||
return_tensors (`str` or `TensorType`, *optional*):
|
||||
The type of tensors to return. Can be one of:
|
||||
- Unset: Return a list of `np.ndarray`.
|
||||
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
|
||||
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
|
||||
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
|
||||
The channel dimension format for the output image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- Unset: Use the channel dimension format of the input image.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format for the input image. If unset, the channel dimension format is inferred
|
||||
from the input image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||
"""
|
||||
do_resize = do_resize if do_resize is not None else self.do_resize
|
||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||
do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
|
||||
crop_size = crop_size if crop_size is not None else self.crop_size
|
||||
crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
|
||||
resample = resample if resample is not None else self.resample
|
||||
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||
image_mean = image_mean if image_mean is not None else self.image_mean
|
||||
image_std = image_std if image_std is not None else self.image_std
|
||||
|
||||
size = size if size is not None else self.size
|
||||
size_dict = get_size_dict(size)
|
||||
|
||||
validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
|
||||
|
||||
if not is_batched(images):
|
||||
images = [images]
|
||||
|
||||
if not valid_images(images):
|
||||
raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, or torch.Tensor")
|
||||
validate_preprocess_arguments(
|
||||
do_rescale=do_rescale,
|
||||
rescale_factor=rescale_factor,
|
||||
do_normalize=do_normalize,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
do_center_crop=do_center_crop,
|
||||
crop_size=crop_size,
|
||||
do_resize=do_resize,
|
||||
size=size,
|
||||
resample=resample,
|
||||
)
|
||||
# All transformations expect numpy arrays.
|
||||
images = [to_numpy_array(image) for image in images]
|
||||
|
||||
if do_rescale and is_scaled_image(images[0]):
|
||||
logger.warning_once(
|
||||
"It looks like you are trying to rescale already rescaled images. If the input"
|
||||
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
|
||||
)
|
||||
|
||||
if input_data_format is None:
|
||||
# We assume that all images have the same channel dimension format.
|
||||
input_data_format = infer_channel_dimension_format(images[0])
|
||||
|
||||
if do_resize:
|
||||
images = [
|
||||
self.resize(image=image, size=size_dict, resample=resample, input_data_format=input_data_format)
|
||||
for image in images
|
||||
]
|
||||
|
||||
if do_center_crop:
|
||||
images = [
|
||||
self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
|
||||
]
|
||||
|
||||
if do_rescale:
|
||||
images = [
|
||||
self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
|
||||
for image in images
|
||||
]
|
||||
|
||||
if do_normalize:
|
||||
images = [
|
||||
self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
|
||||
for image in images
|
||||
]
|
||||
|
||||
images = [
|
||||
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
|
||||
]
|
||||
|
||||
data = {"pixel_values": images}
|
||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||
|
||||
|
||||
__all__ = ["EfficientFormerImageProcessor"]
|
||||
@ -1,771 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2022 Snapchat Research and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""PyTorch EfficientFormer model."""
|
||||
|
||||
import itertools
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from ....activations import ACT2FN
|
||||
from ....modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
|
||||
from ....modeling_utils import PreTrainedModel
|
||||
from ....utils import (
|
||||
ModelOutput,
|
||||
add_code_sample_docstrings,
|
||||
add_start_docstrings,
|
||||
add_start_docstrings_to_model_forward,
|
||||
logging,
|
||||
)
|
||||
from .configuration_efficientformer import EfficientFormerConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "EfficientFormerConfig"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "snap-research/efficientformer-l1-300"
|
||||
_EXPECTED_OUTPUT_SHAPE = [1, 49, 448]
|
||||
|
||||
# Image classification docstring
|
||||
_IMAGE_CLASS_CHECKPOINT = "snap-research/efficientformer-l1-300"
|
||||
_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat"
|
||||
|
||||
|
||||
class EfficientFormerPatchEmbeddings(nn.Module):
|
||||
"""
|
||||
This class performs downsampling between two stages. For the input tensor with the shape [batch_size, num_channels,
|
||||
height, width] it produces output tensor with the shape [batch_size, num_channels, height/stride, width/stride]
|
||||
"""
|
||||
|
||||
def __init__(self, config: EfficientFormerConfig, num_channels: int, embed_dim: int, apply_norm: bool = True):
|
||||
super().__init__()
|
||||
self.num_channels = num_channels
|
||||
|
||||
self.projection = nn.Conv2d(
|
||||
num_channels,
|
||||
embed_dim,
|
||||
kernel_size=config.downsample_patch_size,
|
||||
stride=config.downsample_stride,
|
||||
padding=config.downsample_pad,
|
||||
)
|
||||
self.norm = nn.BatchNorm2d(embed_dim, eps=config.batch_norm_eps) if apply_norm else nn.Identity()
|
||||
|
||||
def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
|
||||
batch_size, num_channels, height, width = pixel_values.shape
|
||||
if num_channels != self.num_channels:
|
||||
raise ValueError(
|
||||
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
|
||||
)
|
||||
|
||||
embeddings = self.projection(pixel_values)
|
||||
embeddings = self.norm(embeddings)
|
||||
|
||||
return embeddings
|
||||
|
||||
|
||||
class EfficientFormerSelfAttention(nn.Module):
|
||||
def __init__(self, dim: int, key_dim: int, num_heads: int, attention_ratio: int, resolution: int):
|
||||
super().__init__()
|
||||
|
||||
self.num_heads = num_heads
|
||||
self.key_dim = key_dim
|
||||
self.attention_ratio = attention_ratio
|
||||
self.scale = key_dim**-0.5
|
||||
self.total_key_dim = key_dim * num_heads
|
||||
self.expanded_key_dim = int(attention_ratio * key_dim)
|
||||
self.total_expanded_key_dim = int(self.expanded_key_dim * num_heads)
|
||||
hidden_size = self.total_expanded_key_dim + self.total_key_dim * 2
|
||||
self.qkv = nn.Linear(dim, hidden_size)
|
||||
self.projection = nn.Linear(self.total_expanded_key_dim, dim)
|
||||
points = list(itertools.product(range(resolution), range(resolution)))
|
||||
num_points = len(points)
|
||||
attention_offsets = {}
|
||||
idxs = []
|
||||
for point_1 in points:
|
||||
for point_2 in points:
|
||||
offset = (abs(point_1[0] - point_2[0]), abs(point_1[1] - point_2[1]))
|
||||
if offset not in attention_offsets:
|
||||
attention_offsets[offset] = len(attention_offsets)
|
||||
idxs.append(attention_offsets[offset])
|
||||
self.attention_biases = torch.nn.Parameter(torch.zeros(num_heads, len(attention_offsets)))
|
||||
self.register_buffer("attention_bias_idxs", torch.LongTensor(idxs).view(num_points, num_points))
|
||||
|
||||
@torch.no_grad()
|
||||
def train(self, mode=True):
|
||||
super().train(mode)
|
||||
if mode and hasattr(self, "ab"):
|
||||
del self.ab
|
||||
else:
|
||||
self.ab = self.attention_biases[:, self.attention_bias_idxs]
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> tuple[torch.Tensor]:
|
||||
batch_size, sequence_length, num_channels = hidden_states.shape
|
||||
qkv = self.qkv(hidden_states)
|
||||
query_layer, key_layer, value_layer = qkv.reshape(batch_size, sequence_length, self.num_heads, -1).split(
|
||||
[self.key_dim, self.key_dim, self.expanded_key_dim], dim=3
|
||||
)
|
||||
query_layer = query_layer.permute(0, 2, 1, 3)
|
||||
key_layer = key_layer.permute(0, 2, 1, 3)
|
||||
value_layer = value_layer.permute(0, 2, 1, 3)
|
||||
|
||||
# set `model.to(torch_device)` won't change `self.ab.device`, if there is no follow-up `train` or `eval` call.
|
||||
# Let's do it manually here, so users won't have to do this everytime.
|
||||
if not self.training:
|
||||
self.ab = self.ab.to(self.attention_biases.device)
|
||||
attention_probs = (torch.matmul(query_layer, key_layer.transpose(-2, -1))) * self.scale + (
|
||||
self.attention_biases[:, self.attention_bias_idxs] if self.training else self.ab
|
||||
)
|
||||
|
||||
attention_probs = attention_probs.softmax(dim=-1)
|
||||
|
||||
context_layer = torch.matmul(attention_probs, value_layer).transpose(1, 2)
|
||||
context_layer = context_layer.reshape(batch_size, sequence_length, self.total_expanded_key_dim)
|
||||
context_layer = self.projection(context_layer)
|
||||
|
||||
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
class EfficientFormerConvStem(nn.Module):
|
||||
def __init__(self, config: EfficientFormerConfig, out_channels: int):
|
||||
super().__init__()
|
||||
|
||||
self.convolution1 = nn.Conv2d(config.num_channels, out_channels // 2, kernel_size=3, stride=2, padding=1)
|
||||
self.batchnorm_before = nn.BatchNorm2d(out_channels // 2, eps=config.batch_norm_eps)
|
||||
|
||||
self.convolution2 = nn.Conv2d(out_channels // 2, out_channels, kernel_size=3, stride=2, padding=1)
|
||||
self.batchnorm_after = nn.BatchNorm2d(out_channels, eps=config.batch_norm_eps)
|
||||
|
||||
self.activation = nn.ReLU()
|
||||
|
||||
def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
|
||||
features = self.batchnorm_before(self.convolution1(pixel_values))
|
||||
features = self.activation(features)
|
||||
features = self.batchnorm_after(self.convolution2(features))
|
||||
features = self.activation(features)
|
||||
|
||||
return features
|
||||
|
||||
|
||||
class EfficientFormerPooling(nn.Module):
|
||||
def __init__(self, pool_size: int):
|
||||
super().__init__()
|
||||
self.pool = nn.AvgPool2d(pool_size, stride=1, padding=pool_size // 2, count_include_pad=False)
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||
output = self.pool(hidden_states) - hidden_states
|
||||
return output
|
||||
|
||||
|
||||
class EfficientFormerDenseMlp(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
config: EfficientFormerConfig,
|
||||
in_features: int,
|
||||
hidden_features: Optional[int] = None,
|
||||
out_features: Optional[int] = None,
|
||||
):
|
||||
super().__init__()
|
||||
out_features = out_features or in_features
|
||||
hidden_features = hidden_features or in_features
|
||||
|
||||
self.linear_in = nn.Linear(in_features, hidden_features)
|
||||
self.activation = ACT2FN[config.hidden_act]
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
self.linear_out = nn.Linear(hidden_features, out_features)
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||
hidden_states = self.linear_in(hidden_states)
|
||||
hidden_states = self.activation(hidden_states)
|
||||
hidden_states = self.dropout(hidden_states)
|
||||
hidden_states = self.linear_out(hidden_states)
|
||||
hidden_states = self.dropout(hidden_states)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class EfficientFormerConvMlp(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
config: EfficientFormerConfig,
|
||||
in_features: int,
|
||||
hidden_features: Optional[int] = None,
|
||||
out_features: Optional[int] = None,
|
||||
drop: float = 0.0,
|
||||
):
|
||||
super().__init__()
|
||||
out_features = out_features or in_features
|
||||
hidden_features = hidden_features or in_features
|
||||
|
||||
self.convolution1 = nn.Conv2d(in_features, hidden_features, 1)
|
||||
self.activation = ACT2FN[config.hidden_act]
|
||||
self.convolution2 = nn.Conv2d(hidden_features, out_features, 1)
|
||||
self.dropout = nn.Dropout(drop)
|
||||
|
||||
self.batchnorm_before = nn.BatchNorm2d(hidden_features, eps=config.batch_norm_eps)
|
||||
self.batchnorm_after = nn.BatchNorm2d(out_features, eps=config.batch_norm_eps)
|
||||
|
||||
def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
|
||||
hidden_state = self.convolution1(hidden_state)
|
||||
hidden_state = self.batchnorm_before(hidden_state)
|
||||
|
||||
hidden_state = self.activation(hidden_state)
|
||||
hidden_state = self.dropout(hidden_state)
|
||||
hidden_state = self.convolution2(hidden_state)
|
||||
|
||||
hidden_state = self.batchnorm_after(hidden_state)
|
||||
hidden_state = self.dropout(hidden_state)
|
||||
|
||||
return hidden_state
|
||||
|
||||
|
||||
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
|
||||
"""
|
||||
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
|
||||
|
||||
"""
|
||||
if drop_prob == 0.0 or not training:
|
||||
return input
|
||||
keep_prob = 1 - drop_prob
|
||||
shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
|
||||
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
|
||||
random_tensor.floor_() # binarize
|
||||
output = input.div(keep_prob) * random_tensor
|
||||
return output
|
||||
|
||||
|
||||
class EfficientFormerDropPath(nn.Module):
|
||||
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
|
||||
|
||||
def __init__(self, drop_prob: Optional[float] = None) -> None:
|
||||
super().__init__()
|
||||
self.drop_prob = drop_prob
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||
return drop_path(hidden_states, self.drop_prob, self.training)
|
||||
|
||||
def extra_repr(self) -> str:
|
||||
return f"p={self.drop_prob}"
|
||||
|
||||
|
||||
class EfficientFormerFlat(nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor]:
|
||||
hidden_states = hidden_states.flatten(2).transpose(1, 2)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class EfficientFormerMeta3D(nn.Module):
|
||||
def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0):
|
||||
super().__init__()
|
||||
|
||||
self.token_mixer = EfficientFormerSelfAttention(
|
||||
dim=config.dim,
|
||||
key_dim=config.key_dim,
|
||||
num_heads=config.num_attention_heads,
|
||||
attention_ratio=config.attention_ratio,
|
||||
resolution=config.resolution,
|
||||
)
|
||||
|
||||
self.layernorm1 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
|
||||
self.layernorm2 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
|
||||
|
||||
mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
|
||||
self.mlp = EfficientFormerDenseMlp(config, in_features=dim, hidden_features=mlp_hidden_dim)
|
||||
|
||||
self.drop_path = EfficientFormerDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
|
||||
self.use_layer_scale = config.use_layer_scale
|
||||
if config.use_layer_scale:
|
||||
self.layer_scale_1 = nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True)
|
||||
self.layer_scale_2 = nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True)
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> tuple[torch.Tensor]:
|
||||
self_attention_outputs = self.token_mixer(self.layernorm1(hidden_states), output_attentions)
|
||||
attention_output = self_attention_outputs[0]
|
||||
outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
|
||||
|
||||
if self.use_layer_scale:
|
||||
layer_output = hidden_states + self.drop_path(
|
||||
self.layer_scale_1.unsqueeze(0).unsqueeze(0) * attention_output
|
||||
)
|
||||
layer_output = layer_output + self.drop_path(
|
||||
self.layer_scale_2.unsqueeze(0).unsqueeze(0) * self.mlp(self.layernorm2(layer_output))
|
||||
)
|
||||
else:
|
||||
layer_output = hidden_states + self.drop_path(attention_output)
|
||||
layer_output = layer_output + self.drop_path(self.mlp(self.layernorm2(layer_output)))
|
||||
|
||||
outputs = (layer_output,) + outputs
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
class EfficientFormerMeta3DLayers(nn.Module):
|
||||
def __init__(self, config: EfficientFormerConfig):
|
||||
super().__init__()
|
||||
drop_paths = [
|
||||
config.drop_path_rate * (block_idx + sum(config.depths[:-1]))
|
||||
for block_idx in range(config.num_meta3d_blocks)
|
||||
]
|
||||
self.blocks = nn.ModuleList(
|
||||
[EfficientFormerMeta3D(config, config.hidden_sizes[-1], drop_path=drop_path) for drop_path in drop_paths]
|
||||
)
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> tuple[torch.Tensor]:
|
||||
all_attention_outputs = () if output_attentions else None
|
||||
|
||||
for layer_module in self.blocks:
|
||||
if isinstance(hidden_states, tuple):
|
||||
hidden_states = hidden_states[0]
|
||||
|
||||
hidden_states = layer_module(hidden_states, output_attentions)
|
||||
|
||||
if output_attentions:
|
||||
all_attention_outputs = all_attention_outputs + (hidden_states[1],)
|
||||
|
||||
if output_attentions:
|
||||
outputs = (hidden_states[0],) + all_attention_outputs
|
||||
return outputs
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class EfficientFormerMeta4D(nn.Module):
|
||||
def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0):
|
||||
super().__init__()
|
||||
pool_size = config.pool_size if config.pool_size is not None else 3
|
||||
self.token_mixer = EfficientFormerPooling(pool_size=pool_size)
|
||||
mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
|
||||
self.mlp = EfficientFormerConvMlp(
|
||||
config, in_features=dim, hidden_features=mlp_hidden_dim, drop=config.hidden_dropout_prob
|
||||
)
|
||||
|
||||
self.drop_path = EfficientFormerDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
|
||||
self.use_layer_scale = config.use_layer_scale
|
||||
if config.use_layer_scale:
|
||||
self.layer_scale_1 = nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True)
|
||||
self.layer_scale_2 = nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True)
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor]:
|
||||
outputs = self.token_mixer(hidden_states)
|
||||
|
||||
if self.use_layer_scale:
|
||||
layer_output = hidden_states + self.drop_path(self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) * outputs)
|
||||
|
||||
layer_output = layer_output + self.drop_path(
|
||||
self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) * self.mlp(layer_output)
|
||||
)
|
||||
else:
|
||||
layer_output = hidden_states + self.drop_path(outputs)
|
||||
layer_output = layer_output + self.drop_path(self.mlp(layer_output))
|
||||
|
||||
return layer_output
|
||||
|
||||
|
||||
class EfficientFormerMeta4DLayers(nn.Module):
|
||||
def __init__(self, config: EfficientFormerConfig, stage_idx: int):
|
||||
super().__init__()
|
||||
num_layers = (
|
||||
config.depths[stage_idx] if stage_idx != -1 else config.depths[stage_idx] - config.num_meta3d_blocks
|
||||
)
|
||||
drop_paths = [
|
||||
config.drop_path_rate * (block_idx + sum(config.depths[:stage_idx])) for block_idx in range(num_layers)
|
||||
]
|
||||
|
||||
self.blocks = nn.ModuleList(
|
||||
[
|
||||
EfficientFormerMeta4D(config, config.hidden_sizes[stage_idx], drop_path=drop_path)
|
||||
for drop_path in drop_paths
|
||||
]
|
||||
)
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor]:
|
||||
for layer_module in self.blocks:
|
||||
hidden_states = layer_module(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class EfficientFormerIntermediateStage(nn.Module):
|
||||
def __init__(self, config: EfficientFormerConfig, index: int):
|
||||
super().__init__()
|
||||
self.meta4D_layers = EfficientFormerMeta4DLayers(config, index)
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor]:
|
||||
hidden_states = self.meta4D_layers(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class EfficientFormerLastStage(nn.Module):
|
||||
def __init__(self, config: EfficientFormerConfig):
|
||||
super().__init__()
|
||||
self.meta4D_layers = EfficientFormerMeta4DLayers(config, -1)
|
||||
self.flat = EfficientFormerFlat()
|
||||
self.meta3D_layers = EfficientFormerMeta3DLayers(config)
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> tuple[torch.Tensor]:
|
||||
hidden_states = self.meta4D_layers(hidden_states)
|
||||
hidden_states = self.flat(hidden_states)
|
||||
hidden_states = self.meta3D_layers(hidden_states, output_attentions)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class EfficientFormerEncoder(nn.Module):
|
||||
def __init__(self, config: EfficientFormerConfig):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
num_intermediate_stages = len(config.depths) - 1
|
||||
downsamples = [
|
||||
config.downsamples[i] or config.hidden_sizes[i] != config.hidden_sizes[i + 1]
|
||||
for i in range(num_intermediate_stages)
|
||||
]
|
||||
intermediate_stages = []
|
||||
|
||||
for i in range(num_intermediate_stages):
|
||||
intermediate_stages.append(EfficientFormerIntermediateStage(config, i))
|
||||
if downsamples[i]:
|
||||
intermediate_stages.append(
|
||||
EfficientFormerPatchEmbeddings(config, config.hidden_sizes[i], config.hidden_sizes[i + 1])
|
||||
)
|
||||
|
||||
self.intermediate_stages = nn.ModuleList(intermediate_stages)
|
||||
self.last_stage = EfficientFormerLastStage(config)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
output_hidden_states: bool = False,
|
||||
output_attentions: bool = False,
|
||||
return_dict: bool = True,
|
||||
) -> BaseModelOutput:
|
||||
all_hidden_states = () if output_hidden_states else None
|
||||
all_self_attentions = () if output_attentions else None
|
||||
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||
|
||||
for layer_module in self.intermediate_stages:
|
||||
hidden_states = layer_module(hidden_states)
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||
|
||||
layer_output = self.last_stage(hidden_states, output_attentions=output_attentions)
|
||||
|
||||
if output_attentions:
|
||||
all_self_attentions = all_self_attentions + layer_output[1:]
|
||||
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (layer_output[0],)
|
||||
|
||||
if not return_dict:
|
||||
return tuple(v for v in [layer_output[0], all_hidden_states, all_self_attentions] if v is not None)
|
||||
|
||||
return BaseModelOutput(
|
||||
last_hidden_state=layer_output[0],
|
||||
hidden_states=all_hidden_states,
|
||||
attentions=all_self_attentions,
|
||||
)
|
||||
|
||||
|
||||
class EfficientFormerPreTrainedModel(PreTrainedModel):
|
||||
"""
|
||||
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
||||
models.
|
||||
"""
|
||||
|
||||
config: EfficientFormerConfig
|
||||
base_model_prefix = "efficientformer"
|
||||
main_input_name = "pixel_values"
|
||||
supports_gradient_checkpointing = False
|
||||
|
||||
|
||||
EFFICIENTFORMER_START_DOCSTRING = r"""
|
||||
This model is a PyTorch [nn.Module](https://pytorch.org/docs/stable/nn.html#nn.Module) subclass. Use it as a
|
||||
regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior.
|
||||
|
||||
Parameters:
|
||||
config ([`EfficientFormerConfig`]): Model configuration class with all the parameters of the model.
|
||||
Initializing with a config file does not load the weights associated with the model, only the
|
||||
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
||||
"""
|
||||
|
||||
EFFICIENTFORMER_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Pixel values can be obtained using [`ViTImageProcessor`]. See
|
||||
[`ViTImageProcessor.preprocess`] for details.
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||
tensors for more detail.
|
||||
output_hidden_states (`bool`, *optional*):
|
||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
||||
more detail.
|
||||
return_dict (`bool`, *optional*):
|
||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The bare EfficientFormer Model transformer outputting raw hidden-states without any specific head on top.",
|
||||
EFFICIENTFORMER_START_DOCSTRING,
|
||||
)
|
||||
class EfficientFormerModel(EfficientFormerPreTrainedModel):
|
||||
def __init__(self, config: EfficientFormerConfig):
|
||||
super().__init__(config)
|
||||
self.config = config
|
||||
_no_split_modules = ["EfficientFormerMeta4D"]
|
||||
|
||||
self.patch_embed = EfficientFormerConvStem(config, config.hidden_sizes[0])
|
||||
self.encoder = EfficientFormerEncoder(config)
|
||||
self.layernorm = nn.LayerNorm(config.hidden_sizes[-1], eps=config.layer_norm_eps)
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
@add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=BaseModelOutputWithPooling,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
modality="vision",
|
||||
expected_output=_EXPECTED_OUTPUT_SHAPE,
|
||||
)
|
||||
def forward(
|
||||
self,
|
||||
pixel_values: Optional[torch.Tensor] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[tuple, BaseModelOutput]:
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if pixel_values is None:
|
||||
raise ValueError("You have to specify pixel_values")
|
||||
|
||||
embedding_output = self.patch_embed(pixel_values)
|
||||
encoder_outputs = self.encoder(
|
||||
embedding_output, output_attentions=output_attentions, output_hidden_states=output_hidden_states
|
||||
)
|
||||
|
||||
sequence_output = encoder_outputs[0]
|
||||
sequence_output = self.layernorm(sequence_output)
|
||||
|
||||
if not return_dict:
|
||||
head_outputs = (sequence_output,)
|
||||
return head_outputs + encoder_outputs[1:]
|
||||
|
||||
return BaseModelOutput(
|
||||
last_hidden_state=sequence_output,
|
||||
hidden_states=encoder_outputs.hidden_states,
|
||||
attentions=encoder_outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""
|
||||
EfficientFormer Model transformer with an image classification head on top (a linear layer on top of the final
|
||||
hidden state of the [CLS] token) e.g. for ImageNet.
|
||||
""",
|
||||
EFFICIENTFORMER_START_DOCSTRING,
|
||||
)
|
||||
class EfficientFormerForImageClassification(EfficientFormerPreTrainedModel):
|
||||
def __init__(self, config: EfficientFormerConfig):
|
||||
super().__init__(config)
|
||||
|
||||
self.num_labels = config.num_labels
|
||||
self.efficientformer = EfficientFormerModel(config)
|
||||
|
||||
# Classifier head
|
||||
self.classifier = (
|
||||
nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
|
||||
)
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
@add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=ImageClassifierOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
|
||||
)
|
||||
def forward(
|
||||
self,
|
||||
pixel_values: Optional[torch.Tensor] = None,
|
||||
labels: Optional[torch.Tensor] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[tuple, ImageClassifierOutput]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
|
||||
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
|
||||
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.efficientformer(
|
||||
pixel_values,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
|
||||
logits = self.classifier(sequence_output.mean(-2))
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
loss = self.loss_function(labels, logits, self.config)
|
||||
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return ImageClassifierOutput(
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class EfficientFormerForImageClassificationWithTeacherOutput(ModelOutput):
|
||||
"""
|
||||
Output type of [`EfficientFormerForImageClassificationWithTeacher`].
|
||||
|
||||
Args:
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Prediction scores as the average of the cls_logits and distillation logits.
|
||||
cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
|
||||
class token).
|
||||
distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
|
||||
distillation token).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
"""
|
||||
|
||||
logits: Optional[torch.FloatTensor] = None
|
||||
cls_logits: Optional[torch.FloatTensor] = None
|
||||
distillation_logits: Optional[torch.FloatTensor] = None
|
||||
hidden_states: Optional[tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[tuple[torch.FloatTensor]] = None
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""
|
||||
EfficientFormer Model transformer with image classification heads on top (a linear layer on top of the final hidden
|
||||
state of the [CLS] token and a linear layer on top of the final hidden state of the distillation token) e.g. for
|
||||
ImageNet.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
|
||||
supported.
|
||||
|
||||
</Tip>
|
||||
""",
|
||||
EFFICIENTFORMER_START_DOCSTRING,
|
||||
)
|
||||
class EfficientFormerForImageClassificationWithTeacher(EfficientFormerPreTrainedModel):
|
||||
def __init__(self, config: EfficientFormerConfig):
|
||||
super().__init__(config)
|
||||
|
||||
self.num_labels = config.num_labels
|
||||
self.efficientformer = EfficientFormerModel(config)
|
||||
|
||||
# Classifier head
|
||||
self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
|
||||
# Distillation head
|
||||
self.distillation_classifier = (
|
||||
nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
|
||||
)
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
@add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=EfficientFormerForImageClassificationWithTeacherOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
|
||||
)
|
||||
def forward(
|
||||
self,
|
||||
pixel_values: Optional[torch.Tensor] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[tuple, EfficientFormerForImageClassificationWithTeacherOutput]:
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
outputs = self.efficientformer(
|
||||
pixel_values,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
|
||||
cls_logits = self.classifier(sequence_output.mean(-2))
|
||||
distillation_logits = self.distillation_classifier(sequence_output.mean(-2))
|
||||
|
||||
# during inference, return the average of both classifier predictions
|
||||
logits = (cls_logits + distillation_logits) / 2
|
||||
|
||||
if not return_dict:
|
||||
output = (logits, cls_logits, distillation_logits) + outputs[1:]
|
||||
return output
|
||||
|
||||
return EfficientFormerForImageClassificationWithTeacherOutput(
|
||||
logits=logits,
|
||||
cls_logits=cls_logits,
|
||||
distillation_logits=distillation_logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"EfficientFormerForImageClassification",
|
||||
"EfficientFormerForImageClassificationWithTeacher",
|
||||
"EfficientFormerModel",
|
||||
"EfficientFormerPreTrainedModel",
|
||||
]
|
||||
@ -1,28 +0,0 @@
|
||||
# Copyright 2023 The HuggingFace and Baidu Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ....utils import _LazyModule
|
||||
from ....utils.import_utils import define_import_structure
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_ernie_m import *
|
||||
from .modeling_ernie_m import *
|
||||
from .tokenization_ernie_m import *
|
||||
else:
|
||||
import sys
|
||||
|
||||
_file = globals()["__file__"]
|
||||
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
|
||||
@ -1,112 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2023 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""ErnieM model configuration"""
|
||||
# Adapted from original paddlenlp repository.(https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/transformers/ernie_m/configuration.py)
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from ....configuration_utils import PreTrainedConfig
|
||||
|
||||
|
||||
class ErnieMConfig(PreTrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`ErnieMModel`]. It is used to instantiate a
|
||||
Ernie-M model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
||||
with the defaults will yield a similar configuration to that of the `Ernie-M`
|
||||
[susnato/ernie-m-base_pytorch](https://huggingface.co/susnato/ernie-m-base_pytorch) architecture.
|
||||
|
||||
|
||||
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PreTrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 250002):
|
||||
Vocabulary size of `inputs_ids` in [`ErnieMModel`]. Also is the vocab size of token embedding matrix.
|
||||
Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling
|
||||
[`ErnieMModel`].
|
||||
hidden_size (`int`, *optional*, defaults to 768):
|
||||
Dimensionality of the embedding layer, encoder layers and pooler layer.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 12):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
intermediate_size (`int`, *optional*, defaults to 3072):
|
||||
Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors to feed-forward layers are
|
||||
firstly projected from hidden_size to intermediate_size, and then projected back to hidden_size. Typically
|
||||
intermediate_size is larger than hidden_size.
|
||||
hidden_act (`str`, *optional*, defaults to `"gelu"`):
|
||||
The non-linear activation function in the feed-forward layer. `"gelu"`, `"relu"` and any other torch
|
||||
supported activation functions are supported.
|
||||
hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
|
||||
The dropout probability for all fully connected layers in the embeddings and encoder.
|
||||
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
|
||||
The dropout probability used in `MultiHeadAttention` in all encoder layers to drop some attention target.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 514):
|
||||
The maximum value of the dimensionality of position encoding, which dictates the maximum supported length
|
||||
of an input sequence.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the normal initializer for initializing all weight matrices. The index of padding
|
||||
token in the token vocabulary.
|
||||
pad_token_id (`int`, *optional*, defaults to 1):
|
||||
Padding token id.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the layer normalization layers.
|
||||
classifier_dropout (`float`, *optional*):
|
||||
The dropout ratio for the classification head.
|
||||
act_dropout (`float`, *optional*, defaults to 0.0):
|
||||
This dropout probability is used in `ErnieMEncoderLayer` after activation.
|
||||
|
||||
A normal_initializer initializes weight matrices as normal distributions. See
|
||||
`ErnieMPretrainedModel._init_weights()` for how weights are initialized in `ErnieMModel`.
|
||||
"""
|
||||
|
||||
model_type = "ernie_m"
|
||||
attribute_map: dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size: int = 250002,
|
||||
hidden_size: int = 768,
|
||||
num_hidden_layers: int = 12,
|
||||
num_attention_heads: int = 12,
|
||||
intermediate_size: int = 3072,
|
||||
hidden_act: str = "gelu",
|
||||
hidden_dropout_prob: float = 0.1,
|
||||
attention_probs_dropout_prob: float = 0.1,
|
||||
max_position_embeddings: int = 514,
|
||||
initializer_range: float = 0.02,
|
||||
pad_token_id: int = 1,
|
||||
layer_norm_eps: float = 1e-05,
|
||||
classifier_dropout=None,
|
||||
act_dropout=0.0,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(pad_token_id=pad_token_id, **kwargs)
|
||||
self.vocab_size = vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.intermediate_size = intermediate_size
|
||||
self.hidden_act = hidden_act
|
||||
self.hidden_dropout_prob = hidden_dropout_prob
|
||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.initializer_range = initializer_range
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.classifier_dropout = classifier_dropout
|
||||
self.act_dropout = act_dropout
|
||||
|
||||
|
||||
__all__ = ["ErnieMConfig"]
|
||||
@ -1,987 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2023 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""PyTorch ErnieM model."""
|
||||
|
||||
import math
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn, tensor
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ....activations import ACT2FN
|
||||
from ....cache_utils import Cache
|
||||
from ....modeling_outputs import (
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
BaseModelOutputWithPoolingAndCrossAttentions,
|
||||
MultipleChoiceModelOutput,
|
||||
QuestionAnsweringModelOutput,
|
||||
SequenceClassifierOutput,
|
||||
TokenClassifierOutput,
|
||||
)
|
||||
from ....modeling_utils import PreTrainedModel
|
||||
from ....utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
|
||||
from .configuration_ernie_m import ErnieMConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
_CHECKPOINT_FOR_DOC = "susnato/ernie-m-base_pytorch"
|
||||
_CONFIG_FOR_DOC = "ErnieMConfig"
|
||||
_TOKENIZER_FOR_DOC = "ErnieMTokenizer"
|
||||
|
||||
|
||||
# Adapted from paddlenlp.transformers.ernie_m.modeling.ErnieEmbeddings
|
||||
class ErnieMEmbeddings(nn.Module):
|
||||
"""Construct the embeddings from word and position embeddings."""
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.hidden_size = config.hidden_size
|
||||
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
|
||||
self.position_embeddings = nn.Embedding(
|
||||
config.max_position_embeddings, config.hidden_size, padding_idx=config.pad_token_id
|
||||
)
|
||||
self.layer_norm = nn.LayerNorm(normalized_shape=config.hidden_size, eps=config.layer_norm_eps)
|
||||
self.dropout = nn.Dropout(p=config.hidden_dropout_prob)
|
||||
self.padding_idx = config.pad_token_id
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
inputs_embeds: Optional[torch.LongTensor] = None,
|
||||
past_key_values_length: int = 0,
|
||||
) -> torch.Tensor:
|
||||
if inputs_embeds is None:
|
||||
inputs_embeds = self.word_embeddings(input_ids)
|
||||
if position_ids is None:
|
||||
input_shape = inputs_embeds.size()[:-1]
|
||||
ones = torch.ones(input_shape, dtype=torch.int64, device=inputs_embeds.device)
|
||||
seq_length = torch.cumsum(ones, dim=1)
|
||||
position_ids = seq_length - ones
|
||||
|
||||
if past_key_values_length > 0:
|
||||
position_ids = position_ids + past_key_values_length
|
||||
# to mimic paddlenlp implementation
|
||||
position_ids += 2
|
||||
position_embeddings = self.position_embeddings(position_ids)
|
||||
embeddings = inputs_embeds + position_embeddings
|
||||
embeddings = self.layer_norm(embeddings)
|
||||
embeddings = self.dropout(embeddings)
|
||||
|
||||
return embeddings
|
||||
|
||||
|
||||
class ErnieMSelfAttention(nn.Module):
|
||||
def __init__(self, config, position_embedding_type=None):
|
||||
super().__init__()
|
||||
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
|
||||
raise ValueError(
|
||||
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
|
||||
f"heads ({config.num_attention_heads})"
|
||||
)
|
||||
|
||||
self.num_attention_heads = config.num_attention_heads
|
||||
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
|
||||
self.all_head_size = self.num_attention_heads * self.attention_head_size
|
||||
|
||||
self.q_proj = nn.Linear(config.hidden_size, self.all_head_size)
|
||||
self.k_proj = nn.Linear(config.hidden_size, self.all_head_size)
|
||||
self.v_proj = nn.Linear(config.hidden_size, self.all_head_size)
|
||||
|
||||
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
|
||||
self.position_embedding_type = position_embedding_type or getattr(
|
||||
config, "position_embedding_type", "absolute"
|
||||
)
|
||||
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
|
||||
self.max_position_embeddings = config.max_position_embeddings
|
||||
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
|
||||
|
||||
self.is_decoder = config.is_decoder
|
||||
|
||||
def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
|
||||
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
|
||||
x = x.view(new_x_shape)
|
||||
return x.permute(0, 2, 1, 3)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
attention_mask: Optional[torch.FloatTensor] = None,
|
||||
encoder_hidden_states: Optional[torch.FloatTensor] = None,
|
||||
encoder_attention_mask: Optional[torch.FloatTensor] = None,
|
||||
past_key_values: Optional[Cache] = None,
|
||||
output_attentions: Optional[bool] = False,
|
||||
) -> tuple[torch.Tensor]:
|
||||
mixed_query_layer = self.q_proj(hidden_states)
|
||||
|
||||
# If this is instantiated as a cross-attention module, the keys
|
||||
# and values come from an encoder; the attention mask needs to be
|
||||
# such that the encoder's padding tokens are not attended to.
|
||||
is_cross_attention = encoder_hidden_states is not None
|
||||
|
||||
if is_cross_attention and past_key_values is not None:
|
||||
# reuse k,v, cross_attentions
|
||||
key_layer = past_key_values[0]
|
||||
value_layer = past_key_values[1]
|
||||
attention_mask = encoder_attention_mask
|
||||
elif is_cross_attention:
|
||||
key_layer = self.transpose_for_scores(self.k_proj(encoder_hidden_states))
|
||||
value_layer = self.transpose_for_scores(self.v_proj(encoder_hidden_states))
|
||||
attention_mask = encoder_attention_mask
|
||||
elif past_key_values is not None:
|
||||
key_layer = self.transpose_for_scores(self.k_proj(hidden_states))
|
||||
value_layer = self.transpose_for_scores(self.v_proj(hidden_states))
|
||||
key_layer = torch.cat([past_key_values[0], key_layer], dim=2)
|
||||
value_layer = torch.cat([past_key_values[1], value_layer], dim=2)
|
||||
else:
|
||||
key_layer = self.transpose_for_scores(self.k_proj(hidden_states))
|
||||
value_layer = self.transpose_for_scores(self.v_proj(hidden_states))
|
||||
|
||||
query_layer = self.transpose_for_scores(mixed_query_layer)
|
||||
|
||||
use_cache = past_key_values is not None
|
||||
if self.is_decoder:
|
||||
# if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
|
||||
# Further calls to cross_attention layer can then reuse all cross-attention
|
||||
# key/value_states (first "if" case)
|
||||
# if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
|
||||
# all previous decoder key/value_states. Further calls to uni-directional self-attention
|
||||
# can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
|
||||
# if encoder bi-directional self-attention `past_key_values` is always `None`
|
||||
past_key_values = (key_layer, value_layer)
|
||||
|
||||
# Take the dot product between "query" and "key" to get the raw attention scores.
|
||||
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
|
||||
|
||||
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
|
||||
query_length, key_length = query_layer.shape[2], key_layer.shape[2]
|
||||
if use_cache:
|
||||
position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
|
||||
-1, 1
|
||||
)
|
||||
else:
|
||||
position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
|
||||
position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
|
||||
distance = position_ids_l - position_ids_r
|
||||
|
||||
positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
|
||||
positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility
|
||||
|
||||
if self.position_embedding_type == "relative_key":
|
||||
relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
|
||||
attention_scores = attention_scores + relative_position_scores
|
||||
elif self.position_embedding_type == "relative_key_query":
|
||||
relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
|
||||
relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
|
||||
attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
|
||||
|
||||
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
|
||||
if attention_mask is not None:
|
||||
# Apply the attention mask is (precomputed for all layers in ErnieMModel forward() function)
|
||||
attention_scores = attention_scores + attention_mask
|
||||
|
||||
# Normalize the attention scores to probabilities.
|
||||
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
|
||||
|
||||
# This is actually dropping out entire tokens to attend to, which might
|
||||
# seem a bit unusual, but is taken from the original Transformer paper.
|
||||
attention_probs = self.dropout(attention_probs)
|
||||
|
||||
context_layer = torch.matmul(attention_probs, value_layer)
|
||||
|
||||
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
|
||||
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
|
||||
context_layer = context_layer.view(new_context_layer_shape)
|
||||
|
||||
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
|
||||
|
||||
if self.is_decoder:
|
||||
outputs = outputs + (past_key_values,)
|
||||
return outputs
|
||||
|
||||
|
||||
class ErnieMAttention(nn.Module):
|
||||
def __init__(self, config, position_embedding_type=None):
|
||||
super().__init__()
|
||||
self.self_attn = ErnieMSelfAttention(config, position_embedding_type=position_embedding_type)
|
||||
self.out_proj = nn.Linear(config.hidden_size, config.hidden_size)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
attention_mask: Optional[torch.FloatTensor] = None,
|
||||
encoder_hidden_states: Optional[torch.FloatTensor] = None,
|
||||
encoder_attention_mask: Optional[torch.FloatTensor] = None,
|
||||
past_key_values: Optional[Cache] = None,
|
||||
output_attentions: Optional[bool] = False,
|
||||
) -> tuple[torch.Tensor]:
|
||||
self_outputs = self.self_attn(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
past_key_values,
|
||||
output_attentions,
|
||||
)
|
||||
attention_output = self.out_proj(self_outputs[0])
|
||||
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
|
||||
return outputs
|
||||
|
||||
|
||||
class ErnieMEncoderLayer(nn.Module):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
# to mimic paddlenlp implementation
|
||||
dropout = 0.1 if config.hidden_dropout_prob is None else config.hidden_dropout_prob
|
||||
act_dropout = config.hidden_dropout_prob if config.act_dropout is None else config.act_dropout
|
||||
|
||||
self.self_attn = ErnieMAttention(config)
|
||||
self.linear1 = nn.Linear(config.hidden_size, config.intermediate_size)
|
||||
self.dropout = nn.Dropout(act_dropout)
|
||||
self.linear2 = nn.Linear(config.intermediate_size, config.hidden_size)
|
||||
self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||
self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||
self.dropout1 = nn.Dropout(dropout)
|
||||
self.dropout2 = nn.Dropout(dropout)
|
||||
if isinstance(config.hidden_act, str):
|
||||
self.activation = ACT2FN[config.hidden_act]
|
||||
else:
|
||||
self.activation = config.hidden_act
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
attention_mask: Optional[torch.FloatTensor] = None,
|
||||
past_key_values: Optional[Cache] = None,
|
||||
output_attentions: Optional[bool] = True,
|
||||
):
|
||||
residual = hidden_states
|
||||
if output_attentions:
|
||||
hidden_states, attention_opt_weights = self.self_attn(
|
||||
hidden_states=hidden_states,
|
||||
attention_mask=attention_mask,
|
||||
past_key_values=past_key_values,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
|
||||
else:
|
||||
hidden_states = self.self_attn(
|
||||
hidden_states=hidden_states,
|
||||
attention_mask=attention_mask,
|
||||
past_key_values=past_key_values,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
hidden_states = residual + self.dropout1(hidden_states)
|
||||
hidden_states = self.norm1(hidden_states)
|
||||
residual = hidden_states
|
||||
|
||||
hidden_states = self.linear1(hidden_states)
|
||||
hidden_states = self.activation(hidden_states)
|
||||
hidden_states = self.dropout(hidden_states)
|
||||
hidden_states = self.linear2(hidden_states)
|
||||
hidden_states = residual + self.dropout2(hidden_states)
|
||||
hidden_states = self.norm2(hidden_states)
|
||||
|
||||
if output_attentions:
|
||||
return hidden_states, attention_opt_weights
|
||||
else:
|
||||
return hidden_states
|
||||
|
||||
|
||||
class ErnieMEncoder(nn.Module):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.layers = nn.ModuleList([ErnieMEncoderLayer(config) for _ in range(config.num_hidden_layers)])
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_embeds: torch.Tensor,
|
||||
attention_mask: Optional[torch.FloatTensor] = None,
|
||||
past_key_values: Optional[Cache] = None,
|
||||
output_attentions: Optional[bool] = False,
|
||||
output_hidden_states: Optional[bool] = False,
|
||||
return_dict: Optional[bool] = True,
|
||||
) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
|
||||
hidden_states = () if output_hidden_states else None
|
||||
attentions = () if output_attentions else None
|
||||
|
||||
output = input_embeds
|
||||
if output_hidden_states:
|
||||
hidden_states = hidden_states + (output,)
|
||||
for i, layer in enumerate(self.layers):
|
||||
output, opt_attn_weights = layer(
|
||||
hidden_states=output,
|
||||
attention_mask=attention_mask,
|
||||
past_key_values=past_key_values[i] if past_key_values is not None else None,
|
||||
)
|
||||
|
||||
if output_hidden_states:
|
||||
hidden_states = hidden_states + (output,)
|
||||
if output_attentions:
|
||||
attentions = attentions + (opt_attn_weights,)
|
||||
|
||||
last_hidden_state = output
|
||||
if not return_dict:
|
||||
return tuple(v for v in [last_hidden_state, hidden_states, attentions] if v is not None)
|
||||
|
||||
return BaseModelOutputWithPastAndCrossAttentions(
|
||||
last_hidden_state=last_hidden_state, hidden_states=hidden_states, attentions=attentions
|
||||
)
|
||||
|
||||
|
||||
class ErnieMPooler(nn.Module):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
||||
self.activation = nn.Tanh()
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||
# We "pool" the model by simply taking the hidden state corresponding
|
||||
# to the first token.
|
||||
first_token_tensor = hidden_states[:, 0]
|
||||
pooled_output = self.dense(first_token_tensor)
|
||||
pooled_output = self.activation(pooled_output)
|
||||
return pooled_output
|
||||
|
||||
|
||||
class ErnieMPreTrainedModel(PreTrainedModel):
|
||||
"""
|
||||
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
||||
models.
|
||||
"""
|
||||
|
||||
config: ErnieMConfig
|
||||
base_model_prefix = "ernie_m"
|
||||
|
||||
|
||||
ERNIE_M_START_DOCSTRING = r"""
|
||||
|
||||
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
||||
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
||||
etc.)
|
||||
|
||||
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
|
||||
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
|
||||
behavior.
|
||||
|
||||
Parameters:
|
||||
config ([`ErnieMConfig`]): Model configuration class with all the parameters of the model.
|
||||
Initializing with a config file does not load the weights associated with the model, only the
|
||||
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
||||
"""
|
||||
|
||||
ERNIE_M_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
input_ids (`torch.LongTensor` of shape `({0})`):
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
|
||||
Indices can be obtained using [`ErnieMTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
||||
[`PreTrainedTokenizer.__call__`] for details.
|
||||
|
||||
[What are input IDs?](../glossary#input-ids)
|
||||
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
|
||||
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
|
||||
[What are attention masks?](../glossary#attention-mask)
|
||||
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
|
||||
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
||||
config.max_position_embeddings - 1]`.
|
||||
|
||||
[What are position IDs?](../glossary#position-ids)
|
||||
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
|
||||
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
|
||||
is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
|
||||
model's internal embedding lookup matrix.
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||
tensors for more detail.
|
||||
output_hidden_states (`bool`, *optional*):
|
||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
||||
more detail.
|
||||
return_dict (`bool`, *optional*):
|
||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The bare ErnieM Model transformer outputting raw hidden-states without any specific head on top.",
|
||||
ERNIE_M_START_DOCSTRING,
|
||||
)
|
||||
class ErnieMModel(ErnieMPreTrainedModel):
|
||||
def __init__(self, config, add_pooling_layer=True):
|
||||
super().__init__(config)
|
||||
self.initializer_range = config.initializer_range
|
||||
self.embeddings = ErnieMEmbeddings(config)
|
||||
self.encoder = ErnieMEncoder(config)
|
||||
self.pooler = ErnieMPooler(config) if add_pooling_layer else None
|
||||
self.post_init()
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.embeddings.word_embeddings
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
self.embeddings.word_embeddings = value
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
"""
|
||||
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
|
||||
class PreTrainedModel
|
||||
"""
|
||||
for layer, heads in heads_to_prune.items():
|
||||
self.encoder.layers[layer].self_attn.prune_heads(heads)
|
||||
|
||||
@add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_TOKENIZER_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=BaseModelOutputWithPastAndCrossAttentions,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
)
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[tensor] = None,
|
||||
position_ids: Optional[tensor] = None,
|
||||
attention_mask: Optional[tensor] = None,
|
||||
inputs_embeds: Optional[tensor] = None,
|
||||
past_key_values: Optional[tuple[tuple[tensor]]] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[tuple[torch.FloatTensor], BaseModelOutputWithPoolingAndCrossAttentions]:
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time.")
|
||||
|
||||
# init the default bool value
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_dict = return_dict if return_dict is not None else self.config.return_dict
|
||||
|
||||
past_key_values_length = 0
|
||||
if past_key_values is not None:
|
||||
past_key_values_length = past_key_values.get_seq_length()
|
||||
|
||||
# Adapted from paddlenlp.transformers.ernie_m.ErnieMModel
|
||||
if attention_mask is None:
|
||||
attention_mask = (input_ids == self.config.pad_token_id).to(torch.float32)
|
||||
attention_mask *= torch.finfo(attention_mask.dtype).min
|
||||
if past_key_values is not None:
|
||||
batch_size = past_key_values[0][0].shape[0]
|
||||
past_mask = torch.zeros([batch_size, 1, 1, past_key_values_length], dtype=attention_mask.dtype)
|
||||
attention_mask = torch.concat([past_mask, attention_mask], dim=-1)
|
||||
# For 2D attention_mask from tokenizer
|
||||
elif attention_mask.ndim == 2:
|
||||
attention_mask = attention_mask.to(torch.float32)
|
||||
attention_mask = 1.0 - attention_mask
|
||||
attention_mask *= torch.finfo(attention_mask.dtype).min
|
||||
|
||||
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(1)
|
||||
|
||||
embedding_output = self.embeddings(
|
||||
input_ids=input_ids,
|
||||
position_ids=position_ids,
|
||||
inputs_embeds=inputs_embeds,
|
||||
past_key_values_length=past_key_values_length,
|
||||
)
|
||||
encoder_outputs = self.encoder(
|
||||
embedding_output,
|
||||
attention_mask=extended_attention_mask,
|
||||
past_key_values=past_key_values,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
if not return_dict:
|
||||
sequence_output = encoder_outputs[0]
|
||||
pooler_output = self.pooler(sequence_output) if self.pooler is not None else None
|
||||
return (sequence_output, pooler_output) + encoder_outputs[1:]
|
||||
|
||||
sequence_output = encoder_outputs["last_hidden_state"]
|
||||
pooler_output = self.pooler(sequence_output) if self.pooler is not None else None
|
||||
hidden_states = None if not output_hidden_states else encoder_outputs["hidden_states"]
|
||||
attentions = None if not output_attentions else encoder_outputs["attentions"]
|
||||
|
||||
return BaseModelOutputWithPoolingAndCrossAttentions(
|
||||
last_hidden_state=sequence_output,
|
||||
pooler_output=pooler_output,
|
||||
hidden_states=hidden_states,
|
||||
attentions=attentions,
|
||||
)
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""ErnieM Model transformer with a sequence classification/regression head on top (a linear layer on top of
|
||||
the pooled output) e.g. for GLUE tasks.""",
|
||||
ERNIE_M_START_DOCSTRING,
|
||||
)
|
||||
class ErnieMForSequenceClassification(ErnieMPreTrainedModel):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
self.config = config
|
||||
|
||||
self.ernie_m = ErnieMModel(config)
|
||||
classifier_dropout = (
|
||||
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
|
||||
)
|
||||
self.dropout = nn.Dropout(classifier_dropout)
|
||||
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
@add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_TOKENIZER_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=SequenceClassifierOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
)
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.Tensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.Tensor] = None,
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
past_key_values: Optional[Cache] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = True,
|
||||
labels: Optional[torch.Tensor] = None,
|
||||
) -> Union[tuple[torch.FloatTensor], SequenceClassifierOutput]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
|
||||
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
|
||||
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.ernie_m(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
inputs_embeds=inputs_embeds,
|
||||
past_key_values=past_key_values,
|
||||
output_hidden_states=output_hidden_states,
|
||||
output_attentions=output_attentions,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
pooled_output = outputs[1]
|
||||
|
||||
pooled_output = self.dropout(pooled_output)
|
||||
logits = self.classifier(pooled_output)
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if self.config.problem_type is None:
|
||||
if self.num_labels == 1:
|
||||
self.config.problem_type = "regression"
|
||||
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
|
||||
self.config.problem_type = "single_label_classification"
|
||||
else:
|
||||
self.config.problem_type = "multi_label_classification"
|
||||
|
||||
if self.config.problem_type == "regression":
|
||||
loss_fct = MSELoss()
|
||||
if self.num_labels == 1:
|
||||
loss = loss_fct(logits.squeeze(), labels.squeeze())
|
||||
else:
|
||||
loss = loss_fct(logits, labels)
|
||||
elif self.config.problem_type == "single_label_classification":
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
elif self.config.problem_type == "multi_label_classification":
|
||||
loss_fct = BCEWithLogitsLoss()
|
||||
loss = loss_fct(logits, labels)
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return SequenceClassifierOutput(
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""ErnieM Model with a multiple choice classification head on top (a linear layer on top of
|
||||
the pooled output and a softmax) e.g. for RocStories/SWAG tasks.""",
|
||||
ERNIE_M_START_DOCSTRING,
|
||||
)
|
||||
class ErnieMForMultipleChoice(ErnieMPreTrainedModel):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
||||
self.ernie_m = ErnieMModel(config)
|
||||
classifier_dropout = (
|
||||
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
|
||||
)
|
||||
self.dropout = nn.Dropout(classifier_dropout)
|
||||
self.classifier = nn.Linear(config.hidden_size, 1)
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
@add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=MultipleChoiceModelOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
)
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.Tensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.Tensor] = None,
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
labels: Optional[torch.Tensor] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = True,
|
||||
) -> Union[tuple[torch.FloatTensor], MultipleChoiceModelOutput]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
|
||||
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
|
||||
`input_ids` above)
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
|
||||
|
||||
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
|
||||
attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
|
||||
position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
|
||||
inputs_embeds = (
|
||||
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
|
||||
if inputs_embeds is not None
|
||||
else None
|
||||
)
|
||||
|
||||
outputs = self.ernie_m(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
pooled_output = outputs[1]
|
||||
|
||||
pooled_output = self.dropout(pooled_output)
|
||||
logits = self.classifier(pooled_output)
|
||||
reshaped_logits = logits.view(-1, num_choices)
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(reshaped_logits, labels)
|
||||
|
||||
if not return_dict:
|
||||
output = (reshaped_logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return MultipleChoiceModelOutput(
|
||||
loss=loss,
|
||||
logits=reshaped_logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""ErnieM Model with a token classification head on top (a linear layer on top of
|
||||
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.""",
|
||||
ERNIE_M_START_DOCSTRING,
|
||||
)
|
||||
class ErnieMForTokenClassification(ErnieMPreTrainedModel):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
||||
self.ernie_m = ErnieMModel(config, add_pooling_layer=False)
|
||||
classifier_dropout = (
|
||||
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
|
||||
)
|
||||
self.dropout = nn.Dropout(classifier_dropout)
|
||||
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
@add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_TOKENIZER_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=TokenClassifierOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
)
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.Tensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.Tensor] = None,
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
past_key_values: Optional[Cache] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = True,
|
||||
labels: Optional[torch.Tensor] = None,
|
||||
) -> Union[tuple[torch.FloatTensor], TokenClassifierOutput]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.ernie_m(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
inputs_embeds=inputs_embeds,
|
||||
past_key_values=past_key_values,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
|
||||
sequence_output = self.dropout(sequence_output)
|
||||
logits = self.classifier(sequence_output)
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return TokenClassifierOutput(
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""ErnieM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
|
||||
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).""",
|
||||
ERNIE_M_START_DOCSTRING,
|
||||
)
|
||||
class ErnieMForQuestionAnswering(ErnieMPreTrainedModel):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
||||
self.ernie_m = ErnieMModel(config, add_pooling_layer=False)
|
||||
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
@add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_TOKENIZER_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=QuestionAnsweringModelOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
)
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.Tensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.Tensor] = None,
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
start_positions: Optional[torch.Tensor] = None,
|
||||
end_positions: Optional[torch.Tensor] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = True,
|
||||
) -> Union[tuple[torch.FloatTensor], QuestionAnsweringModelOutput]:
|
||||
r"""
|
||||
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for position (index) of the start of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
|
||||
are not taken into account for computing the loss.
|
||||
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for position (index) of the end of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
|
||||
are not taken into account for computing the loss.
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.ernie_m(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
|
||||
logits = self.qa_outputs(sequence_output)
|
||||
start_logits, end_logits = logits.split(1, dim=-1)
|
||||
start_logits = start_logits.squeeze(-1).contiguous()
|
||||
end_logits = end_logits.squeeze(-1).contiguous()
|
||||
|
||||
total_loss = None
|
||||
if start_positions is not None and end_positions is not None:
|
||||
# If we are on multi-GPU, split add a dimension
|
||||
if len(start_positions.size()) > 1:
|
||||
start_positions = start_positions.squeeze(-1)
|
||||
if len(end_positions.size()) > 1:
|
||||
end_positions = end_positions.squeeze(-1)
|
||||
# sometimes the start/end positions are outside our model inputs, we ignore these terms
|
||||
ignored_index = start_logits.size(1)
|
||||
start_positions = start_positions.clamp(0, ignored_index)
|
||||
end_positions = end_positions.clamp(0, ignored_index)
|
||||
|
||||
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
|
||||
start_loss = loss_fct(start_logits, start_positions)
|
||||
end_loss = loss_fct(end_logits, end_positions)
|
||||
total_loss = (start_loss + end_loss) / 2
|
||||
|
||||
if not return_dict:
|
||||
output = (start_logits, end_logits) + outputs[2:]
|
||||
return ((total_loss,) + output) if total_loss is not None else output
|
||||
|
||||
return QuestionAnsweringModelOutput(
|
||||
loss=total_loss,
|
||||
start_logits=start_logits,
|
||||
end_logits=end_logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""ErnieMForInformationExtraction is a Ernie-M Model with two linear layer on top of the hidden-states output to
|
||||
compute `start_prob` and `end_prob`, designed for Universal Information Extraction.""",
|
||||
ERNIE_M_START_DOCSTRING,
|
||||
)
|
||||
class ErnieMForInformationExtraction(ErnieMPreTrainedModel):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.ernie_m = ErnieMModel(config)
|
||||
self.linear_start = nn.Linear(config.hidden_size, 1)
|
||||
self.linear_end = nn.Linear(config.hidden_size, 1)
|
||||
self.sigmoid = nn.Sigmoid()
|
||||
self.post_init()
|
||||
|
||||
@add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.Tensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.Tensor] = None,
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
start_positions: Optional[torch.Tensor] = None,
|
||||
end_positions: Optional[torch.Tensor] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = True,
|
||||
) -> Union[tuple[torch.FloatTensor], QuestionAnsweringModelOutput]:
|
||||
r"""
|
||||
start_positions (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for position (index) for computing the start_positions loss. Position outside of the sequence are
|
||||
not taken into account for computing the loss.
|
||||
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for position (index) for computing the end_positions loss. Position outside of the sequence are not
|
||||
taken into account for computing the loss.
|
||||
"""
|
||||
|
||||
result = self.ernie_m(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
if return_dict:
|
||||
sequence_output = result.last_hidden_state
|
||||
elif not return_dict:
|
||||
sequence_output = result[0]
|
||||
|
||||
start_logits = self.linear_start(sequence_output)
|
||||
start_logits = start_logits.squeeze(-1)
|
||||
end_logits = self.linear_end(sequence_output)
|
||||
end_logits = end_logits.squeeze(-1)
|
||||
|
||||
total_loss = None
|
||||
if start_positions is not None and end_positions is not None:
|
||||
# If we are on multi-GPU, split add a dimension
|
||||
if len(start_positions.size()) > 1:
|
||||
start_positions = start_positions.squeeze(-1)
|
||||
if len(end_positions.size()) > 1:
|
||||
end_positions = end_positions.squeeze(-1)
|
||||
# sometimes the start/end positions are outside our model inputs, we ignore these terms
|
||||
ignored_index = start_logits.size(1)
|
||||
start_positions = start_positions.clamp(0, ignored_index)
|
||||
end_positions = end_positions.clamp(0, ignored_index)
|
||||
|
||||
loss_fct = BCEWithLogitsLoss()
|
||||
start_loss = loss_fct(start_logits, start_positions)
|
||||
end_loss = loss_fct(end_logits, end_positions)
|
||||
total_loss = (start_loss + end_loss) / 2
|
||||
|
||||
if not return_dict:
|
||||
return tuple(
|
||||
i
|
||||
for i in [total_loss, start_logits, end_logits, result.hidden_states, result.attentions]
|
||||
if i is not None
|
||||
)
|
||||
|
||||
return QuestionAnsweringModelOutput(
|
||||
loss=total_loss,
|
||||
start_logits=start_logits,
|
||||
end_logits=end_logits,
|
||||
hidden_states=result.hidden_states,
|
||||
attentions=result.attentions,
|
||||
)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"ErnieMForMultipleChoice",
|
||||
"ErnieMForQuestionAnswering",
|
||||
"ErnieMForSequenceClassification",
|
||||
"ErnieMForTokenClassification",
|
||||
"ErnieMModel",
|
||||
"ErnieMPreTrainedModel",
|
||||
"ErnieMForInformationExtraction",
|
||||
]
|
||||
@ -1,409 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2023 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Tokenization classes for Ernie-M."""
|
||||
|
||||
import os
|
||||
import unicodedata
|
||||
from typing import Any, Optional
|
||||
|
||||
import sentencepiece as spm
|
||||
|
||||
from ....tokenization_utils import PreTrainedTokenizer
|
||||
from ....utils import logging
|
||||
from ....utils.import_utils import requires
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
SPIECE_UNDERLINE = "▁"
|
||||
|
||||
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "sentencepiece_model_ckpt": "sentencepiece.bpe.model"}
|
||||
|
||||
RESOURCE_FILES_NAMES = {
|
||||
"sentencepiece_model_file": "sentencepiece.bpe.model",
|
||||
"vocab_file": "vocab.txt",
|
||||
}
|
||||
|
||||
|
||||
# Adapted from paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer
|
||||
@requires(backends=("sentencepiece",))
|
||||
class ErnieMTokenizer(PreTrainedTokenizer):
|
||||
r"""
|
||||
Constructs a Ernie-M tokenizer. It uses the `sentencepiece` tools to cut the words to sub-words.
|
||||
|
||||
Args:
|
||||
sentencepiece_model_file (`str`):
|
||||
The file path of sentencepiece model.
|
||||
vocab_file (`str`, *optional*):
|
||||
The file path of the vocabulary.
|
||||
do_lower_case (`str`, *optional*, defaults to `True`):
|
||||
Whether or not to lowercase the input when tokenizing.
|
||||
unk_token (`str`, *optional*, defaults to `"[UNK]"`):
|
||||
A special token representing the `unknown (out-of-vocabulary)` token. An unknown token is set to be
|
||||
`unk_token` inorder to be converted to an ID.
|
||||
sep_token (`str`, *optional*, defaults to `"[SEP]"`):
|
||||
A special token separating two different sentences in the same input.
|
||||
pad_token (`str`, *optional*, defaults to `"[PAD]"`):
|
||||
A special token used to make arrays of tokens the same size for batching purposes.
|
||||
cls_token (`str`, *optional*, defaults to `"[CLS]"`):
|
||||
A special token used for sequence classification. It is the last token of the sequence when built with
|
||||
special tokens.
|
||||
mask_token (`str`, *optional*, defaults to `"[MASK]"`):
|
||||
A special token representing a masked token. This is the token used in the masked language modeling task
|
||||
which the model tries to predict the original unmasked ones.
|
||||
"""
|
||||
|
||||
# Ernie-M model doesn't have token_type embedding.
|
||||
model_input_names: list[str] = ["input_ids"]
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
resource_files_names = RESOURCE_FILES_NAMES
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
sentencepiece_model_ckpt,
|
||||
vocab_file=None,
|
||||
do_lower_case=False,
|
||||
encoding="utf8",
|
||||
unk_token="[UNK]",
|
||||
sep_token="[SEP]",
|
||||
pad_token="[PAD]",
|
||||
cls_token="[CLS]",
|
||||
mask_token="[MASK]",
|
||||
sp_model_kwargs: Optional[dict[str, Any]] = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
# Mask token behave like a normal word, i.e. include the space before it and
|
||||
# is included in the raw text, there should be a match in a non-normalized sentence.
|
||||
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
|
||||
self.do_lower_case = do_lower_case
|
||||
self.sentencepiece_model_ckpt = sentencepiece_model_ckpt
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(sentencepiece_model_ckpt)
|
||||
|
||||
# to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning
|
||||
if vocab_file is not None:
|
||||
self.vocab = self.load_vocab(filepath=vocab_file)
|
||||
else:
|
||||
self.vocab = {self.sp_model.id_to_piece(id): id for id in range(self.sp_model.get_piece_size())}
|
||||
self.reverse_vocab = {v: k for k, v in self.vocab.items()}
|
||||
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
vocab_file=vocab_file,
|
||||
encoding=encoding,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def get_offset_mapping(self, text):
|
||||
if text is None:
|
||||
return None
|
||||
|
||||
split_tokens = self.tokenize(text)
|
||||
normalized_text, char_mapping = "", []
|
||||
|
||||
for i, ch in enumerate(text):
|
||||
if ch in self.SP_CHAR_MAPPING:
|
||||
ch = self.SP_CHAR_MAPPING.get(ch)
|
||||
else:
|
||||
ch = unicodedata.normalize("NFKC", ch)
|
||||
if self.is_whitespace(ch):
|
||||
continue
|
||||
normalized_text += ch
|
||||
char_mapping.extend([i] * len(ch))
|
||||
|
||||
text, token_mapping, offset = normalized_text, [], 0
|
||||
|
||||
if self.do_lower_case:
|
||||
text = text.lower()
|
||||
|
||||
for token in split_tokens:
|
||||
if token[:1] == "▁":
|
||||
token = token[1:]
|
||||
start = text[offset:].index(token) + offset
|
||||
end = start + len(token)
|
||||
|
||||
token_mapping.append((char_mapping[start], char_mapping[end - 1] + 1))
|
||||
offset = end
|
||||
return token_mapping
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return len(self.vocab)
|
||||
|
||||
def get_vocab(self):
|
||||
return dict(self.vocab, **self.added_tokens_encoder)
|
||||
|
||||
def __getstate__(self):
|
||||
state = self.__dict__.copy()
|
||||
state["sp_model"] = None
|
||||
return state
|
||||
|
||||
def __setstate__(self, d):
|
||||
self.__dict__ = d
|
||||
|
||||
# for backward compatibility
|
||||
if not hasattr(self, "sp_model_kwargs"):
|
||||
self.sp_model_kwargs = {}
|
||||
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(self.sentencepiece_model_ckpt)
|
||||
|
||||
def clean_text(self, text):
|
||||
"""Performs invalid character removal and whitespace cleanup on text."""
|
||||
return "".join(self.SP_CHAR_MAPPING.get(c, c) for c in text)
|
||||
|
||||
def _tokenize(self, text, enable_sampling=False, nbest_size=64, alpha=0.1):
|
||||
"""Tokenize a string."""
|
||||
|
||||
if self.sp_model_kwargs.get("enable_sampling") is True:
|
||||
enable_sampling = True
|
||||
if self.sp_model_kwargs.get("alpha") is not None:
|
||||
alpha = self.sp_model_kwargs.get("alpha")
|
||||
if self.sp_model_kwargs.get("nbest_size") is not None:
|
||||
nbest_size = self.sp_model_kwargs.get("nbest_size")
|
||||
|
||||
if not enable_sampling:
|
||||
pieces = self.sp_model.EncodeAsPieces(text)
|
||||
else:
|
||||
pieces = self.sp_model.SampleEncodeAsPieces(text, nbest_size, alpha)
|
||||
new_pieces = []
|
||||
for pi, piece in enumerate(pieces):
|
||||
if piece == SPIECE_UNDERLINE:
|
||||
if not pieces[pi + 1].startswith(SPIECE_UNDERLINE) and pi != 0:
|
||||
new_pieces.append(SPIECE_UNDERLINE)
|
||||
continue
|
||||
else:
|
||||
continue
|
||||
lst_i = 0
|
||||
for i, chunk in enumerate(piece):
|
||||
if chunk == SPIECE_UNDERLINE:
|
||||
continue
|
||||
if self.is_ch_char(chunk) or self.is_punct(chunk):
|
||||
if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE:
|
||||
new_pieces.append(piece[lst_i:i])
|
||||
new_pieces.append(chunk)
|
||||
lst_i = i + 1
|
||||
elif chunk.isdigit() and i > 0 and not piece[i - 1].isdigit():
|
||||
if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE:
|
||||
new_pieces.append(piece[lst_i:i])
|
||||
lst_i = i
|
||||
elif not chunk.isdigit() and i > 0 and piece[i - 1].isdigit():
|
||||
if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE:
|
||||
new_pieces.append(piece[lst_i:i])
|
||||
lst_i = i
|
||||
if len(piece) > lst_i:
|
||||
new_pieces.append(piece[lst_i:])
|
||||
return new_pieces
|
||||
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
|
||||
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
|
||||
return out_string
|
||||
|
||||
def convert_ids_to_string(self, ids):
|
||||
"""
|
||||
Converts a sequence of tokens (strings for sub-words) in a single string.
|
||||
"""
|
||||
tokens = self.convert_ids_to_tokens(ids)
|
||||
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
|
||||
return out_string
|
||||
|
||||
# to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning
|
||||
def _convert_token_to_id(self, token):
|
||||
return self.vocab.get(token, self.vocab.get(self.unk_token))
|
||||
|
||||
# to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning
|
||||
def _convert_id_to_token(self, index):
|
||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||
return self.reverse_vocab.get(index, self.unk_token)
|
||||
|
||||
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
||||
r"""
|
||||
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
|
||||
adding special tokens. An ErnieM sequence has the following format:
|
||||
|
||||
- single sequence: `[CLS] X [SEP]`
|
||||
- pair of sequences: `[CLS] A [SEP] [SEP] B [SEP]`
|
||||
|
||||
Args:
|
||||
token_ids_0 (`list[int]`):
|
||||
List of IDs to which the special tokens will be added.
|
||||
token_ids_1 (`list[int]`, *optional*):
|
||||
Optional second list of IDs for sequence pairs.
|
||||
Returns:
|
||||
`list[int]`: List of input_id with the appropriate special tokens.
|
||||
"""
|
||||
if token_ids_1 is None:
|
||||
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
||||
_cls = [self.cls_token_id]
|
||||
_sep = [self.sep_token_id]
|
||||
return _cls + token_ids_0 + _sep + _sep + token_ids_1 + _sep
|
||||
|
||||
def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
|
||||
r"""
|
||||
Build offset map from a pair of offset map by concatenating and adding offsets of special tokens. An Ernie-M
|
||||
offset_mapping has the following format:
|
||||
|
||||
- single sequence: `(0,0) X (0,0)`
|
||||
- pair of sequences: `(0,0) A (0,0) (0,0) B (0,0)`
|
||||
|
||||
Args:
|
||||
offset_mapping_ids_0 (`list[tuple]`):
|
||||
List of char offsets to which the special tokens will be added.
|
||||
offset_mapping_ids_1 (`list[tuple]`, *optional*):
|
||||
Optional second list of wordpiece offsets for offset mapping pairs.
|
||||
Returns:
|
||||
`list[tuple]`: List of wordpiece offsets with the appropriate offsets of special tokens.
|
||||
"""
|
||||
if offset_mapping_1 is None:
|
||||
return [(0, 0)] + offset_mapping_0 + [(0, 0)]
|
||||
|
||||
return [(0, 0)] + offset_mapping_0 + [(0, 0), (0, 0)] + offset_mapping_1 + [(0, 0)]
|
||||
|
||||
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
|
||||
r"""
|
||||
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||
special tokens using the tokenizer `encode` method.
|
||||
|
||||
Args:
|
||||
token_ids_0 (`list[int]`):
|
||||
List of ids of the first sequence.
|
||||
token_ids_1 (`list[int]`, *optional*):
|
||||
Optional second list of IDs for sequence pairs.
|
||||
already_has_special_tokens (`str`, *optional*, defaults to `False`):
|
||||
Whether or not the token list is already formatted with special tokens for the model.
|
||||
Returns:
|
||||
`list[int]`:
|
||||
The list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
||||
"""
|
||||
|
||||
if already_has_special_tokens:
|
||||
if token_ids_1 is not None:
|
||||
raise ValueError(
|
||||
"You should not supply a second sequence if the provided sequence of "
|
||||
"ids is already formatted with special tokens for the model."
|
||||
)
|
||||
return [1 if x in [self.sep_token_id, self.cls_token_id] else 0 for x in token_ids_0]
|
||||
|
||||
if token_ids_1 is not None:
|
||||
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
|
||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||
|
||||
def create_token_type_ids_from_sequences(
|
||||
self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
|
||||
) -> list[int]:
|
||||
"""
|
||||
Create the token type IDs corresponding to the sequences passed. [What are token type
|
||||
IDs?](../glossary#token-type-ids) Should be overridden in a subclass if the model has a special way of
|
||||
building: those.
|
||||
|
||||
Args:
|
||||
token_ids_0 (`list[int]`):
|
||||
The first tokenized sequence.
|
||||
token_ids_1 (`list[int]`, *optional*):
|
||||
The second tokenized sequence.
|
||||
Returns:
|
||||
`list[int]`: The token type ids.
|
||||
"""
|
||||
# called when `add_special_tokens` is True, so align with `build_inputs_with_special_tokens` method
|
||||
if token_ids_1 is None:
|
||||
# [CLS] X [SEP]
|
||||
return (len(token_ids_0) + 2) * [0]
|
||||
|
||||
# [CLS] A [SEP] [SEP] B [SEP]
|
||||
return [0] * (len(token_ids_0) + 1) + [1] * (len(token_ids_1) + 3)
|
||||
|
||||
def is_ch_char(self, char):
|
||||
"""
|
||||
is_ch_char
|
||||
"""
|
||||
if "\u4e00" <= char <= "\u9fff":
|
||||
return True
|
||||
return False
|
||||
|
||||
def is_alpha(self, char):
|
||||
"""
|
||||
is_alpha
|
||||
"""
|
||||
if ("a" <= char <= "z") or ("A" <= char <= "Z"):
|
||||
return True
|
||||
return False
|
||||
|
||||
def is_punct(self, char):
|
||||
"""
|
||||
is_punct
|
||||
"""
|
||||
if char in ",;:.?!~,;:。?!《》【】":
|
||||
return True
|
||||
return False
|
||||
|
||||
def is_whitespace(self, char):
|
||||
"""
|
||||
is whitespace
|
||||
"""
|
||||
if char == " " or char == "\t" or char == "\n" or char == "\r":
|
||||
return True
|
||||
if len(char) == 1:
|
||||
cat = unicodedata.category(char)
|
||||
if cat == "Zs":
|
||||
return True
|
||||
return False
|
||||
|
||||
def load_vocab(self, filepath):
|
||||
token_to_idx = {}
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
for index, line in enumerate(f):
|
||||
token = line.rstrip("\n")
|
||||
token_to_idx[token] = int(index)
|
||||
|
||||
return token_to_idx
|
||||
|
||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
|
||||
index = 0
|
||||
if os.path.isdir(save_directory):
|
||||
vocab_file = os.path.join(
|
||||
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
|
||||
)
|
||||
else:
|
||||
vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
|
||||
with open(vocab_file, "w", encoding="utf-8") as writer:
|
||||
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
|
||||
if index != token_index:
|
||||
logger.warning(
|
||||
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
|
||||
" Please check that the vocabulary is not corrupted!"
|
||||
)
|
||||
index = token_index
|
||||
writer.write(token + "\n")
|
||||
index += 1
|
||||
|
||||
tokenizer_model_file = os.path.join(save_directory, "sentencepiece.bpe.model")
|
||||
with open(tokenizer_model_file, "wb") as fi:
|
||||
content_spiece_model = self.sp_model.serialized_model_proto()
|
||||
fi.write(content_spiece_model)
|
||||
|
||||
return (vocab_file,)
|
||||
|
||||
|
||||
__all__ = ["ErnieMTokenizer"]
|
||||
@ -1,28 +0,0 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ....utils import _LazyModule
|
||||
from ....utils.import_utils import define_import_structure
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_gptsan_japanese import *
|
||||
from .modeling_gptsan_japanese import *
|
||||
from .tokenization_gptsan_japanese import *
|
||||
else:
|
||||
import sys
|
||||
|
||||
_file = globals()["__file__"]
|
||||
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
|
||||
@ -1,157 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2023, HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""GPTSAN-japanese model configuration"""
|
||||
|
||||
from ....configuration_utils import PreTrainedConfig
|
||||
from ....utils import logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class GPTSanJapaneseConfig(PreTrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`GPTSanJapaneseModel`]. It is used to instantiate
|
||||
a GPTSANJapanese model according to the specified arguments, defining the model architecture. Instantiating a
|
||||
configuration with the defaults will yield a similar configuration to that of the GPTSANJapanese
|
||||
[Tanrei/GPTSAN-japanese](https://huggingface.co/Tanrei/GPTSAN-japanese) architecture.
|
||||
|
||||
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PreTrainedConfig`] for more information.
|
||||
|
||||
Arguments:
|
||||
vocab_size (`int`, *optional*, defaults to 36000):
|
||||
Vocabulary size of the GPTSANJapanese model. Defines the number of different tokens that can be represented
|
||||
by the `inputs_ids` passed when calling [`GPTSanJapaneseModel`].
|
||||
max_position_embeddings (`int`, *optional*, defaults to 1280):
|
||||
The maximum sequence length that this model might ever be used with. Defaults set this to 1280.
|
||||
d_model (`int`, *optional*, defaults to 1024):
|
||||
Size of the encoder layers and the pooler layer.
|
||||
d_ff (`int`, *optional*, defaults to 8192):
|
||||
Size of the intermediate feed forward layer in each `SwitchTransformersBlock`.
|
||||
d_ext (`int`, *optional*, defaults to 4096):
|
||||
Size of the intermediate feed forward layer in each Extra-layers.
|
||||
d_spout (`int`, *optional*, defaults to 128):
|
||||
Size of the `spout` vector.
|
||||
num_switch_layers (`int`, *optional*, defaults to 10):
|
||||
Number of layers in the Switch Transformer layer.
|
||||
num_ext_layers (`int`, *optional*, defaults to 0):
|
||||
Number of layers in the Extra-layers.
|
||||
num_heads (`int`, *optional*, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
num_experts (`int`, *optional*, defaults to 16):
|
||||
Number of experts for each SwitchTransformer layer.
|
||||
expert_capacity (`int`, *optional*, defaults to 128):
|
||||
Number of tokens that can be stored in each expert. If set to 1, the model will behave like a regular
|
||||
Transformer.
|
||||
dropout_rate (`float`, *optional*, defaults to 0.0):
|
||||
The ratio for all dropout layers.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
||||
The epsilon used by the layer normalization layers.
|
||||
router_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to add a bias to the router.
|
||||
router_jitter_noise (`float`, *optional*, defaults to 0.0):
|
||||
Amount of noise to add to the router. Set it to 0.0 during prediction or set small value (usually 1e-2)
|
||||
during training.
|
||||
router_dtype (`str`, *optional*, default to `"float32"`):
|
||||
The `dtype` used for the routers. It is preferable to keep the `dtype` to `"float32"` as specified in the
|
||||
*selective precision* discussion in [the paper](https://huggingface.co/papers/2101.03961).
|
||||
router_ignore_padding_tokens (`bool`, *optional*, defaults to `False`):
|
||||
Whether to ignore padding tokens when routing.
|
||||
output_hidden_states (`bool`, *optional*, default to `False`):
|
||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
||||
more detail.
|
||||
output_attentions (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to return the attentions tensors of all attention layers.
|
||||
initializer_factor (`float`, *optional*, defaults to 0.002):
|
||||
A factor for initializing all weight matrices.
|
||||
output_router_logits (`bool`, *optional*, default to `False`):
|
||||
Whether or not to return the router logits of all experts.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models)
|
||||
"""
|
||||
|
||||
model_type = "gptsan-japanese"
|
||||
keys_to_ignore_at_inference = [
|
||||
"past_key_values",
|
||||
]
|
||||
attribute_map = {
|
||||
"hidden_size": "d_model",
|
||||
"num_attention_heads": "num_heads",
|
||||
"num_hidden_layers": "num_layers",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=36000,
|
||||
max_position_embeddings=1280,
|
||||
d_model=1024,
|
||||
d_ff=8192,
|
||||
d_ext=4096,
|
||||
d_spout=128,
|
||||
num_switch_layers=10,
|
||||
num_ext_layers=0,
|
||||
num_heads=16,
|
||||
num_experts=16,
|
||||
expert_capacity=128,
|
||||
dropout_rate=0.0,
|
||||
layer_norm_epsilon=1e-5,
|
||||
router_bias=False,
|
||||
router_jitter_noise=0.0,
|
||||
router_dtype="float32",
|
||||
router_ignore_padding_tokens=False,
|
||||
output_hidden_states=False,
|
||||
output_attentions=False,
|
||||
initializer_factor=0.002,
|
||||
output_router_logits=False,
|
||||
use_cache=True,
|
||||
separator_token_id=35998,
|
||||
pad_token_id=35995,
|
||||
eos_token_id=35999,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.d_model = d_model
|
||||
self.d_ff = d_ff
|
||||
self.d_ext = d_ext
|
||||
self.d_spout = d_spout
|
||||
self.num_switch_layers = num_switch_layers
|
||||
self.num_ext_layers = num_ext_layers
|
||||
self.num_layers = num_switch_layers + num_ext_layers
|
||||
self.num_heads = num_heads
|
||||
self.num_experts = num_experts
|
||||
self.expert_capacity = expert_capacity
|
||||
self.dropout_rate = dropout_rate
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
self.router_bias = router_bias
|
||||
self.router_jitter_noise = router_jitter_noise
|
||||
self.router_dtype = router_dtype
|
||||
self.router_ignore_padding_tokens = router_ignore_padding_tokens
|
||||
self.output_hidden_states = output_hidden_states
|
||||
self.output_attentions = output_attentions
|
||||
self.initializer_factor = initializer_factor
|
||||
self.output_router_logits = output_router_logits
|
||||
self.use_cache = use_cache
|
||||
|
||||
super().__init__(
|
||||
separator_token_id=separator_token_id,
|
||||
pad_token_id=pad_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["GPTSanJapaneseConfig"]
|
||||
@ -1,181 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2023 The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Convert GPTSANJapanese checkpoints from the original repository to pytorch model."""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
from collections import OrderedDict
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
import torch
|
||||
|
||||
|
||||
def convert_tf_gptsan_to_pt(args):
|
||||
parameter_file = os.path.join(args.tf_model_dir, "parameters.json")
|
||||
params = json.loads(open(parameter_file).read())
|
||||
if not params:
|
||||
raise ValueError(
|
||||
f"It seems that the json file at {parameter_file} is empty. Make sure you have a correct json file."
|
||||
)
|
||||
if not args.output.endswith(".pt"):
|
||||
args.output = args.output + ".pt"
|
||||
new_state = OrderedDict()
|
||||
with tf.device("/CPU:0"):
|
||||
reader = tf.train.load_checkpoint(args.tf_model_dir)
|
||||
shapes = reader.get_variable_to_shape_map()
|
||||
for key_name in shapes:
|
||||
vnp = reader.get_tensor(key_name).astype(np.float16)
|
||||
if key_name.endswith("/adam_m") or key_name.endswith("/adam_v"):
|
||||
continue
|
||||
if key_name.startswith("pasts/"):
|
||||
if key_name.startswith("pasts/mlp"):
|
||||
player = int(key_name[9])
|
||||
elif key_name.startswith("pasts/out"):
|
||||
player = 8
|
||||
name = "model.sqout.%d.weight" % (player * 2) # enter to nn.Sequential with Tanh, so 2 at a time
|
||||
state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix
|
||||
new_state[name] = torch.tensor(state)
|
||||
elif key_name.startswith("model/moe"):
|
||||
player = int(key_name[9:].split("/")[0])
|
||||
if key_name.endswith("/switch_gating/kernel"):
|
||||
name = "model.blocks.%d.feed_forward.mlp.router.classifier.weight" % player
|
||||
state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix
|
||||
new_state[name] = torch.tensor(state)
|
||||
elif key_name.endswith("/softmlp/kernel"):
|
||||
name = "model.blocks.%d.feed_forward.soft_bypass_mlp.weight" % player
|
||||
state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix
|
||||
new_state[name] = torch.tensor(state)
|
||||
elif key_name.endswith("/wo/kernel") or key_name.endswith("/wi/kernel"):
|
||||
nlayer = key_name[-9:-7]
|
||||
for i in range(16):
|
||||
name = "model.blocks.%d.feed_forward.mlp.experts.expert_%d.%s.weight" % (player, i, nlayer)
|
||||
state = (
|
||||
vnp[i].transpose([1, 0]).copy()
|
||||
) # In Mesh-Tensorflow, it is one array, so it is divided
|
||||
new_state[name] = torch.tensor(state)
|
||||
elif key_name.startswith("model/mlp"):
|
||||
player = int(key_name[9:].split("/")[0])
|
||||
if key_name.endswith("/p1/kernel"):
|
||||
name = "model.blocks.%d.feed_forward.mlp.wi.weight" % player
|
||||
state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix
|
||||
new_state[name] = torch.tensor(state)
|
||||
elif key_name.endswith("/p1/bias"):
|
||||
name = "model.blocks.%d.feed_forward.mlp.wi.bias" % player
|
||||
state = vnp.copy() # same because it is one dimensional
|
||||
new_state[name] = torch.tensor(state)
|
||||
elif key_name.endswith("/p2/kernel"):
|
||||
name = "model.blocks.%d.feed_forward.mlp.wo.weight" % player
|
||||
state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix
|
||||
new_state[name] = torch.tensor(state)
|
||||
elif key_name.endswith("/p2/bias"):
|
||||
name = "model.blocks.%d.feed_forward.mlp.wo.bias" % player
|
||||
state = vnp.copy() # same because it is one dimensional
|
||||
new_state[name] = torch.tensor(state)
|
||||
elif key_name.startswith("model/ln"):
|
||||
player = int(key_name[8:].split("/")[0])
|
||||
if key_name.endswith("/b"):
|
||||
name = "model.blocks.%d.feed_forward.norm.bias" % player
|
||||
state = vnp.copy() # same because it is one dimensional
|
||||
new_state[name] = torch.tensor(state)
|
||||
elif key_name.endswith("/g"):
|
||||
name = "model.blocks.%d.feed_forward.norm.weight" % player
|
||||
state = vnp.copy() # same because it is one dimensional
|
||||
new_state[name] = torch.tensor(state)
|
||||
elif key_name.startswith("model/att"):
|
||||
player = int(key_name[9:].split("/")[0])
|
||||
if key_name.endswith("/qkv/kernel"):
|
||||
state = vnp.copy() # Compute same dimension as Mesh-tensorflow using einsum
|
||||
state_q = state[:, 0, :, :]
|
||||
state_k = state[:, 1, :, :]
|
||||
state_v = state[:, 2, :, :]
|
||||
state_q = (
|
||||
state_q.reshape([state_q.shape[0], state_q.shape[1] * state_q.shape[2]])
|
||||
.transpose([1, 0])
|
||||
.copy()
|
||||
) # Mesh-Tensorflow is a diagonal matrix
|
||||
state_k = (
|
||||
state_k.reshape([state_k.shape[0], state_k.shape[1] * state_k.shape[2]])
|
||||
.transpose([1, 0])
|
||||
.copy()
|
||||
) # Mesh-Tensorflow is a diagonal matrix
|
||||
state_v = (
|
||||
state_v.reshape([state_v.shape[0], state_v.shape[1] * state_v.shape[2]])
|
||||
.transpose([1, 0])
|
||||
.copy()
|
||||
) # Mesh-Tensorflow is a diagonal matrix
|
||||
name = "model.blocks.%d.self_attn.self_attn.q_proj.weight" % player
|
||||
new_state[name] = torch.tensor(state_q)
|
||||
name = "model.blocks.%d.self_attn.self_attn.k_proj.weight" % player
|
||||
new_state[name] = torch.tensor(state_k)
|
||||
name = "model.blocks.%d.self_attn.self_attn.v_proj.weight" % player
|
||||
new_state[name] = torch.tensor(state_v)
|
||||
elif key_name.endswith("/o/kernel"):
|
||||
name = "model.blocks.%d.self_attn.self_attn.out_proj.weight" % player
|
||||
state = (
|
||||
vnp.reshape([vnp.shape[0] * vnp.shape[1], vnp.shape[2]]).transpose([1, 0]).copy()
|
||||
) # Mesh-Tensorflow is a diagonal matrix
|
||||
new_state[name] = torch.tensor(state)
|
||||
elif key_name.startswith("model/an"):
|
||||
player = int(key_name[8:].split("/")[0])
|
||||
if key_name.endswith("/b"):
|
||||
name = "model.blocks.%d.self_attn.norm.bias" % player
|
||||
state = vnp.copy() # same because it is one dimensional
|
||||
new_state[name] = torch.tensor(state)
|
||||
elif key_name.endswith("/g"):
|
||||
name = "model.blocks.%d.self_attn.norm.weight" % player
|
||||
state = vnp.copy() # same because it is one dimensional
|
||||
new_state[name] = torch.tensor(state)
|
||||
elif (
|
||||
key_name.startswith("model/wte")
|
||||
or key_name.startswith("model/wpe")
|
||||
or key_name.startswith("model/ete")
|
||||
):
|
||||
nlayer = {"wte": "embed_tokens", "wpe": "position_embeddings", "ete": "extra_position_embeddings"}[
|
||||
key_name[-3:]
|
||||
]
|
||||
name = "model.%s.weight" % nlayer
|
||||
state = vnp.copy() # same in embedded
|
||||
new_state[name] = torch.tensor(state)
|
||||
if key_name.startswith("model/wte"):
|
||||
name = "lm_head.weight"
|
||||
state = vnp.copy() # same in embedded
|
||||
new_state[name] = torch.tensor(state)
|
||||
elif key_name.startswith("model/wob"):
|
||||
name = "final_logits_bias"
|
||||
state = vnp.copy() # same in embedded
|
||||
state = state.reshape((1, -1))
|
||||
new_state[name] = torch.tensor(state)
|
||||
elif key_name == "model/dense/kernel":
|
||||
name = "model.last_project.weight"
|
||||
state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix
|
||||
new_state[name] = torch.tensor(state)
|
||||
elif key_name == "model/dense_1/bias":
|
||||
name = "model.last_project.bias"
|
||||
state = vnp.copy() # same because it is one dimensional
|
||||
new_state[name] = torch.tensor(state)
|
||||
torch.save(new_state, args.output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="model converter.", formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||
)
|
||||
parser.add_argument("--tf_model_dir", metavar="PATH", type=str, required=True, help="import model")
|
||||
parser.add_argument("--output", metavar="PATH", type=str, required=True, help="output model")
|
||||
args = parser.parse_args()
|
||||
convert_tf_gptsan_to_pt(args)
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,518 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2023 HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Tokenization classes for GPTSANJapanese."""
|
||||
|
||||
import collections
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from typing import Optional, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ....tokenization_utils import PreTrainedTokenizer
|
||||
from ....tokenization_utils_base import (
|
||||
BatchEncoding,
|
||||
PreTokenizedInput,
|
||||
PreTokenizedInputPair,
|
||||
TextInput,
|
||||
TextInputPair,
|
||||
TruncationStrategy,
|
||||
)
|
||||
from ....utils import PaddingStrategy, logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "emoji_file": "emoji.json"}
|
||||
|
||||
|
||||
def load_vocab_and_emoji(vocab_file, emoji_file):
|
||||
"""Loads a vocabulary file and emoji file into a dictionary."""
|
||||
with open(emoji_file, "r", encoding="utf-8") as f:
|
||||
emoji = json.loads(f.read())
|
||||
|
||||
vocab = collections.OrderedDict()
|
||||
raw_vocab = collections.OrderedDict()
|
||||
ids_to_tokens = collections.OrderedDict()
|
||||
with open(vocab_file, "r", encoding="utf-8") as f:
|
||||
token = f.readlines()
|
||||
token = [[t.rstrip("\n")] if (t == ",\n" or "," not in t) else t.rstrip("\n").split(",") for t in token]
|
||||
for idx, b in enumerate(token):
|
||||
ids_to_tokens[idx] = b
|
||||
raw_vocab[",".join(b)] = idx
|
||||
for wd in b:
|
||||
vocab[wd] = idx
|
||||
|
||||
return vocab, raw_vocab, ids_to_tokens, emoji
|
||||
|
||||
|
||||
class GPTSanJapaneseTokenizer(PreTrainedTokenizer):
|
||||
"""
|
||||
This tokenizer is based on GPTNeoXJapaneseTokenizer and has the following modifications
|
||||
- Decoding byte0~byte255 tokens correctly
|
||||
- Added bagofword token handling
|
||||
- Return token_type_ids for Prefix-LM model
|
||||
The bagofword token represents a repetition of the previous token and is converted to 3 consecutive tokens when
|
||||
decoding In addition, the original Japanese special Sub-Word-Encoding has been released in this repository
|
||||
(https://github.com/tanreinama/Japanese-BPEEncoder_V2). The token_type_ids is a mask indicating the prefix input
|
||||
position of the Prefix-LM model. To specify a prefix position, specify a prefix input for prefix_text, or specify a
|
||||
sentence of the prefix part and the part after it as a text pair of batch input.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import GPTSanJapaneseTokenizer
|
||||
|
||||
>>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
|
||||
>>> # You can confirm both 慶応 and 慶應 are encoded to 17750
|
||||
>>> tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"]
|
||||
[35993, 35998, 34347, 31459, 30647, 31448, 25, 30659, 35729, 35676, 32417, 30647, 17750, 35589, 17750, 35590, 321, 1281]
|
||||
|
||||
>>> # Both 慶応 and 慶應 are decoded to 慶応
|
||||
>>> tokenizer.decode(tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"])
|
||||
'吾輩は猫である🐯。実は慶応(慶応)大学出身'
|
||||
```
|
||||
|
||||
Example for Prefix-LM:
|
||||
|
||||
```python
|
||||
>>> from transformers import GPTSanJapaneseTokenizer
|
||||
|
||||
>>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
|
||||
>>> tokenizer("実は慶応(慶應)大学出身", prefix_text="吾輩は猫である🐯。")["input_ids"]
|
||||
[35993, 34347, 31459, 30647, 31448, 25, 30659, 35729, 35676, 35998, 32417, 30647, 17750, 35589, 17750, 35590, 321, 1281]
|
||||
|
||||
>>> # Mask for Prefix-LM inputs
|
||||
>>> tokenizer("実は慶応(慶應)大学出身", prefix_text="吾輩は猫である🐯。")["token_type_ids"]
|
||||
[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
|
||||
```
|
||||
|
||||
Example for batch encode:
|
||||
|
||||
```python
|
||||
>>> from transformers import GPTSanJapaneseTokenizer
|
||||
|
||||
>>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
|
||||
>>> tokenizer([["武田信玄", "は、"], ["織田信長", "の配下の、"]], padding=True)["input_ids"]
|
||||
[[35993, 35998, 8640, 25948, 35993, 35998, 30647, 35675, 35999, 35999], [35993, 35998, 10382, 9868, 35993, 35998, 30646, 9459, 30646, 35675]]
|
||||
|
||||
>>> # Mask for Prefix-LM inputs
|
||||
>>> tokenizer([["武田信玄", "は、"], ["織田信長", "の配下の、"]], padding=True)["token_type_ids"]
|
||||
[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
|
||||
|
||||
>>> # Mask for padding
|
||||
>>> tokenizer([["武田信玄", "は、"], ["織田信長", "の配下の、"]], padding=True)["attention_mask"]
|
||||
[[1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
|
||||
```
|
||||
|
||||
Args:
|
||||
vocab_file (`str`):
|
||||
File containing the vocabulary.
|
||||
emoji_file (`str`):
|
||||
File containing the emoji.
|
||||
unk_token (`str`, *optional*, defaults to `"<|nottoken|>"`):
|
||||
The token used for unknown character
|
||||
pad_token (`str`, *optional*, defaults to `"<|separator|>"`):
|
||||
The token used for padding
|
||||
bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`):
|
||||
The beginning of sequence token.
|
||||
eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
|
||||
The end of sequence token.
|
||||
sep_token (`str`, *optional*, defaults to `"<|segmenter|>"`):
|
||||
A special token to separate token to prefix part and general input part.
|
||||
do_clean_text (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to clean text for URL, EMAIL, TEL, Japanese DATE and Japanese PRICE.
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file,
|
||||
emoji_file,
|
||||
unk_token="<|nottoken|>",
|
||||
pad_token="<|separator|>",
|
||||
bos_token="<|startoftext|>",
|
||||
eos_token="<|endoftext|>",
|
||||
sep_token="<|segmenter|>",
|
||||
do_clean_text=False,
|
||||
**kwargs,
|
||||
):
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
|
||||
" model use `tokenizer = GPTSanJapaneseTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
|
||||
)
|
||||
if not os.path.isfile(emoji_file):
|
||||
raise ValueError(
|
||||
f"Can't find a emoji file at path '{emoji_file}'. To load the emoji information from a Google"
|
||||
" pretrained model use `tokenizer = GPTSanJapaneseTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
|
||||
)
|
||||
self.do_clean_text = do_clean_text
|
||||
self.vocab, self.raw_vocab, self.ids_to_tokens, self.emoji = load_vocab_and_emoji(vocab_file, emoji_file)
|
||||
self.subword_tokenizer = SubWordJapaneseTokenizer(
|
||||
vocab=self.vocab, ids_to_tokens=self.ids_to_tokens, emoji=self.emoji
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
unk_token=unk_token,
|
||||
pad_token=pad_token,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
sep_token=sep_token,
|
||||
do_clean_text=do_clean_text,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
# self.vocab contains support for character fluctuation unique to Japanese, and has a large number of vocab
|
||||
return len(self.raw_vocab)
|
||||
|
||||
def get_vocab(self):
|
||||
return dict(self.raw_vocab, **self.added_tokens_encoder)
|
||||
|
||||
def _tokenize(self, text):
|
||||
return self.subword_tokenizer.tokenize(text, clean=self.do_clean_text)
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
"""Converts a token (str) in an id using the vocab."""
|
||||
return self.vocab.get(token, self.vocab.get(self.unk_token))
|
||||
|
||||
def _convert_id_to_token(self, index):
|
||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||
return self.subword_tokenizer.convert_id_to_token(index)
|
||||
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
"""Converts a sequence of tokens (string) in a single string."""
|
||||
words = []
|
||||
byte_tokens = []
|
||||
for word in tokens:
|
||||
if word[:6] == "<|byte" and word[-2:] == "|>":
|
||||
byte_tokens.append(int(word[6:-2]))
|
||||
else:
|
||||
if len(byte_tokens) > 0:
|
||||
words.append(bytearray(byte_tokens).decode("utf-8", errors="replace"))
|
||||
byte_tokens = []
|
||||
if word[:7] == "<|emoji" and word[-2:] == "|>":
|
||||
words.append(self.emoji["emoji_inv"][word])
|
||||
elif word == "<SP>":
|
||||
words.append(" ")
|
||||
elif word == "<BR>":
|
||||
words.append("\n")
|
||||
elif word == "<TAB>":
|
||||
words.append("\t")
|
||||
elif word == "<BLOCK>":
|
||||
words.append("▀")
|
||||
elif word == "<KIGOU>":
|
||||
words.append("ǀ")
|
||||
elif word == "<U2000U2BFF>":
|
||||
words.append("‖")
|
||||
elif word == "<|bagoftoken|>":
|
||||
if len(words) > 0:
|
||||
words.append(words[-1])
|
||||
words.append(words[-1])
|
||||
words.append(words[-1])
|
||||
elif word.startswith("<|") and word.endswith("|>"):
|
||||
words.append("")
|
||||
else:
|
||||
words.append(word)
|
||||
if len(byte_tokens) > 0:
|
||||
words.append(bytearray(byte_tokens).decode("utf-8", errors="replace"))
|
||||
text = "".join(words)
|
||||
return text
|
||||
|
||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
|
||||
index = 0
|
||||
if os.path.isdir(save_directory):
|
||||
vocab_file = os.path.join(
|
||||
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
|
||||
)
|
||||
emoji_file = os.path.join(
|
||||
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["emoji_file"]
|
||||
)
|
||||
else:
|
||||
vocab_file = (
|
||||
(filename_prefix + "-" if filename_prefix else "") + save_directory + VOCAB_FILES_NAMES["vocab_file"]
|
||||
)
|
||||
emoji_file = (
|
||||
(filename_prefix + "-" if filename_prefix else "") + save_directory + VOCAB_FILES_NAMES["emoji_file"]
|
||||
)
|
||||
with open(vocab_file, "w", encoding="utf-8") as writer:
|
||||
for token_index, token in self.ids_to_tokens.items():
|
||||
if index != token_index:
|
||||
logger.warning(
|
||||
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
|
||||
" Please check that the vocabulary is not corrupted!"
|
||||
)
|
||||
index = token_index
|
||||
writer.write(",".join(token) + "\n")
|
||||
index += 1
|
||||
with open(emoji_file, "w", encoding="utf-8") as writer:
|
||||
json.dump(self.emoji, writer)
|
||||
return vocab_file, emoji_file
|
||||
|
||||
def create_token_type_ids_from_sequences(
|
||||
self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
|
||||
) -> list[int]:
|
||||
# docstyle-ignore
|
||||
"""
|
||||
The tokenizer returns token_type_ids as separators between the Prefix part and the rest.
|
||||
token_type_ids is 1 for the Prefix part and 0 for the rest of the token.
|
||||
|
||||
Example:
|
||||
```python
|
||||
>>> from transformers import GPTSanJapaneseTokenizer
|
||||
|
||||
>>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
|
||||
>>> x_token = tokenizer("アイウエ")
|
||||
>>> # input_ids: | SOT | SEG | ア | イ | ウ | エ |
|
||||
>>> # token_type_ids: | 1 | 0 | 0 | 0 | 0 | 0 |
|
||||
|
||||
>>> x_token = tokenizer("", prefix_text="アイウエ")
|
||||
>>> # input_ids: | SOT | ア | イ | ウ | エ | SEG |
|
||||
>>> # token_type_ids: | 1 | 1 | 1 | 1 | 1 | 0 |
|
||||
|
||||
>>> x_token = tokenizer("ウエ", prefix_text="アイ")
|
||||
>>> # input_ids: | SOT | ア | イ | SEG | ウ | エ |
|
||||
>>> # token_type_ids: | 1 | 1 | 1 | 0 | 0 | 0 |
|
||||
```"""
|
||||
prefix_len = 0
|
||||
if self.sep_token in self.vocab:
|
||||
segid = self.vocab[self.sep_token]
|
||||
if segid in token_ids_0:
|
||||
prefix_len = token_ids_0.index(segid)
|
||||
if token_ids_1 is None:
|
||||
total_len = len(token_ids_0)
|
||||
else:
|
||||
total_len = len(token_ids_0 + token_ids_1)
|
||||
return prefix_len * [1] + (total_len - prefix_len) * [0]
|
||||
|
||||
def prepare_for_tokenization(self, text, prefix_text=None, add_sep_token=None, **kwargs):
|
||||
# GPTSAN inserts extra SEP tokens in Prefix-LM in addition to SOT for text generation.
|
||||
# SOT at the beginning of the text, and SEP at the separator between the Prefix part and the rest.
|
||||
if add_sep_token is None:
|
||||
add_sep_token = self.sep_token not in text # If insert un-prefix position explicitly
|
||||
prepared = self.bos_token if self.bos_token in self.vocab else ""
|
||||
prepared += prefix_text if prefix_text is not None else ""
|
||||
if add_sep_token:
|
||||
prepared += self.sep_token if self.sep_token in self.vocab else ""
|
||||
prepared += text
|
||||
return (prepared, kwargs)
|
||||
|
||||
def _batch_encode_plus(
|
||||
self,
|
||||
batch_text_or_text_pairs: Union[
|
||||
list[TextInput], list[TextInputPair], list[PreTokenizedInput], list[PreTokenizedInputPair]
|
||||
],
|
||||
add_special_tokens: bool = True,
|
||||
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
||||
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
is_split_into_words: bool = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
return_tensors: Optional[str] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
return_overflowing_tokens: bool = False,
|
||||
return_special_tokens_mask: bool = False,
|
||||
return_offsets_mapping: bool = False,
|
||||
return_length: bool = False,
|
||||
verbose: bool = True,
|
||||
**kwargs,
|
||||
) -> BatchEncoding:
|
||||
# This tokenizer converts input text pairs into Prefix input and subsequent input
|
||||
if isinstance(batch_text_or_text_pairs[0], tuple) or isinstance(tuple(batch_text_or_text_pairs[0]), list):
|
||||
# As a single text with an explicit un-prefix position
|
||||
batch_prefix_texts = []
|
||||
for pref, txt in batch_text_or_text_pairs:
|
||||
batch_prefix_texts.append(pref + self.sep_token + txt)
|
||||
batch_text_or_text_pairs = batch_prefix_texts
|
||||
|
||||
return super()._batch_encode_plus(
|
||||
batch_text_or_text_pairs,
|
||||
add_special_tokens,
|
||||
padding_strategy,
|
||||
truncation_strategy,
|
||||
max_length,
|
||||
stride,
|
||||
is_split_into_words,
|
||||
pad_to_multiple_of,
|
||||
return_tensors,
|
||||
return_token_type_ids,
|
||||
return_attention_mask,
|
||||
return_overflowing_tokens,
|
||||
return_special_tokens_mask,
|
||||
return_offsets_mapping,
|
||||
return_length,
|
||||
verbose,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
class SubWordJapaneseTokenizer:
|
||||
"""
|
||||
This tokenizer is based on GPTNeoXJapaneseTokenizer and has the following modifications
|
||||
- Decoding byte0~byte255 tokens correctly
|
||||
- Added bagofword token handling
|
||||
|
||||
https://github.com/tanreinama/Japanese-BPEEncoder_V2 This tokenizer class is under MIT License according to the
|
||||
original repository.
|
||||
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2020 tanreinama
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
|
||||
documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
|
||||
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of
|
||||
the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
|
||||
THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
"""
|
||||
|
||||
def __init__(self, vocab, ids_to_tokens, emoji):
|
||||
self.vocab = vocab # same as swe
|
||||
self.ids_to_tokens = ids_to_tokens # same as bpe
|
||||
self.emoji = emoji
|
||||
self.maxlen = np.max([len(w) for w in self.vocab])
|
||||
self.content_repatter1 = re.compile(r"(https?|ftp)(:\/\/[-_\.!~*\'()a-zA-Z0-9;\/?:\@&=\+$,%#]+)")
|
||||
self.content_repatter2 = re.compile(r"[A-Za-z0-9\._+]*@[\-_0-9A-Za-z]+(\.[A-Za-z]+)*")
|
||||
self.content_repatter3 = re.compile(r"[\(]{0,1}[0-9]{2,4}[\)\-\(]{0,1}[0-9]{2,4}[\)\-]{0,1}[0-9]{3,4}")
|
||||
self.content_repatter4 = re.compile(
|
||||
r"([12]\d{3}[/\-年])*(0?[1-9]|1[0-2])[/\-月]((0?[1-9]|[12][0-9]|3[01])日?)*(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*"
|
||||
)
|
||||
self.content_repatter5 = re.compile(
|
||||
r"(明治|大正|昭和|平成|令和|㍾|㍽|㍼|㍻|\u32ff)\d{1,2}年(0?[1-9]|1[0-2])月(0?[1-9]|[12][0-9]|3[01])日(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*"
|
||||
)
|
||||
# The original version of this regex displays catastrophic backtracking behaviour. We avoid this using
|
||||
# possessive quantifiers in Py >= 3.11. In versions below this, we avoid the vulnerability using a slightly
|
||||
# different regex that should generally have the same behaviour in most non-pathological cases.
|
||||
if sys.version_info >= (3, 11):
|
||||
self.content_repatter6 = re.compile(
|
||||
r"(?:\d,\d{3}|[\d億])*+"
|
||||
r"(?:\d,\d{3}|[\d万])*+"
|
||||
r"(?:\d,\d{3}|[\d千])*+"
|
||||
r"(?:千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+"
|
||||
r"(?:\(税込\)|\(税抜\)|\+tax)*"
|
||||
)
|
||||
else:
|
||||
self.content_repatter6 = re.compile(
|
||||
r"(?:\d,\d{3}|[\d億万千])*"
|
||||
r"(?:千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+"
|
||||
r"(?:\(税込\)|\(税抜\)|\+tax)*"
|
||||
)
|
||||
keisen = "─━│┃┄┅┆┇┈┉┊┋┌┍┎┏┐┑┒┓└┕┖┗┘┙┚┛├┝┞┟┠┡┢┣┤┥┦┧┨┩┪┫┬┭┮┯┰┱┲┳┴┵┶┷┸┹┺┻┼┽┾┿╀╁╂╃╄╅╆╇╈╉╊╋╌╍╎╏═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡╢╣╤╥╦╧╨╩╪╫╬╭╮╯╰╱╲╳╴╵╶╷╸╹╺╻╼╽╾╿"
|
||||
blocks = "▀▁▂▃▄▅▆▇█▉▊▋▌▍▎▏▐░▒▓▔▕▖▗▘▙▚▛▜▝▞▟"
|
||||
self.content_trans1 = str.maketrans(dict.fromkeys(keisen + blocks, "<BLOCK>"))
|
||||
|
||||
def __len__(self):
|
||||
return len(self.ids_to_tokens)
|
||||
|
||||
def clean_text(self, content):
|
||||
content = self.content_repatter1.sub("<URL>", content)
|
||||
content = self.content_repatter2.sub("<EMAIL>", content)
|
||||
content = self.content_repatter3.sub("<TEL>", content)
|
||||
content = self.content_repatter4.sub("<DATE>", content)
|
||||
content = self.content_repatter5.sub("<DATE>", content)
|
||||
content = self.content_repatter6.sub("<PRICE>", content)
|
||||
content = content.translate(self.content_trans1)
|
||||
while "<BLOCK><BLOCK>" in content:
|
||||
content = content.replace("<BLOCK><BLOCK>", "<BLOCK>")
|
||||
return content
|
||||
|
||||
def tokenize(self, text, clean=False):
|
||||
text = text.replace(" ", "<SP>")
|
||||
text = text.replace(" ", "<SP>")
|
||||
text = text.replace("\r\n", "<BR>")
|
||||
text = text.replace("\n", "<BR>")
|
||||
text = text.replace("\r", "<BR>")
|
||||
text = text.replace("\t", "<TAB>")
|
||||
text = text.replace("—", "ー")
|
||||
text = text.replace("−", "ー")
|
||||
for k, v in self.emoji["emoji"].items():
|
||||
if k in text:
|
||||
text = text.replace(k, v)
|
||||
if clean:
|
||||
text = self.clean_text(text)
|
||||
|
||||
def check_simbol(x):
|
||||
e = x.encode()
|
||||
if len(x) == 1 and len(e) == 2:
|
||||
c = (int(e[0]) << 8) + int(e[1])
|
||||
if (
|
||||
(c >= 0xC2A1 and c <= 0xC2BF)
|
||||
or (c >= 0xC780 and c <= 0xC783)
|
||||
or (c >= 0xCAB9 and c <= 0xCBBF)
|
||||
or (c >= 0xCC80 and c <= 0xCDA2)
|
||||
):
|
||||
return True
|
||||
return False
|
||||
|
||||
def checku2e(x):
|
||||
e = x.encode()
|
||||
if len(x) == 1 and len(e) == 3:
|
||||
c = (int(e[0]) << 16) + (int(e[1]) << 8) + int(e[2])
|
||||
if c >= 0xE28080 and c <= 0xE2B07F:
|
||||
return True
|
||||
return False
|
||||
|
||||
pos = 0
|
||||
result = []
|
||||
while pos < len(text):
|
||||
end = min(len(text), pos + self.maxlen + 1) if text[pos] == "<" else pos + 3
|
||||
candidates = [] # (token_id, token, pos)
|
||||
for e in range(end, pos, -1):
|
||||
wd = text[pos:e]
|
||||
if wd in self.vocab:
|
||||
if wd[0] == "<" and len(wd) > 2:
|
||||
candidates = [(self.vocab[wd], wd, e)]
|
||||
break
|
||||
else:
|
||||
candidates.append((self.vocab[wd], wd, e))
|
||||
if len(candidates) > 0:
|
||||
# the smallest token_id is adopted
|
||||
_, wd, e = min(candidates, key=lambda x: x[0])
|
||||
result.append(wd)
|
||||
pos = e
|
||||
else:
|
||||
end = pos + 1
|
||||
wd = text[pos:end]
|
||||
if check_simbol(wd):
|
||||
result.append("<KIGOU>")
|
||||
elif checku2e(wd):
|
||||
result.append("<U2000U2BFF>")
|
||||
else:
|
||||
for i in wd.encode("utf-8"):
|
||||
result.append("<|byte%d|>" % i)
|
||||
pos = end
|
||||
return result
|
||||
|
||||
def convert_id_to_token(self, index):
|
||||
return self.ids_to_tokens[index][0]
|
||||
|
||||
|
||||
__all__ = ["GPTSanJapaneseTokenizer"]
|
||||
@ -1,27 +0,0 @@
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ....utils import _LazyModule
|
||||
from ....utils.import_utils import define_import_structure
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_graphormer import *
|
||||
from .modeling_graphormer import *
|
||||
else:
|
||||
import sys
|
||||
|
||||
_file = globals()["__file__"]
|
||||
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
|
||||
@ -1,107 +0,0 @@
|
||||
# Copyright (c) Microsoft Corporation and HuggingFace
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import cython
|
||||
|
||||
cimport numpy
|
||||
from cython.parallel cimport parallel, prange
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
# Reduce this number if matrices are too big for large graphs
|
||||
UNREACHABLE_NODE_DISTANCE = 510
|
||||
|
||||
def floyd_warshall(adjacency_matrix):
|
||||
"""
|
||||
Applies the Floyd-Warshall algorithm to the adjacency matrix, to compute the
|
||||
shortest paths distance between all nodes, up to UNREACHABLE_NODE_DISTANCE.
|
||||
"""
|
||||
(nrows, ncols) = adjacency_matrix.shape
|
||||
assert nrows == ncols
|
||||
cdef unsigned int n = nrows
|
||||
|
||||
adj_mat_copy = adjacency_matrix.astype(np.int32, order='C', casting='safe', copy=True)
|
||||
assert adj_mat_copy.flags['C_CONTIGUOUS']
|
||||
cdef numpy.ndarray[numpy.int32_t, ndim=2, mode='c'] M = adj_mat_copy
|
||||
cdef numpy.ndarray[numpy.int32_t, ndim=2, mode='c'] path = -1 * np.ones([n, n], dtype=np.int32)
|
||||
|
||||
cdef unsigned int i, j, k
|
||||
cdef numpy.int32_t M_ij, M_ik, cost_ikkj
|
||||
cdef numpy.int32_t* M_ptr = &M[0,0]
|
||||
cdef numpy.int32_t* M_i_ptr
|
||||
cdef numpy.int32_t* M_k_ptr
|
||||
|
||||
# set unreachable nodes distance to UNREACHABLE_NODE_DISTANCE
|
||||
for i in range(n):
|
||||
for j in range(n):
|
||||
if i == j:
|
||||
M[i][j] = 0
|
||||
elif M[i][j] == 0:
|
||||
M[i][j] = UNREACHABLE_NODE_DISTANCE
|
||||
|
||||
# floyed algo
|
||||
for k in range(n):
|
||||
M_k_ptr = M_ptr + n*k
|
||||
for i in range(n):
|
||||
M_i_ptr = M_ptr + n*i
|
||||
M_ik = M_i_ptr[k]
|
||||
for j in range(n):
|
||||
cost_ikkj = M_ik + M_k_ptr[j]
|
||||
M_ij = M_i_ptr[j]
|
||||
if M_ij > cost_ikkj:
|
||||
M_i_ptr[j] = cost_ikkj
|
||||
path[i][j] = k
|
||||
|
||||
# set unreachable path to UNREACHABLE_NODE_DISTANCE
|
||||
for i in range(n):
|
||||
for j in range(n):
|
||||
if M[i][j] >= UNREACHABLE_NODE_DISTANCE:
|
||||
path[i][j] = UNREACHABLE_NODE_DISTANCE
|
||||
M[i][j] = UNREACHABLE_NODE_DISTANCE
|
||||
|
||||
return M, path
|
||||
|
||||
|
||||
def get_all_edges(path, i, j):
|
||||
"""
|
||||
Recursive function to compute all possible paths between two nodes from the graph adjacency matrix.
|
||||
"""
|
||||
cdef int k = path[i][j]
|
||||
if k == -1:
|
||||
return []
|
||||
else:
|
||||
return get_all_edges(path, i, k) + [k] + get_all_edges(path, k, j)
|
||||
|
||||
|
||||
def gen_edge_input(max_dist, path, edge_feat):
|
||||
"""
|
||||
Generates the full edge feature and adjacency matrix.
|
||||
Shape: num_nodes * num_nodes * max_distance_between_nodes * num_edge_features
|
||||
Dim 1 is the input node, dim 2 the output node of the edge, dim 3 the depth of the edge, dim 4 the feature
|
||||
"""
|
||||
(nrows, ncols) = path.shape
|
||||
assert nrows == ncols
|
||||
cdef unsigned int n = nrows
|
||||
cdef unsigned int max_dist_copy = max_dist
|
||||
|
||||
path_copy = path.astype(int, order='C', casting='safe', copy=True)
|
||||
edge_feat_copy = edge_feat.astype(int, order='C', casting='safe', copy=True)
|
||||
assert path_copy.flags['C_CONTIGUOUS']
|
||||
assert edge_feat_copy.flags['C_CONTIGUOUS']
|
||||
|
||||
cdef numpy.ndarray[numpy.int32_t, ndim=4, mode='c'] edge_fea_all = -1 * np.ones([n, n, max_dist_copy, edge_feat.shape[-1]], dtype=np.int32)
|
||||
cdef unsigned int i, j, k, num_path, cur
|
||||
|
||||
for i in range(n):
|
||||
for j in range(n):
|
||||
if i == j:
|
||||
continue
|
||||
if path_copy[i][j] == UNREACHABLE_NODE_DISTANCE:
|
||||
continue
|
||||
path = [i] + get_all_edges(path_copy, i, j) + [j]
|
||||
num_path = len(path) - 1
|
||||
for k in range(num_path):
|
||||
edge_fea_all[i, j, k, :] = edge_feat_copy[path[k], path[k+1], :]
|
||||
|
||||
return edge_fea_all
|
||||
@ -1,135 +0,0 @@
|
||||
# Copyright (c) Microsoft Corporation and HuggingFace
|
||||
# Licensed under the MIT License.
|
||||
|
||||
from collections.abc import Mapping
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from ....utils import is_cython_available, requires_backends
|
||||
|
||||
|
||||
if is_cython_available():
|
||||
import pyximport
|
||||
|
||||
pyximport.install(setup_args={"include_dirs": np.get_include()})
|
||||
from . import algos_graphormer
|
||||
|
||||
|
||||
def convert_to_single_emb(x, offset: int = 512):
|
||||
feature_num = x.shape[1] if len(x.shape) > 1 else 1
|
||||
feature_offset = 1 + np.arange(0, feature_num * offset, offset, dtype=np.int64)
|
||||
x = x + feature_offset
|
||||
return x
|
||||
|
||||
|
||||
def preprocess_item(item, keep_features=True):
|
||||
requires_backends(preprocess_item, ["cython"])
|
||||
|
||||
if keep_features and "edge_attr" in item: # edge_attr
|
||||
edge_attr = np.asarray(item["edge_attr"], dtype=np.int64)
|
||||
else:
|
||||
edge_attr = np.ones((len(item["edge_index"][0]), 1), dtype=np.int64) # same embedding for all
|
||||
|
||||
if keep_features and "node_feat" in item: # input_nodes
|
||||
node_feature = np.asarray(item["node_feat"], dtype=np.int64)
|
||||
else:
|
||||
node_feature = np.ones((item["num_nodes"], 1), dtype=np.int64) # same embedding for all
|
||||
|
||||
edge_index = np.asarray(item["edge_index"], dtype=np.int64)
|
||||
|
||||
input_nodes = convert_to_single_emb(node_feature) + 1
|
||||
num_nodes = item["num_nodes"]
|
||||
|
||||
if len(edge_attr.shape) == 1:
|
||||
edge_attr = edge_attr[:, None]
|
||||
attn_edge_type = np.zeros([num_nodes, num_nodes, edge_attr.shape[-1]], dtype=np.int64)
|
||||
attn_edge_type[edge_index[0], edge_index[1]] = convert_to_single_emb(edge_attr) + 1
|
||||
|
||||
# node adj matrix [num_nodes, num_nodes] bool
|
||||
adj = np.zeros([num_nodes, num_nodes], dtype=bool)
|
||||
adj[edge_index[0], edge_index[1]] = True
|
||||
|
||||
shortest_path_result, path = algos_graphormer.floyd_warshall(adj)
|
||||
max_dist = np.amax(shortest_path_result)
|
||||
|
||||
input_edges = algos_graphormer.gen_edge_input(max_dist, path, attn_edge_type)
|
||||
attn_bias = np.zeros([num_nodes + 1, num_nodes + 1], dtype=np.single) # with graph token
|
||||
|
||||
# combine
|
||||
item["input_nodes"] = input_nodes + 1 # we shift all indices by one for padding
|
||||
item["attn_bias"] = attn_bias
|
||||
item["attn_edge_type"] = attn_edge_type
|
||||
item["spatial_pos"] = shortest_path_result.astype(np.int64) + 1 # we shift all indices by one for padding
|
||||
item["in_degree"] = np.sum(adj, axis=1).reshape(-1) + 1 # we shift all indices by one for padding
|
||||
item["out_degree"] = item["in_degree"] # for undirected graph
|
||||
item["input_edges"] = input_edges + 1 # we shift all indices by one for padding
|
||||
if "labels" not in item:
|
||||
item["labels"] = item["y"]
|
||||
|
||||
return item
|
||||
|
||||
|
||||
class GraphormerDataCollator:
|
||||
def __init__(self, spatial_pos_max=20, on_the_fly_processing=False):
|
||||
if not is_cython_available():
|
||||
raise ImportError("Graphormer preprocessing needs Cython (pyximport)")
|
||||
|
||||
self.spatial_pos_max = spatial_pos_max
|
||||
self.on_the_fly_processing = on_the_fly_processing
|
||||
|
||||
def __call__(self, features: list[dict]) -> dict[str, Any]:
|
||||
if self.on_the_fly_processing:
|
||||
features = [preprocess_item(i) for i in features]
|
||||
|
||||
if not isinstance(features[0], Mapping):
|
||||
features = [vars(f) for f in features]
|
||||
batch = {}
|
||||
|
||||
max_node_num = max(len(i["input_nodes"]) for i in features)
|
||||
node_feat_size = len(features[0]["input_nodes"][0])
|
||||
edge_feat_size = len(features[0]["attn_edge_type"][0][0])
|
||||
max_dist = max(len(i["input_edges"][0][0]) for i in features)
|
||||
edge_input_size = len(features[0]["input_edges"][0][0][0])
|
||||
batch_size = len(features)
|
||||
|
||||
batch["attn_bias"] = torch.zeros(batch_size, max_node_num + 1, max_node_num + 1, dtype=torch.float)
|
||||
batch["attn_edge_type"] = torch.zeros(batch_size, max_node_num, max_node_num, edge_feat_size, dtype=torch.long)
|
||||
batch["spatial_pos"] = torch.zeros(batch_size, max_node_num, max_node_num, dtype=torch.long)
|
||||
batch["in_degree"] = torch.zeros(batch_size, max_node_num, dtype=torch.long)
|
||||
batch["input_nodes"] = torch.zeros(batch_size, max_node_num, node_feat_size, dtype=torch.long)
|
||||
batch["input_edges"] = torch.zeros(
|
||||
batch_size, max_node_num, max_node_num, max_dist, edge_input_size, dtype=torch.long
|
||||
)
|
||||
|
||||
for ix, f in enumerate(features):
|
||||
for k in ["attn_bias", "attn_edge_type", "spatial_pos", "in_degree", "input_nodes", "input_edges"]:
|
||||
f[k] = torch.tensor(f[k])
|
||||
|
||||
if len(f["attn_bias"][1:, 1:][f["spatial_pos"] >= self.spatial_pos_max]) > 0:
|
||||
f["attn_bias"][1:, 1:][f["spatial_pos"] >= self.spatial_pos_max] = float("-inf")
|
||||
|
||||
batch["attn_bias"][ix, : f["attn_bias"].shape[0], : f["attn_bias"].shape[1]] = f["attn_bias"]
|
||||
batch["attn_edge_type"][ix, : f["attn_edge_type"].shape[0], : f["attn_edge_type"].shape[1], :] = f[
|
||||
"attn_edge_type"
|
||||
]
|
||||
batch["spatial_pos"][ix, : f["spatial_pos"].shape[0], : f["spatial_pos"].shape[1]] = f["spatial_pos"]
|
||||
batch["in_degree"][ix, : f["in_degree"].shape[0]] = f["in_degree"]
|
||||
batch["input_nodes"][ix, : f["input_nodes"].shape[0], :] = f["input_nodes"]
|
||||
batch["input_edges"][
|
||||
ix, : f["input_edges"].shape[0], : f["input_edges"].shape[1], : f["input_edges"].shape[2], :
|
||||
] = f["input_edges"]
|
||||
|
||||
batch["out_degree"] = batch["in_degree"]
|
||||
|
||||
sample = features[0]["labels"]
|
||||
if len(sample) == 1: # one task
|
||||
if isinstance(sample[0], float): # regression
|
||||
batch["labels"] = torch.from_numpy(np.concatenate([i["labels"] for i in features]))
|
||||
else: # binary classification
|
||||
batch["labels"] = torch.from_numpy(np.concatenate([i["labels"] for i in features]))
|
||||
else: # multi task classification, left to float to keep the NaNs
|
||||
batch["labels"] = torch.from_numpy(np.stack([i["labels"] for i in features], axis=0))
|
||||
|
||||
return batch
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user