mirror of
https://github.com/huggingface/transformers.git
synced 2025-10-20 17:13:56 +08:00
Compare commits
276 Commits
3c7552f733
...
temp-kosmo
Author | SHA1 | Date | |
---|---|---|---|
7c99fd066f | |||
a65a9b11d3 | |||
30f927a54a | |||
a3638eab9d | |||
876cb6b217 | |||
ce222a6990 | |||
9b29aacce3 | |||
bf14c4b95a | |||
db865dbeda | |||
7f0d26c55e | |||
0ec499a841 | |||
5f731a9aa9 | |||
6b2f7d79e2 | |||
2d4cbba164 | |||
83d600e5f8 | |||
55944fcf87 | |||
c356a36327 | |||
b2c3db2990 | |||
39dc6ef257 | |||
9ddc86b560 | |||
d5b834925c | |||
3a58742f92 | |||
d1c52f4ffc | |||
b688c4f564 | |||
c639eeb73e | |||
8a058d9d56 | |||
395a6365b0 | |||
2c47915424 | |||
9c8aff7cf9 | |||
00e324db90 | |||
90c4fcc29c | |||
ce3a6b0ab7 | |||
fbb3e592af | |||
f8c98d6173 | |||
b1db4f22b6 | |||
85da449436 | |||
e3802f4baa | |||
91fa38341a | |||
dcced48507 | |||
9a841add0a | |||
6ed504dbce | |||
925e14a0f5 | |||
0b9e5adaa0 | |||
52788cc66e | |||
ac94b571a3 | |||
0153a08af2 | |||
1c58c8f666 | |||
830671beb4 | |||
9a8479d57c | |||
f66c6ee998 | |||
fcc095fbbf | |||
f2dae0ddbc | |||
08e1cb0d2d | |||
6f2bd73f77 | |||
4b7bc957a5 | |||
142604dadf | |||
968b033343 | |||
94cc6d22f6 | |||
a6154db0e1 | |||
eab8e69fcf | |||
f8497ce0cc | |||
63603d65c6 | |||
40ff0155ee | |||
f5d4439bde | |||
d0bf57e8fc | |||
65490b4e5d | |||
9e620b6cd8 | |||
806ca1bbb1 | |||
c7050497ad | |||
d99934d666 | |||
55cb12dcd7 | |||
df9d3ad270 | |||
0ed8541df9 | |||
5e5a9e9fa8 | |||
54b1984c4d | |||
87ab93532d | |||
ab687f5f9b | |||
15feaeaa7e | |||
dd12798243 | |||
b5ebf09127 | |||
e5ffaee338 | |||
cc7d28f9e4 | |||
fe418d05e6 | |||
e1ab413b65 | |||
eb116abdd5 | |||
9a01f8f6fd | |||
ef6754ce8e | |||
0ae49e0859 | |||
f4c73b355a | |||
06cbb5ded7 | |||
1bd02b2278 | |||
9d7363fdd4 | |||
8ee2aa9726 | |||
720a8ab36c | |||
66598978c0 | |||
66af73d646 | |||
139e834e7c | |||
bb4c2470fc | |||
e583cd4407 | |||
1e175baf94 | |||
9822d00b51 | |||
e41b8759ae | |||
703ccfd101 | |||
e6fe2ae21a | |||
e62993ced5 | |||
4e709e50df | |||
210ccb1989 | |||
e09217e618 | |||
de6d842c20 | |||
7df300082c | |||
3681119b37 | |||
f2b61c2944 | |||
2cdb62aa73 | |||
6f8b2e6af1 | |||
1424e07256 | |||
70d85cd2e6 | |||
29d272b0ff | |||
ac1968bd6d | |||
2157f31685 | |||
18fa43b75d | |||
2ea4d4f6a7 | |||
267e1d669f | |||
2b2fe1c468 | |||
60240f2f98 | |||
d0e4fb74e8 | |||
7dfd1454fe | |||
7e5a91cb57 | |||
40dc555148 | |||
889d9da79a | |||
162f569f90 | |||
865fc2fd12 | |||
cc17791fe2 | |||
9e0c277a6f | |||
6cae0b6641 | |||
ab546cc85c | |||
cfaa28fc9f | |||
09d8b29c99 | |||
bd765554dd | |||
482e5e12dc | |||
6b82ce0d18 | |||
26fb9694e9 | |||
64f915e326 | |||
c027a98a3c | |||
ed50bbdd4e | |||
87ccbc73ad | |||
9fca9ca6fa | |||
32df418ae1 | |||
0d166ded63 | |||
9dcacfc453 | |||
0ddfe76426 | |||
da45edd39e | |||
2a782f0e1c | |||
99f0d99efc | |||
06c52aed56 | |||
28b58ff00e | |||
fbbf151d20 | |||
25e3260d2b | |||
5ba6d849be | |||
8b27f806cb | |||
e9e56d0dd5 | |||
5b3a6f796c | |||
54a632e4b4 | |||
c54f9a8c6b | |||
3cebe13ae0 | |||
188adbf95d | |||
1776f31939 | |||
2db6b886b4 | |||
4308a40e09 | |||
452b23d955 | |||
c23a8dd704 | |||
ca57f47dc9 | |||
6eb0683582 | |||
24961cdd2c | |||
253714074d | |||
d577c901a7 | |||
b7be077f04 | |||
93b291f11b | |||
d2c57cc9a7 | |||
b574b092b5 | |||
4eca23ced6 | |||
9c1539ab95 | |||
8066ee7a49 | |||
ec82032bd9 | |||
2fe1f9490b | |||
7e810e2f07 | |||
5d1d0953da | |||
2fba9ab5ad | |||
625fc05381 | |||
9b24a63836 | |||
a5c48d5275 | |||
607f65e7b1 | |||
c7c52a7c43 | |||
c5c4864090 | |||
8998e48003 | |||
d14ac7dd06 | |||
5c5dd54578 | |||
f518e5077a | |||
ca820d0385 | |||
630a40df94 | |||
63877c3223 | |||
7710f9a06e | |||
6fa6221758 | |||
eb2b93c727 | |||
e81b7fed5e | |||
d5ad9579d2 | |||
303e918af4 | |||
40b4e984b1 | |||
2e398f74a9 | |||
937945818e | |||
35ef6559df | |||
cd8ac6ed29 | |||
73dddc516c | |||
f19b06cbf8 | |||
f05e361fb3 | |||
b7d5ec9be1 | |||
c3063253d4 | |||
578acce08f | |||
916781aa85 | |||
b64e30045f | |||
9046ec5a84 | |||
42dd2ea83b | |||
29d7cff9c4 | |||
363180bc27 | |||
9c74c61406 | |||
ba8b3dd446 | |||
241b0bf9cb | |||
fe51247d0d | |||
589e9efed7 | |||
b72fe0a537 | |||
3a0cfaae48 | |||
9eece30c83 | |||
2de836d557 | |||
7d8783b0f6 | |||
05c99438ae | |||
234149a511 | |||
532b1e06fe | |||
6d797c6664 | |||
2e6cad8e24 | |||
8bde09f354 | |||
3cbca06bbb | |||
d9b23c4023 | |||
71d3275e3d | |||
477fd34502 | |||
73463dfafe | |||
4fe45f8eeb | |||
48924d6330 | |||
fcf17a6ee2 | |||
775bec3280 | |||
3ed0d66b3e | |||
b85d5d79f0 | |||
fdc28b7041 | |||
5f51a7d5a5 | |||
be5b0f924a | |||
0ec4d44a76 | |||
80c29c51d0 | |||
93a7dc3a27 | |||
c8aaa35edf | |||
4ee3d7eedb | |||
ef94db2166 | |||
b1d373bf8d | |||
0d7a2733cb | |||
fdc614267a | |||
ce839cc270 | |||
02f21a72f9 | |||
cd55891f1a | |||
c433374429 | |||
cab16ce8a6 | |||
4ceb5c8603 | |||
cedd7d3ca0 | |||
352e67821a | |||
f1be589aa3 | |||
d9cf29031d | |||
3aa802c7eb | |||
21b0ecc740 | |||
a6636c1aff | |||
661ea9c742 |
@ -860,6 +860,8 @@
|
||||
title: InstructBlipVideo
|
||||
- local: model_doc/kosmos-2
|
||||
title: KOSMOS-2
|
||||
- local: model_doc/kosmos-2.5
|
||||
title: KOSMOS-2.5
|
||||
- local: model_doc/layoutlm
|
||||
title: LayoutLM
|
||||
- local: model_doc/layoutlmv2
|
||||
|
@ -184,6 +184,7 @@ Flax), PyTorch, and/or TensorFlow.
|
||||
| [JetMoe](model_doc/jetmoe) | ✅ | ❌ | ❌ |
|
||||
| [Jukebox](model_doc/jukebox) | ✅ | ❌ | ❌ |
|
||||
| [KOSMOS-2](model_doc/kosmos-2) | ✅ | ❌ | ❌ |
|
||||
| [KOSMOS-2.5](model_doc/kosmos-2.5) | ✅ | ❌ | ❌ |
|
||||
| [LayoutLM](model_doc/layoutlm) | ✅ | ✅ | ❌ |
|
||||
| [LayoutLMv2](model_doc/layoutlmv2) | ✅ | ❌ | ❌ |
|
||||
| [LayoutLMv3](model_doc/layoutlmv3) | ✅ | ✅ | ❌ |
|
||||
|
63
docs/source/en/model_doc/kosmos-2.5.md
Normal file
63
docs/source/en/model_doc/kosmos-2.5.md
Normal file
@ -0,0 +1,63 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# KOSMOS-2.5
|
||||
|
||||
## Overview
|
||||
|
||||
Kosmos-2.5 is a multimodal literate model for machine reading of text-intensive images. Pre-trained on large-scale text-intensive images, Kosmos-2.5 excels in two distinct yet cooperative transcription tasks: (1) generating spatially-aware text blocks, where each block of text is assigned its spatial coordinates within the image, and (2) producing structured text output that captures styles and structures into the markdown format. This unified multimodal literate capability is achieved through a shared decoder-only auto-regressive Transformer architecture, task-specific prompts, and flexible text representations. We evaluate Kosmos-2.5 on end-to-end document-level text recognition and image-to-markdown text generation. Furthermore, the model can be readily adapted for any text-intensive image understanding task with different prompts through supervised fine-tuning, making it a general-purpose tool for real-world applications involving text-rich images. This work also paves the way for the future scaling of multimodal large language models.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*We present Kosmos-2.5, a multimodal literate model for machine reading of text-intensive images. Pre-trained on large-scale text-intensive images, Kosmos-2.5 excels in two distinct yet cooperative transcription tasks: (1) generating spatially-aware text blocks, where each block of text is assigned its spatial coordinates within the image, and (2) producing structured text output that captures styles and structures into the markdown format. This unified multimodal literate capability is achieved through a shared Transformer architecture, task-specific prompts, and flexible text representations. We evaluate Kosmos-2.5 on end-to-end document-level text recognition and image-to-markdown text generation. Furthermore, the model can be readily adapted for any text-intensive image understanding task with different prompts through supervised fine-tuning, making it a general-purpose tool for real-world applications involving text-rich images. This work also paves the way for the future scaling of multimodal large language models.*
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/kosmos2_5_ocr.png"
|
||||
alt="drawing" width="600"/>
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/kosmos2_5_md.png"
|
||||
alt="drawing" width="600"/>
|
||||
|
||||
<small> Overview of tasks that KOSMOS-2.5 can handle. Taken from the <a href="https://arxiv.org/abs/2309.11419">original paper</a>. </small>
|
||||
|
||||
## Example
|
||||
**Markdown Task:** For usage instructions, please refer to [md.py](https://huggingface.co/microsoft/kosmos-2.5/blob/main/md.py).
|
||||
|
||||
**OCR Task:** For usage instructions, please refer to [ocr.py](https://huggingface.co/microsoft/kosmos-2.5/blob/main/ocr.py).
|
||||
|
||||
|
||||
|
||||
## Kosmos2_5Config
|
||||
|
||||
[[autodoc]] Kosmos2_5Config
|
||||
|
||||
## Kosmos2_5ImageProcessor
|
||||
|
||||
[[autodoc]] Kosmos2_5ImageProcessor
|
||||
|
||||
## Kosmos2_5Processor
|
||||
|
||||
[[autodoc]] Kosmos2_5Processor
|
||||
- __call__
|
||||
|
||||
## Kosmos2_5Model
|
||||
|
||||
[[autodoc]] Kosmos2_5Model
|
||||
- forward
|
||||
|
||||
## Kosmos2_5ForConditionalGeneration
|
||||
|
||||
[[autodoc]] Kosmos2_5ForConditionalGeneration
|
||||
- forward
|
@ -61,6 +61,7 @@ FlashAttention-2 is currently supported for the following architectures:
|
||||
* [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel)
|
||||
* [JetMoe](https://huggingface.co/docs/transformers/model_doc/jetmoe#transformers.JetMoeModel)
|
||||
* [Jamba](https://huggingface.co/docs/transformers/model_doc/jamba#transformers.JambaModel)
|
||||
* [Kosmos-2.5](https://huggingface.co/docs/transformers/model_doc/kosmos2_5#transformers.Kosmos2_5Model)
|
||||
* [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel)
|
||||
* [Llava](https://huggingface.co/docs/transformers/model_doc/llava)
|
||||
* [Llava-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next)
|
||||
@ -251,6 +252,7 @@ For now, Transformers supports SDPA inference and training for the following arc
|
||||
* [GraniteMoe](https://huggingface.co/docs/transformers/model_doc/granitemoe#transformers.GraniteMoeModel)
|
||||
* [JetMoe](https://huggingface.co/docs/transformers/model_doc/jetmoe#transformers.JetMoeModel)
|
||||
* [Jamba](https://huggingface.co/docs/transformers/model_doc/jamba#transformers.JambaModel)
|
||||
* [Kosmos-2.5](https://huggingface.co/docs/transformers/model_doc/kosmos2_5#transformers.Kosmos2_5Model)
|
||||
* [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel)
|
||||
* [Llava](https://huggingface.co/docs/transformers/model_doc/llava)
|
||||
* [Llava-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next)
|
||||
|
@ -512,6 +512,10 @@ _import_structure = {
|
||||
"Kosmos2Config",
|
||||
"Kosmos2Processor",
|
||||
],
|
||||
"models.kosmos2_5": [
|
||||
"Kosmos2_5Config",
|
||||
"Kosmos2_5Processor",
|
||||
],
|
||||
"models.layoutlm": [
|
||||
"LayoutLMConfig",
|
||||
"LayoutLMTokenizer",
|
||||
@ -1216,6 +1220,7 @@ else:
|
||||
_import_structure["models.idefics3"].extend(["Idefics3ImageProcessor"])
|
||||
_import_structure["models.imagegpt"].extend(["ImageGPTFeatureExtractor", "ImageGPTImageProcessor"])
|
||||
_import_structure["models.instructblipvideo"].extend(["InstructBlipVideoImageProcessor"])
|
||||
_import_structure["models.kosmos2_5"].extend(["Kosmos2_5ImageProcessor"])
|
||||
_import_structure["models.layoutlmv2"].extend(["LayoutLMv2FeatureExtractor", "LayoutLMv2ImageProcessor"])
|
||||
_import_structure["models.layoutlmv3"].extend(["LayoutLMv3FeatureExtractor", "LayoutLMv3ImageProcessor"])
|
||||
_import_structure["models.levit"].extend(["LevitFeatureExtractor", "LevitImageProcessor"])
|
||||
@ -2557,6 +2562,13 @@ else:
|
||||
"Kosmos2PreTrainedModel",
|
||||
]
|
||||
)
|
||||
_import_structure["models.kosmos2_5"].extend(
|
||||
[
|
||||
"Kosmos2_5ForConditionalGeneration",
|
||||
"Kosmos2_5Model",
|
||||
"Kosmos2_5PreTrainedModel",
|
||||
]
|
||||
)
|
||||
_import_structure["models.layoutlm"].extend(
|
||||
[
|
||||
"LayoutLMForMaskedLM",
|
||||
@ -5438,6 +5450,10 @@ if TYPE_CHECKING:
|
||||
Kosmos2Config,
|
||||
Kosmos2Processor,
|
||||
)
|
||||
from .models.kosmos2_5 import (
|
||||
Kosmos2_5Config,
|
||||
Kosmos2_5Processor,
|
||||
)
|
||||
from .models.layoutlm import (
|
||||
LayoutLMConfig,
|
||||
LayoutLMTokenizer,
|
||||
@ -6177,6 +6193,7 @@ if TYPE_CHECKING:
|
||||
from .models.idefics3 import Idefics3ImageProcessor
|
||||
from .models.imagegpt import ImageGPTFeatureExtractor, ImageGPTImageProcessor
|
||||
from .models.instructblipvideo import InstructBlipVideoImageProcessor
|
||||
from .models.kosmos2_5 import Kosmos2_5ImageProcessor
|
||||
from .models.layoutlmv2 import (
|
||||
LayoutLMv2FeatureExtractor,
|
||||
LayoutLMv2ImageProcessor,
|
||||
@ -7301,6 +7318,11 @@ if TYPE_CHECKING:
|
||||
Kosmos2Model,
|
||||
Kosmos2PreTrainedModel,
|
||||
)
|
||||
from .models.kosmos2_5 import (
|
||||
Kosmos2_5ForConditionalGeneration,
|
||||
Kosmos2_5Model,
|
||||
Kosmos2_5PreTrainedModel,
|
||||
)
|
||||
from .models.layoutlm import (
|
||||
LayoutLMForMaskedLM,
|
||||
LayoutLMForQuestionAnswering,
|
||||
|
@ -127,6 +127,7 @@ from . import (
|
||||
jamba,
|
||||
jetmoe,
|
||||
kosmos2,
|
||||
kosmos2_5,
|
||||
layoutlm,
|
||||
layoutlmv2,
|
||||
layoutlmv3,
|
||||
|
@ -148,6 +148,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
|
||||
("jetmoe", "JetMoeConfig"),
|
||||
("jukebox", "JukeboxConfig"),
|
||||
("kosmos-2", "Kosmos2Config"),
|
||||
("kosmos-2.5", "Kosmos2_5Config"),
|
||||
("layoutlm", "LayoutLMConfig"),
|
||||
("layoutlmv2", "LayoutLMv2Config"),
|
||||
("layoutlmv3", "LayoutLMv3Config"),
|
||||
@ -459,6 +460,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
|
||||
("jetmoe", "JetMoe"),
|
||||
("jukebox", "Jukebox"),
|
||||
("kosmos-2", "KOSMOS-2"),
|
||||
("kosmos-2.5", "KOSMOS-2.5"),
|
||||
("layoutlm", "LayoutLM"),
|
||||
("layoutlmv2", "LayoutLMv2"),
|
||||
("layoutlmv3", "LayoutLMv3"),
|
||||
@ -692,6 +694,7 @@ SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict(
|
||||
("data2vec-vision", "data2vec"),
|
||||
("donut-swin", "donut"),
|
||||
("kosmos-2", "kosmos2"),
|
||||
("kosmos-2.5", "kosmos2_5"),
|
||||
("maskformer-swin", "maskformer"),
|
||||
("xclip", "x_clip"),
|
||||
("clip_vision_model", "clip"),
|
||||
|
@ -98,6 +98,7 @@ else:
|
||||
("instructblip", ("BlipImageProcessor",)),
|
||||
("instructblipvideo", ("InstructBlipVideoImageProcessor",)),
|
||||
("kosmos-2", ("CLIPImageProcessor",)),
|
||||
("kosmos-2.5", ("Kosmos2_5ImageProcessor",)),
|
||||
("layoutlmv2", ("LayoutLMv2ImageProcessor",)),
|
||||
("layoutlmv3", ("LayoutLMv3ImageProcessor",)),
|
||||
("levit", ("LevitImageProcessor",)),
|
||||
|
@ -143,6 +143,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
|
||||
("jetmoe", "JetMoeModel"),
|
||||
("jukebox", "JukeboxModel"),
|
||||
("kosmos-2", "Kosmos2Model"),
|
||||
("kosmos-2.5", "Kosmos2_5Model"),
|
||||
("layoutlm", "LayoutLMModel"),
|
||||
("layoutlmv2", "LayoutLMv2Model"),
|
||||
("layoutlmv3", "LayoutLMv3Model"),
|
||||
@ -761,6 +762,7 @@ MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict(
|
||||
("instructblip", "InstructBlipForConditionalGeneration"),
|
||||
("instructblipvideo", "InstructBlipVideoForConditionalGeneration"),
|
||||
("kosmos-2", "Kosmos2ForConditionalGeneration"),
|
||||
("kosmos-2.5", "Kosmos2_5ForConditionalGeneration"),
|
||||
("llava", "LlavaForConditionalGeneration"),
|
||||
("llava_next", "LlavaNextForConditionalGeneration"),
|
||||
("llava_next_video", "LlavaNextVideoForConditionalGeneration"),
|
||||
@ -788,6 +790,7 @@ MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict(
|
||||
("idefics3", "Idefics3ForConditionalGeneration"),
|
||||
("instructblip", "InstructBlipForConditionalGeneration"),
|
||||
("kosmos-2", "Kosmos2ForConditionalGeneration"),
|
||||
("kosmos-2.5", "Kosmos2_5ForConditionalGeneration"),
|
||||
("llava", "LlavaForConditionalGeneration"),
|
||||
("llava_next", "LlavaNextForConditionalGeneration"),
|
||||
("llava_onevision", "LlavaOnevisionForConditionalGeneration"),
|
||||
|
@ -70,6 +70,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
|
||||
("instructblip", "InstructBlipProcessor"),
|
||||
("instructblipvideo", "InstructBlipVideoProcessor"),
|
||||
("kosmos-2", "Kosmos2Processor"),
|
||||
("kosmos-2.5", "Kosmos2_5Processor"),
|
||||
("layoutlmv2", "LayoutLMv2Processor"),
|
||||
("layoutlmv3", "LayoutLMv3Processor"),
|
||||
("llava", "LlavaProcessor"),
|
||||
|
@ -247,6 +247,7 @@ else:
|
||||
"XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
|
||||
),
|
||||
),
|
||||
("kosmos-2.5", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
|
||||
("layoutlmv3", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)),
|
||||
|
@ -2073,6 +2073,7 @@ class Kosmos2ForConditionalGeneration(Kosmos2PreTrainedModel, GenerationMixin):
|
||||
vision_model_output=vision_model_output,
|
||||
)
|
||||
|
||||
@torch.no_grad()
|
||||
def generate(
|
||||
self,
|
||||
pixel_values: Optional[torch.Tensor] = None,
|
||||
|
30
src/transformers/models/kosmos2_5/__init__.py
Normal file
30
src/transformers/models/kosmos2_5/__init__.py
Normal file
@ -0,0 +1,30 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2024 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ...utils import _LazyModule
|
||||
from ...utils.import_utils import define_import_structure
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_kosmos2_5 import *
|
||||
from .image_processing_kosmos2_5 import *
|
||||
from .modeling_kosmos2_5 import *
|
||||
from .processing_kosmos2_5 import *
|
||||
else:
|
||||
import sys
|
||||
|
||||
_file = globals()["__file__"]
|
||||
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
|
278
src/transformers/models/kosmos2_5/configuration_kosmos2_5.py
Normal file
278
src/transformers/models/kosmos2_5/configuration_kosmos2_5.py
Normal file
@ -0,0 +1,278 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2024 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""KOSMOS-2.5 model configuration"""
|
||||
|
||||
import os
|
||||
from typing import Union
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class Kosmos2_5TextConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`Kosmos2_5TextModel`]. It is used to instantiate a
|
||||
KOSMOS-2.5 text decoder according to the specified arguments, defining the model architecture. Instantiating a
|
||||
configuration with the defaults will yield a similar configuration to that of the text decoder of the KOSMOS-2.5
|
||||
[microsoft/kosmos-2.5](https://huggingface.co/microsoft/kosmos-2.5) architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 108481):
|
||||
Vocabulary size of the Kosmos2_5 model. Defines the number of different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`Kosmos2_5Model`].
|
||||
max_position_embeddings (`int`, *optional*, defaults to 2048):
|
||||
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
embed_dim (`int`, *optional*, defaults to 2048):
|
||||
Dimensionality of the layers and the pooler layer.
|
||||
layers (`int`, *optional*, defaults to 24):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
ffn_dim (`int`, *optional*, defaults to 8192):
|
||||
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
|
||||
attention_heads (`int`, *optional*, defaults to 32):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"silu"` and `"gelu_new"` are supported.
|
||||
dropout (`float`, *optional*, defaults to 0.1):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities.
|
||||
activation_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for activations inside the fully connected layer.
|
||||
layerdrop (`float`, *optional*, defaults to 0.0):
|
||||
The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
|
||||
for more details.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
||||
The epsilon used by the layer normalization layers.
|
||||
init_std (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
scale_embedding (`bool`, *optional*, defaults to `True`):
|
||||
Scale embeddings by diving by sqrt(embed_dim).
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models).
|
||||
```"""
|
||||
|
||||
model_type = "kosmos_2_5_text_model"
|
||||
base_config_key = "text_config"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"num_attention_heads": "attention_heads",
|
||||
"hidden_size": "embed_dim",
|
||||
"num_hidden_layers": "layers",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=108481,
|
||||
max_position_embeddings=4096,
|
||||
embed_dim=1536,
|
||||
layers=24,
|
||||
ffn_dim=6144,
|
||||
attention_heads=16,
|
||||
activation_function="gelu",
|
||||
dropout=0.1,
|
||||
attention_dropout=0,
|
||||
activation_dropout=0.0,
|
||||
layerdrop=0.0,
|
||||
layer_norm_eps=1e-5,
|
||||
init_std=0.02,
|
||||
scale_embedding=True,
|
||||
use_cache=True,
|
||||
pad_token_id=1,
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.embed_dim = embed_dim
|
||||
self.layers = layers
|
||||
self.ffn_dim = ffn_dim
|
||||
self.attention_heads = attention_heads
|
||||
self.activation_function = activation_function
|
||||
self.dropout = dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.activation_dropout = activation_dropout
|
||||
self.layerdrop = layerdrop
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.init_std = init_std
|
||||
self.scale_embedding = scale_embedding
|
||||
self.use_cache = use_cache
|
||||
|
||||
|
||||
class Kosmos2_5VisionConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`Kosmos2_5VisionModel`]. It is used to
|
||||
instantiate a KOSMOS-2.5 vision encoder according to the specified arguments, defining the model architecture.
|
||||
Instantiating a configuration defaults will yield a similar configuration to that of the vision encoder of the KOSMOS-2.5
|
||||
[microsoft/kosmos-2.5](https://huggingface.co/microsoft/kosmos-2.5) architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
hidden_size (`int`, *optional*, defaults to 768):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
patch_embed_hidden_size (`int`, *optional*, defaults to 768):
|
||||
Dimensionality of the input patch_embedding layer in the Transformer encoder.
|
||||
d_ff (`int`, *optional*, defaults to 2048):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
d_kv (`int`, *optional*, defaults to 64):
|
||||
Dimensionality of the key, query, value projections per attention head.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 12):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
dense_act_fn (`str` or `function`, *optional*, defaults to `"gelu_new"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
|
||||
The epsilon used by the layer normalization layers.
|
||||
dropout_rate (`float`, *optional*, defaults to 0.0):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
seq_len (`int`, *optional*, defaults to 4096):
|
||||
Maximum sequence length (here number of patches) supported by the model.
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import Kosmos2_5VisionConfig, Kosmos2_5VisionModel
|
||||
|
||||
>>> # Initializing a Kosmos2_5VisionConfig with microsoft/kosmos-2.5 style configuration
|
||||
>>> configuration = Kosmos2_5VisionConfig()
|
||||
|
||||
>>> # Initializing a Kosmos2_5VisionModel (with random weights) from the microsoft/kosmos-2.5 style configuration
|
||||
>>> model = Kosmos2_5VisionModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "kosmos_2_5_vision_model"
|
||||
base_config_key = "vision_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size=1536,
|
||||
patch_embed_hidden_size=768,
|
||||
d_ff=3968,
|
||||
d_kv=64,
|
||||
num_hidden_layers=18,
|
||||
num_attention_heads=24,
|
||||
dense_act_fn="gelu_new",
|
||||
layer_norm_eps=1e-6,
|
||||
dropout_rate=0.0,
|
||||
attention_dropout=0.0,
|
||||
seq_len=4096,
|
||||
initializer_factor=1.0,
|
||||
initializer_range=0.02,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.patch_embed_hidden_size = patch_embed_hidden_size
|
||||
self.d_ff = d_ff
|
||||
self.dropout_rate = dropout_rate
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.attention_dropout = attention_dropout
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.dense_act_fn = dense_act_fn
|
||||
self.seq_len = seq_len
|
||||
self.d_kv = d_kv
|
||||
self.initializer_factor = initializer_factor
|
||||
self.initializer_range = initializer_range
|
||||
|
||||
|
||||
class Kosmos2_5Config(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`Kosmos2_5Model`]. It is used to instantiate a
|
||||
KOSMOS-2.5 model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
||||
with the defaults will yield a similar configuration to that of the KOSMOS-2.5
|
||||
[microsoft/kosmos-2.5](https://huggingface.co/microsoft/kosmos-2.5) architecture.
|
||||
|
||||
Args:
|
||||
text_config (`dict`, *optional*):
|
||||
Dictionary of configuration options used to initialize [`Kosmos2_5TextConfig`].
|
||||
vision_config (`dict`, *optional*):
|
||||
Dictionary of configuration options used to initialize [`Kosmos2_5VisionConfig`].
|
||||
latent_query_num (`int`, *optional*, defaults to 2048):
|
||||
The number of latent query tokens that represent the image features used in the text decoder component.
|
||||
kwargs (*optional*):
|
||||
Dictionary of keyword arguments.
|
||||
"""
|
||||
|
||||
model_type = "kosmos-2.5"
|
||||
sub_configs = {"text_config": Kosmos2_5TextConfig, "vision_config": Kosmos2_5VisionConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
text_config=None,
|
||||
vision_config=None,
|
||||
latent_query_num=2048,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
if text_config is None:
|
||||
text_config = {}
|
||||
logger.info("text_config is None. Initializing the Kosmos2_5TextConfig with default values.")
|
||||
if vision_config is None:
|
||||
vision_config = {}
|
||||
logger.info("vision_config is None. Initializing the Kosmos2_5VisionConfig with default values.")
|
||||
|
||||
self.text_config = Kosmos2_5TextConfig(**text_config)
|
||||
self.vision_config = Kosmos2_5VisionConfig(**vision_config)
|
||||
|
||||
self.latent_query_num = latent_query_num
|
||||
|
||||
@classmethod
|
||||
def from_text_vision_configs(
|
||||
cls,
|
||||
text_config: Kosmos2_5TextConfig,
|
||||
vision_config: Kosmos2_5VisionConfig,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Instantiate a [`Kosmos2_5Config`] (or a derived class) from Kosmos2_5 text model configuration and Kosmos2_5
|
||||
vision model configuration.
|
||||
|
||||
Returns:
|
||||
[`Kosmos2_5Config`]: An instance of a configuration object
|
||||
"""
|
||||
|
||||
return cls(
|
||||
text_config=text_config.to_dict(),
|
||||
vision_config=vision_config.to_dict(),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Kosmos2_5Config"]
|
87
src/transformers/models/kosmos2_5/convert_kosmos2_5.py
Normal file
87
src/transformers/models/kosmos2_5/convert_kosmos2_5.py
Normal file
@ -0,0 +1,87 @@
|
||||
import argparse
|
||||
|
||||
from fairseq.checkpoint_utils import load_checkpoint_to_cpu
|
||||
|
||||
from transformers import Kosmos2_5Config, Kosmos2_5ForConditionalGeneration
|
||||
|
||||
|
||||
KEYS_TO_MODIFY_MAPPING = {
|
||||
"gpt_model.decoder.output_projection": "text_model.lm_head",
|
||||
"gpt_model.decoder": "text_model.model",
|
||||
"img_connector": "image_to_text_projection",
|
||||
"img_model.embeddings": "vision_model.embeddings",
|
||||
"img_model.encoder": "vision_model.encoder",
|
||||
"img_model.layernorm": "vision_model.layernorm",
|
||||
"img_model": "vision_model",
|
||||
"ln_pre": "pre_layrnorm",
|
||||
"ln_post": "post_layernorm",
|
||||
"transformer.resblocks": "encoder.layers",
|
||||
"ts_attn": "self_attn",
|
||||
"ln_1": "layer_norm1",
|
||||
"ln_2": "layer_norm2",
|
||||
"c_fc": "fc1",
|
||||
"c_proj": "fc2",
|
||||
}
|
||||
|
||||
|
||||
KEYS_TO_IGNORE = [
|
||||
# this buffer in the original code is only used to send weights to the desired device
|
||||
"gpt_model.decoder.embed_positions._float_tensor",
|
||||
# this weight is never used in the forward in the original KOSMOS-2.5)
|
||||
"gpt_model.decoder.self_attn_sope.scale",
|
||||
]
|
||||
|
||||
|
||||
def rename_key(key):
|
||||
for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
|
||||
if key_to_modify in key:
|
||||
key = key.replace(key_to_modify, new_key)
|
||||
|
||||
return key
|
||||
|
||||
|
||||
def convert_kosmos2_5_checkpoint_to_pytorch(checkpoint_path, pytorch_dump_folder_path):
|
||||
state = load_checkpoint_to_cpu(checkpoint_path)
|
||||
state_dict = state["model"]
|
||||
state_dict_keys = list(state_dict.keys())
|
||||
|
||||
config = Kosmos2_5Config()
|
||||
# This is necessary to match the results given by the original demo
|
||||
config.text_config.no_repeat_ngram_size = 3
|
||||
model = Kosmos2_5ForConditionalGeneration(config)
|
||||
|
||||
# convert (by renaming keys)
|
||||
converted_state_dict = {}
|
||||
for key in state_dict_keys:
|
||||
if key in KEYS_TO_IGNORE:
|
||||
continue
|
||||
renamed_key = rename_key(key)
|
||||
converted_state_dict[renamed_key] = state_dict[key]
|
||||
|
||||
# set
|
||||
# check weight loading
|
||||
# check whether the state in converted_state_dict is the same as the state in the model
|
||||
model.load_state_dict(converted_state_dict, strict=True)
|
||||
# save the result
|
||||
model.save_pretrained(pytorch_dump_folder_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
# Required parameters
|
||||
parser.add_argument(
|
||||
"--kosmos2_5_checkpoint_path",
|
||||
default="ckpt.pt",
|
||||
type=str,
|
||||
required=False,
|
||||
help="Path the official PyTorch dump.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pytorch_dump_folder_path",
|
||||
default="ckpt",
|
||||
type=str,
|
||||
required=False,
|
||||
help="Path to the output PyTorch model.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
convert_kosmos2_5_checkpoint_to_pytorch(args.kosmos2_5_checkpoint_path, args.pytorch_dump_folder_path)
|
342
src/transformers/models/kosmos2_5/image_processing_kosmos2_5.py
Normal file
342
src/transformers/models/kosmos2_5/image_processing_kosmos2_5.py
Normal file
@ -0,0 +1,342 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2024 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Image processor class for Kosmos2_5."""
|
||||
|
||||
import math
|
||||
from typing import Dict, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ...image_processing_utils import BaseImageProcessor, BatchFeature
|
||||
from ...image_transforms import (
|
||||
convert_to_rgb,
|
||||
normalize,
|
||||
to_channel_dimension_format,
|
||||
)
|
||||
from ...image_utils import (
|
||||
ChannelDimension,
|
||||
ImageInput,
|
||||
get_image_size,
|
||||
infer_channel_dimension_format,
|
||||
make_list_of_images,
|
||||
to_numpy_array,
|
||||
valid_images,
|
||||
)
|
||||
from ...utils import TensorType, is_torch_available, logging
|
||||
from ...utils.import_utils import requires_backends
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
DEFAULT_FONT_PATH = "ybelkada/fonts"
|
||||
|
||||
|
||||
# Copied from transformers.models.pix2struct.image_processing_pix2struct.torch_extract_patches
|
||||
def torch_extract_patches(image_tensor, patch_height, patch_width):
|
||||
"""
|
||||
Utiliy function to extract patches from a given image tensor. Returns a tensor of shape (1, `patch_height`,
|
||||
`patch_width`, `num_channels`x `patch_height` x `patch_width`)
|
||||
|
||||
Args:
|
||||
image_tensor (torch.Tensor):
|
||||
The image tensor to extract patches from.
|
||||
patch_height (int):
|
||||
The height of the patches to extract.
|
||||
patch_width (int):
|
||||
The width of the patches to extract.
|
||||
"""
|
||||
requires_backends(torch_extract_patches, ["torch"])
|
||||
|
||||
image_tensor = image_tensor.unsqueeze(0)
|
||||
patches = torch.nn.functional.unfold(image_tensor, (patch_height, patch_width), stride=(patch_height, patch_width))
|
||||
patches = patches.reshape(image_tensor.size(0), image_tensor.size(1), patch_height, patch_width, -1)
|
||||
patches = patches.permute(0, 4, 2, 3, 1).reshape(
|
||||
image_tensor.size(2) // patch_height,
|
||||
image_tensor.size(3) // patch_width,
|
||||
image_tensor.size(1) * patch_height * patch_width,
|
||||
)
|
||||
return patches.unsqueeze(0)
|
||||
|
||||
|
||||
# similar to transformers.models.pix2struct.image_processing_pix2struct.Pix2StructImageProcessor, but delete is_vqa and additionaly return width and height after resizing
|
||||
class Kosmos2_5ImageProcessor(BaseImageProcessor):
|
||||
r"""
|
||||
Constructs a Kosmos2_5 image processor.
|
||||
|
||||
Args:
|
||||
do_convert_rgb (`bool`, *optional*, defaults to `True`):
|
||||
Whether to convert the image to RGB.
|
||||
do_normalize (`bool`, *optional*, defaults to `True`):
|
||||
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
|
||||
method. According to Kosmos2_5 paper and code, the image is normalized with its own mean and standard
|
||||
deviation.
|
||||
patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
|
||||
The patch size to use for the image. According to Kosmos2_5 paper and code, the patch size is 16x16.
|
||||
max_patches (`int`, *optional*, defaults to 4096):
|
||||
The maximum number of patches to extract from the image as per the [Kosmos2_5
|
||||
paper](https://arxiv.org/pdf/2309.11419).
|
||||
"""
|
||||
|
||||
model_input_names = ["flattened_patches"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
do_convert_rgb: bool = True,
|
||||
do_normalize: bool = True,
|
||||
patch_size: Dict[str, int] = None,
|
||||
max_patches: int = 4096,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self.patch_size = patch_size if patch_size is not None else {"height": 16, "width": 16}
|
||||
self.do_normalize = do_normalize
|
||||
self.do_convert_rgb = do_convert_rgb
|
||||
self.max_patches = max_patches
|
||||
|
||||
def extract_flattened_patches(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
max_patches: int,
|
||||
patch_size: dict,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
**kwargs,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Extract flattened patches from an image.
|
||||
|
||||
Args:
|
||||
image (`np.ndarray`):
|
||||
Image to extract flattened patches from.
|
||||
max_patches (`int`):
|
||||
Maximum number of patches to extract.
|
||||
patch_size (`dict`):
|
||||
Dictionary containing the patch height and width.
|
||||
|
||||
Returns:
|
||||
result (`np.ndarray`):
|
||||
A sequence of `max_patches` flattened patches.
|
||||
"""
|
||||
requires_backends(self.extract_flattened_patches, "torch")
|
||||
|
||||
# convert to torch
|
||||
image = to_channel_dimension_format(image, ChannelDimension.FIRST, input_data_format)
|
||||
image = torch.from_numpy(image)
|
||||
|
||||
patch_height, patch_width = patch_size["height"], patch_size["width"]
|
||||
image_height, image_width = get_image_size(image, ChannelDimension.FIRST)
|
||||
|
||||
# maximize scale s.t.
|
||||
scale = math.sqrt(max_patches * (patch_height / image_height) * (patch_width / image_width))
|
||||
num_feasible_rows = max(min(math.floor(scale * image_height / patch_height), max_patches), 1)
|
||||
num_feasible_cols = max(min(math.floor(scale * image_width / patch_width), max_patches), 1)
|
||||
resized_height = max(num_feasible_rows * patch_height, 1)
|
||||
resized_width = max(num_feasible_cols * patch_width, 1)
|
||||
|
||||
image = torch.nn.functional.interpolate(
|
||||
image.unsqueeze(0),
|
||||
size=(resized_height, resized_width),
|
||||
mode="bilinear",
|
||||
align_corners=False,
|
||||
antialias=True,
|
||||
).squeeze(0)
|
||||
|
||||
# [1, rows, columns, patch_height * patch_width * image_channels]
|
||||
patches = torch_extract_patches(image, patch_height, patch_width)
|
||||
|
||||
patches_shape = patches.shape
|
||||
rows = patches_shape[1]
|
||||
columns = patches_shape[2]
|
||||
depth = patches_shape[3]
|
||||
|
||||
# [rows * columns, patch_height * patch_width * image_channels]
|
||||
patches = patches.reshape([rows * columns, depth])
|
||||
|
||||
# [rows * columns, 1]
|
||||
row_ids = torch.arange(rows).reshape([rows, 1]).repeat(1, columns).reshape([rows * columns, 1])
|
||||
col_ids = torch.arange(columns).reshape([1, columns]).repeat(rows, 1).reshape([rows * columns, 1])
|
||||
|
||||
# Offset by 1 so the ids do not contain zeros, which represent padding.
|
||||
row_ids += 1
|
||||
col_ids += 1
|
||||
|
||||
# Prepare additional patch features.
|
||||
# [rows * columns, 1]
|
||||
row_ids = row_ids.to(torch.float32)
|
||||
col_ids = col_ids.to(torch.float32)
|
||||
|
||||
# [rows * columns, 2 + patch_height * patch_width * image_channels]
|
||||
result = torch.cat([row_ids, col_ids, patches], -1)
|
||||
|
||||
# [max_patches, 2 + patch_height * patch_width * image_channels]
|
||||
result = torch.nn.functional.pad(result, [0, 0, 0, max_patches - (rows * columns)]).float()
|
||||
|
||||
result = to_numpy_array(result)
|
||||
|
||||
return result, resized_width, resized_height, rows, columns
|
||||
|
||||
def normalize(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
**kwargs,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Normalize an image. image = (image - image_mean) / image_std.
|
||||
|
||||
The image std is to mimic the tensorflow implementation of the `per_image_standardization`:
|
||||
https://www.tensorflow.org/api_docs/python/tf/image/per_image_standardization
|
||||
|
||||
Args:
|
||||
image (`np.ndarray`):
|
||||
Image to normalize.
|
||||
data_format (`str` or `ChannelDimension`, *optional*):
|
||||
The channel dimension format for the output image. If unset, the channel dimension format of the input
|
||||
image is used.
|
||||
input_data_format (`str` or `ChannelDimension`, *optional*):
|
||||
The channel dimension format of the input image. If not provided, it will be inferred.
|
||||
"""
|
||||
if image.dtype == np.uint8:
|
||||
image = image.astype(np.float32)
|
||||
|
||||
# take mean across the whole `image`
|
||||
mean = np.mean(image)
|
||||
std = np.std(image)
|
||||
adjusted_stddev = max(std, 1.0 / math.sqrt(np.prod(image.shape)))
|
||||
|
||||
return normalize(
|
||||
image,
|
||||
mean=mean,
|
||||
std=adjusted_stddev,
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def preprocess(
|
||||
self,
|
||||
images: ImageInput,
|
||||
do_convert_rgb: bool = None,
|
||||
do_normalize: Optional[bool] = None,
|
||||
max_patches: Optional[int] = None,
|
||||
patch_size: Optional[Dict[str, int]] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
data_format: ChannelDimension = ChannelDimension.FIRST,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
**kwargs,
|
||||
) -> ImageInput:
|
||||
"""
|
||||
Preprocess an image or batch of images. The processor first computes the maximum possible number of
|
||||
aspect-ratio preserving patches of size `patch_size` that can be extracted from the image. It then pads the
|
||||
image with zeros to make the image respect the constraint of `max_patches`. Before extracting the patches the
|
||||
images are standardized following the tensorflow implementation of `per_image_standardization`
|
||||
(https://www.tensorflow.org/api_docs/python/tf/image/per_image_standardization).
|
||||
|
||||
|
||||
Args:
|
||||
images (`ImageInput`):
|
||||
Image to preprocess. Expects a single or batch of images.
|
||||
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
||||
Whether to convert the image to RGB.
|
||||
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
||||
Whether to normalize the image.
|
||||
max_patches (`int`, *optional*, defaults to `self.max_patches`):
|
||||
Maximum number of patches to extract.
|
||||
patch_size (`dict`, *optional*, defaults to `self.patch_size`):
|
||||
Dictionary containing the patch height and width.
|
||||
return_tensors (`str` or `TensorType`, *optional*):
|
||||
The type of tensors to return. Can be one of:
|
||||
- Unset: Return a list of `np.ndarray`.
|
||||
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
|
||||
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
|
||||
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
|
||||
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
|
||||
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
|
||||
The channel dimension format for the output image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- Unset: Use the channel dimension format of the input image.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format for the input image. If unset, the channel dimension format is inferred
|
||||
from the input image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||
"""
|
||||
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
|
||||
patch_size = patch_size if patch_size is not None else self.patch_size
|
||||
max_patches = max_patches if max_patches is not None else self.max_patches
|
||||
|
||||
if kwargs.get("data_format", None) is not None:
|
||||
raise ValueError("data_format is not an accepted input as the outputs are ")
|
||||
|
||||
images = make_list_of_images(images)
|
||||
|
||||
if not valid_images(images):
|
||||
raise ValueError(
|
||||
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
|
||||
"torch.Tensor, tf.Tensor or jax.ndarray."
|
||||
)
|
||||
|
||||
# PIL RGBA images are converted to RGB
|
||||
if do_convert_rgb:
|
||||
images = [convert_to_rgb(image) for image in images]
|
||||
|
||||
# All transformations expect numpy arrays.
|
||||
images = [to_numpy_array(image) for image in images]
|
||||
|
||||
if input_data_format is None:
|
||||
# We assume that all images have the same channel dimension format.
|
||||
input_data_format = infer_channel_dimension_format(images[0])
|
||||
|
||||
flattened_patches, width, height, rows, cols, attention_masks = [], [], [], [], [], []
|
||||
for image in images:
|
||||
if do_normalize:
|
||||
image = self.normalize(image=image, input_data_format=input_data_format)
|
||||
|
||||
# convert to torch tensor and permute
|
||||
f, w, h, r, c = self.extract_flattened_patches(
|
||||
image=image,
|
||||
max_patches=max_patches,
|
||||
patch_size=patch_size,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
flattened_patches.append(f)
|
||||
width.append(w)
|
||||
height.append(h)
|
||||
rows.append(r)
|
||||
cols.append(c)
|
||||
# create attention mask in numpy
|
||||
attention_masks.append((f.sum(axis=-1) != 0).astype(np.float32))
|
||||
|
||||
encoded_outputs = BatchFeature(
|
||||
data={
|
||||
"flattened_patches": flattened_patches,
|
||||
"attention_mask": attention_masks,
|
||||
"width": width,
|
||||
"height": height,
|
||||
"rows": rows,
|
||||
"cols": cols,
|
||||
},
|
||||
tensor_type=return_tensors,
|
||||
)
|
||||
|
||||
return encoded_outputs
|
||||
|
||||
|
||||
__all__ = ["Kosmos2_5ImageProcessor"]
|
2253
src/transformers/models/kosmos2_5/modeling_kosmos2_5.py
Normal file
2253
src/transformers/models/kosmos2_5/modeling_kosmos2_5.py
Normal file
File diff suppressed because it is too large
Load Diff
164
src/transformers/models/kosmos2_5/processing_kosmos2_5.py
Normal file
164
src/transformers/models/kosmos2_5/processing_kosmos2_5.py
Normal file
@ -0,0 +1,164 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2024 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Processor class for Kosmos2_5.
|
||||
"""
|
||||
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from ...image_processing_utils import BatchFeature
|
||||
from ...image_utils import ImageInput
|
||||
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
|
||||
from ...tokenization_utils_base import TextInput
|
||||
from ...utils import is_torch_available
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
class Kosmos2_5ImagesKwargs(ImagesKwargs, total=False):
|
||||
max_patches: Optional[int]
|
||||
num_image_tokens: Optional[int]
|
||||
|
||||
|
||||
class Kosmos2_5ProcessorKwargs(ProcessingKwargs, total=False):
|
||||
text_kwargs: TextKwargs
|
||||
images_kwargs: Kosmos2_5ImagesKwargs
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"padding": True,
|
||||
"truncation": True,
|
||||
"max_length": None,
|
||||
"stride": 0,
|
||||
"pad_to_multiple_of": None,
|
||||
"return_attention_mask": None,
|
||||
},
|
||||
"images_kwargs": {
|
||||
"max_patches": 4096,
|
||||
"num_image_tokens": 2048,
|
||||
},
|
||||
"common_kwargs": {"return_tensors": "pt"},
|
||||
}
|
||||
|
||||
|
||||
class Kosmos2_5Processor(ProcessorMixin):
|
||||
r"""
|
||||
Constructs a Kosmos2_5 processor which wraps a PreTrainedTokenizerFast and Kosmos2_5 image processor into a single
|
||||
processor.
|
||||
|
||||
[`Kosmos2_5Processor`] offers all the functionalities of [`Kosmos2_5ImageProcessor`] and [`PreTrainedTokenizerFast`]. See
|
||||
the docstring of [`~Kosmos2_5Processor.__call__`] and [`~Kosmos2_5Processor.decode`] for more information.
|
||||
|
||||
Args:
|
||||
image_processor (`Kosmos2_5ImageProcessor`):
|
||||
An instance of [`Kosmos2_5ImageProcessor`]. The image processor is a required input.
|
||||
tokenizer (Union[`T5TokenizerFast`, `T5Tokenizer`]):
|
||||
An instance of ['T5TokenizerFast`] or ['T5Tokenizer`]. The tokenizer is a required input.
|
||||
"""
|
||||
|
||||
attributes = ["image_processor", "tokenizer"]
|
||||
image_processor_class = "Kosmos2_5ImageProcessor"
|
||||
tokenizer_class = "PreTrainedTokenizerFast"
|
||||
|
||||
def __init__(self, image_processor, tokenizer):
|
||||
tokenizer.return_token_type_ids = False
|
||||
super().__init__(image_processor, tokenizer)
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
images: ImageInput = None,
|
||||
text: Union[TextInput, List[TextInput]] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[Kosmos2_5ProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
This method uses [`Kosmos2_5ImageProcessor.preprocess`] method to prepare image(s) for the model, and
|
||||
[`PreTrainedTokenizerFast.__call__`] to prepare text for the model.
|
||||
|
||||
Please refer to the docstring of the above two methods for more information.
|
||||
|
||||
The rest of this documentation shows the arguments specific to `Kosmos2_5Processor`.
|
||||
"""
|
||||
if images is None and text is None:
|
||||
raise ValueError("You have to specify either images or text.")
|
||||
|
||||
if images is None:
|
||||
raise ValueError("Kosmos2_5Processor requires images to be passed.")
|
||||
|
||||
output_kwargs = self._merge_kwargs(
|
||||
Kosmos2_5ProcessorKwargs,
|
||||
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
num_image_tokens = output_kwargs["images_kwargs"].setdefault("num_image_tokens", None)
|
||||
|
||||
encoding = BatchFeature()
|
||||
|
||||
if images is not None:
|
||||
image_encoding = self.image_processor(images, **output_kwargs["images_kwargs"])
|
||||
image_encoding.pop("rows")
|
||||
image_encoding.pop("cols")
|
||||
encoding.update(image_encoding)
|
||||
|
||||
prompt = "<s><image>" + "<s>" * num_image_tokens + "</image>"
|
||||
|
||||
if text is not None:
|
||||
if isinstance(text, str):
|
||||
text = [prompt + text]
|
||||
else:
|
||||
text = [prompt + t for t in text]
|
||||
input = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
||||
|
||||
batch_size, seq_len = input.input_ids.shape
|
||||
image_embeds_position_mask = [0, -1] + [1] * num_image_tokens + [-1]
|
||||
image_embeds_position_mask += [0] * (seq_len - len(image_embeds_position_mask))
|
||||
image_embeds_position_mask = (
|
||||
torch.LongTensor(image_embeds_position_mask).unsqueeze(0).repeat(batch_size, 1)
|
||||
)
|
||||
|
||||
encoding.update(
|
||||
{
|
||||
"input_ids": input.input_ids,
|
||||
"attention_mask": input.attention_mask,
|
||||
"image_embeds_position_mask": image_embeds_position_mask,
|
||||
}
|
||||
)
|
||||
|
||||
return encoding
|
||||
|
||||
def batch_decode(self, *args, **kwargs):
|
||||
"""
|
||||
This method forwards all its arguments to Kosmos2_5TokenizerFast's [`~PreTrainedTokenizer.batch_decode`].
|
||||
Please refer to the docstring of this method for more information.
|
||||
"""
|
||||
return self.tokenizer.batch_decode(*args, **kwargs)
|
||||
|
||||
def decode(self, *args, **kwargs):
|
||||
"""
|
||||
This method forwards all its arguments to Kosmos2_5TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please
|
||||
refer to the docstring of this method for more information.
|
||||
"""
|
||||
return self.tokenizer.decode(*args, **kwargs)
|
||||
|
||||
@property
|
||||
def model_input_names(self):
|
||||
tokenizer_input_names = self.tokenizer.model_input_names
|
||||
image_processor_input_names = self.image_processor.model_input_names
|
||||
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
||||
|
||||
|
||||
__all__ = ["Kosmos2_5Processor"]
|
@ -302,7 +302,7 @@ class Pix2StructVisionLayer(nn.Module):
|
||||
|
||||
|
||||
class Pix2StructVisionEncoder(nn.Module):
|
||||
def __init__(self, config: Pix2StructConfig) -> None:
|
||||
def __init__(self, config: Pix2StructVisionConfig) -> None:
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.layer = nn.ModuleList([Pix2StructVisionLayer(config) for _ in range(config.num_hidden_layers)])
|
||||
@ -531,7 +531,7 @@ class Pix2StructVisionModel(Pix2StructPreTrainedModel):
|
||||
supports_gradient_checkpointing = True
|
||||
_no_split_modules = ["Pix2StructVisionLayer"]
|
||||
|
||||
def __init__(self, config: Pix2StructConfig):
|
||||
def __init__(self, config: Pix2StructVisionConfig):
|
||||
super().__init__(config)
|
||||
self.config = config
|
||||
|
||||
|
@ -5255,6 +5255,27 @@ class Kosmos2PreTrainedModel(metaclass=DummyObject):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class Kosmos2_5ForConditionalGeneration(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class Kosmos2_5Model(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class Kosmos2_5PreTrainedModel(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class LayoutLMForMaskedLM(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
|
@ -331,6 +331,13 @@ class InstructBlipVideoImageProcessor(metaclass=DummyObject):
|
||||
requires_backends(self, ["vision"])
|
||||
|
||||
|
||||
class Kosmos2_5ImageProcessor(metaclass=DummyObject):
|
||||
_backends = ["vision"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["vision"])
|
||||
|
||||
|
||||
class LayoutLMv2FeatureExtractor(metaclass=DummyObject):
|
||||
_backends = ["vision"]
|
||||
|
||||
|
0
tests/models/kosmos2_5/__init__.py
Normal file
0
tests/models/kosmos2_5/__init__.py
Normal file
308
tests/models/kosmos2_5/test_image_processing_kosmos2_5.py
Normal file
308
tests/models/kosmos2_5/test_image_processing_kosmos2_5.py
Normal file
@ -0,0 +1,308 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2024 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
|
||||
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import Kosmos2_5ImageProcessor
|
||||
|
||||
|
||||
class Kosmos2_5ImageProcessingTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=7,
|
||||
num_channels=3,
|
||||
image_size=18,
|
||||
min_resolution=30,
|
||||
max_resolution=400,
|
||||
size=None,
|
||||
do_normalize=True,
|
||||
do_convert_rgb=True,
|
||||
patch_size=None,
|
||||
):
|
||||
size = size if size is not None else {"height": 20, "width": 20}
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.num_channels = num_channels
|
||||
self.image_size = image_size
|
||||
self.min_resolution = min_resolution
|
||||
self.max_resolution = max_resolution
|
||||
self.size = size
|
||||
self.do_normalize = do_normalize
|
||||
self.do_convert_rgb = do_convert_rgb
|
||||
self.max_patches = [512, 1024, 2048, 4096]
|
||||
self.patch_size = patch_size if patch_size is not None else {"height": 16, "width": 16}
|
||||
|
||||
def prepare_image_processor_dict(self):
|
||||
return {"do_normalize": self.do_normalize, "do_convert_rgb": self.do_convert_rgb}
|
||||
|
||||
def prepare_dummy_image(self):
|
||||
img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg"
|
||||
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
|
||||
return raw_image
|
||||
|
||||
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
|
||||
return prepare_image_inputs(
|
||||
batch_size=self.batch_size,
|
||||
num_channels=self.num_channels,
|
||||
min_resolution=self.min_resolution,
|
||||
max_resolution=self.max_resolution,
|
||||
equal_resolution=equal_resolution,
|
||||
numpify=numpify,
|
||||
torchify=torchify,
|
||||
)
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
class Kosmos2_5ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = Kosmos2_5ImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = Kosmos2_5ImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
def image_processor_dict(self):
|
||||
return self.image_processor_tester.prepare_image_processor_dict()
|
||||
|
||||
def test_image_processor_properties(self):
|
||||
image_processor = self.image_processing_class(**self.image_processor_dict)
|
||||
self.assertTrue(hasattr(image_processor, "do_normalize"))
|
||||
self.assertTrue(hasattr(image_processor, "do_convert_rgb"))
|
||||
|
||||
def test_expected_patches(self):
|
||||
dummy_image = self.image_processor_tester.prepare_dummy_image()
|
||||
|
||||
image_processor = self.image_processing_class(**self.image_processor_dict)
|
||||
max_patch = 2048
|
||||
|
||||
inputs = image_processor(dummy_image, return_tensors="pt", max_patches=max_patch)
|
||||
self.assertTrue(torch.allclose(inputs.flattened_patches.mean(), torch.tensor(0.0606), atol=1e-3, rtol=1e-3))
|
||||
|
||||
def test_call_pil(self):
|
||||
# Initialize image_processor
|
||||
image_processor = self.image_processing_class(**self.image_processor_dict)
|
||||
# create random PIL images
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
|
||||
for image in image_inputs:
|
||||
self.assertIsInstance(image, Image.Image)
|
||||
|
||||
# Test not batched input
|
||||
expected_hidden_dim = (
|
||||
(self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
|
||||
* self.image_processor_tester.num_channels
|
||||
) + 2
|
||||
|
||||
for max_patch in self.image_processor_tester.max_patches:
|
||||
# Test not batched input
|
||||
encoded_images = image_processor(
|
||||
image_inputs[0], return_tensors="pt", max_patches=max_patch
|
||||
).flattened_patches
|
||||
self.assertEqual(
|
||||
encoded_images.shape,
|
||||
(1, max_patch, expected_hidden_dim),
|
||||
)
|
||||
|
||||
# Test batched
|
||||
encoded_images = image_processor(
|
||||
image_inputs, return_tensors="pt", max_patches=max_patch
|
||||
).flattened_patches
|
||||
self.assertEqual(
|
||||
encoded_images.shape,
|
||||
(self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
|
||||
)
|
||||
|
||||
def test_call_numpy(self):
|
||||
# Initialize image_processor
|
||||
image_processor = self.image_processing_class(**self.image_processor_dict)
|
||||
# create random numpy tensors
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
|
||||
for image in image_inputs:
|
||||
self.assertIsInstance(image, np.ndarray)
|
||||
|
||||
expected_hidden_dim = (
|
||||
(self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
|
||||
* self.image_processor_tester.num_channels
|
||||
) + 2
|
||||
|
||||
for max_patch in self.image_processor_tester.max_patches:
|
||||
# Test not batched input
|
||||
encoded_images = image_processor(
|
||||
image_inputs[0], return_tensors="pt", max_patches=max_patch
|
||||
).flattened_patches
|
||||
self.assertEqual(
|
||||
encoded_images.shape,
|
||||
(1, max_patch, expected_hidden_dim),
|
||||
)
|
||||
|
||||
# Test batched
|
||||
encoded_images = image_processor(
|
||||
image_inputs, return_tensors="pt", max_patches=max_patch
|
||||
).flattened_patches
|
||||
self.assertEqual(
|
||||
encoded_images.shape,
|
||||
(self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
|
||||
)
|
||||
|
||||
def test_call_numpy_4_channels(self):
|
||||
# Initialize image_processor
|
||||
image_processor = self.image_processing_class(**self.image_processor_dict)
|
||||
# create random numpy tensors
|
||||
self.image_processor_tester.num_channels = 4
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
|
||||
for image in image_inputs:
|
||||
self.assertIsInstance(image, np.ndarray)
|
||||
|
||||
expected_hidden_dim = (
|
||||
(self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
|
||||
* self.image_processor_tester.num_channels
|
||||
) + 2
|
||||
|
||||
for max_patch in self.image_processor_tester.max_patches:
|
||||
# Test not batched input
|
||||
encoded_images = image_processor(
|
||||
image_inputs[0], return_tensors="pt", max_patches=max_patch, input_data_format="channels_last"
|
||||
).flattened_patches
|
||||
self.assertEqual(
|
||||
encoded_images.shape,
|
||||
(1, max_patch, expected_hidden_dim),
|
||||
)
|
||||
|
||||
# Test batched
|
||||
encoded_images = image_processor(
|
||||
image_inputs, return_tensors="pt", max_patches=max_patch, input_data_format="channels_last"
|
||||
).flattened_patches
|
||||
self.assertEqual(
|
||||
encoded_images.shape,
|
||||
(self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
|
||||
)
|
||||
self.image_processor_tester.num_channels = 3
|
||||
|
||||
def test_call_pytorch(self):
|
||||
# Initialize image_processor
|
||||
image_processor = self.image_processing_class(**self.image_processor_dict)
|
||||
# create random PyTorch tensors
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
|
||||
for image in image_inputs:
|
||||
self.assertIsInstance(image, torch.Tensor)
|
||||
|
||||
# Test not batched input
|
||||
expected_hidden_dim = (
|
||||
(self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
|
||||
* self.image_processor_tester.num_channels
|
||||
) + 2
|
||||
|
||||
for max_patch in self.image_processor_tester.max_patches:
|
||||
# Test not batched input
|
||||
encoded_images = image_processor(
|
||||
image_inputs[0], return_tensors="pt", max_patches=max_patch
|
||||
).flattened_patches
|
||||
self.assertEqual(
|
||||
encoded_images.shape,
|
||||
(1, max_patch, expected_hidden_dim),
|
||||
)
|
||||
|
||||
# Test batched
|
||||
encoded_images = image_processor(
|
||||
image_inputs, return_tensors="pt", max_patches=max_patch
|
||||
).flattened_patches
|
||||
self.assertEqual(
|
||||
encoded_images.shape,
|
||||
(self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
|
||||
)
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
class Kosmos2_5ImageProcessingTestFourChannels(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = Kosmos2_5ImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = Kosmos2_5ImageProcessingTester(self, num_channels=4)
|
||||
self.expected_encoded_image_num_channels = 3
|
||||
|
||||
@property
|
||||
def image_processor_dict(self):
|
||||
return self.image_processor_tester.prepare_image_processor_dict()
|
||||
|
||||
def test_image_processor_properties(self):
|
||||
image_processor = self.image_processing_class(**self.image_processor_dict)
|
||||
self.assertTrue(hasattr(image_processor, "do_normalize"))
|
||||
self.assertTrue(hasattr(image_processor, "do_convert_rgb"))
|
||||
|
||||
def test_call_pil(self):
|
||||
# Initialize image_processor
|
||||
image_processor = self.image_processing_class(**self.image_processor_dict)
|
||||
# create random PIL images
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
|
||||
for image in image_inputs:
|
||||
self.assertIsInstance(image, Image.Image)
|
||||
|
||||
# Test not batched input
|
||||
expected_hidden_dim = (
|
||||
(self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
|
||||
* (self.image_processor_tester.num_channels - 1)
|
||||
) + 2
|
||||
|
||||
for max_patch in self.image_processor_tester.max_patches:
|
||||
# Test not batched input
|
||||
encoded_images = image_processor(
|
||||
image_inputs[0], return_tensors="pt", max_patches=max_patch
|
||||
).flattened_patches
|
||||
self.assertEqual(
|
||||
encoded_images.shape,
|
||||
(1, max_patch, expected_hidden_dim),
|
||||
)
|
||||
|
||||
# Test batched
|
||||
encoded_images = image_processor(
|
||||
image_inputs, return_tensors="pt", max_patches=max_patch
|
||||
).flattened_patches
|
||||
self.assertEqual(
|
||||
encoded_images.shape,
|
||||
(self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
|
||||
)
|
||||
|
||||
@unittest.skip(reason="Kosmos2_5ImageProcessor does not support 4 channels yet") # FIXME Amy
|
||||
def test_call_numpy(self):
|
||||
return super().test_call_numpy()
|
||||
|
||||
@unittest.skip(reason="Kosmos2_5ImageProcessor does not support 4 channels yet") # FIXME Amy
|
||||
def test_call_pytorch(self):
|
||||
return super().test_call_torch()
|
||||
|
||||
@unittest.skip(
|
||||
reason="Kosmos2_5ImageProcessor does treat numpy and PIL 4 channel images consistently"
|
||||
) # FIXME Amy
|
||||
def test_call_numpy_4_channels(self):
|
||||
return super().test_call_torch()
|
848
tests/models/kosmos2_5/test_modeling_kosmos2_5.py
Normal file
848
tests/models/kosmos2_5/test_modeling_kosmos2_5.py
Normal file
@ -0,0 +1,848 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2024 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Testing suite for the PyTorch KOSMOS-2.5 model."""
|
||||
|
||||
import copy
|
||||
import inspect
|
||||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import requests
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import AutoProcessor, Kosmos2_5Config
|
||||
from transformers.models.kosmos2_5.configuration_kosmos2_5 import (
|
||||
Kosmos2_5TextConfig,
|
||||
Kosmos2_5VisionConfig,
|
||||
)
|
||||
from transformers.testing_utils import (
|
||||
require_flash_attn,
|
||||
require_torch,
|
||||
require_torch_gpu,
|
||||
require_torch_sdpa,
|
||||
require_vision,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
|
||||
from ...generation.test_utils import GenerationTesterMixin
|
||||
from ...test_configuration_common import ConfigTester
|
||||
from ...test_modeling_common import (
|
||||
ModelTesterMixin,
|
||||
_config_zero_init,
|
||||
floats_tensor,
|
||||
ids_tensor,
|
||||
random_attention_mask,
|
||||
)
|
||||
from ...test_pipeline_mixin import PipelineTesterMixin
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
from transformers import Kosmos2_5ForConditionalGeneration, Kosmos2_5Model
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
|
||||
class Kosmos2_5VisionModelTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=6,
|
||||
image_size=32,
|
||||
patch_size=4,
|
||||
num_channels=3,
|
||||
is_training=True,
|
||||
hidden_size=32,
|
||||
d_ff=64,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
dropout=0,
|
||||
attention_dropout=0,
|
||||
scope=None,
|
||||
):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.image_size = image_size
|
||||
self.patch_size = patch_size
|
||||
self.num_channels = num_channels
|
||||
self.is_training = is_training
|
||||
self.hidden_size = hidden_size
|
||||
self.d_ff = d_ff
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.patch_embed_hidden_size = patch_size * patch_size * num_channels
|
||||
self.dropout = dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.scope = scope
|
||||
|
||||
# in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
|
||||
num_patches = (image_size // patch_size) ** 2
|
||||
self.seq_length = num_patches + 1
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
flattened_patches = floats_tensor([self.batch_size, self.seq_length, self.patch_embed_hidden_size + 2])
|
||||
config = self.get_config()
|
||||
|
||||
return config, flattened_patches
|
||||
|
||||
def get_config(self):
|
||||
return Kosmos2_5VisionConfig(
|
||||
image_size=self.image_size,
|
||||
patch_size=self.patch_size,
|
||||
num_channels=self.num_channels,
|
||||
hidden_size=self.hidden_size,
|
||||
d_ff=self.d_ff,
|
||||
num_hidden_layers=self.num_hidden_layers,
|
||||
num_attention_heads=self.num_attention_heads,
|
||||
patch_embed_hidden_size=self.patch_embed_hidden_size,
|
||||
dropout=self.dropout,
|
||||
attention_dropout=self.attention_dropout,
|
||||
)
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
config, flattened_patches = config_and_inputs
|
||||
inputs_dict = {"flattened_patches": flattened_patches}
|
||||
return config, inputs_dict
|
||||
|
||||
|
||||
class Kosmos2_5TextModelTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=6,
|
||||
seq_length=7,
|
||||
is_training=True,
|
||||
use_input_mask=True,
|
||||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
ffn_dim=64,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
dropout=0,
|
||||
attention_dropout=0,
|
||||
max_position_embeddings=512,
|
||||
scope=None,
|
||||
):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.seq_length = seq_length
|
||||
self.is_training = is_training
|
||||
self.use_input_mask = use_input_mask
|
||||
self.use_labels = use_labels
|
||||
self.vocab_size = vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.ffn_dim = ffn_dim
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.dropout = dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.scope = scope
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||
|
||||
input_mask = None
|
||||
if self.use_input_mask:
|
||||
input_mask = random_attention_mask([self.batch_size, self.seq_length])
|
||||
|
||||
if input_mask is not None:
|
||||
batch_size, seq_length = input_mask.shape
|
||||
rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
|
||||
for batch_idx, start_index in enumerate(rnd_start_indices):
|
||||
input_mask[batch_idx, :start_index] = 1
|
||||
input_mask[batch_idx, start_index:] = 0
|
||||
|
||||
config = self.get_config()
|
||||
|
||||
return config, input_ids, input_mask
|
||||
|
||||
def get_config(self):
|
||||
return Kosmos2_5TextConfig(
|
||||
vocab_size=self.vocab_size,
|
||||
embed_dim=self.hidden_size,
|
||||
ffn_dim=self.ffn_dim,
|
||||
layers=self.num_hidden_layers,
|
||||
attention_heads=self.num_attention_heads,
|
||||
dropout=self.dropout,
|
||||
attention_dropout=self.attention_dropout,
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
)
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
config, input_ids, input_mask = config_and_inputs
|
||||
inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
|
||||
return config, inputs_dict
|
||||
|
||||
|
||||
class Kosmos2_5ModelTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
text_kwargs=None,
|
||||
vision_kwargs=None,
|
||||
latent_query_num=3,
|
||||
is_training=True,
|
||||
):
|
||||
if text_kwargs is None:
|
||||
text_kwargs = {}
|
||||
if vision_kwargs is None:
|
||||
vision_kwargs = {}
|
||||
|
||||
self.parent = parent
|
||||
self.text_model_tester = Kosmos2_5TextModelTester(parent, **text_kwargs)
|
||||
self.vision_model_tester = Kosmos2_5VisionModelTester(parent, **vision_kwargs)
|
||||
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
|
||||
self.seq_length = self.text_model_tester.seq_length
|
||||
self.latent_query_num = latent_query_num
|
||||
self.is_training = is_training
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
|
||||
vision_config, flattened_patches = self.vision_model_tester.prepare_config_and_inputs()
|
||||
|
||||
# build `image_embeds_position_mask`
|
||||
image_embeds_position_mask = torch.zeros_like(input_ids)
|
||||
image_embeds_position_mask[:, 1 : 1 + self.latent_query_num :] = 1
|
||||
|
||||
config = self.get_config()
|
||||
|
||||
return (
|
||||
config,
|
||||
input_ids,
|
||||
attention_mask,
|
||||
image_embeds_position_mask,
|
||||
flattened_patches,
|
||||
)
|
||||
|
||||
def get_config(self):
|
||||
return Kosmos2_5Config(
|
||||
self.text_model_tester.get_config().to_dict(),
|
||||
self.vision_model_tester.get_config().to_dict(),
|
||||
latent_query_num=self.latent_query_num,
|
||||
)
|
||||
|
||||
def create_and_check_model(
|
||||
self,
|
||||
config,
|
||||
input_ids,
|
||||
attention_mask,
|
||||
image_embeds_position_mask,
|
||||
flattened_patches,
|
||||
):
|
||||
model = Kosmos2_5Model(config).to(torch_device).eval()
|
||||
with torch.no_grad():
|
||||
result = model(input_ids, flattened_patches, image_embeds_position_mask, attention_mask)
|
||||
self.parent.assertEqual(
|
||||
result.last_hidden_state.shape,
|
||||
(
|
||||
self.text_model_tester.batch_size,
|
||||
self.text_model_tester.seq_length,
|
||||
self.text_model_tester.hidden_size,
|
||||
),
|
||||
)
|
||||
self.parent.assertEqual(
|
||||
result.image_embeds.shape,
|
||||
(
|
||||
self.text_model_tester.batch_size,
|
||||
self.latent_query_num,
|
||||
self.text_model_tester.hidden_size,
|
||||
),
|
||||
)
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
(
|
||||
config,
|
||||
input_ids,
|
||||
attention_mask,
|
||||
image_embeds_position_mask,
|
||||
flattened_patches,
|
||||
) = config_and_inputs
|
||||
inputs_dict = {
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
"image_embeds_position_mask": image_embeds_position_mask,
|
||||
"flattened_patches": flattened_patches,
|
||||
}
|
||||
return config, inputs_dict
|
||||
|
||||
|
||||
@require_torch
|
||||
class Kosmos2_5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (Kosmos2_5Model, Kosmos2_5ForConditionalGeneration) if is_torch_available() else ()
|
||||
all_generative_model_classes = (Kosmos2_5ForConditionalGeneration,) if is_torch_available() else ()
|
||||
pipeline_model_mapping = (
|
||||
{
|
||||
"feature-extraction": Kosmos2_5Model,
|
||||
"image-to-text": Kosmos2_5ForConditionalGeneration,
|
||||
}
|
||||
if is_torch_available()
|
||||
else {}
|
||||
)
|
||||
fx_compatible = False
|
||||
test_head_masking = False
|
||||
test_pruning = False
|
||||
test_resize_embeddings = False
|
||||
test_attention_outputs = False
|
||||
_is_composite = True
|
||||
|
||||
# TODO: `image-to-text` pipeline for this model needs Processor.
|
||||
def is_pipeline_test_to_skip(
|
||||
self,
|
||||
pipeline_test_casse_name,
|
||||
config_class,
|
||||
model_architecture,
|
||||
tokenizer_name,
|
||||
processor_name,
|
||||
):
|
||||
return pipeline_test_casse_name == "ImageToTextPipelineTests"
|
||||
|
||||
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
|
||||
inputs_dict = copy.deepcopy(inputs_dict)
|
||||
|
||||
if return_labels:
|
||||
if model_class.__name__ == "Kosmos2_5ForConditionalGeneration":
|
||||
inputs_dict["labels"] = torch.zeros(
|
||||
(
|
||||
self.model_tester.text_model_tester.batch_size,
|
||||
self.model_tester.text_model_tester.seq_length,
|
||||
),
|
||||
dtype=torch.long,
|
||||
device=torch_device,
|
||||
)
|
||||
|
||||
if model_class.__name__ in [
|
||||
"Kosmos2_5Model",
|
||||
"Kosmos2_5ForConditionalGeneration",
|
||||
]:
|
||||
bs, _ = inputs_dict["input_ids"].shape
|
||||
seqlen = self.model_tester.text_model_tester.seq_length
|
||||
inputs_dict["input_ids"] = torch.arange(seqlen, device=torch_device).unsqueeze(0).expand(bs, seqlen)
|
||||
inputs_dict["input_ids"] = inputs_dict["input_ids"] % self.model_tester.text_model_tester.vocab_size
|
||||
inputs_dict["attention_mask"] = torch.ones((bs, seqlen), device=torch_device)
|
||||
inputs_dict["image_embeds_position_mask"] = torch.zeros((bs, seqlen), device=torch_device)
|
||||
inputs_dict["image_embeds_position_mask"][:, : self.model_tester.latent_query_num] = 1
|
||||
return inputs_dict
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = Kosmos2_5ModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=Kosmos2_5Config, hidden_size=37)
|
||||
|
||||
# overwrite from common to skip `image_to_text_projection.latent_query`
|
||||
def test_initialization(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
configs_no_init = _config_zero_init(config)
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config=configs_no_init)
|
||||
for name, param in model.named_parameters():
|
||||
if param.requires_grad:
|
||||
if name == "image_to_text_projection.latent_query":
|
||||
# The original code use ` nn.Parameter(torch.randn(...))` for which this test won't pass.
|
||||
continue
|
||||
self.assertIn(
|
||||
((param.data.mean() * 1e9).round() / 1e9).item(),
|
||||
[0.0, 1.0],
|
||||
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||
)
|
||||
|
||||
def test_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||
|
||||
def test_forward_signature(self):
|
||||
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
signature = inspect.signature(model.forward)
|
||||
# signature.parameters is an OrderedDict => so arg_names order is deterministic
|
||||
arg_names = [*signature.parameters.keys()]
|
||||
|
||||
expected_arg_names = ["input_ids"]
|
||||
self.assertListEqual(arg_names[:1], expected_arg_names)
|
||||
|
||||
def test_load_save_without_tied_weights(self):
|
||||
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
config.text_config.tie_word_embeddings = False
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
model.save_pretrained(d)
|
||||
|
||||
model_reloaded, infos = model_class.from_pretrained(d, output_loading_info=True)
|
||||
# Checking the state dicts are correct
|
||||
reloaded_state = model_reloaded.state_dict()
|
||||
for k, v in model.state_dict().items():
|
||||
self.assertIn(k, reloaded_state, f"Key {k} is missing from reloaded")
|
||||
torch.testing.assert_close(
|
||||
v,
|
||||
reloaded_state[k],
|
||||
msg=lambda x: f"{model_class.__name__}: Tensor {k}: {x}",
|
||||
)
|
||||
# Checking there was no complain of missing weights
|
||||
self.assertEqual(infos["missing_keys"], [])
|
||||
|
||||
# overwrite from common in order to use `self.model_tester.text_model_tester.num_hidden_layers`
|
||||
def test_hidden_states_output(self):
|
||||
def check_hidden_states_output(inputs_dict, config, model_class):
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
|
||||
|
||||
hidden_states = outputs.hidden_states
|
||||
|
||||
expected_num_layers = getattr(
|
||||
self.model_tester,
|
||||
"expected_num_hidden_layers",
|
||||
self.model_tester.text_model_tester.num_hidden_layers + 1,
|
||||
)
|
||||
self.assertEqual(len(hidden_states), expected_num_layers)
|
||||
|
||||
seq_length = self.model_tester.text_model_tester.seq_length
|
||||
|
||||
self.assertListEqual(
|
||||
list(hidden_states[0].shape[-2:]),
|
||||
[seq_length, self.model_tester.text_model_tester.hidden_size],
|
||||
)
|
||||
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
inputs_dict["output_hidden_states"] = True
|
||||
check_hidden_states_output(inputs_dict, config, model_class)
|
||||
|
||||
# check that output_hidden_states also work using config
|
||||
del inputs_dict["output_hidden_states"]
|
||||
config.output_hidden_states = True
|
||||
|
||||
check_hidden_states_output(inputs_dict, config, model_class)
|
||||
|
||||
# overwrite from common in order to use `config.text_config.vocab_size` instead of `config.vocab_size`
|
||||
def test_tie_model_weights(self):
|
||||
if not self.test_torchscript:
|
||||
self.skipTest(reason="test_torchscript is set to False")
|
||||
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
def check_same_values(layer_1, layer_2):
|
||||
equal = True
|
||||
for p1, p2 in zip(layer_1.weight, layer_2.weight):
|
||||
if p1.data.ne(p2.data).sum() > 0:
|
||||
equal = False
|
||||
return equal
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
config.torchscript = True
|
||||
model_not_tied = model_class(config)
|
||||
if model_not_tied.get_output_embeddings() is None:
|
||||
continue
|
||||
|
||||
config_tied = copy.deepcopy(config)
|
||||
config_tied.torchscript = False
|
||||
model_tied = model_class(config_tied)
|
||||
params_tied = list(model_tied.parameters())
|
||||
# Check that the embedding layer and decoding layer are the same in size and in value
|
||||
# self.assertTrue(check_same_values(embeddings, decoding))
|
||||
|
||||
# # Check that after modification, they remain the same.
|
||||
# embeddings.weight.data.div_(2)
|
||||
# # Check that the embedding layer and decoding layer are the same in size and in value
|
||||
# self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
|
||||
# self.assertTrue(check_same_values(embeddings, decoding))
|
||||
|
||||
# # Check that after modification, they remain the same.
|
||||
# decoding.weight.data.div_(4)
|
||||
# # Check that the embedding layer and decoding layer are the same in size and in value
|
||||
# self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
|
||||
# self.assertTrue(check_same_values(embeddings, decoding))
|
||||
|
||||
# Check that after resize they remain tied.
|
||||
model_tied.resize_token_embeddings(config.text_config.vocab_size + 10)
|
||||
params_tied_2 = list(model_tied.parameters())
|
||||
self.assertEqual(len(params_tied_2), len(params_tied))
|
||||
|
||||
# decoding.weight.data.mul_(20)
|
||||
# # Check that the embedding layer and decoding layer are the same in size and in value
|
||||
# self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
|
||||
# self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
|
||||
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
model_name = "microsoft/kosmos-2.5"
|
||||
model = Kosmos2_5Model.from_pretrained(model_name)
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
@unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
|
||||
def test_model_parallelism(self):
|
||||
super().test_model_parallelism()
|
||||
|
||||
# TODO: ydshieh
|
||||
@require_torch_gpu
|
||||
@pytest.mark.flash_attn_test
|
||||
@slow
|
||||
@unittest.skip(reason="kosmos-2.5 flash attention does not support right padding")
|
||||
def test_flash_attn_2_inference_equivalence_right_padding(self):
|
||||
pass
|
||||
|
||||
# TODO: ydshieh
|
||||
@require_torch_gpu
|
||||
@pytest.mark.flash_attn_test
|
||||
@slow
|
||||
@unittest.skip(reason="kosmos-2.5 test : the dummy inputs should be tweaked: dummy_input = inputs_dict")
|
||||
def test_flash_attn_2_inference_equivalence(self):
|
||||
pass
|
||||
|
||||
# TODO: ydshieh
|
||||
@require_torch_sdpa
|
||||
@require_torch_gpu
|
||||
@slow
|
||||
@unittest.skip(reason="_update_causal_mask is not implemented yet which fails this test")
|
||||
def test_sdpa_can_dispatch_on_flash(self):
|
||||
pass
|
||||
|
||||
# TODO: ydshieh
|
||||
@unittest.skip(reason="doesn't support padding yet")
|
||||
def test_eager_matches_sdpa_inference_1_bfloat16(self):
|
||||
pass
|
||||
|
||||
# TODO: ydshieh
|
||||
@unittest.skip(reason=" the model hasn't been added to auto class")
|
||||
def test_flash_attn_2_from_config(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("This test is currently not well designed for multimodal model (float type as an input).")
|
||||
def test_flash_attn_2_fp32_ln(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("This test is currently not well designed for multimodal model (float type as an input).")
|
||||
def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Kosmos 2.5 is multimodel and has specific input shapes.")
|
||||
def test_flash_attn_2_generate_reuse_cache(self):
|
||||
pass
|
||||
|
||||
@pytest.mark.generate
|
||||
@parameterized.expand([("greedy", 1), ("beam search", 2)])
|
||||
@unittest.skip(
|
||||
"KOSMOS-2.5 doesn't support inputs embeds. The test isn't skipped by checking input args because KOSMOS-2 has `generate()` overwritten",
|
||||
)
|
||||
def test_generate_from_inputs_embeds(self):
|
||||
pass
|
||||
|
||||
# TODO: ydshieh
|
||||
@pytest.mark.generate
|
||||
@unittest.skip(
|
||||
"Kosmos2_5ForConditionalGeneration returns `vision_model_output` which is currently not working with `stack_model_outputs`",
|
||||
)
|
||||
def test_beam_search_low_memory(self):
|
||||
pass
|
||||
|
||||
@pytest.mark.generate
|
||||
def test_left_padding_compatibility(self):
|
||||
# Overwrite because Kosmos-2.5 need to padd pixel values and pad image-attn-mask
|
||||
|
||||
def _prepare_model_kwargs(input_ids, attention_mask, pad_size, signature):
|
||||
model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask}
|
||||
if "position_ids" in signature:
|
||||
position_ids = torch.cumsum(attention_mask, dim=-1) - 1
|
||||
position_ids.masked_fill_(attention_mask == 0, 1)
|
||||
model_kwargs["position_ids"] = position_ids
|
||||
if "cache_position" in signature:
|
||||
cache_position = torch.arange(input_ids.shape[-1], device=torch_device)
|
||||
model_kwargs["cache_position"] = cache_position
|
||||
if "image_embeds_position_mask" in signature:
|
||||
image_embeds_position_mask = torch.zeros_like(input_ids)
|
||||
image_embeds_position_mask[:, (pad_size + 1) : pad_size + 1 + self.model_tester.latent_query_num] = 1
|
||||
model_kwargs["image_embeds_position_mask"] = image_embeds_position_mask
|
||||
return model_kwargs
|
||||
|
||||
for model_class in self.all_generative_model_classes:
|
||||
config, inputs_dict = self.prepare_config_and_inputs_for_generate()
|
||||
input_ids = inputs_dict["input_ids"]
|
||||
flattened_patches = inputs_dict["flattened_patches"]
|
||||
attention_mask = inputs_dict.get("attention_mask")
|
||||
if attention_mask is None:
|
||||
attention_mask = torch.ones_like(input_ids)
|
||||
|
||||
model = model_class(config).to(torch_device).eval()
|
||||
signature = inspect.signature(model.forward).parameters.keys()
|
||||
|
||||
# no cache as some models require special cache classes to be init outside forward
|
||||
model.generation_config.use_cache = False
|
||||
|
||||
# Without padding
|
||||
model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, pad_size=0, signature=signature)
|
||||
next_logits_wo_padding = model(**model_kwargs, flattened_patches=flattened_patches).logits[:, -1, :]
|
||||
|
||||
# With left-padding (length 32)
|
||||
# can hardcode pad_token to be 0 as we'll do attn masking anyway
|
||||
pad_token_id = (
|
||||
config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0
|
||||
)
|
||||
pad_size = (input_ids.shape[0], 32)
|
||||
padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id
|
||||
padded_input_ids = torch.cat((padding, input_ids), dim=1)
|
||||
padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1)
|
||||
model_kwargs = _prepare_model_kwargs(
|
||||
padded_input_ids, padded_attention_mask, pad_size=32, signature=signature
|
||||
)
|
||||
next_logits_with_padding = model(**model_kwargs, flattened_patches=flattened_patches).logits[:, -1, :]
|
||||
|
||||
# They should result in very similar logits
|
||||
self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-3))
|
||||
|
||||
def _create_and_check_torchscript(self, config, inputs_dict):
|
||||
if not self.test_torchscript:
|
||||
self.skipTest(reason="test_torchscript is set to False")
|
||||
|
||||
configs_no_init = _config_zero_init(config) # To be sure we have no Nan
|
||||
configs_no_init.torchscript = True
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config=configs_no_init)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
inputs = self._prepare_for_class(inputs_dict, model_class)
|
||||
|
||||
main_input_name = model_class.main_input_name
|
||||
|
||||
try:
|
||||
main_input = inputs[main_input_name]
|
||||
model(
|
||||
main_input,
|
||||
inputs["flattened_patches"],
|
||||
inputs["image_embeds_position_mask"],
|
||||
)
|
||||
traced_model = torch.jit.trace(
|
||||
model,
|
||||
(
|
||||
main_input,
|
||||
inputs["flattened_patches"],
|
||||
inputs["image_embeds_position_mask"],
|
||||
),
|
||||
)
|
||||
except RuntimeError:
|
||||
self.fail("Couldn't trace module.")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir_name:
|
||||
pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
|
||||
|
||||
try:
|
||||
torch.jit.save(traced_model, pt_file_name)
|
||||
except Exception:
|
||||
self.fail("Couldn't save module.")
|
||||
|
||||
try:
|
||||
loaded_model = torch.jit.load(pt_file_name)
|
||||
except Exception:
|
||||
self.fail("Couldn't load module.")
|
||||
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
loaded_model.to(torch_device)
|
||||
loaded_model.eval()
|
||||
|
||||
model_state_dict = model.state_dict()
|
||||
loaded_model_state_dict = loaded_model.state_dict()
|
||||
|
||||
non_persistent_buffers = {}
|
||||
for key in loaded_model_state_dict.keys():
|
||||
if key not in model_state_dict.keys():
|
||||
non_persistent_buffers[key] = loaded_model_state_dict[key]
|
||||
|
||||
loaded_model_state_dict = {
|
||||
key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
|
||||
}
|
||||
|
||||
self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
|
||||
|
||||
model_buffers = list(model.buffers())
|
||||
for non_persistent_buffer in non_persistent_buffers.values():
|
||||
found_buffer = False
|
||||
for i, model_buffer in enumerate(model_buffers):
|
||||
if torch.equal(non_persistent_buffer, model_buffer):
|
||||
found_buffer = True
|
||||
break
|
||||
|
||||
self.assertTrue(found_buffer)
|
||||
model_buffers.pop(i)
|
||||
|
||||
models_equal = True
|
||||
for layer_name, p1 in model_state_dict.items():
|
||||
if layer_name in loaded_model_state_dict:
|
||||
p2 = loaded_model_state_dict[layer_name]
|
||||
if p1.data.ne(p2.data).sum() > 0:
|
||||
models_equal = False
|
||||
|
||||
self.assertTrue(models_equal)
|
||||
|
||||
# Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
|
||||
# (Even with this call, there are still memory leak by ~0.04MB)
|
||||
self.clear_torch_jit_class_registry()
|
||||
|
||||
|
||||
@require_vision
|
||||
@require_torch
|
||||
@slow
|
||||
class Kosmos2_5ModelIntegrationTest(unittest.TestCase):
|
||||
# This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
|
||||
# Depending on the hardware we get different logits / generations
|
||||
cuda_compute_capability_major_version = None
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
if is_torch_available() and torch.cuda.is_available():
|
||||
# 8 is for A100 / A10 and 7 for T4
|
||||
cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0]
|
||||
|
||||
def run_example(self, prompt, image, model, processor):
|
||||
inputs = processor(text=prompt, images=image, return_tensors="pt")
|
||||
_, _ = inputs.pop("height"), inputs.pop("width")
|
||||
inputs = {k: v.to(torch_device) if v is not None else None for k, v in inputs.items()}
|
||||
inputs["flattened_patches"] = inputs["flattened_patches"].to(model.dtype)
|
||||
|
||||
generation_outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=1024,
|
||||
)
|
||||
generated_ids = generation_outputs
|
||||
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
|
||||
|
||||
return generated_ids, generated_text
|
||||
|
||||
def test_eager(self):
|
||||
url = "https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"
|
||||
image = Image.open(requests.get(url, stream=True).raw)
|
||||
|
||||
dtype = torch.bfloat16
|
||||
repo = "microsoft/kosmos-2.5"
|
||||
model = Kosmos2_5ForConditionalGeneration.from_pretrained(
|
||||
repo, device_map=torch_device, torch_dtype=dtype, attn_implementation="eager"
|
||||
)
|
||||
processor = AutoProcessor.from_pretrained(repo)
|
||||
prompt = "<ocr>"
|
||||
generated_ids, generated_text = self.run_example(prompt, image, model, processor)
|
||||
EXPECTED_TEXT = {
|
||||
7: [
|
||||
"<bbox><x_53><y_573><x_69><y_606></bbox>1\n<bbox><x_79><y_573><x_464><y_611></bbox>[REG] BLACK SAKURA\n<bbox><x_690><y_569><x_810><y_606></bbox>45,455\n<bbox><x_53><y_614><x_69><y_648></bbox>1\n<bbox><x_79><y_614><x_468><y_651></bbox>COOKIE DOH SAUCES\n<bbox><x_788><y_609><x_812><y_642></bbox>0\n<bbox><x_50><y_658><x_69><y_693></bbox>1\n<bbox><x_79><y_658><x_358><y_693></bbox>NATA DE COCO\n<bbox><x_790><y_652><x_814><y_683></bbox>0\n<bbox><x_31><y_742><x_820><y_781></bbox>Sub Total 45,455\n<bbox><x_27><y_781><x_822><y_827></bbox>PB1 (10%) 4,545\n<bbox><x_27><y_826><x_824><y_872></bbox>Rounding 0\n<bbox><x_24><y_872><x_827><y_921></bbox>Total 50,000\n<bbox><x_17><y_1056><x_836><y_1108></bbox>Card Payment 50,000\n"
|
||||
],
|
||||
8: [
|
||||
"<bbox><x_53><y_573><x_69><y_606></bbox>1\n<bbox><x_79><y_573><x_464><y_611></bbox>[REG] BLACK SAKURA\n<bbox><x_690><y_569><x_810><y_606></bbox>45,455\n<bbox><x_53><y_614><x_69><y_648></bbox>1\n<bbox><x_79><y_614><x_468><y_650></bbox>COOKIE DOH SAUCES\n<bbox><x_788><y_609><x_812><y_644></bbox>0\n<bbox><x_50><y_658><x_69><y_693></bbox>1\n<bbox><x_79><y_658><x_358><y_693></bbox>NATA DE COCO\n<bbox><x_790><y_652><x_814><y_687></bbox>0\n<bbox><x_31><y_742><x_820><y_781></bbox>Sub Total 45,455\n<bbox><x_27><y_781><x_822><y_827></bbox>PB1 (10%) 4,545\n<bbox><x_27><y_826><x_824><y_872></bbox>Rounding 0\n<bbox><x_24><y_872><x_827><y_921></bbox>Total 50,000\n<bbox><x_17><y_1056><x_836><y_1108></bbox>Card Payment 50,000\n"
|
||||
],
|
||||
}
|
||||
|
||||
self.assertListEqual(generated_text, EXPECTED_TEXT[self.cuda_compute_capability_major_version])
|
||||
|
||||
prompt = "<md>"
|
||||
generated_ids, generated_text = self.run_example(prompt, image, model, processor)
|
||||
|
||||
EXPECTED_TEXT = {
|
||||
7: [
|
||||
"- **1 \\[REG\\] BLACK SAKURA** 45,455\n- **1 COOKIE DOH SAUCES** 0\n- **1 NATA DE COCO** 0\n- **Sub Total** 45,455\n- **PB1 (10%)** 4,545\n- **Rounding** 0\n- **Total** **50,000**\n\nCard Payment 50,000"
|
||||
],
|
||||
8: [
|
||||
"- **1 \\[REG\\] BLACK SAKURA** 45,455\n- **1 COOKIE DOH SAUCES** 0\n- **1 NATA DE COCO** 0\n- **Sub Total** 45,455\n- **PB1 (10%)** 4,545\n- **Rounding** 0\n- **Total** **50,000**\n\nCard Payment 50,000"
|
||||
],
|
||||
}
|
||||
|
||||
self.assertListEqual(generated_text, EXPECTED_TEXT[self.cuda_compute_capability_major_version])
|
||||
|
||||
def test_sdpa(self):
|
||||
url = "https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"
|
||||
image = Image.open(requests.get(url, stream=True).raw)
|
||||
|
||||
dtype = torch.bfloat16
|
||||
repo = "microsoft/kosmos-2.5"
|
||||
model = Kosmos2_5ForConditionalGeneration.from_pretrained(
|
||||
repo, device_map=torch_device, torch_dtype=dtype, attn_implementation="sdpa"
|
||||
)
|
||||
processor = AutoProcessor.from_pretrained(repo)
|
||||
prompt = "<ocr>"
|
||||
generated_ids, generated_text = self.run_example(prompt, image, model, processor)
|
||||
EXPECTED_TEXT = {
|
||||
7: [
|
||||
"<bbox><x_53><y_573><x_69><y_606></bbox>1\n<bbox><x_79><y_573><x_464><y_611></bbox>[REG] BLACK SAKURA\n<bbox><x_690><y_569><x_810><y_606></bbox>45,455\n<bbox><x_53><y_614><x_69><y_648></bbox>1\n<bbox><x_79><y_614><x_468><y_651></bbox>COOKIE DOH SAUCES\n<bbox><x_788><y_609><x_812><y_642></bbox>0\n<bbox><x_50><y_658><x_69><y_693></bbox>1\n<bbox><x_79><y_658><x_358><y_693></bbox>NATA DE COCO\n<bbox><x_790><y_652><x_814><y_683></bbox>0\n<bbox><x_31><y_742><x_820><y_781></bbox>Sub Total 45,455\n<bbox><x_27><y_781><x_822><y_827></bbox>PB1 (10%) 4,545\n<bbox><x_27><y_826><x_824><y_872></bbox>Rounding 0\n<bbox><x_24><y_872><x_827><y_921></bbox>Total 50,000\n<bbox><x_17><y_1056><x_836><y_1108></bbox>Card Payment 50,000\n",
|
||||
],
|
||||
8: [
|
||||
"<bbox><x_53><y_573><x_69><y_606></bbox>1\n<bbox><x_79><y_573><x_464><y_611></bbox>[REG] BLACK SAKURA\n<bbox><x_690><y_569><x_810><y_606></bbox>45,455\n<bbox><x_53><y_614><x_69><y_648></bbox>1\n<bbox><x_79><y_614><x_468><y_651></bbox>COOKIE DOH SAUCES\n<bbox><x_788><y_609><x_812><y_642></bbox>0\n<bbox><x_50><y_658><x_69><y_693></bbox>1\n<bbox><x_79><y_658><x_358><y_693></bbox>NATA DE COCO\n<bbox><x_790><y_652><x_814><y_683></bbox>0\n<bbox><x_31><y_742><x_820><y_781></bbox>Sub Total 45,455\n<bbox><x_27><y_781><x_822><y_827></bbox>PB1 (10%) 4,545\n<bbox><x_27><y_826><x_824><y_872></bbox>Rounding 0\n<bbox><x_24><y_872><x_827><y_921></bbox>Total 50,000\n<bbox><x_17><y_1056><x_836><y_1108></bbox>Card Payment 50,000\n"
|
||||
],
|
||||
}
|
||||
|
||||
self.assertListEqual(generated_text, EXPECTED_TEXT[self.cuda_compute_capability_major_version])
|
||||
|
||||
prompt = "<md>"
|
||||
generated_ids, generated_text = self.run_example(prompt, image, model, processor)
|
||||
|
||||
EXPECTED_TEXT = {
|
||||
7: [
|
||||
"- **1 \\[REG\\] BLACK SAKURA** 45,455\n- **1 COOKIE DOH SAUCES** 0\n- **1 NATA DE COCO** 0\n- **Sub Total** 45,455\n- **PB1 (10%)** 4,545\n- **Rounding** 0\n- **Total** **50,000**\n\nCard Payment 50,000"
|
||||
],
|
||||
8: [
|
||||
"- **1 \\[REG\\] BLACK SAKURA** 45,455\n- **1 COOKIE DOH SAUCES** 0\n- **1 NATA DE COCO** 0\n- **Sub Total** 45,455\n- **PB1 (10%)** 4,545\n- **Rounding** 0\n- **Total** **50,000**\n\nCard Payment 50,000"
|
||||
],
|
||||
}
|
||||
|
||||
self.assertListEqual(generated_text, EXPECTED_TEXT[self.cuda_compute_capability_major_version])
|
||||
|
||||
@require_flash_attn
|
||||
@require_torch_gpu
|
||||
@pytest.mark.flash_attn_test
|
||||
@slow
|
||||
def test_FA2(self):
|
||||
url = "https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"
|
||||
image = Image.open(requests.get(url, stream=True).raw)
|
||||
|
||||
dtype = torch.bfloat16
|
||||
repo = "microsoft/kosmos-2.5"
|
||||
model = Kosmos2_5ForConditionalGeneration.from_pretrained(
|
||||
repo,
|
||||
device_map=torch_device,
|
||||
torch_dtype=dtype,
|
||||
attn_implementation="flash_attention_2",
|
||||
)
|
||||
processor = AutoProcessor.from_pretrained(repo)
|
||||
prompt = "<ocr>"
|
||||
generated_ids, generated_text = self.run_example(prompt, image, model, processor)
|
||||
EXPECTED_TEXT = [
|
||||
"<bbox><x_53><y_573><x_69><y_606></bbox>1\n<bbox><x_79><y_573><x_464><y_612></bbox>[REG] BLACK SAKURA\n<bbox><x_690><y_569><x_812><y_606></bbox>45,455\n<bbox><x_53><y_614><x_69><y_650></bbox>1\n<bbox><x_79><y_614><x_468><y_650></bbox>COOKIE DOH SAUCES\n<bbox><x_788><y_610><x_813><y_644></bbox>0\n<bbox><x_50><y_658><x_65><y_693></bbox>1\n<bbox><x_76><y_658><x_358><y_693></bbox>NATA DE COCO\n<bbox><x_790><y_652><x_815><y_687></bbox>0\n<bbox><x_31><y_742><x_822><y_781></bbox>Sub Total 45,455\n<bbox><x_27><y_780><x_822><y_827></bbox>PB1 (10%) 4,545\n<bbox><x_27><y_826><x_824><y_874></bbox>Rounding 0\n<bbox><x_24><y_872><x_827><y_921></bbox>Total 50,000\n<bbox><x_17><y_1056><x_835><y_1108></bbox>Card Payment 50,000\n"
|
||||
]
|
||||
|
||||
self.assertListEqual(generated_text, EXPECTED_TEXT)
|
||||
|
||||
prompt = "<md>"
|
||||
generated_ids, generated_text = self.run_example(prompt, image, model, processor)
|
||||
# A10 gives the 1st one, but A100 gives the 2nd one
|
||||
EXPECTED_TEXT = [
|
||||
"- **1 \\[REG\\] BLACK SAKURA** 45,455\n- **1 COOKIE DOH SAUCES** 0\n- **1 NATA DE COCO** 0\n\n<table>\n<thead>\n<tr>\n<th>\nSub Total\n</th>\n<th>\n45,455\n</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>\nPB1 (10%)\n</td>\n<td>\n4,545\n</td>\n</tr>\n<tr>\n<td>\nRounding\n</td>\n<td>\n0\n</td>\n</tr>\n<tr>\n<td>\n<strong>\nTotal\n</strong>\n</td>\n<td>\n<strong>\n50,000\n</strong>\n</td>\n</tr>\n</tbody>\n</table>\n\nCard Payment 50,000",
|
||||
"- **1 \\[REG\\] BLACK SAKURA** 45,455\n- **1 COOKIE DOH SAUCES** 0\n- **1 NATA DE COCO** 0\n- **Sub Total** 45,455\n- **PB1 (10%)** 4,545\n- **Rounding** 0\n- **Total** **50,000**\n",
|
||||
]
|
||||
self.assertIn(generated_text[0], EXPECTED_TEXT)
|
391
tests/models/kosmos2_5/test_processor_kosmos2_5.py
Normal file
391
tests/models/kosmos2_5/test_processor_kosmos2_5.py
Normal file
@ -0,0 +1,391 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2024 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from transformers.testing_utils import (
|
||||
require_torch,
|
||||
require_vision,
|
||||
)
|
||||
from transformers.utils import is_vision_available
|
||||
|
||||
from ...test_processing_common import ProcessorTesterMixin
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import (
|
||||
AutoProcessor,
|
||||
AutoTokenizer,
|
||||
Kosmos2_5ImageProcessor,
|
||||
Kosmos2_5Processor,
|
||||
PreTrainedTokenizerFast,
|
||||
)
|
||||
|
||||
|
||||
@require_vision
|
||||
class Kosmos2_5ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
processor_class = Kosmos2_5Processor
|
||||
images_input_name = "flattened_patches"
|
||||
|
||||
def setUp(self):
|
||||
self.tmpdirname = tempfile.mkdtemp()
|
||||
image_processor = Kosmos2_5ImageProcessor()
|
||||
tokenizer = AutoTokenizer.from_pretrained("microsoft/kosmos-2.5")
|
||||
processor = Kosmos2_5Processor(image_processor, tokenizer)
|
||||
processor.save_pretrained(self.tmpdirname)
|
||||
|
||||
def get_tokenizer(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
|
||||
|
||||
def get_image_processor(self, **kwargs):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.tmpdirname)
|
||||
|
||||
def test_image_procesor_load_save_reload(self):
|
||||
# make sure load from Hub repo. -> save -> reload locally work
|
||||
image_processor = Kosmos2_5ImageProcessor.from_pretrained("microsoft/kosmos-2.5")
|
||||
with TemporaryDirectory() as tmp_dir:
|
||||
image_processor.save_pretrained(tmp_dir)
|
||||
reloaded_image_processor = Kosmos2_5ImageProcessor.from_pretrained(tmp_dir)
|
||||
assert image_processor.to_dict() == reloaded_image_processor.to_dict()
|
||||
assert image_processor.to_json_string() == reloaded_image_processor.to_json_string()
|
||||
|
||||
def test_save_load_pretrained_additional_features(self):
|
||||
processor = Kosmos2_5Processor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
|
||||
processor.save_pretrained(self.tmpdirname)
|
||||
|
||||
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
|
||||
image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
|
||||
|
||||
processor = Kosmos2_5Processor.from_pretrained(
|
||||
self.tmpdirname,
|
||||
bos_token="(BOS)",
|
||||
eos_token="(EOS)",
|
||||
do_normalize=False,
|
||||
padding_value=1.0,
|
||||
)
|
||||
|
||||
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
||||
self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast)
|
||||
|
||||
self.assertEqual(
|
||||
processor.image_processor.to_json_string(),
|
||||
image_processor_add_kwargs.to_json_string(),
|
||||
)
|
||||
self.assertIsInstance(processor.image_processor, Kosmos2_5ImageProcessor)
|
||||
|
||||
@unittest.skip(reason="kosmos-2.5 must have both image and text")
|
||||
def test_image_processor(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="kosmos-2.5 must have both image and text")
|
||||
def test_tokenizer(self):
|
||||
pass
|
||||
|
||||
def test_tokenizer_decode(self):
|
||||
image_processor = self.get_image_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
processor = Kosmos2_5Processor(tokenizer=tokenizer, image_processor=image_processor)
|
||||
|
||||
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
|
||||
|
||||
decoded_processor = processor.batch_decode(predicted_ids)
|
||||
decoded_tok = tokenizer.batch_decode(predicted_ids)
|
||||
|
||||
self.assertListEqual(decoded_tok, decoded_processor)
|
||||
|
||||
def test_can_load_various_tokenizers(self):
|
||||
for checkpoint in ["microsoft/kosmos-2.5", "kirp/kosmos2_5"]:
|
||||
processor = AutoProcessor.from_pretrained(checkpoint)
|
||||
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
||||
self.assertEqual(processor.tokenizer.__class__, tokenizer.__class__)
|
||||
|
||||
@require_torch
|
||||
def test_model_input_names(self):
|
||||
image_processor = self.get_image_processor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
processor = Kosmos2_5Processor(tokenizer=tokenizer, image_processor=image_processor)
|
||||
|
||||
input_str = "This is a test"
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
# both image and text
|
||||
inputs = processor(text=input_str, images=image_input)
|
||||
self.assertListEqual(
|
||||
list(inputs.keys()),
|
||||
[
|
||||
"flattened_patches",
|
||||
"attention_mask",
|
||||
"width",
|
||||
"height",
|
||||
"input_ids",
|
||||
"image_embeds_position_mask",
|
||||
],
|
||||
)
|
||||
# test if it raises when no input is passed
|
||||
with pytest.raises(ValueError):
|
||||
processor()
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_image_processor_defaults_preserved_by_image_kwargs(self):
|
||||
# Rewrite as KOSMOS-2.5 processor return "flattened_patches" and not "pixel_values"
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor", max_patches=1024, patch_size={"height": 8, "width": 8})
|
||||
tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs()
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
inputs = processor(text=input_str, images=image_input)
|
||||
self.assertEqual(len(inputs["flattened_patches"][0][0]), 194)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_kwargs_overrides_default_image_processor_kwargs(self):
|
||||
# Rewrite as KOSMOS-2.5 processor return "flattened_patches" and not "pixel_values"
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor", max_patches=4096)
|
||||
tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs()
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
inputs = processor(text=input_str, images=image_input, max_patches=1024)
|
||||
self.assertEqual(len(inputs["flattened_patches"][0]), 1024)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_unstructured_kwargs(self):
|
||||
# Rewrite as KOSMOS-2.5 processor doesn't use `rescale_factor`
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs()
|
||||
image_input = self.prepare_image_inputs()
|
||||
inputs = processor(
|
||||
text=input_str,
|
||||
images=image_input,
|
||||
return_tensors="pt",
|
||||
max_patches=1024,
|
||||
padding="max_length",
|
||||
max_length=76,
|
||||
)
|
||||
|
||||
self.assertEqual(inputs["flattened_patches"].shape[1], 1024)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 76)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_unstructured_kwargs_batched(self):
|
||||
# Rewrite as KOSMOS-2.5 processor doesn't use `rescale_factor`
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs(batch_size=2)
|
||||
image_input = self.prepare_image_inputs(batch_size=2)
|
||||
inputs = processor(
|
||||
text=input_str,
|
||||
images=image_input,
|
||||
return_tensors="pt",
|
||||
max_patches=1024,
|
||||
padding="longest",
|
||||
max_length=76,
|
||||
)
|
||||
|
||||
self.assertEqual(inputs["flattened_patches"].shape[1], 1024)
|
||||
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 76)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_structured_kwargs_nested(self):
|
||||
# Rewrite as KOSMOS-2.5 processor doesn't use `rescale_factor`
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = self.prepare_text_inputs()
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
# Define the kwargs for each modality
|
||||
all_kwargs = {
|
||||
"common_kwargs": {"return_tensors": "pt"},
|
||||
"images_kwargs": {"max_patches": 1024},
|
||||
"text_kwargs": {"padding": "max_length", "max_length": 76},
|
||||
}
|
||||
|
||||
inputs = processor(text=input_str, images=image_input, **all_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
self.assertEqual(inputs["flattened_patches"].shape[1], 1024)
|
||||
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 76)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_structured_kwargs_nested_from_dict(self):
|
||||
# Rewrite as KOSMOS-2.5 processor doesn't use `rescale_factor`
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
input_str = self.prepare_text_inputs()
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
# Define the kwargs for each modality
|
||||
all_kwargs = {
|
||||
"common_kwargs": {"return_tensors": "pt"},
|
||||
"images_kwargs": {"max_patches": 1024},
|
||||
"text_kwargs": {"padding": "max_length", "max_length": 76},
|
||||
}
|
||||
|
||||
inputs = processor(text=input_str, images=image_input, **all_kwargs)
|
||||
self.assertEqual(inputs["flattened_patches"].shape[1], 1024)
|
||||
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 76)
|
||||
|
||||
@require_torch
|
||||
def test_full_processor(self):
|
||||
url = "https://huggingface.co/kirp/kosmos2_5/resolve/main/receipt_00008.png"
|
||||
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")
|
||||
texts = ["<md>", "<ocr>"]
|
||||
expected_input_ids = [
|
||||
[100288],
|
||||
[100282],
|
||||
]
|
||||
expected_attention_mask = [[1], [1]]
|
||||
|
||||
image = Image.open(requests.get(url, stream=True).raw)
|
||||
# To match the official (microsoft) Kosmos-2 demo from which the expected values here are grabbed
|
||||
image_path = os.path.join(self.tmpdirname, "image.png")
|
||||
image.save(image_path)
|
||||
image = Image.open(image_path)
|
||||
|
||||
# test single image
|
||||
outputs = processor(images=image, text=texts[0])
|
||||
self.assertListEqual(
|
||||
outputs.input_ids[0].numpy().tolist(),
|
||||
[0, 100283] + [0] * 2048 + [100284] + expected_input_ids[0],
|
||||
)
|
||||
self.assertListEqual(
|
||||
outputs.image_embeds_position_mask[0].numpy().tolist(),
|
||||
[0, -1] + [1] * 2048 + [-1] + [0] * (len(expected_input_ids[0])),
|
||||
)
|
||||
self.assertListEqual(
|
||||
outputs.attention_mask[0].numpy().tolist(),
|
||||
[1, 1] + [1] * 2048 + [1] + expected_attention_mask[0],
|
||||
)
|
||||
EXPECTED_FP_1 = [
|
||||
1.0,
|
||||
2.0,
|
||||
-2.9527735710144043,
|
||||
-2.672085762023926,
|
||||
-2.9933173656463623,
|
||||
-2.905944585800171,
|
||||
-2.5891761779785156,
|
||||
-2.8751866817474365,
|
||||
-2.962153434753418,
|
||||
-2.588062047958374,
|
||||
]
|
||||
EXPECTED_FP_200 = [
|
||||
4.0,
|
||||
45.0,
|
||||
1.5713728666305542,
|
||||
1.584628939628601,
|
||||
1.3589054346084595,
|
||||
1.6515952348709106,
|
||||
1.7014952898025513,
|
||||
1.3731343746185303,
|
||||
1.6010395288467407,
|
||||
1.6607422828674316,
|
||||
]
|
||||
self.assertTupleEqual(outputs.flattened_patches.shape, (1, 4096, 770))
|
||||
np.testing.assert_allclose(
|
||||
outputs.flattened_patches[0][1][:10].numpy().tolist(),
|
||||
EXPECTED_FP_1,
|
||||
atol=1e-9,
|
||||
)
|
||||
np.testing.assert_allclose(
|
||||
outputs.flattened_patches[0][200][:10].numpy().tolist(),
|
||||
EXPECTED_FP_200,
|
||||
atol=1e-9,
|
||||
)
|
||||
|
||||
# test a batch of images and texts, right padding
|
||||
outputs = processor(images=[image, image], text=texts)
|
||||
self.assertListEqual(
|
||||
outputs.input_ids[1].numpy().tolist(),
|
||||
[0, 100283] + [0] * 2048 + [100284] + expected_input_ids[1],
|
||||
)
|
||||
self.assertListEqual(
|
||||
outputs.image_embeds_position_mask[1].numpy().tolist(),
|
||||
[0, -1] + [1] * 2048 + [-1] + [0] * (len(expected_input_ids[1])),
|
||||
)
|
||||
self.assertListEqual(
|
||||
outputs.attention_mask[1].numpy().tolist(),
|
||||
[1, 1] + [1] * 2048 + [1] + expected_attention_mask[1],
|
||||
)
|
||||
self.assertTupleEqual(outputs.flattened_patches.shape, (2, 4096, 770))
|
||||
np.testing.assert_allclose(
|
||||
outputs.flattened_patches[1][1][:10].numpy().tolist(),
|
||||
EXPECTED_FP_1,
|
||||
atol=1e-9,
|
||||
)
|
||||
np.testing.assert_allclose(
|
||||
outputs.flattened_patches[1][200][:10].numpy().tolist(),
|
||||
EXPECTED_FP_200,
|
||||
atol=1e-9,
|
||||
)
|
@ -85,6 +85,9 @@ PRIVATE_MODELS = [
|
||||
"Idefics2PerceiverResampler",
|
||||
"Idefics2VisionTransformer",
|
||||
"Idefics3VisionTransformer",
|
||||
"Kosmos2_5TextModel",
|
||||
"Kosmos2_5TextForCausalLM",
|
||||
"Kosmos2_5VisionModel",
|
||||
"AriaTextForCausalLM",
|
||||
"AriaTextModel",
|
||||
]
|
||||
|
Reference in New Issue
Block a user