Compare commits

...

276 Commits

Author SHA1 Message Date
7c99fd066f temp 2024-12-17 18:17:35 +01:00
a65a9b11d3 temp 2024-12-17 17:49:49 +01:00
30f927a54a temp 2024-12-17 17:38:31 +01:00
a3638eab9d temp 2024-12-17 17:25:54 +01:00
876cb6b217 temp 2024-12-17 16:36:00 +01:00
ce222a6990 temp 2024-12-17 16:33:05 +01:00
9b29aacce3 temp 2024-12-17 16:18:37 +01:00
bf14c4b95a temp 2024-12-17 16:14:37 +01:00
db865dbeda temp 2024-12-17 16:12:47 +01:00
7f0d26c55e temp 2024-12-17 16:06:31 +01:00
0ec499a841 temp 2024-12-17 15:55:43 +01:00
5f731a9aa9 temp 2024-12-17 15:45:43 +01:00
6b2f7d79e2 temp 2024-12-17 15:28:45 +01:00
2d4cbba164 temp 2024-12-17 15:26:01 +01:00
83d600e5f8 temp 2024-12-17 15:22:53 +01:00
55944fcf87 temp 2024-12-17 15:19:54 +01:00
c356a36327 temp 2024-12-17 15:14:47 +01:00
b2c3db2990 temp 2024-12-17 14:51:26 +01:00
39dc6ef257 temp 2024-12-17 14:47:48 +01:00
9ddc86b560 temp 2024-12-17 13:30:32 +01:00
d5b834925c temp 2024-12-17 13:23:06 +01:00
3a58742f92 temp 2024-12-17 12:52:54 +01:00
d1c52f4ffc temp 2024-12-17 12:38:01 +01:00
b688c4f564 Merge branch 'ca03842c' into kosmos25 2024-12-16 17:02:08 +01:00
c639eeb73e it's Monday let's go 2024-12-16 17:00:01 +01:00
8a058d9d56 it's Monday let's go 2024-12-16 16:35:17 +01:00
395a6365b0 it's Monday let's go 2024-12-16 16:23:18 +01:00
2c47915424 it's Friday night, let cross finger 2024-12-13 19:39:05 +01:00
9c8aff7cf9 it's Friday night, let cross finger 2024-12-13 19:06:30 +01:00
00e324db90 it's Friday night, let cross finger 2024-12-13 18:35:53 +01:00
90c4fcc29c it's Friday night, let cross finger 2024-12-13 18:29:22 +01:00
ce3a6b0ab7 it's Friday night, let cross finger 2024-12-13 18:28:20 +01:00
fbb3e592af it's Friday night, let cross finger 2024-12-13 18:20:21 +01:00
f8c98d6173 it's Friday night, let cross finger 2024-12-13 17:58:05 +01:00
b1db4f22b6 fix 2024-12-13 16:56:06 +01:00
85da449436 fix 2024-12-13 16:37:28 +01:00
e3802f4baa fix 2024-12-13 15:56:08 +01:00
91fa38341a fix 2024-12-13 15:35:00 +01:00
dcced48507 fix 2024-12-13 15:09:42 +01:00
9a841add0a fix 2024-12-06 17:13:32 +01:00
6ed504dbce fix 2024-12-06 16:19:28 +01:00
925e14a0f5 Merge branch 'main' into main 2024-12-06 15:30:02 +01:00
0b9e5adaa0 Merge branch 'main' into kosmos25 2024-12-06 10:42:13 +01:00
52788cc66e [ydshieh] add GenerationTesterMixin 2024-10-31 14:36:00 +01:00
ac94b571a3 [ydshieh] add ProcessorTesterMixin 2024-10-31 14:34:31 +01:00
0153a08af2 [ydshieh] remove 2024-10-31 14:25:37 +01:00
1c58c8f666 [ydshieh] fix 2024-10-29 15:03:18 +01:00
830671beb4 [ydshieh] new init 2024-10-29 15:00:52 +01:00
9a8479d57c [ydshieh] Add to MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES 2024-10-29 13:23:57 +01:00
f66c6ee998 [ydshieh] remove 2024-10-29 12:14:54 +01:00
fcc095fbbf [ydshieh] fix copie 2024-10-29 11:17:23 +01:00
f2dae0ddbc Merge branch 'main' into kosmos25 2024-10-29 10:56:47 +01:00
08e1cb0d2d [ydshieh] move 2024-10-29 10:52:45 +01:00
6f2bd73f77 [ydshieh] skip 2024-10-25 17:47:45 +02:00
4b7bc957a5 [ydshieh] remove copied from 2024-10-25 17:19:46 +02:00
142604dadf [ydshieh] tokenizer class 2024-10-25 17:02:40 +02:00
968b033343 [ydshieh] remove duplication in init file 2024-10-25 16:59:17 +02:00
94cc6d22f6 [ydshieh] update loop 2024-10-22 11:32:21 +02:00
a6154db0e1 [run-slow] kosmos2_5 2024-10-10 20:41:48 -04:00
eab8e69fcf [run-slow] kosmos2_5 2024-10-10 14:58:16 -04:00
f8497ce0cc [run-slow] kosmos2_5 2024-10-10 14:37:20 -04:00
63603d65c6 [kirp] remove cross_attn in textblock 2024-10-10 14:31:21 -04:00
40ff0155ee [run-slow] kosmos2_5 2024-10-02 01:04:48 +00:00
f5d4439bde [run-slow] kosmos2_5 2024-10-02 00:48:01 +00:00
d0bf57e8fc Merge remote-tracking branch 'upstream/main' into main 2024-10-02 00:39:43 +00:00
65490b4e5d [run-slow] fix checkpoint bug 2024-09-30 21:55:00 +00:00
9e620b6cd8 [run-slow] fix checkpoint bug 2024-09-30 21:44:36 +00:00
806ca1bbb1 [run-slow] kosmos2_5 2024-09-30 18:17:45 +00:00
c7050497ad [kirp] make style 2024-09-30 18:01:22 +00:00
d99934d666 [kirp] regroup the init 2024-09-30 17:59:50 +00:00
55cb12dcd7 [kirp] check copies 2024-09-30 07:15:37 +00:00
df9d3ad270 [kirp] use update_causal_mask 2024-09-30 07:04:50 +00:00
0ed8541df9 [kirp] fix format 2024-09-30 06:41:10 +00:00
5e5a9e9fa8 [kirp] fix format 2024-09-30 06:39:10 +00:00
54b1984c4d [kirp] cache sdpa and format 2024-09-30 06:31:57 +00:00
87ab93532d [kirp] move attention_mask maker to vision encoder 2024-09-30 06:16:58 +00:00
ab687f5f9b [kirp] sdpa cache 2024-09-30 06:11:10 +00:00
15feaeaa7e [kirp] cache for eager 2024-09-30 05:09:39 +00:00
dd12798243 [kirp] remove test file 2024-09-03 15:53:21 +00:00
b5ebf09127 [kirp] remove head mask 2024-09-03 13:34:53 +00:00
e5ffaee338 [kirp] fix typo in processor 2024-09-03 13:24:37 +00:00
cc7d28f9e4 Revert "[kirp] remove creating mask in the layer"
This reverts commit e1ab413b658b03719696ff3fe7e0acf5ed23baf8.
2024-09-02 04:47:03 +00:00
fe418d05e6 [kirp] remove cache 2024-09-02 03:44:02 +00:00
e1ab413b65 [kirp] remove creating mask in the layer 2024-09-02 03:39:25 +00:00
eb116abdd5 [kirp] use string 2024-09-02 02:57:35 +00:00
9a01f8f6fd [kirp] reformat 2024-09-02 02:41:09 +00:00
ef6754ce8e [kirp] remove cross attention 2024-09-02 02:31:18 +00:00
0ae49e0859 [kirp] iterate over the images only once 2024-09-02 02:29:00 +00:00
f4c73b355a [kirp] remove zero bias 2024-09-02 02:19:23 +00:00
06cbb5ded7 [kirp] update the example part in readme 2024-08-27 14:15:06 +00:00
1bd02b2278 [ydshieh] add to toctree 2024-08-21 16:34:27 +02:00
9d7363fdd4 [ydshieh] update value 2024-08-21 16:12:28 +02:00
8ee2aa9726 [ydshieh] fix 2024-08-14 15:51:41 +02:00
720a8ab36c [ydshieh] fix 2024-08-14 15:49:11 +02:00
66598978c0 [ydshieh] fix 2024-08-14 15:46:51 +02:00
66af73d646 remove old url 2024-08-14 04:09:32 +00:00
139e834e7c [ydshieh] revert 2024-08-13 16:07:21 +02:00
bb4c2470fc [ydshieh] stye 2024-08-13 15:56:32 +02:00
e583cd4407 [ydshieh] tests 2024-08-13 15:55:01 +02:00
1e175baf94 [ydshieh] tests 2024-08-13 15:48:28 +02:00
9822d00b51 [ydshieh] tests 2024-08-13 15:46:12 +02:00
e41b8759ae [ydshieh] tests 2024-08-13 15:22:51 +02:00
703ccfd101 [ydshieh] tiny tweak 2024-08-13 14:53:36 +02:00
e6fe2ae21a [ydshieh] Kosmos2TextForCausalLM 2024-08-13 12:20:16 +02:00
e62993ced5 [ydshieh] copied statement for Kosmos2_5TextModel 2024-08-13 11:50:45 +02:00
4e709e50df [ydshieh] update _init_weights 2024-08-13 11:41:41 +02:00
210ccb1989 [ydshieh] update _init_weights 2024-08-13 10:32:00 +02:00
e09217e618 [ydshieh] update _init_weights 2024-08-13 10:11:15 +02:00
de6d842c20 [ydshieh] copied statement for vision model 2024-08-12 22:27:11 +02:00
7df300082c [ydshieh] update vision model class inheritance 2024-08-12 22:06:52 +02:00
3681119b37 [yilinjia] fix doc in config 2024-08-07 08:42:21 +00:00
f2b61c2944 [ydshieh] _init_weights 2024-08-05 23:52:22 +02:00
2cdb62aa73 [ydshieh] _init_weights 2024-08-05 22:58:31 +02:00
6f8b2e6af1 [ydshieh] _init_weights 2024-08-05 22:54:08 +02:00
1424e07256 [ydshieh] copied 2024-08-05 22:53:47 +02:00
70d85cd2e6 [ydshieh] copied 2024-08-05 22:19:24 +02:00
29d272b0ff [kirp] make style 2024-08-03 08:16:20 +00:00
ac1968bd6d fix bug 2024-08-03 08:04:11 +00:00
2157f31685 [ydshieh] fix 2024-08-03 00:28:37 +02:00
18fa43b75d [ydshieh] fix 2024-08-03 00:04:49 +02:00
2ea4d4f6a7 [ydshieh] fix 2024-08-02 23:49:36 +02:00
267e1d669f [ydshieh] copied 2024-08-02 23:00:17 +02:00
2b2fe1c468 [ydshieh] copied 2024-08-02 19:17:02 +02:00
60240f2f98 [ydshieh] copied 2024-07-31 00:26:49 +02:00
d0e4fb74e8 [ydshieh] copied 2024-07-30 23:54:54 +02:00
7dfd1454fe [ydshieh] copied 2024-07-30 22:40:54 +02:00
7e5a91cb57 [ydshieh] copied 2024-07-30 22:24:29 +02:00
40dc555148 [ydshieh] copied 2024-07-30 21:51:58 +02:00
889d9da79a [ydshieh] copied 2024-07-30 20:47:43 +02:00
162f569f90 [ydshieh] copied 2024-07-30 20:43:52 +02:00
865fc2fd12 [ydshieh] docstring 2024-07-30 18:58:00 +02:00
cc17791fe2 [ydshieh] Kosmos2_5ForConditionalGeneration 2024-07-30 18:25:32 +02:00
9e0c277a6f [ydshieh] style 2024-07-30 18:22:05 +02:00
6cae0b6641 [ydshieh] add copied 2024-07-30 18:12:09 +02:00
ab546cc85c [ydshieh] remove 2024-07-30 18:06:05 +02:00
cfaa28fc9f test_model_input_names need torch 2024-07-30 15:25:00 +00:00
09d8b29c99 make style 2024-07-30 15:07:43 +00:00
bd765554dd fix format 2024-07-30 14:40:00 +00:00
482e5e12dc [ydshieh] better skip 2024-07-30 14:57:07 +02:00
6b82ce0d18 [ydshieh] better skip 2024-07-30 14:55:08 +02:00
26fb9694e9 [ydshieh] test_sdpa 2024-07-30 13:49:13 +02:00
64f915e326 fix ans 2024-07-30 10:19:19 +00:00
c027a98a3c fix error 2024-07-30 10:05:57 +00:00
ed50bbdd4e refractor FA2 2024-07-30 09:59:47 +00:00
87ccbc73ad Merge remote-tracking branch 'upstream/main' into main 2024-07-30 09:59:08 +00:00
9fca9ca6fa [ydshieh] num_image_tokens 2024-07-30 11:24:53 +02:00
32df418ae1 [ydshieh] better skip 2024-07-30 11:09:28 +02:00
0d166ded63 [ydshieh] 2024 2024-07-30 10:49:03 +02:00
9dcacfc453 fix style 2024-07-30 08:07:53 +00:00
0ddfe76426 add more ks25 processor test 2024-07-30 07:55:28 +00:00
da45edd39e ks25 image processor test added 2024-07-30 03:26:34 +00:00
2a782f0e1c Merge branch 'main' of https://github.com/tic-top/transformers into main 2024-07-29 09:19:27 +00:00
99f0d99efc add meaningful comment 2024-07-29 09:19:19 +00:00
06c52aed56 copied from comment added 2024-07-29 09:18:58 +00:00
28b58ff00e remove unnecessary comment 2024-07-29 09:18:33 +00:00
fbbf151d20 [ydshieh] images, width, height, rows, cols = ... 2024-07-29 09:54:30 +02:00
25e3260d2b [ydshieh] update repo name in doc 2024-07-29 09:48:39 +02:00
5ba6d849be simplify ks25 image processor 2024-07-28 09:20:41 +00:00
8b27f806cb Merge branch 'main' of https://github.com/tic-top/transformers into main 2024-07-28 09:13:40 +00:00
e9e56d0dd5 simplify ks25 image procrssor 2024-07-28 09:13:33 +00:00
5b3a6f796c fix doc in ks25 cfg 2024-07-28 09:13:11 +00:00
54a632e4b4 Update src/transformers/models/kosmos2_5/convert_kosmos2_5.py
Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
2024-07-28 17:08:11 +08:00
c54f9a8c6b Update src/transformers/models/kosmos2_5/convert_kosmos2_5.py
Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
2024-07-28 17:07:23 +08:00
3cebe13ae0 update copyright 2024-07-28 08:13:11 +00:00
188adbf95d add comment to ks25 image processor 2024-07-28 07:01:48 +00:00
1776f31939 fix foc in ks25 processor 2024-07-28 07:01:36 +00:00
2db6b886b4 Merge branch 'main' of https://github.com/tic-top/transformers into main 2024-07-28 07:01:12 +00:00
4308a40e09 fix document in ks25 config 2024-07-28 07:01:04 +00:00
452b23d955 add batch test 2024-07-28 07:00:40 +00:00
c23a8dd704 Update src/transformers/models/kosmos2_5/configuration_kosmos2_5.py
Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
2024-07-27 14:17:06 +08:00
ca57f47dc9 Update src/transformers/models/kosmos2_5/configuration_kosmos2_5.py
Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
2024-07-26 23:02:15 +08:00
6eb0683582 Update src/transformers/models/kosmos2_5/convert_kosmos2_5.py
Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
2024-07-26 22:01:02 +08:00
24961cdd2c Update src/transformers/models/kosmos2_5/configuration_kosmos2_5.py
Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
2024-07-26 21:58:34 +08:00
253714074d without grad when generating 2024-07-26 13:30:03 +00:00
d577c901a7 remove add_special_tokens 2024-07-26 12:41:26 +00:00
b7be077f04 [ydshieh] fix 2024-07-25 20:17:48 +02:00
93b291f11b [ydshieh] update FA2 md expected outputs 2024-07-25 20:13:01 +02:00
d2c57cc9a7 [ydshieh] [ydshieh] update eager ocr expected outputs 2024-07-25 20:02:39 +02:00
b574b092b5 [ydshieh] fix FA2 deco 2024-07-25 19:56:12 +02:00
4eca23ced6 [ydshieh] cuda_compute_capability_major_version 2024-07-25 19:50:19 +02:00
9c1539ab95 [ydshieh] no need eval() 2024-07-25 19:44:09 +02:00
8066ee7a49 [ydshieh] require_flash_attn 2024-07-25 19:38:21 +02:00
ec82032bd9 [ydshieh] update FA2 ocr expected outputs 2024-07-25 19:36:30 +02:00
2fe1f9490b [ydshieh] update eager/sdpa ocr expected outputs 2024-07-25 19:27:51 +02:00
7e810e2f07 upload doc images 2024-07-25 17:31:58 +02:00
5d1d0953da fix format 2024-07-23 00:45:45 +00:00
2fba9ab5ad sdpa, eager, fa2 modeling test 2024-07-22 15:29:34 +00:00
625fc05381 test for ks25 processor 2024-07-22 14:36:35 +00:00
9b24a63836 Merge branch 'main' of https://github.com/tic-top/transformers into main 2024-07-21 02:44:43 +00:00
a5c48d5275 fix copyright 2024-07-21 02:44:33 +00:00
607f65e7b1 Update create_circleci_config.py 2024-07-21 10:31:15 +08:00
c7c52a7c43 Fix copyright and add arvix link 2024-07-21 02:25:50 +00:00
c5c4864090 Revert "format"
This reverts commit 9eece30c83d1c867ef74484964df9a66b01e05bf.
2024-07-21 02:24:38 +00:00
8998e48003 Revert "Revert "fix format""
This reverts commit 5c5dd5457852b43dc590f3866782db281e976519.
2024-07-21 02:23:13 +00:00
d14ac7dd06 Merge branch 'main' of https://github.com/tic-top/transformers into main 2024-07-21 02:19:03 +00:00
5c5dd54578 Revert "fix format"
This reverts commit 303e918af4c2ba68cd90ad71e63fde67aba84159.
2024-07-21 02:18:46 +00:00
f518e5077a Update create_circleci_config.py 2024-07-21 10:05:59 +08:00
ca820d0385 Update create_circleci_config.py
revert
2024-07-21 10:04:31 +08:00
630a40df94 fix copyright 2024-07-17 07:12:26 +00:00
63877c3223 [run-slow] kosmos2_5 2024-07-10 21:05:27 +02:00
7710f9a06e [run-slow] kosmos2_5 on A10 2024-07-10 19:58:53 +02:00
6fa6221758 [run-slow] kosmos2_5 2024-07-10 17:34:07 +02:00
eb2b93c727 Merge remote-tracking branch 'upstream/main' into main 2024-07-04 06:24:03 +00:00
e81b7fed5e update readme 2024-07-03 15:57:39 +00:00
d5ad9579d2 initialization test passed 2024-07-03 15:30:54 +00:00
303e918af4 fix format 2024-07-03 15:23:04 +00:00
40b4e984b1 init test 2024-07-03 15:11:58 +00:00
2e398f74a9 hi 2024-07-03 03:24:47 +00:00
937945818e remove tmp img 2024-07-03 02:55:00 +00:00
35ef6559df fixup 2024-07-03 02:53:13 +00:00
cd8ac6ed29 std 2024-07-03 02:47:26 +00:00
73dddc516c add mean 2024-07-02 17:34:10 +00:00
f19b06cbf8 duplicate import 2024-07-02 17:33:44 +00:00
f05e361fb3 test finish 2024-07-02 17:04:08 +00:00
b7d5ec9be1 skip sdpa test 2024-07-02 16:44:31 +00:00
c3063253d4 run slow-prepare some test 2024-07-02 16:31:39 +00:00
578acce08f processor test 2024-07-02 14:08:51 +00:00
916781aa85 load from the config 2024-07-02 05:31:00 +00:00
b64e30045f restore ks2_test; update ks25 test 2024-07-02 05:29:47 +00:00
9046ec5a84 model test 2024-07-02 05:18:37 +00:00
42dd2ea83b better initilization 2024-07-02 05:01:14 +00:00
29d7cff9c4 model test 2024-07-01 17:39:14 +00:00
363180bc27 modeling_test in progress 2024-07-01 09:31:36 +00:00
9c74c61406 init weight 2024-07-01 08:01:50 +00:00
ba8b3dd446 init test 2024-06-30 16:00:30 +00:00
241b0bf9cb fixup 2024-06-30 15:22:06 +00:00
fe51247d0d Merge remote-tracking branch 'upstream/main' into main 2024-06-30 15:17:17 +00:00
589e9efed7 reformat 2024-06-30 10:24:03 +00:00
b72fe0a537 reformat 2024-06-30 10:20:22 +00:00
3a0cfaae48 reformat 2024-06-30 10:19:35 +00:00
9eece30c83 format 2024-06-30 10:15:54 +00:00
2de836d557 format 2024-06-30 10:11:11 +00:00
7d8783b0f6 . 2024-06-30 10:06:44 +00:00
05c99438ae import sort 2024-06-30 09:47:56 +00:00
234149a511 . 2024-06-30 09:34:23 +00:00
532b1e06fe . 2024-06-30 09:22:25 +00:00
6d797c6664 init weight 2024-06-30 09:20:28 +00:00
2e6cad8e24 add procesor 2024-06-30 08:40:08 +00:00
8bde09f354 format 2024-06-29 16:42:52 +00:00
3cbca06bbb . 2024-06-29 16:39:27 +00:00
d9b23c4023 format 2024-06-29 16:25:41 +00:00
71d3275e3d format 2024-06-29 16:24:20 +00:00
477fd34502 add torch required 2024-06-29 16:21:13 +00:00
73463dfafe fix name 2024-06-29 16:10:19 +00:00
4fe45f8eeb sample code 2024-06-29 12:49:59 +00:00
48924d6330 fix some comment 2024-06-29 10:26:28 +00:00
fcf17a6ee2 default configuration become eager 2024-06-29 10:26:11 +00:00
775bec3280 copyright 2024-06-29 10:25:50 +00:00
3ed0d66b3e new processor 2024-06-29 10:21:05 +00:00
b85d5d79f0 textspda 2024-06-29 03:47:22 +00:00
fdc28b7041 support sdpa 2024-06-28 14:38:42 +00:00
5f51a7d5a5 add some document 2024-06-27 02:21:25 +00:00
be5b0f924a remove ckpt, default to flash_attn 2024-06-26 03:40:02 +00:00
0ec4d44a76 segment_emb is needed 2024-06-26 03:38:04 +00:00
80c29c51d0 new config 2024-06-25 08:53:42 +00:00
93a7dc3a27 v1 2024-06-24 05:51:17 +00:00
c8aaa35edf . 2024-06-22 11:18:13 +00:00
4ee3d7eedb new configuration 2024-06-21 03:42:37 +00:00
ef94db2166 sdpa/flash_attn2/eager supported 2024-06-20 16:13:20 +00:00
b1d373bf8d flash2 & spa 2024-06-20 09:25:48 +00:00
0d7a2733cb remove cache and inference code 2024-06-20 08:10:55 +00:00
fdc614267a sdpa, flash attn2 supported 2024-06-20 08:08:57 +00:00
ce839cc270 remove hardcode dtype 2024-06-19 15:41:36 +00:00
02f21a72f9 remove the hardcode dtype 2024-06-19 15:39:48 +00:00
cd55891f1a eager attention supported, flash_attn2 is not completed 2024-06-19 14:29:48 +00:00
c433374429 . 2024-06-16 17:34:32 +00:00
cab16ce8a6 v1 2024-06-16 17:04:19 +00:00
4ceb5c8603 init 2024-06-16 16:52:04 +00:00
cedd7d3ca0 image processor 2024-06-16 15:49:30 +00:00
352e67821a . 2024-06-16 15:14:05 +00:00
f1be589aa3 . 2024-06-16 15:04:06 +00:00
d9cf29031d . 2024-06-16 15:03:07 +00:00
3aa802c7eb . 2024-06-16 14:56:44 +00:00
21b0ecc740 . 2024-06-16 14:53:31 +00:00
a6636c1aff . 2024-06-16 14:52:11 +00:00
661ea9c742 kosmos2_5 basic 2024-06-16 14:22:18 +00:00
26 changed files with 4835 additions and 2 deletions

View File

@ -860,6 +860,8 @@
title: InstructBlipVideo
- local: model_doc/kosmos-2
title: KOSMOS-2
- local: model_doc/kosmos-2.5
title: KOSMOS-2.5
- local: model_doc/layoutlm
title: LayoutLM
- local: model_doc/layoutlmv2

View File

@ -184,6 +184,7 @@ Flax), PyTorch, and/or TensorFlow.
| [JetMoe](model_doc/jetmoe) | ✅ | ❌ | ❌ |
| [Jukebox](model_doc/jukebox) | ✅ | ❌ | ❌ |
| [KOSMOS-2](model_doc/kosmos-2) | ✅ | ❌ | ❌ |
| [KOSMOS-2.5](model_doc/kosmos-2.5) | ✅ | ❌ | ❌ |
| [LayoutLM](model_doc/layoutlm) | ✅ | ✅ | ❌ |
| [LayoutLMv2](model_doc/layoutlmv2) | ✅ | ❌ | ❌ |
| [LayoutLMv3](model_doc/layoutlmv3) | ✅ | ✅ | ❌ |

View File

@ -0,0 +1,63 @@
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# KOSMOS-2.5
## Overview
Kosmos-2.5 is a multimodal literate model for machine reading of text-intensive images. Pre-trained on large-scale text-intensive images, Kosmos-2.5 excels in two distinct yet cooperative transcription tasks: (1) generating spatially-aware text blocks, where each block of text is assigned its spatial coordinates within the image, and (2) producing structured text output that captures styles and structures into the markdown format. This unified multimodal literate capability is achieved through a shared decoder-only auto-regressive Transformer architecture, task-specific prompts, and flexible text representations. We evaluate Kosmos-2.5 on end-to-end document-level text recognition and image-to-markdown text generation. Furthermore, the model can be readily adapted for any text-intensive image understanding task with different prompts through supervised fine-tuning, making it a general-purpose tool for real-world applications involving text-rich images. This work also paves the way for the future scaling of multimodal large language models.
The abstract from the paper is the following:
*We present Kosmos-2.5, a multimodal literate model for machine reading of text-intensive images. Pre-trained on large-scale text-intensive images, Kosmos-2.5 excels in two distinct yet cooperative transcription tasks: (1) generating spatially-aware text blocks, where each block of text is assigned its spatial coordinates within the image, and (2) producing structured text output that captures styles and structures into the markdown format. This unified multimodal literate capability is achieved through a shared Transformer architecture, task-specific prompts, and flexible text representations. We evaluate Kosmos-2.5 on end-to-end document-level text recognition and image-to-markdown text generation. Furthermore, the model can be readily adapted for any text-intensive image understanding task with different prompts through supervised fine-tuning, making it a general-purpose tool for real-world applications involving text-rich images. This work also paves the way for the future scaling of multimodal large language models.*
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/kosmos2_5_ocr.png"
alt="drawing" width="600"/>
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/kosmos2_5_md.png"
alt="drawing" width="600"/>
<small> Overview of tasks that KOSMOS-2.5 can handle. Taken from the <a href="https://arxiv.org/abs/2309.11419">original paper</a>. </small>
## Example
**Markdown Task:** For usage instructions, please refer to [md.py](https://huggingface.co/microsoft/kosmos-2.5/blob/main/md.py).
**OCR Task:** For usage instructions, please refer to [ocr.py](https://huggingface.co/microsoft/kosmos-2.5/blob/main/ocr.py).
## Kosmos2_5Config
[[autodoc]] Kosmos2_5Config
## Kosmos2_5ImageProcessor
[[autodoc]] Kosmos2_5ImageProcessor
## Kosmos2_5Processor
[[autodoc]] Kosmos2_5Processor
- __call__
## Kosmos2_5Model
[[autodoc]] Kosmos2_5Model
- forward
## Kosmos2_5ForConditionalGeneration
[[autodoc]] Kosmos2_5ForConditionalGeneration
- forward

View File

@ -61,6 +61,7 @@ FlashAttention-2 is currently supported for the following architectures:
* [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel)
* [JetMoe](https://huggingface.co/docs/transformers/model_doc/jetmoe#transformers.JetMoeModel)
* [Jamba](https://huggingface.co/docs/transformers/model_doc/jamba#transformers.JambaModel)
* [Kosmos-2.5](https://huggingface.co/docs/transformers/model_doc/kosmos2_5#transformers.Kosmos2_5Model)
* [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel)
* [Llava](https://huggingface.co/docs/transformers/model_doc/llava)
* [Llava-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next)
@ -251,6 +252,7 @@ For now, Transformers supports SDPA inference and training for the following arc
* [GraniteMoe](https://huggingface.co/docs/transformers/model_doc/granitemoe#transformers.GraniteMoeModel)
* [JetMoe](https://huggingface.co/docs/transformers/model_doc/jetmoe#transformers.JetMoeModel)
* [Jamba](https://huggingface.co/docs/transformers/model_doc/jamba#transformers.JambaModel)
* [Kosmos-2.5](https://huggingface.co/docs/transformers/model_doc/kosmos2_5#transformers.Kosmos2_5Model)
* [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel)
* [Llava](https://huggingface.co/docs/transformers/model_doc/llava)
* [Llava-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next)

View File

@ -512,6 +512,10 @@ _import_structure = {
"Kosmos2Config",
"Kosmos2Processor",
],
"models.kosmos2_5": [
"Kosmos2_5Config",
"Kosmos2_5Processor",
],
"models.layoutlm": [
"LayoutLMConfig",
"LayoutLMTokenizer",
@ -1216,6 +1220,7 @@ else:
_import_structure["models.idefics3"].extend(["Idefics3ImageProcessor"])
_import_structure["models.imagegpt"].extend(["ImageGPTFeatureExtractor", "ImageGPTImageProcessor"])
_import_structure["models.instructblipvideo"].extend(["InstructBlipVideoImageProcessor"])
_import_structure["models.kosmos2_5"].extend(["Kosmos2_5ImageProcessor"])
_import_structure["models.layoutlmv2"].extend(["LayoutLMv2FeatureExtractor", "LayoutLMv2ImageProcessor"])
_import_structure["models.layoutlmv3"].extend(["LayoutLMv3FeatureExtractor", "LayoutLMv3ImageProcessor"])
_import_structure["models.levit"].extend(["LevitFeatureExtractor", "LevitImageProcessor"])
@ -2557,6 +2562,13 @@ else:
"Kosmos2PreTrainedModel",
]
)
_import_structure["models.kosmos2_5"].extend(
[
"Kosmos2_5ForConditionalGeneration",
"Kosmos2_5Model",
"Kosmos2_5PreTrainedModel",
]
)
_import_structure["models.layoutlm"].extend(
[
"LayoutLMForMaskedLM",
@ -5438,6 +5450,10 @@ if TYPE_CHECKING:
Kosmos2Config,
Kosmos2Processor,
)
from .models.kosmos2_5 import (
Kosmos2_5Config,
Kosmos2_5Processor,
)
from .models.layoutlm import (
LayoutLMConfig,
LayoutLMTokenizer,
@ -6177,6 +6193,7 @@ if TYPE_CHECKING:
from .models.idefics3 import Idefics3ImageProcessor
from .models.imagegpt import ImageGPTFeatureExtractor, ImageGPTImageProcessor
from .models.instructblipvideo import InstructBlipVideoImageProcessor
from .models.kosmos2_5 import Kosmos2_5ImageProcessor
from .models.layoutlmv2 import (
LayoutLMv2FeatureExtractor,
LayoutLMv2ImageProcessor,
@ -7301,6 +7318,11 @@ if TYPE_CHECKING:
Kosmos2Model,
Kosmos2PreTrainedModel,
)
from .models.kosmos2_5 import (
Kosmos2_5ForConditionalGeneration,
Kosmos2_5Model,
Kosmos2_5PreTrainedModel,
)
from .models.layoutlm import (
LayoutLMForMaskedLM,
LayoutLMForQuestionAnswering,

View File

@ -127,6 +127,7 @@ from . import (
jamba,
jetmoe,
kosmos2,
kosmos2_5,
layoutlm,
layoutlmv2,
layoutlmv3,

View File

@ -148,6 +148,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
("jetmoe", "JetMoeConfig"),
("jukebox", "JukeboxConfig"),
("kosmos-2", "Kosmos2Config"),
("kosmos-2.5", "Kosmos2_5Config"),
("layoutlm", "LayoutLMConfig"),
("layoutlmv2", "LayoutLMv2Config"),
("layoutlmv3", "LayoutLMv3Config"),
@ -459,6 +460,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
("jetmoe", "JetMoe"),
("jukebox", "Jukebox"),
("kosmos-2", "KOSMOS-2"),
("kosmos-2.5", "KOSMOS-2.5"),
("layoutlm", "LayoutLM"),
("layoutlmv2", "LayoutLMv2"),
("layoutlmv3", "LayoutLMv3"),
@ -692,6 +694,7 @@ SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict(
("data2vec-vision", "data2vec"),
("donut-swin", "donut"),
("kosmos-2", "kosmos2"),
("kosmos-2.5", "kosmos2_5"),
("maskformer-swin", "maskformer"),
("xclip", "x_clip"),
("clip_vision_model", "clip"),

View File

@ -98,6 +98,7 @@ else:
("instructblip", ("BlipImageProcessor",)),
("instructblipvideo", ("InstructBlipVideoImageProcessor",)),
("kosmos-2", ("CLIPImageProcessor",)),
("kosmos-2.5", ("Kosmos2_5ImageProcessor",)),
("layoutlmv2", ("LayoutLMv2ImageProcessor",)),
("layoutlmv3", ("LayoutLMv3ImageProcessor",)),
("levit", ("LevitImageProcessor",)),

View File

@ -143,6 +143,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("jetmoe", "JetMoeModel"),
("jukebox", "JukeboxModel"),
("kosmos-2", "Kosmos2Model"),
("kosmos-2.5", "Kosmos2_5Model"),
("layoutlm", "LayoutLMModel"),
("layoutlmv2", "LayoutLMv2Model"),
("layoutlmv3", "LayoutLMv3Model"),
@ -761,6 +762,7 @@ MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict(
("instructblip", "InstructBlipForConditionalGeneration"),
("instructblipvideo", "InstructBlipVideoForConditionalGeneration"),
("kosmos-2", "Kosmos2ForConditionalGeneration"),
("kosmos-2.5", "Kosmos2_5ForConditionalGeneration"),
("llava", "LlavaForConditionalGeneration"),
("llava_next", "LlavaNextForConditionalGeneration"),
("llava_next_video", "LlavaNextVideoForConditionalGeneration"),
@ -788,6 +790,7 @@ MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict(
("idefics3", "Idefics3ForConditionalGeneration"),
("instructblip", "InstructBlipForConditionalGeneration"),
("kosmos-2", "Kosmos2ForConditionalGeneration"),
("kosmos-2.5", "Kosmos2_5ForConditionalGeneration"),
("llava", "LlavaForConditionalGeneration"),
("llava_next", "LlavaNextForConditionalGeneration"),
("llava_onevision", "LlavaOnevisionForConditionalGeneration"),

View File

@ -70,6 +70,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
("instructblip", "InstructBlipProcessor"),
("instructblipvideo", "InstructBlipVideoProcessor"),
("kosmos-2", "Kosmos2Processor"),
("kosmos-2.5", "Kosmos2_5Processor"),
("layoutlmv2", "LayoutLMv2Processor"),
("layoutlmv3", "LayoutLMv3Processor"),
("llava", "LlavaProcessor"),

View File

@ -247,6 +247,7 @@ else:
"XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
),
),
("kosmos-2.5", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
("layoutlmv3", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)),

View File

@ -2073,6 +2073,7 @@ class Kosmos2ForConditionalGeneration(Kosmos2PreTrainedModel, GenerationMixin):
vision_model_output=vision_model_output,
)
@torch.no_grad()
def generate(
self,
pixel_values: Optional[torch.Tensor] = None,

View File

@ -0,0 +1,30 @@
# coding=utf-8
# Copyright 2024 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ...utils import _LazyModule
from ...utils.import_utils import define_import_structure
if TYPE_CHECKING:
from .configuration_kosmos2_5 import *
from .image_processing_kosmos2_5 import *
from .modeling_kosmos2_5 import *
from .processing_kosmos2_5 import *
else:
import sys
_file = globals()["__file__"]
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

View File

@ -0,0 +1,278 @@
# coding=utf-8
# Copyright 2024 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""KOSMOS-2.5 model configuration"""
import os
from typing import Union
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
class Kosmos2_5TextConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Kosmos2_5TextModel`]. It is used to instantiate a
KOSMOS-2.5 text decoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the text decoder of the KOSMOS-2.5
[microsoft/kosmos-2.5](https://huggingface.co/microsoft/kosmos-2.5) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 108481):
Vocabulary size of the Kosmos2_5 model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`Kosmos2_5Model`].
max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
embed_dim (`int`, *optional*, defaults to 2048):
Dimensionality of the layers and the pooler layer.
layers (`int`, *optional*, defaults to 24):
Number of hidden layers in the Transformer encoder.
ffn_dim (`int`, *optional*, defaults to 8192):
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the Transformer encoder.
activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"silu"` and `"gelu_new"` are supported.
dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer.
layerdrop (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
for more details.
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
init_std (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
scale_embedding (`bool`, *optional*, defaults to `True`):
Scale embeddings by diving by sqrt(embed_dim).
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models).
```"""
model_type = "kosmos_2_5_text_model"
base_config_key = "text_config"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"num_attention_heads": "attention_heads",
"hidden_size": "embed_dim",
"num_hidden_layers": "layers",
}
def __init__(
self,
vocab_size=108481,
max_position_embeddings=4096,
embed_dim=1536,
layers=24,
ffn_dim=6144,
attention_heads=16,
activation_function="gelu",
dropout=0.1,
attention_dropout=0,
activation_dropout=0.0,
layerdrop=0.0,
layer_norm_eps=1e-5,
init_std=0.02,
scale_embedding=True,
use_cache=True,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
**kwargs,
):
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
**kwargs,
)
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.embed_dim = embed_dim
self.layers = layers
self.ffn_dim = ffn_dim
self.attention_heads = attention_heads
self.activation_function = activation_function
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.layerdrop = layerdrop
self.layer_norm_eps = layer_norm_eps
self.init_std = init_std
self.scale_embedding = scale_embedding
self.use_cache = use_cache
class Kosmos2_5VisionConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Kosmos2_5VisionModel`]. It is used to
instantiate a KOSMOS-2.5 vision encoder according to the specified arguments, defining the model architecture.
Instantiating a configuration defaults will yield a similar configuration to that of the vision encoder of the KOSMOS-2.5
[microsoft/kosmos-2.5](https://huggingface.co/microsoft/kosmos-2.5) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
patch_embed_hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the input patch_embedding layer in the Transformer encoder.
d_ff (`int`, *optional*, defaults to 2048):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
d_kv (`int`, *optional*, defaults to 64):
Dimensionality of the key, query, value projections per attention head.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
dense_act_fn (`str` or `function`, *optional*, defaults to `"gelu_new"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the layer normalization layers.
dropout_rate (`float`, *optional*, defaults to 0.0):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
seq_len (`int`, *optional*, defaults to 4096):
Maximum sequence length (here number of patches) supported by the model.
Example:
```python
>>> from transformers import Kosmos2_5VisionConfig, Kosmos2_5VisionModel
>>> # Initializing a Kosmos2_5VisionConfig with microsoft/kosmos-2.5 style configuration
>>> configuration = Kosmos2_5VisionConfig()
>>> # Initializing a Kosmos2_5VisionModel (with random weights) from the microsoft/kosmos-2.5 style configuration
>>> model = Kosmos2_5VisionModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "kosmos_2_5_vision_model"
base_config_key = "vision_config"
def __init__(
self,
hidden_size=1536,
patch_embed_hidden_size=768,
d_ff=3968,
d_kv=64,
num_hidden_layers=18,
num_attention_heads=24,
dense_act_fn="gelu_new",
layer_norm_eps=1e-6,
dropout_rate=0.0,
attention_dropout=0.0,
seq_len=4096,
initializer_factor=1.0,
initializer_range=0.02,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.patch_embed_hidden_size = patch_embed_hidden_size
self.d_ff = d_ff
self.dropout_rate = dropout_rate
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.attention_dropout = attention_dropout
self.layer_norm_eps = layer_norm_eps
self.dense_act_fn = dense_act_fn
self.seq_len = seq_len
self.d_kv = d_kv
self.initializer_factor = initializer_factor
self.initializer_range = initializer_range
class Kosmos2_5Config(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Kosmos2_5Model`]. It is used to instantiate a
KOSMOS-2.5 model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the KOSMOS-2.5
[microsoft/kosmos-2.5](https://huggingface.co/microsoft/kosmos-2.5) architecture.
Args:
text_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`Kosmos2_5TextConfig`].
vision_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`Kosmos2_5VisionConfig`].
latent_query_num (`int`, *optional*, defaults to 2048):
The number of latent query tokens that represent the image features used in the text decoder component.
kwargs (*optional*):
Dictionary of keyword arguments.
"""
model_type = "kosmos-2.5"
sub_configs = {"text_config": Kosmos2_5TextConfig, "vision_config": Kosmos2_5VisionConfig}
def __init__(
self,
text_config=None,
vision_config=None,
latent_query_num=2048,
**kwargs,
):
super().__init__(**kwargs)
if text_config is None:
text_config = {}
logger.info("text_config is None. Initializing the Kosmos2_5TextConfig with default values.")
if vision_config is None:
vision_config = {}
logger.info("vision_config is None. Initializing the Kosmos2_5VisionConfig with default values.")
self.text_config = Kosmos2_5TextConfig(**text_config)
self.vision_config = Kosmos2_5VisionConfig(**vision_config)
self.latent_query_num = latent_query_num
@classmethod
def from_text_vision_configs(
cls,
text_config: Kosmos2_5TextConfig,
vision_config: Kosmos2_5VisionConfig,
**kwargs,
):
r"""
Instantiate a [`Kosmos2_5Config`] (or a derived class) from Kosmos2_5 text model configuration and Kosmos2_5
vision model configuration.
Returns:
[`Kosmos2_5Config`]: An instance of a configuration object
"""
return cls(
text_config=text_config.to_dict(),
vision_config=vision_config.to_dict(),
**kwargs,
)
__all__ = ["Kosmos2_5Config"]

View File

@ -0,0 +1,87 @@
import argparse
from fairseq.checkpoint_utils import load_checkpoint_to_cpu
from transformers import Kosmos2_5Config, Kosmos2_5ForConditionalGeneration
KEYS_TO_MODIFY_MAPPING = {
"gpt_model.decoder.output_projection": "text_model.lm_head",
"gpt_model.decoder": "text_model.model",
"img_connector": "image_to_text_projection",
"img_model.embeddings": "vision_model.embeddings",
"img_model.encoder": "vision_model.encoder",
"img_model.layernorm": "vision_model.layernorm",
"img_model": "vision_model",
"ln_pre": "pre_layrnorm",
"ln_post": "post_layernorm",
"transformer.resblocks": "encoder.layers",
"ts_attn": "self_attn",
"ln_1": "layer_norm1",
"ln_2": "layer_norm2",
"c_fc": "fc1",
"c_proj": "fc2",
}
KEYS_TO_IGNORE = [
# this buffer in the original code is only used to send weights to the desired device
"gpt_model.decoder.embed_positions._float_tensor",
# this weight is never used in the forward in the original KOSMOS-2.5)
"gpt_model.decoder.self_attn_sope.scale",
]
def rename_key(key):
for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
if key_to_modify in key:
key = key.replace(key_to_modify, new_key)
return key
def convert_kosmos2_5_checkpoint_to_pytorch(checkpoint_path, pytorch_dump_folder_path):
state = load_checkpoint_to_cpu(checkpoint_path)
state_dict = state["model"]
state_dict_keys = list(state_dict.keys())
config = Kosmos2_5Config()
# This is necessary to match the results given by the original demo
config.text_config.no_repeat_ngram_size = 3
model = Kosmos2_5ForConditionalGeneration(config)
# convert (by renaming keys)
converted_state_dict = {}
for key in state_dict_keys:
if key in KEYS_TO_IGNORE:
continue
renamed_key = rename_key(key)
converted_state_dict[renamed_key] = state_dict[key]
# set
# check weight loading
# check whether the state in converted_state_dict is the same as the state in the model
model.load_state_dict(converted_state_dict, strict=True)
# save the result
model.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument(
"--kosmos2_5_checkpoint_path",
default="ckpt.pt",
type=str,
required=False,
help="Path the official PyTorch dump.",
)
parser.add_argument(
"--pytorch_dump_folder_path",
default="ckpt",
type=str,
required=False,
help="Path to the output PyTorch model.",
)
args = parser.parse_args()
convert_kosmos2_5_checkpoint_to_pytorch(args.kosmos2_5_checkpoint_path, args.pytorch_dump_folder_path)

View File

@ -0,0 +1,342 @@
# coding=utf-8
# Copyright 2024 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for Kosmos2_5."""
import math
from typing import Dict, Optional, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature
from ...image_transforms import (
convert_to_rgb,
normalize,
to_channel_dimension_format,
)
from ...image_utils import (
ChannelDimension,
ImageInput,
get_image_size,
infer_channel_dimension_format,
make_list_of_images,
to_numpy_array,
valid_images,
)
from ...utils import TensorType, is_torch_available, logging
from ...utils.import_utils import requires_backends
if is_torch_available():
import torch
logger = logging.get_logger(__name__)
DEFAULT_FONT_PATH = "ybelkada/fonts"
# Copied from transformers.models.pix2struct.image_processing_pix2struct.torch_extract_patches
def torch_extract_patches(image_tensor, patch_height, patch_width):
"""
Utiliy function to extract patches from a given image tensor. Returns a tensor of shape (1, `patch_height`,
`patch_width`, `num_channels`x `patch_height` x `patch_width`)
Args:
image_tensor (torch.Tensor):
The image tensor to extract patches from.
patch_height (int):
The height of the patches to extract.
patch_width (int):
The width of the patches to extract.
"""
requires_backends(torch_extract_patches, ["torch"])
image_tensor = image_tensor.unsqueeze(0)
patches = torch.nn.functional.unfold(image_tensor, (patch_height, patch_width), stride=(patch_height, patch_width))
patches = patches.reshape(image_tensor.size(0), image_tensor.size(1), patch_height, patch_width, -1)
patches = patches.permute(0, 4, 2, 3, 1).reshape(
image_tensor.size(2) // patch_height,
image_tensor.size(3) // patch_width,
image_tensor.size(1) * patch_height * patch_width,
)
return patches.unsqueeze(0)
# similar to transformers.models.pix2struct.image_processing_pix2struct.Pix2StructImageProcessor, but delete is_vqa and additionaly return width and height after resizing
class Kosmos2_5ImageProcessor(BaseImageProcessor):
r"""
Constructs a Kosmos2_5 image processor.
Args:
do_convert_rgb (`bool`, *optional*, defaults to `True`):
Whether to convert the image to RGB.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
method. According to Kosmos2_5 paper and code, the image is normalized with its own mean and standard
deviation.
patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
The patch size to use for the image. According to Kosmos2_5 paper and code, the patch size is 16x16.
max_patches (`int`, *optional*, defaults to 4096):
The maximum number of patches to extract from the image as per the [Kosmos2_5
paper](https://arxiv.org/pdf/2309.11419).
"""
model_input_names = ["flattened_patches"]
def __init__(
self,
do_convert_rgb: bool = True,
do_normalize: bool = True,
patch_size: Dict[str, int] = None,
max_patches: int = 4096,
**kwargs,
) -> None:
super().__init__(**kwargs)
self.patch_size = patch_size if patch_size is not None else {"height": 16, "width": 16}
self.do_normalize = do_normalize
self.do_convert_rgb = do_convert_rgb
self.max_patches = max_patches
def extract_flattened_patches(
self,
image: np.ndarray,
max_patches: int,
patch_size: dict,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Extract flattened patches from an image.
Args:
image (`np.ndarray`):
Image to extract flattened patches from.
max_patches (`int`):
Maximum number of patches to extract.
patch_size (`dict`):
Dictionary containing the patch height and width.
Returns:
result (`np.ndarray`):
A sequence of `max_patches` flattened patches.
"""
requires_backends(self.extract_flattened_patches, "torch")
# convert to torch
image = to_channel_dimension_format(image, ChannelDimension.FIRST, input_data_format)
image = torch.from_numpy(image)
patch_height, patch_width = patch_size["height"], patch_size["width"]
image_height, image_width = get_image_size(image, ChannelDimension.FIRST)
# maximize scale s.t.
scale = math.sqrt(max_patches * (patch_height / image_height) * (patch_width / image_width))
num_feasible_rows = max(min(math.floor(scale * image_height / patch_height), max_patches), 1)
num_feasible_cols = max(min(math.floor(scale * image_width / patch_width), max_patches), 1)
resized_height = max(num_feasible_rows * patch_height, 1)
resized_width = max(num_feasible_cols * patch_width, 1)
image = torch.nn.functional.interpolate(
image.unsqueeze(0),
size=(resized_height, resized_width),
mode="bilinear",
align_corners=False,
antialias=True,
).squeeze(0)
# [1, rows, columns, patch_height * patch_width * image_channels]
patches = torch_extract_patches(image, patch_height, patch_width)
patches_shape = patches.shape
rows = patches_shape[1]
columns = patches_shape[2]
depth = patches_shape[3]
# [rows * columns, patch_height * patch_width * image_channels]
patches = patches.reshape([rows * columns, depth])
# [rows * columns, 1]
row_ids = torch.arange(rows).reshape([rows, 1]).repeat(1, columns).reshape([rows * columns, 1])
col_ids = torch.arange(columns).reshape([1, columns]).repeat(rows, 1).reshape([rows * columns, 1])
# Offset by 1 so the ids do not contain zeros, which represent padding.
row_ids += 1
col_ids += 1
# Prepare additional patch features.
# [rows * columns, 1]
row_ids = row_ids.to(torch.float32)
col_ids = col_ids.to(torch.float32)
# [rows * columns, 2 + patch_height * patch_width * image_channels]
result = torch.cat([row_ids, col_ids, patches], -1)
# [max_patches, 2 + patch_height * patch_width * image_channels]
result = torch.nn.functional.pad(result, [0, 0, 0, max_patches - (rows * columns)]).float()
result = to_numpy_array(result)
return result, resized_width, resized_height, rows, columns
def normalize(
self,
image: np.ndarray,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Normalize an image. image = (image - image_mean) / image_std.
The image std is to mimic the tensorflow implementation of the `per_image_standardization`:
https://www.tensorflow.org/api_docs/python/tf/image/per_image_standardization
Args:
image (`np.ndarray`):
Image to normalize.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used.
input_data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
if image.dtype == np.uint8:
image = image.astype(np.float32)
# take mean across the whole `image`
mean = np.mean(image)
std = np.std(image)
adjusted_stddev = max(std, 1.0 / math.sqrt(np.prod(image.shape)))
return normalize(
image,
mean=mean,
std=adjusted_stddev,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
def preprocess(
self,
images: ImageInput,
do_convert_rgb: bool = None,
do_normalize: Optional[bool] = None,
max_patches: Optional[int] = None,
patch_size: Optional[Dict[str, int]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: ChannelDimension = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> ImageInput:
"""
Preprocess an image or batch of images. The processor first computes the maximum possible number of
aspect-ratio preserving patches of size `patch_size` that can be extracted from the image. It then pads the
image with zeros to make the image respect the constraint of `max_patches`. Before extracting the patches the
images are standardized following the tensorflow implementation of `per_image_standardization`
(https://www.tensorflow.org/api_docs/python/tf/image/per_image_standardization).
Args:
images (`ImageInput`):
Image to preprocess. Expects a single or batch of images.
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
Whether to convert the image to RGB.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
max_patches (`int`, *optional*, defaults to `self.max_patches`):
Maximum number of patches to extract.
patch_size (`dict`, *optional*, defaults to `self.patch_size`):
Dictionary containing the patch height and width.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
"""
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
patch_size = patch_size if patch_size is not None else self.patch_size
max_patches = max_patches if max_patches is not None else self.max_patches
if kwargs.get("data_format", None) is not None:
raise ValueError("data_format is not an accepted input as the outputs are ")
images = make_list_of_images(images)
if not valid_images(images):
raise ValueError(
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
"torch.Tensor, tf.Tensor or jax.ndarray."
)
# PIL RGBA images are converted to RGB
if do_convert_rgb:
images = [convert_to_rgb(image) for image in images]
# All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images]
if input_data_format is None:
# We assume that all images have the same channel dimension format.
input_data_format = infer_channel_dimension_format(images[0])
flattened_patches, width, height, rows, cols, attention_masks = [], [], [], [], [], []
for image in images:
if do_normalize:
image = self.normalize(image=image, input_data_format=input_data_format)
# convert to torch tensor and permute
f, w, h, r, c = self.extract_flattened_patches(
image=image,
max_patches=max_patches,
patch_size=patch_size,
input_data_format=input_data_format,
)
flattened_patches.append(f)
width.append(w)
height.append(h)
rows.append(r)
cols.append(c)
# create attention mask in numpy
attention_masks.append((f.sum(axis=-1) != 0).astype(np.float32))
encoded_outputs = BatchFeature(
data={
"flattened_patches": flattened_patches,
"attention_mask": attention_masks,
"width": width,
"height": height,
"rows": rows,
"cols": cols,
},
tensor_type=return_tensors,
)
return encoded_outputs
__all__ = ["Kosmos2_5ImageProcessor"]

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,164 @@
# coding=utf-8
# Copyright 2024 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for Kosmos2_5.
"""
from typing import List, Optional, Union
from ...image_processing_utils import BatchFeature
from ...image_utils import ImageInput
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
from ...tokenization_utils_base import TextInput
from ...utils import is_torch_available
if is_torch_available():
import torch
class Kosmos2_5ImagesKwargs(ImagesKwargs, total=False):
max_patches: Optional[int]
num_image_tokens: Optional[int]
class Kosmos2_5ProcessorKwargs(ProcessingKwargs, total=False):
text_kwargs: TextKwargs
images_kwargs: Kosmos2_5ImagesKwargs
_defaults = {
"text_kwargs": {
"padding": True,
"truncation": True,
"max_length": None,
"stride": 0,
"pad_to_multiple_of": None,
"return_attention_mask": None,
},
"images_kwargs": {
"max_patches": 4096,
"num_image_tokens": 2048,
},
"common_kwargs": {"return_tensors": "pt"},
}
class Kosmos2_5Processor(ProcessorMixin):
r"""
Constructs a Kosmos2_5 processor which wraps a PreTrainedTokenizerFast and Kosmos2_5 image processor into a single
processor.
[`Kosmos2_5Processor`] offers all the functionalities of [`Kosmos2_5ImageProcessor`] and [`PreTrainedTokenizerFast`]. See
the docstring of [`~Kosmos2_5Processor.__call__`] and [`~Kosmos2_5Processor.decode`] for more information.
Args:
image_processor (`Kosmos2_5ImageProcessor`):
An instance of [`Kosmos2_5ImageProcessor`]. The image processor is a required input.
tokenizer (Union[`T5TokenizerFast`, `T5Tokenizer`]):
An instance of ['T5TokenizerFast`] or ['T5Tokenizer`]. The tokenizer is a required input.
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "Kosmos2_5ImageProcessor"
tokenizer_class = "PreTrainedTokenizerFast"
def __init__(self, image_processor, tokenizer):
tokenizer.return_token_type_ids = False
super().__init__(image_processor, tokenizer)
def __call__(
self,
images: ImageInput = None,
text: Union[TextInput, List[TextInput]] = None,
audio=None,
videos=None,
**kwargs: Unpack[Kosmos2_5ProcessorKwargs],
) -> BatchFeature:
"""
This method uses [`Kosmos2_5ImageProcessor.preprocess`] method to prepare image(s) for the model, and
[`PreTrainedTokenizerFast.__call__`] to prepare text for the model.
Please refer to the docstring of the above two methods for more information.
The rest of this documentation shows the arguments specific to `Kosmos2_5Processor`.
"""
if images is None and text is None:
raise ValueError("You have to specify either images or text.")
if images is None:
raise ValueError("Kosmos2_5Processor requires images to be passed.")
output_kwargs = self._merge_kwargs(
Kosmos2_5ProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
num_image_tokens = output_kwargs["images_kwargs"].setdefault("num_image_tokens", None)
encoding = BatchFeature()
if images is not None:
image_encoding = self.image_processor(images, **output_kwargs["images_kwargs"])
image_encoding.pop("rows")
image_encoding.pop("cols")
encoding.update(image_encoding)
prompt = "<s><image>" + "<s>" * num_image_tokens + "</image>"
if text is not None:
if isinstance(text, str):
text = [prompt + text]
else:
text = [prompt + t for t in text]
input = self.tokenizer(text, **output_kwargs["text_kwargs"])
batch_size, seq_len = input.input_ids.shape
image_embeds_position_mask = [0, -1] + [1] * num_image_tokens + [-1]
image_embeds_position_mask += [0] * (seq_len - len(image_embeds_position_mask))
image_embeds_position_mask = (
torch.LongTensor(image_embeds_position_mask).unsqueeze(0).repeat(batch_size, 1)
)
encoding.update(
{
"input_ids": input.input_ids,
"attention_mask": input.attention_mask,
"image_embeds_position_mask": image_embeds_position_mask,
}
)
return encoding
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Kosmos2_5TokenizerFast's [`~PreTrainedTokenizer.batch_decode`].
Please refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Kosmos2_5TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
__all__ = ["Kosmos2_5Processor"]

View File

@ -302,7 +302,7 @@ class Pix2StructVisionLayer(nn.Module):
class Pix2StructVisionEncoder(nn.Module):
def __init__(self, config: Pix2StructConfig) -> None:
def __init__(self, config: Pix2StructVisionConfig) -> None:
super().__init__()
self.config = config
self.layer = nn.ModuleList([Pix2StructVisionLayer(config) for _ in range(config.num_hidden_layers)])
@ -531,7 +531,7 @@ class Pix2StructVisionModel(Pix2StructPreTrainedModel):
supports_gradient_checkpointing = True
_no_split_modules = ["Pix2StructVisionLayer"]
def __init__(self, config: Pix2StructConfig):
def __init__(self, config: Pix2StructVisionConfig):
super().__init__(config)
self.config = config

View File

@ -5255,6 +5255,27 @@ class Kosmos2PreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"])
class Kosmos2_5ForConditionalGeneration(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Kosmos2_5Model(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Kosmos2_5PreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class LayoutLMForMaskedLM(metaclass=DummyObject):
_backends = ["torch"]

View File

@ -331,6 +331,13 @@ class InstructBlipVideoImageProcessor(metaclass=DummyObject):
requires_backends(self, ["vision"])
class Kosmos2_5ImageProcessor(metaclass=DummyObject):
_backends = ["vision"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
class LayoutLMv2FeatureExtractor(metaclass=DummyObject):
_backends = ["vision"]

View File

View File

@ -0,0 +1,308 @@
# coding=utf-8
# Copyright 2024 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import requests
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
if is_torch_available():
import torch
if is_vision_available():
from PIL import Image
from transformers import Kosmos2_5ImageProcessor
class Kosmos2_5ImageProcessingTester:
def __init__(
self,
parent,
batch_size=7,
num_channels=3,
image_size=18,
min_resolution=30,
max_resolution=400,
size=None,
do_normalize=True,
do_convert_rgb=True,
patch_size=None,
):
size = size if size is not None else {"height": 20, "width": 20}
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.image_size = image_size
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.size = size
self.do_normalize = do_normalize
self.do_convert_rgb = do_convert_rgb
self.max_patches = [512, 1024, 2048, 4096]
self.patch_size = patch_size if patch_size is not None else {"height": 16, "width": 16}
def prepare_image_processor_dict(self):
return {"do_normalize": self.do_normalize, "do_convert_rgb": self.do_convert_rgb}
def prepare_dummy_image(self):
img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg"
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
return raw_image
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
return prepare_image_inputs(
batch_size=self.batch_size,
num_channels=self.num_channels,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
numpify=numpify,
torchify=torchify,
)
@require_torch
@require_vision
class Kosmos2_5ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = Kosmos2_5ImageProcessor if is_vision_available() else None
def setUp(self):
super().setUp()
self.image_processor_tester = Kosmos2_5ImageProcessingTester(self)
@property
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
image_processor = self.image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processor, "do_normalize"))
self.assertTrue(hasattr(image_processor, "do_convert_rgb"))
def test_expected_patches(self):
dummy_image = self.image_processor_tester.prepare_dummy_image()
image_processor = self.image_processing_class(**self.image_processor_dict)
max_patch = 2048
inputs = image_processor(dummy_image, return_tensors="pt", max_patches=max_patch)
self.assertTrue(torch.allclose(inputs.flattened_patches.mean(), torch.tensor(0.0606), atol=1e-3, rtol=1e-3))
def test_call_pil(self):
# Initialize image_processor
image_processor = self.image_processing_class(**self.image_processor_dict)
# create random PIL images
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
for image in image_inputs:
self.assertIsInstance(image, Image.Image)
# Test not batched input
expected_hidden_dim = (
(self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
* self.image_processor_tester.num_channels
) + 2
for max_patch in self.image_processor_tester.max_patches:
# Test not batched input
encoded_images = image_processor(
image_inputs[0], return_tensors="pt", max_patches=max_patch
).flattened_patches
self.assertEqual(
encoded_images.shape,
(1, max_patch, expected_hidden_dim),
)
# Test batched
encoded_images = image_processor(
image_inputs, return_tensors="pt", max_patches=max_patch
).flattened_patches
self.assertEqual(
encoded_images.shape,
(self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
)
def test_call_numpy(self):
# Initialize image_processor
image_processor = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
for image in image_inputs:
self.assertIsInstance(image, np.ndarray)
expected_hidden_dim = (
(self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
* self.image_processor_tester.num_channels
) + 2
for max_patch in self.image_processor_tester.max_patches:
# Test not batched input
encoded_images = image_processor(
image_inputs[0], return_tensors="pt", max_patches=max_patch
).flattened_patches
self.assertEqual(
encoded_images.shape,
(1, max_patch, expected_hidden_dim),
)
# Test batched
encoded_images = image_processor(
image_inputs, return_tensors="pt", max_patches=max_patch
).flattened_patches
self.assertEqual(
encoded_images.shape,
(self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
)
def test_call_numpy_4_channels(self):
# Initialize image_processor
image_processor = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
self.image_processor_tester.num_channels = 4
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
for image in image_inputs:
self.assertIsInstance(image, np.ndarray)
expected_hidden_dim = (
(self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
* self.image_processor_tester.num_channels
) + 2
for max_patch in self.image_processor_tester.max_patches:
# Test not batched input
encoded_images = image_processor(
image_inputs[0], return_tensors="pt", max_patches=max_patch, input_data_format="channels_last"
).flattened_patches
self.assertEqual(
encoded_images.shape,
(1, max_patch, expected_hidden_dim),
)
# Test batched
encoded_images = image_processor(
image_inputs, return_tensors="pt", max_patches=max_patch, input_data_format="channels_last"
).flattened_patches
self.assertEqual(
encoded_images.shape,
(self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
)
self.image_processor_tester.num_channels = 3
def test_call_pytorch(self):
# Initialize image_processor
image_processor = self.image_processing_class(**self.image_processor_dict)
# create random PyTorch tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
for image in image_inputs:
self.assertIsInstance(image, torch.Tensor)
# Test not batched input
expected_hidden_dim = (
(self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
* self.image_processor_tester.num_channels
) + 2
for max_patch in self.image_processor_tester.max_patches:
# Test not batched input
encoded_images = image_processor(
image_inputs[0], return_tensors="pt", max_patches=max_patch
).flattened_patches
self.assertEqual(
encoded_images.shape,
(1, max_patch, expected_hidden_dim),
)
# Test batched
encoded_images = image_processor(
image_inputs, return_tensors="pt", max_patches=max_patch
).flattened_patches
self.assertEqual(
encoded_images.shape,
(self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
)
@require_torch
@require_vision
class Kosmos2_5ImageProcessingTestFourChannels(ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = Kosmos2_5ImageProcessor if is_vision_available() else None
def setUp(self):
super().setUp()
self.image_processor_tester = Kosmos2_5ImageProcessingTester(self, num_channels=4)
self.expected_encoded_image_num_channels = 3
@property
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
image_processor = self.image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processor, "do_normalize"))
self.assertTrue(hasattr(image_processor, "do_convert_rgb"))
def test_call_pil(self):
# Initialize image_processor
image_processor = self.image_processing_class(**self.image_processor_dict)
# create random PIL images
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
for image in image_inputs:
self.assertIsInstance(image, Image.Image)
# Test not batched input
expected_hidden_dim = (
(self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
* (self.image_processor_tester.num_channels - 1)
) + 2
for max_patch in self.image_processor_tester.max_patches:
# Test not batched input
encoded_images = image_processor(
image_inputs[0], return_tensors="pt", max_patches=max_patch
).flattened_patches
self.assertEqual(
encoded_images.shape,
(1, max_patch, expected_hidden_dim),
)
# Test batched
encoded_images = image_processor(
image_inputs, return_tensors="pt", max_patches=max_patch
).flattened_patches
self.assertEqual(
encoded_images.shape,
(self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
)
@unittest.skip(reason="Kosmos2_5ImageProcessor does not support 4 channels yet") # FIXME Amy
def test_call_numpy(self):
return super().test_call_numpy()
@unittest.skip(reason="Kosmos2_5ImageProcessor does not support 4 channels yet") # FIXME Amy
def test_call_pytorch(self):
return super().test_call_torch()
@unittest.skip(
reason="Kosmos2_5ImageProcessor does treat numpy and PIL 4 channel images consistently"
) # FIXME Amy
def test_call_numpy_4_channels(self):
return super().test_call_torch()

View File

@ -0,0 +1,848 @@
# coding=utf-8
# Copyright 2024 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch KOSMOS-2.5 model."""
import copy
import inspect
import os
import tempfile
import unittest
import numpy as np
import pytest
import requests
from parameterized import parameterized
from transformers import AutoProcessor, Kosmos2_5Config
from transformers.models.kosmos2_5.configuration_kosmos2_5 import (
Kosmos2_5TextConfig,
Kosmos2_5VisionConfig,
)
from transformers.testing_utils import (
require_flash_attn,
require_torch,
require_torch_gpu,
require_torch_sdpa,
require_vision,
slow,
torch_device,
)
from transformers.utils import is_torch_available, is_vision_available
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
ModelTesterMixin,
_config_zero_init,
floats_tensor,
ids_tensor,
random_attention_mask,
)
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
from transformers import Kosmos2_5ForConditionalGeneration, Kosmos2_5Model
if is_vision_available():
from PIL import Image
class Kosmos2_5VisionModelTester:
def __init__(
self,
parent,
batch_size=6,
image_size=32,
patch_size=4,
num_channels=3,
is_training=True,
hidden_size=32,
d_ff=64,
num_hidden_layers=2,
num_attention_heads=4,
dropout=0,
attention_dropout=0,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.is_training = is_training
self.hidden_size = hidden_size
self.d_ff = d_ff
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.patch_embed_hidden_size = patch_size * patch_size * num_channels
self.dropout = dropout
self.attention_dropout = attention_dropout
self.scope = scope
# in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
num_patches = (image_size // patch_size) ** 2
self.seq_length = num_patches + 1
def prepare_config_and_inputs(self):
flattened_patches = floats_tensor([self.batch_size, self.seq_length, self.patch_embed_hidden_size + 2])
config = self.get_config()
return config, flattened_patches
def get_config(self):
return Kosmos2_5VisionConfig(
image_size=self.image_size,
patch_size=self.patch_size,
num_channels=self.num_channels,
hidden_size=self.hidden_size,
d_ff=self.d_ff,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
patch_embed_hidden_size=self.patch_embed_hidden_size,
dropout=self.dropout,
attention_dropout=self.attention_dropout,
)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, flattened_patches = config_and_inputs
inputs_dict = {"flattened_patches": flattened_patches}
return config, inputs_dict
class Kosmos2_5TextModelTester:
def __init__(
self,
parent,
batch_size=6,
seq_length=7,
is_training=True,
use_input_mask=True,
use_labels=True,
vocab_size=99,
hidden_size=32,
ffn_dim=64,
num_hidden_layers=2,
num_attention_heads=4,
dropout=0,
attention_dropout=0,
max_position_embeddings=512,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_input_mask = use_input_mask
self.use_labels = use_labels
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.ffn_dim = ffn_dim
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.max_position_embeddings = max_position_embeddings
self.scope = scope
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = random_attention_mask([self.batch_size, self.seq_length])
if input_mask is not None:
batch_size, seq_length = input_mask.shape
rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
for batch_idx, start_index in enumerate(rnd_start_indices):
input_mask[batch_idx, :start_index] = 1
input_mask[batch_idx, start_index:] = 0
config = self.get_config()
return config, input_ids, input_mask
def get_config(self):
return Kosmos2_5TextConfig(
vocab_size=self.vocab_size,
embed_dim=self.hidden_size,
ffn_dim=self.ffn_dim,
layers=self.num_hidden_layers,
attention_heads=self.num_attention_heads,
dropout=self.dropout,
attention_dropout=self.attention_dropout,
max_position_embeddings=self.max_position_embeddings,
)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, input_ids, input_mask = config_and_inputs
inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
return config, inputs_dict
class Kosmos2_5ModelTester:
def __init__(
self,
parent,
text_kwargs=None,
vision_kwargs=None,
latent_query_num=3,
is_training=True,
):
if text_kwargs is None:
text_kwargs = {}
if vision_kwargs is None:
vision_kwargs = {}
self.parent = parent
self.text_model_tester = Kosmos2_5TextModelTester(parent, **text_kwargs)
self.vision_model_tester = Kosmos2_5VisionModelTester(parent, **vision_kwargs)
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
self.seq_length = self.text_model_tester.seq_length
self.latent_query_num = latent_query_num
self.is_training = is_training
def prepare_config_and_inputs(self):
text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
vision_config, flattened_patches = self.vision_model_tester.prepare_config_and_inputs()
# build `image_embeds_position_mask`
image_embeds_position_mask = torch.zeros_like(input_ids)
image_embeds_position_mask[:, 1 : 1 + self.latent_query_num :] = 1
config = self.get_config()
return (
config,
input_ids,
attention_mask,
image_embeds_position_mask,
flattened_patches,
)
def get_config(self):
return Kosmos2_5Config(
self.text_model_tester.get_config().to_dict(),
self.vision_model_tester.get_config().to_dict(),
latent_query_num=self.latent_query_num,
)
def create_and_check_model(
self,
config,
input_ids,
attention_mask,
image_embeds_position_mask,
flattened_patches,
):
model = Kosmos2_5Model(config).to(torch_device).eval()
with torch.no_grad():
result = model(input_ids, flattened_patches, image_embeds_position_mask, attention_mask)
self.parent.assertEqual(
result.last_hidden_state.shape,
(
self.text_model_tester.batch_size,
self.text_model_tester.seq_length,
self.text_model_tester.hidden_size,
),
)
self.parent.assertEqual(
result.image_embeds.shape,
(
self.text_model_tester.batch_size,
self.latent_query_num,
self.text_model_tester.hidden_size,
),
)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(
config,
input_ids,
attention_mask,
image_embeds_position_mask,
flattened_patches,
) = config_and_inputs
inputs_dict = {
"input_ids": input_ids,
"attention_mask": attention_mask,
"image_embeds_position_mask": image_embeds_position_mask,
"flattened_patches": flattened_patches,
}
return config, inputs_dict
@require_torch
class Kosmos2_5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (Kosmos2_5Model, Kosmos2_5ForConditionalGeneration) if is_torch_available() else ()
all_generative_model_classes = (Kosmos2_5ForConditionalGeneration,) if is_torch_available() else ()
pipeline_model_mapping = (
{
"feature-extraction": Kosmos2_5Model,
"image-to-text": Kosmos2_5ForConditionalGeneration,
}
if is_torch_available()
else {}
)
fx_compatible = False
test_head_masking = False
test_pruning = False
test_resize_embeddings = False
test_attention_outputs = False
_is_composite = True
# TODO: `image-to-text` pipeline for this model needs Processor.
def is_pipeline_test_to_skip(
self,
pipeline_test_casse_name,
config_class,
model_architecture,
tokenizer_name,
processor_name,
):
return pipeline_test_casse_name == "ImageToTextPipelineTests"
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
inputs_dict = copy.deepcopy(inputs_dict)
if return_labels:
if model_class.__name__ == "Kosmos2_5ForConditionalGeneration":
inputs_dict["labels"] = torch.zeros(
(
self.model_tester.text_model_tester.batch_size,
self.model_tester.text_model_tester.seq_length,
),
dtype=torch.long,
device=torch_device,
)
if model_class.__name__ in [
"Kosmos2_5Model",
"Kosmos2_5ForConditionalGeneration",
]:
bs, _ = inputs_dict["input_ids"].shape
seqlen = self.model_tester.text_model_tester.seq_length
inputs_dict["input_ids"] = torch.arange(seqlen, device=torch_device).unsqueeze(0).expand(bs, seqlen)
inputs_dict["input_ids"] = inputs_dict["input_ids"] % self.model_tester.text_model_tester.vocab_size
inputs_dict["attention_mask"] = torch.ones((bs, seqlen), device=torch_device)
inputs_dict["image_embeds_position_mask"] = torch.zeros((bs, seqlen), device=torch_device)
inputs_dict["image_embeds_position_mask"][:, : self.model_tester.latent_query_num] = 1
return inputs_dict
def setUp(self):
self.model_tester = Kosmos2_5ModelTester(self)
self.config_tester = ConfigTester(self, config_class=Kosmos2_5Config, hidden_size=37)
# overwrite from common to skip `image_to_text_projection.latent_query`
def test_initialization(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
configs_no_init = _config_zero_init(config)
for model_class in self.all_model_classes:
model = model_class(config=configs_no_init)
for name, param in model.named_parameters():
if param.requires_grad:
if name == "image_to_text_projection.latent_query":
# The original code use ` nn.Parameter(torch.randn(...))` for which this test won't pass.
continue
self.assertIn(
((param.data.mean() * 1e9).round() / 1e9).item(),
[0.0, 1.0],
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
signature = inspect.signature(model.forward)
# signature.parameters is an OrderedDict => so arg_names order is deterministic
arg_names = [*signature.parameters.keys()]
expected_arg_names = ["input_ids"]
self.assertListEqual(arg_names[:1], expected_arg_names)
def test_load_save_without_tied_weights(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
config.text_config.tie_word_embeddings = False
for model_class in self.all_model_classes:
model = model_class(config)
with tempfile.TemporaryDirectory() as d:
model.save_pretrained(d)
model_reloaded, infos = model_class.from_pretrained(d, output_loading_info=True)
# Checking the state dicts are correct
reloaded_state = model_reloaded.state_dict()
for k, v in model.state_dict().items():
self.assertIn(k, reloaded_state, f"Key {k} is missing from reloaded")
torch.testing.assert_close(
v,
reloaded_state[k],
msg=lambda x: f"{model_class.__name__}: Tensor {k}: {x}",
)
# Checking there was no complain of missing weights
self.assertEqual(infos["missing_keys"], [])
# overwrite from common in order to use `self.model_tester.text_model_tester.num_hidden_layers`
def test_hidden_states_output(self):
def check_hidden_states_output(inputs_dict, config, model_class):
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
hidden_states = outputs.hidden_states
expected_num_layers = getattr(
self.model_tester,
"expected_num_hidden_layers",
self.model_tester.text_model_tester.num_hidden_layers + 1,
)
self.assertEqual(len(hidden_states), expected_num_layers)
seq_length = self.model_tester.text_model_tester.seq_length
self.assertListEqual(
list(hidden_states[0].shape[-2:]),
[seq_length, self.model_tester.text_model_tester.hidden_size],
)
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
inputs_dict["output_hidden_states"] = True
check_hidden_states_output(inputs_dict, config, model_class)
# check that output_hidden_states also work using config
del inputs_dict["output_hidden_states"]
config.output_hidden_states = True
check_hidden_states_output(inputs_dict, config, model_class)
# overwrite from common in order to use `config.text_config.vocab_size` instead of `config.vocab_size`
def test_tie_model_weights(self):
if not self.test_torchscript:
self.skipTest(reason="test_torchscript is set to False")
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
def check_same_values(layer_1, layer_2):
equal = True
for p1, p2 in zip(layer_1.weight, layer_2.weight):
if p1.data.ne(p2.data).sum() > 0:
equal = False
return equal
for model_class in self.all_model_classes:
config.torchscript = True
model_not_tied = model_class(config)
if model_not_tied.get_output_embeddings() is None:
continue
config_tied = copy.deepcopy(config)
config_tied.torchscript = False
model_tied = model_class(config_tied)
params_tied = list(model_tied.parameters())
# Check that the embedding layer and decoding layer are the same in size and in value
# self.assertTrue(check_same_values(embeddings, decoding))
# # Check that after modification, they remain the same.
# embeddings.weight.data.div_(2)
# # Check that the embedding layer and decoding layer are the same in size and in value
# self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
# self.assertTrue(check_same_values(embeddings, decoding))
# # Check that after modification, they remain the same.
# decoding.weight.data.div_(4)
# # Check that the embedding layer and decoding layer are the same in size and in value
# self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
# self.assertTrue(check_same_values(embeddings, decoding))
# Check that after resize they remain tied.
model_tied.resize_token_embeddings(config.text_config.vocab_size + 10)
params_tied_2 = list(model_tied.parameters())
self.assertEqual(len(params_tied_2), len(params_tied))
# decoding.weight.data.mul_(20)
# # Check that the embedding layer and decoding layer are the same in size and in value
# self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
# self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
@slow
def test_model_from_pretrained(self):
model_name = "microsoft/kosmos-2.5"
model = Kosmos2_5Model.from_pretrained(model_name)
self.assertIsNotNone(model)
@unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
def test_model_parallelism(self):
super().test_model_parallelism()
# TODO: ydshieh
@require_torch_gpu
@pytest.mark.flash_attn_test
@slow
@unittest.skip(reason="kosmos-2.5 flash attention does not support right padding")
def test_flash_attn_2_inference_equivalence_right_padding(self):
pass
# TODO: ydshieh
@require_torch_gpu
@pytest.mark.flash_attn_test
@slow
@unittest.skip(reason="kosmos-2.5 test : the dummy inputs should be tweaked: dummy_input = inputs_dict")
def test_flash_attn_2_inference_equivalence(self):
pass
# TODO: ydshieh
@require_torch_sdpa
@require_torch_gpu
@slow
@unittest.skip(reason="_update_causal_mask is not implemented yet which fails this test")
def test_sdpa_can_dispatch_on_flash(self):
pass
# TODO: ydshieh
@unittest.skip(reason="doesn't support padding yet")
def test_eager_matches_sdpa_inference_1_bfloat16(self):
pass
# TODO: ydshieh
@unittest.skip(reason=" the model hasn't been added to auto class")
def test_flash_attn_2_from_config(self):
pass
@unittest.skip("This test is currently not well designed for multimodal model (float type as an input).")
def test_flash_attn_2_fp32_ln(self):
pass
@unittest.skip("This test is currently not well designed for multimodal model (float type as an input).")
def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
pass
@unittest.skip("Kosmos 2.5 is multimodel and has specific input shapes.")
def test_flash_attn_2_generate_reuse_cache(self):
pass
@pytest.mark.generate
@parameterized.expand([("greedy", 1), ("beam search", 2)])
@unittest.skip(
"KOSMOS-2.5 doesn't support inputs embeds. The test isn't skipped by checking input args because KOSMOS-2 has `generate()` overwritten",
)
def test_generate_from_inputs_embeds(self):
pass
# TODO: ydshieh
@pytest.mark.generate
@unittest.skip(
"Kosmos2_5ForConditionalGeneration returns `vision_model_output` which is currently not working with `stack_model_outputs`",
)
def test_beam_search_low_memory(self):
pass
@pytest.mark.generate
def test_left_padding_compatibility(self):
# Overwrite because Kosmos-2.5 need to padd pixel values and pad image-attn-mask
def _prepare_model_kwargs(input_ids, attention_mask, pad_size, signature):
model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask}
if "position_ids" in signature:
position_ids = torch.cumsum(attention_mask, dim=-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1)
model_kwargs["position_ids"] = position_ids
if "cache_position" in signature:
cache_position = torch.arange(input_ids.shape[-1], device=torch_device)
model_kwargs["cache_position"] = cache_position
if "image_embeds_position_mask" in signature:
image_embeds_position_mask = torch.zeros_like(input_ids)
image_embeds_position_mask[:, (pad_size + 1) : pad_size + 1 + self.model_tester.latent_query_num] = 1
model_kwargs["image_embeds_position_mask"] = image_embeds_position_mask
return model_kwargs
for model_class in self.all_generative_model_classes:
config, inputs_dict = self.prepare_config_and_inputs_for_generate()
input_ids = inputs_dict["input_ids"]
flattened_patches = inputs_dict["flattened_patches"]
attention_mask = inputs_dict.get("attention_mask")
if attention_mask is None:
attention_mask = torch.ones_like(input_ids)
model = model_class(config).to(torch_device).eval()
signature = inspect.signature(model.forward).parameters.keys()
# no cache as some models require special cache classes to be init outside forward
model.generation_config.use_cache = False
# Without padding
model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, pad_size=0, signature=signature)
next_logits_wo_padding = model(**model_kwargs, flattened_patches=flattened_patches).logits[:, -1, :]
# With left-padding (length 32)
# can hardcode pad_token to be 0 as we'll do attn masking anyway
pad_token_id = (
config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0
)
pad_size = (input_ids.shape[0], 32)
padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id
padded_input_ids = torch.cat((padding, input_ids), dim=1)
padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1)
model_kwargs = _prepare_model_kwargs(
padded_input_ids, padded_attention_mask, pad_size=32, signature=signature
)
next_logits_with_padding = model(**model_kwargs, flattened_patches=flattened_patches).logits[:, -1, :]
# They should result in very similar logits
self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-3))
def _create_and_check_torchscript(self, config, inputs_dict):
if not self.test_torchscript:
self.skipTest(reason="test_torchscript is set to False")
configs_no_init = _config_zero_init(config) # To be sure we have no Nan
configs_no_init.torchscript = True
for model_class in self.all_model_classes:
model = model_class(config=configs_no_init)
model.to(torch_device)
model.eval()
inputs = self._prepare_for_class(inputs_dict, model_class)
main_input_name = model_class.main_input_name
try:
main_input = inputs[main_input_name]
model(
main_input,
inputs["flattened_patches"],
inputs["image_embeds_position_mask"],
)
traced_model = torch.jit.trace(
model,
(
main_input,
inputs["flattened_patches"],
inputs["image_embeds_position_mask"],
),
)
except RuntimeError:
self.fail("Couldn't trace module.")
with tempfile.TemporaryDirectory() as tmp_dir_name:
pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
try:
torch.jit.save(traced_model, pt_file_name)
except Exception:
self.fail("Couldn't save module.")
try:
loaded_model = torch.jit.load(pt_file_name)
except Exception:
self.fail("Couldn't load module.")
model.to(torch_device)
model.eval()
loaded_model.to(torch_device)
loaded_model.eval()
model_state_dict = model.state_dict()
loaded_model_state_dict = loaded_model.state_dict()
non_persistent_buffers = {}
for key in loaded_model_state_dict.keys():
if key not in model_state_dict.keys():
non_persistent_buffers[key] = loaded_model_state_dict[key]
loaded_model_state_dict = {
key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
}
self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
model_buffers = list(model.buffers())
for non_persistent_buffer in non_persistent_buffers.values():
found_buffer = False
for i, model_buffer in enumerate(model_buffers):
if torch.equal(non_persistent_buffer, model_buffer):
found_buffer = True
break
self.assertTrue(found_buffer)
model_buffers.pop(i)
models_equal = True
for layer_name, p1 in model_state_dict.items():
if layer_name in loaded_model_state_dict:
p2 = loaded_model_state_dict[layer_name]
if p1.data.ne(p2.data).sum() > 0:
models_equal = False
self.assertTrue(models_equal)
# Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
# (Even with this call, there are still memory leak by ~0.04MB)
self.clear_torch_jit_class_registry()
@require_vision
@require_torch
@slow
class Kosmos2_5ModelIntegrationTest(unittest.TestCase):
# This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
# Depending on the hardware we get different logits / generations
cuda_compute_capability_major_version = None
@classmethod
def setUpClass(cls):
if is_torch_available() and torch.cuda.is_available():
# 8 is for A100 / A10 and 7 for T4
cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0]
def run_example(self, prompt, image, model, processor):
inputs = processor(text=prompt, images=image, return_tensors="pt")
_, _ = inputs.pop("height"), inputs.pop("width")
inputs = {k: v.to(torch_device) if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(model.dtype)
generation_outputs = model.generate(
**inputs,
max_new_tokens=1024,
)
generated_ids = generation_outputs
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
return generated_ids, generated_text
def test_eager(self):
url = "https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"
image = Image.open(requests.get(url, stream=True).raw)
dtype = torch.bfloat16
repo = "microsoft/kosmos-2.5"
model = Kosmos2_5ForConditionalGeneration.from_pretrained(
repo, device_map=torch_device, torch_dtype=dtype, attn_implementation="eager"
)
processor = AutoProcessor.from_pretrained(repo)
prompt = "<ocr>"
generated_ids, generated_text = self.run_example(prompt, image, model, processor)
EXPECTED_TEXT = {
7: [
"<bbox><x_53><y_573><x_69><y_606></bbox>1\n<bbox><x_79><y_573><x_464><y_611></bbox>[REG] BLACK SAKURA\n<bbox><x_690><y_569><x_810><y_606></bbox>45,455\n<bbox><x_53><y_614><x_69><y_648></bbox>1\n<bbox><x_79><y_614><x_468><y_651></bbox>COOKIE DOH SAUCES\n<bbox><x_788><y_609><x_812><y_642></bbox>0\n<bbox><x_50><y_658><x_69><y_693></bbox>1\n<bbox><x_79><y_658><x_358><y_693></bbox>NATA DE COCO\n<bbox><x_790><y_652><x_814><y_683></bbox>0\n<bbox><x_31><y_742><x_820><y_781></bbox>Sub Total 45,455\n<bbox><x_27><y_781><x_822><y_827></bbox>PB1 (10%) 4,545\n<bbox><x_27><y_826><x_824><y_872></bbox>Rounding 0\n<bbox><x_24><y_872><x_827><y_921></bbox>Total 50,000\n<bbox><x_17><y_1056><x_836><y_1108></bbox>Card Payment 50,000\n"
],
8: [
"<bbox><x_53><y_573><x_69><y_606></bbox>1\n<bbox><x_79><y_573><x_464><y_611></bbox>[REG] BLACK SAKURA\n<bbox><x_690><y_569><x_810><y_606></bbox>45,455\n<bbox><x_53><y_614><x_69><y_648></bbox>1\n<bbox><x_79><y_614><x_468><y_650></bbox>COOKIE DOH SAUCES\n<bbox><x_788><y_609><x_812><y_644></bbox>0\n<bbox><x_50><y_658><x_69><y_693></bbox>1\n<bbox><x_79><y_658><x_358><y_693></bbox>NATA DE COCO\n<bbox><x_790><y_652><x_814><y_687></bbox>0\n<bbox><x_31><y_742><x_820><y_781></bbox>Sub Total 45,455\n<bbox><x_27><y_781><x_822><y_827></bbox>PB1 (10%) 4,545\n<bbox><x_27><y_826><x_824><y_872></bbox>Rounding 0\n<bbox><x_24><y_872><x_827><y_921></bbox>Total 50,000\n<bbox><x_17><y_1056><x_836><y_1108></bbox>Card Payment 50,000\n"
],
}
self.assertListEqual(generated_text, EXPECTED_TEXT[self.cuda_compute_capability_major_version])
prompt = "<md>"
generated_ids, generated_text = self.run_example(prompt, image, model, processor)
EXPECTED_TEXT = {
7: [
"- **1 \\[REG\\] BLACK SAKURA** 45,455\n- **1 COOKIE DOH SAUCES** 0\n- **1 NATA DE COCO** 0\n- **Sub Total** 45,455\n- **PB1 (10%)** 4,545\n- **Rounding** 0\n- **Total** **50,000**\n\nCard Payment 50,000"
],
8: [
"- **1 \\[REG\\] BLACK SAKURA** 45,455\n- **1 COOKIE DOH SAUCES** 0\n- **1 NATA DE COCO** 0\n- **Sub Total** 45,455\n- **PB1 (10%)** 4,545\n- **Rounding** 0\n- **Total** **50,000**\n\nCard Payment 50,000"
],
}
self.assertListEqual(generated_text, EXPECTED_TEXT[self.cuda_compute_capability_major_version])
def test_sdpa(self):
url = "https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"
image = Image.open(requests.get(url, stream=True).raw)
dtype = torch.bfloat16
repo = "microsoft/kosmos-2.5"
model = Kosmos2_5ForConditionalGeneration.from_pretrained(
repo, device_map=torch_device, torch_dtype=dtype, attn_implementation="sdpa"
)
processor = AutoProcessor.from_pretrained(repo)
prompt = "<ocr>"
generated_ids, generated_text = self.run_example(prompt, image, model, processor)
EXPECTED_TEXT = {
7: [
"<bbox><x_53><y_573><x_69><y_606></bbox>1\n<bbox><x_79><y_573><x_464><y_611></bbox>[REG] BLACK SAKURA\n<bbox><x_690><y_569><x_810><y_606></bbox>45,455\n<bbox><x_53><y_614><x_69><y_648></bbox>1\n<bbox><x_79><y_614><x_468><y_651></bbox>COOKIE DOH SAUCES\n<bbox><x_788><y_609><x_812><y_642></bbox>0\n<bbox><x_50><y_658><x_69><y_693></bbox>1\n<bbox><x_79><y_658><x_358><y_693></bbox>NATA DE COCO\n<bbox><x_790><y_652><x_814><y_683></bbox>0\n<bbox><x_31><y_742><x_820><y_781></bbox>Sub Total 45,455\n<bbox><x_27><y_781><x_822><y_827></bbox>PB1 (10%) 4,545\n<bbox><x_27><y_826><x_824><y_872></bbox>Rounding 0\n<bbox><x_24><y_872><x_827><y_921></bbox>Total 50,000\n<bbox><x_17><y_1056><x_836><y_1108></bbox>Card Payment 50,000\n",
],
8: [
"<bbox><x_53><y_573><x_69><y_606></bbox>1\n<bbox><x_79><y_573><x_464><y_611></bbox>[REG] BLACK SAKURA\n<bbox><x_690><y_569><x_810><y_606></bbox>45,455\n<bbox><x_53><y_614><x_69><y_648></bbox>1\n<bbox><x_79><y_614><x_468><y_651></bbox>COOKIE DOH SAUCES\n<bbox><x_788><y_609><x_812><y_642></bbox>0\n<bbox><x_50><y_658><x_69><y_693></bbox>1\n<bbox><x_79><y_658><x_358><y_693></bbox>NATA DE COCO\n<bbox><x_790><y_652><x_814><y_683></bbox>0\n<bbox><x_31><y_742><x_820><y_781></bbox>Sub Total 45,455\n<bbox><x_27><y_781><x_822><y_827></bbox>PB1 (10%) 4,545\n<bbox><x_27><y_826><x_824><y_872></bbox>Rounding 0\n<bbox><x_24><y_872><x_827><y_921></bbox>Total 50,000\n<bbox><x_17><y_1056><x_836><y_1108></bbox>Card Payment 50,000\n"
],
}
self.assertListEqual(generated_text, EXPECTED_TEXT[self.cuda_compute_capability_major_version])
prompt = "<md>"
generated_ids, generated_text = self.run_example(prompt, image, model, processor)
EXPECTED_TEXT = {
7: [
"- **1 \\[REG\\] BLACK SAKURA** 45,455\n- **1 COOKIE DOH SAUCES** 0\n- **1 NATA DE COCO** 0\n- **Sub Total** 45,455\n- **PB1 (10%)** 4,545\n- **Rounding** 0\n- **Total** **50,000**\n\nCard Payment 50,000"
],
8: [
"- **1 \\[REG\\] BLACK SAKURA** 45,455\n- **1 COOKIE DOH SAUCES** 0\n- **1 NATA DE COCO** 0\n- **Sub Total** 45,455\n- **PB1 (10%)** 4,545\n- **Rounding** 0\n- **Total** **50,000**\n\nCard Payment 50,000"
],
}
self.assertListEqual(generated_text, EXPECTED_TEXT[self.cuda_compute_capability_major_version])
@require_flash_attn
@require_torch_gpu
@pytest.mark.flash_attn_test
@slow
def test_FA2(self):
url = "https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"
image = Image.open(requests.get(url, stream=True).raw)
dtype = torch.bfloat16
repo = "microsoft/kosmos-2.5"
model = Kosmos2_5ForConditionalGeneration.from_pretrained(
repo,
device_map=torch_device,
torch_dtype=dtype,
attn_implementation="flash_attention_2",
)
processor = AutoProcessor.from_pretrained(repo)
prompt = "<ocr>"
generated_ids, generated_text = self.run_example(prompt, image, model, processor)
EXPECTED_TEXT = [
"<bbox><x_53><y_573><x_69><y_606></bbox>1\n<bbox><x_79><y_573><x_464><y_612></bbox>[REG] BLACK SAKURA\n<bbox><x_690><y_569><x_812><y_606></bbox>45,455\n<bbox><x_53><y_614><x_69><y_650></bbox>1\n<bbox><x_79><y_614><x_468><y_650></bbox>COOKIE DOH SAUCES\n<bbox><x_788><y_610><x_813><y_644></bbox>0\n<bbox><x_50><y_658><x_65><y_693></bbox>1\n<bbox><x_76><y_658><x_358><y_693></bbox>NATA DE COCO\n<bbox><x_790><y_652><x_815><y_687></bbox>0\n<bbox><x_31><y_742><x_822><y_781></bbox>Sub Total 45,455\n<bbox><x_27><y_780><x_822><y_827></bbox>PB1 (10%) 4,545\n<bbox><x_27><y_826><x_824><y_874></bbox>Rounding 0\n<bbox><x_24><y_872><x_827><y_921></bbox>Total 50,000\n<bbox><x_17><y_1056><x_835><y_1108></bbox>Card Payment 50,000\n"
]
self.assertListEqual(generated_text, EXPECTED_TEXT)
prompt = "<md>"
generated_ids, generated_text = self.run_example(prompt, image, model, processor)
# A10 gives the 1st one, but A100 gives the 2nd one
EXPECTED_TEXT = [
"- **1 \\[REG\\] BLACK SAKURA** 45,455\n- **1 COOKIE DOH SAUCES** 0\n- **1 NATA DE COCO** 0\n\n<table>\n<thead>\n<tr>\n<th>\nSub Total\n</th>\n<th>\n45,455\n</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>\nPB1 (10%)\n</td>\n<td>\n4,545\n</td>\n</tr>\n<tr>\n<td>\nRounding\n</td>\n<td>\n0\n</td>\n</tr>\n<tr>\n<td>\n<strong>\nTotal\n</strong>\n</td>\n<td>\n<strong>\n50,000\n</strong>\n</td>\n</tr>\n</tbody>\n</table>\n\nCard Payment 50,000",
"- **1 \\[REG\\] BLACK SAKURA** 45,455\n- **1 COOKIE DOH SAUCES** 0\n- **1 NATA DE COCO** 0\n- **Sub Total** 45,455\n- **PB1 (10%)** 4,545\n- **Rounding** 0\n- **Total** **50,000**\n",
]
self.assertIn(generated_text[0], EXPECTED_TEXT)

View File

@ -0,0 +1,391 @@
# coding=utf-8
# Copyright 2024 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import shutil
import tempfile
import unittest
from tempfile import TemporaryDirectory
import numpy as np
import pytest
import requests
from transformers.testing_utils import (
require_torch,
require_vision,
)
from transformers.utils import is_vision_available
from ...test_processing_common import ProcessorTesterMixin
if is_vision_available():
from PIL import Image
from transformers import (
AutoProcessor,
AutoTokenizer,
Kosmos2_5ImageProcessor,
Kosmos2_5Processor,
PreTrainedTokenizerFast,
)
@require_vision
class Kosmos2_5ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = Kosmos2_5Processor
images_input_name = "flattened_patches"
def setUp(self):
self.tmpdirname = tempfile.mkdtemp()
image_processor = Kosmos2_5ImageProcessor()
tokenizer = AutoTokenizer.from_pretrained("microsoft/kosmos-2.5")
processor = Kosmos2_5Processor(image_processor, tokenizer)
processor.save_pretrained(self.tmpdirname)
def get_tokenizer(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
def get_image_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
def tearDown(self):
shutil.rmtree(self.tmpdirname)
def test_image_procesor_load_save_reload(self):
# make sure load from Hub repo. -> save -> reload locally work
image_processor = Kosmos2_5ImageProcessor.from_pretrained("microsoft/kosmos-2.5")
with TemporaryDirectory() as tmp_dir:
image_processor.save_pretrained(tmp_dir)
reloaded_image_processor = Kosmos2_5ImageProcessor.from_pretrained(tmp_dir)
assert image_processor.to_dict() == reloaded_image_processor.to_dict()
assert image_processor.to_json_string() == reloaded_image_processor.to_json_string()
def test_save_load_pretrained_additional_features(self):
processor = Kosmos2_5Processor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
processor.save_pretrained(self.tmpdirname)
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
processor = Kosmos2_5Processor.from_pretrained(
self.tmpdirname,
bos_token="(BOS)",
eos_token="(EOS)",
do_normalize=False,
padding_value=1.0,
)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast)
self.assertEqual(
processor.image_processor.to_json_string(),
image_processor_add_kwargs.to_json_string(),
)
self.assertIsInstance(processor.image_processor, Kosmos2_5ImageProcessor)
@unittest.skip(reason="kosmos-2.5 must have both image and text")
def test_image_processor(self):
pass
@unittest.skip(reason="kosmos-2.5 must have both image and text")
def test_tokenizer(self):
pass
def test_tokenizer_decode(self):
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = Kosmos2_5Processor(tokenizer=tokenizer, image_processor=image_processor)
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
decoded_processor = processor.batch_decode(predicted_ids)
decoded_tok = tokenizer.batch_decode(predicted_ids)
self.assertListEqual(decoded_tok, decoded_processor)
def test_can_load_various_tokenizers(self):
for checkpoint in ["microsoft/kosmos-2.5", "kirp/kosmos2_5"]:
processor = AutoProcessor.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
self.assertEqual(processor.tokenizer.__class__, tokenizer.__class__)
@require_torch
def test_model_input_names(self):
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = Kosmos2_5Processor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "This is a test"
image_input = self.prepare_image_inputs()
# both image and text
inputs = processor(text=input_str, images=image_input)
self.assertListEqual(
list(inputs.keys()),
[
"flattened_patches",
"attention_mask",
"width",
"height",
"input_ids",
"image_embeds_position_mask",
],
)
# test if it raises when no input is passed
with pytest.raises(ValueError):
processor()
@require_torch
@require_vision
def test_image_processor_defaults_preserved_by_image_kwargs(self):
# Rewrite as KOSMOS-2.5 processor return "flattened_patches" and not "pixel_values"
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor", max_patches=1024, patch_size={"height": 8, "width": 8})
tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input)
self.assertEqual(len(inputs["flattened_patches"][0][0]), 194)
@require_torch
@require_vision
def test_kwargs_overrides_default_image_processor_kwargs(self):
# Rewrite as KOSMOS-2.5 processor return "flattened_patches" and not "pixel_values"
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor", max_patches=4096)
tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input, max_patches=1024)
self.assertEqual(len(inputs["flattened_patches"][0]), 1024)
@require_torch
@require_vision
def test_unstructured_kwargs(self):
# Rewrite as KOSMOS-2.5 processor doesn't use `rescale_factor`
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
inputs = processor(
text=input_str,
images=image_input,
return_tensors="pt",
max_patches=1024,
padding="max_length",
max_length=76,
)
self.assertEqual(inputs["flattened_patches"].shape[1], 1024)
self.assertEqual(len(inputs["input_ids"][0]), 76)
@require_torch
@require_vision
def test_unstructured_kwargs_batched(self):
# Rewrite as KOSMOS-2.5 processor doesn't use `rescale_factor`
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs(batch_size=2)
image_input = self.prepare_image_inputs(batch_size=2)
inputs = processor(
text=input_str,
images=image_input,
return_tensors="pt",
max_patches=1024,
padding="longest",
max_length=76,
)
self.assertEqual(inputs["flattened_patches"].shape[1], 1024)
self.assertEqual(len(inputs["input_ids"][0]), 76)
@require_torch
@require_vision
def test_structured_kwargs_nested(self):
# Rewrite as KOSMOS-2.5 processor doesn't use `rescale_factor`
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
# Define the kwargs for each modality
all_kwargs = {
"common_kwargs": {"return_tensors": "pt"},
"images_kwargs": {"max_patches": 1024},
"text_kwargs": {"padding": "max_length", "max_length": 76},
}
inputs = processor(text=input_str, images=image_input, **all_kwargs)
self.skip_processor_without_typed_kwargs(processor)
self.assertEqual(inputs["flattened_patches"].shape[1], 1024)
self.assertEqual(len(inputs["input_ids"][0]), 76)
@require_torch
@require_vision
def test_structured_kwargs_nested_from_dict(self):
# Rewrite as KOSMOS-2.5 processor doesn't use `rescale_factor`
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
# Define the kwargs for each modality
all_kwargs = {
"common_kwargs": {"return_tensors": "pt"},
"images_kwargs": {"max_patches": 1024},
"text_kwargs": {"padding": "max_length", "max_length": 76},
}
inputs = processor(text=input_str, images=image_input, **all_kwargs)
self.assertEqual(inputs["flattened_patches"].shape[1], 1024)
self.assertEqual(len(inputs["input_ids"][0]), 76)
@require_torch
def test_full_processor(self):
url = "https://huggingface.co/kirp/kosmos2_5/resolve/main/receipt_00008.png"
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")
texts = ["<md>", "<ocr>"]
expected_input_ids = [
[100288],
[100282],
]
expected_attention_mask = [[1], [1]]
image = Image.open(requests.get(url, stream=True).raw)
# To match the official (microsoft) Kosmos-2 demo from which the expected values here are grabbed
image_path = os.path.join(self.tmpdirname, "image.png")
image.save(image_path)
image = Image.open(image_path)
# test single image
outputs = processor(images=image, text=texts[0])
self.assertListEqual(
outputs.input_ids[0].numpy().tolist(),
[0, 100283] + [0] * 2048 + [100284] + expected_input_ids[0],
)
self.assertListEqual(
outputs.image_embeds_position_mask[0].numpy().tolist(),
[0, -1] + [1] * 2048 + [-1] + [0] * (len(expected_input_ids[0])),
)
self.assertListEqual(
outputs.attention_mask[0].numpy().tolist(),
[1, 1] + [1] * 2048 + [1] + expected_attention_mask[0],
)
EXPECTED_FP_1 = [
1.0,
2.0,
-2.9527735710144043,
-2.672085762023926,
-2.9933173656463623,
-2.905944585800171,
-2.5891761779785156,
-2.8751866817474365,
-2.962153434753418,
-2.588062047958374,
]
EXPECTED_FP_200 = [
4.0,
45.0,
1.5713728666305542,
1.584628939628601,
1.3589054346084595,
1.6515952348709106,
1.7014952898025513,
1.3731343746185303,
1.6010395288467407,
1.6607422828674316,
]
self.assertTupleEqual(outputs.flattened_patches.shape, (1, 4096, 770))
np.testing.assert_allclose(
outputs.flattened_patches[0][1][:10].numpy().tolist(),
EXPECTED_FP_1,
atol=1e-9,
)
np.testing.assert_allclose(
outputs.flattened_patches[0][200][:10].numpy().tolist(),
EXPECTED_FP_200,
atol=1e-9,
)
# test a batch of images and texts, right padding
outputs = processor(images=[image, image], text=texts)
self.assertListEqual(
outputs.input_ids[1].numpy().tolist(),
[0, 100283] + [0] * 2048 + [100284] + expected_input_ids[1],
)
self.assertListEqual(
outputs.image_embeds_position_mask[1].numpy().tolist(),
[0, -1] + [1] * 2048 + [-1] + [0] * (len(expected_input_ids[1])),
)
self.assertListEqual(
outputs.attention_mask[1].numpy().tolist(),
[1, 1] + [1] * 2048 + [1] + expected_attention_mask[1],
)
self.assertTupleEqual(outputs.flattened_patches.shape, (2, 4096, 770))
np.testing.assert_allclose(
outputs.flattened_patches[1][1][:10].numpy().tolist(),
EXPECTED_FP_1,
atol=1e-9,
)
np.testing.assert_allclose(
outputs.flattened_patches[1][200][:10].numpy().tolist(),
EXPECTED_FP_200,
atol=1e-9,
)

View File

@ -85,6 +85,9 @@ PRIVATE_MODELS = [
"Idefics2PerceiverResampler",
"Idefics2VisionTransformer",
"Idefics3VisionTransformer",
"Kosmos2_5TextModel",
"Kosmos2_5TextForCausalLM",
"Kosmos2_5VisionModel",
"AriaTextForCausalLM",
"AriaTextModel",
]