Compare commits

...

11 Commits

Author SHA1 Message Date
c083a0ef5c undo 2025-04-28 15:03:12 +02:00
ceb666f8a9 copies 2025-04-28 15:00:53 +02:00
ff54c1d043 markuplm 2025-04-28 14:57:34 +02:00
ca8858eb63 udop 2025-04-28 14:57:34 +02:00
59f9c9c502 layoutlmv2 2025-04-28 14:57:34 +02:00
37b52214f0 check bytelevel bug, skip 2025-04-28 14:57:34 +02:00
9024123e99 adding test for empty string 2025-04-28 14:57:34 +02:00
2e7e0f6726 revert files 2025-04-28 14:57:34 +02:00
a6ecd8bafb revert files 2025-04-28 14:57:33 +02:00
479859932d revert files 2025-04-28 14:57:33 +02:00
7e510e14a6 handle when text str is '' 2025-04-28 14:57:33 +02:00
6 changed files with 86 additions and 1 deletions

View File

@ -611,7 +611,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
split_special_tokens: bool = False,
**kwargs,
) -> BatchEncoding:
batched_input = [(text, text_pair)] if text_pair else [text]
batched_input = [(text, text_pair)] if text_pair is not None else [text]
batched_output = self._batch_encode_plus(
batched_input,
is_split_into_words=is_split_into_words,

View File

@ -1354,6 +1354,22 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
rust_ids = rust_tokenizer.encode(words, boxes=boxes, add_special_tokens=True)
self.assertListEqual(ids, rust_ids)
def test_call_empty_strings(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
self.skipTest(reason="test_slow_tokenizer is set to False")
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
boxes = [[1, 8], [1, 8]]
input_p = tokenizer_p(["", ""], boxes=boxes)
input_r = tokenizer_r(["", ""], boxes=boxes)
self.assertEqual(input_p["input_ids"], input_r["input_ids"])
def test_tokenization_python_rust_equals(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions

View File

@ -1804,6 +1804,22 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
shutil.rmtree(tmpdirname2)
def test_call_empty_strings(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
self.skipTest(reason="test_slow_tokenizer is set to False")
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
boxes = [[1, 8], [1, 8]]
input_p = tokenizer_p(["", ""], boxes=boxes)
input_r = tokenizer_r(["", ""], boxes=boxes)
self.assertEqual(input_p["input_ids"], input_r["input_ids"])
@unittest.skip(reason="TO DO: overwrite this very extensive test.")
def test_alignement_methods(self):
pass

View File

@ -1116,6 +1116,22 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
rust_ids = rust_tokenizer.encode(nodes, xpaths=xpaths, add_special_tokens=True)
self.assertListEqual(ids, rust_ids)
def test_call_empty_strings(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
self.skipTest(reason="test_slow_tokenizer is set to False")
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
xpaths = ["/html/body", "/html/body"]
input_p = tokenizer_p(["", ""], xpaths=xpaths)
input_r = tokenizer_r(["", ""], xpaths=xpaths)
self.assertEqual(input_p["input_ids"], input_r["input_ids"])
def test_tokenization_python_rust_equals(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions

View File

@ -1316,6 +1316,22 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
self.assertSequenceEqual(tokens_r, tokens_p)
def test_call_empty_strings(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
self.skipTest(reason="test_slow_tokenizer is set to False")
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
boxes = [[1, 8], [1, 8]]
input_p = tokenizer_p(["", ""], boxes=boxes)
input_r = tokenizer_r(["", ""], boxes=boxes)
self.assertEqual(input_p["input_ids"], input_r["input_ids"])
def test_compare_add_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):

View File

@ -3520,6 +3520,27 @@ class TokenizerTesterMixin:
if tokenizer_r.num_special_tokens_to_add(pair=True):
self.assertIn(None, pair_batch_sequence_ids)
def test_call_empty_strings(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
self.skipTest(reason="test_slow_tokenizer is set to False")
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
# not actually adding this, but need to investigate further if this is a bug
if type(tokenizer_r._tokenizer.pre_tokenizer).__name__ == "ByteLevel" and tokenizer_r.add_prefix_space:
self.skipTest(reason="ByteLevel pre tokenizer not supported for empty strings")
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
# Ensure basic input match
input_p = tokenizer_p("", "")
input_r = tokenizer_r("", "")
self.assertEqual(input_p["input_ids"], input_r["input_ids"])
def test_tokenization_python_rust_equals(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions