mirror of
https://github.com/huggingface/transformers.git
synced 2025-10-21 01:23:56 +08:00
Compare commits
11 Commits
v4.52.1
...
batched_ha
Author | SHA1 | Date | |
---|---|---|---|
c083a0ef5c | |||
ceb666f8a9 | |||
ff54c1d043 | |||
ca8858eb63 | |||
59f9c9c502 | |||
37b52214f0 | |||
9024123e99 | |||
2e7e0f6726 | |||
a6ecd8bafb | |||
479859932d | |||
7e510e14a6 |
@ -611,7 +611,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
||||
split_special_tokens: bool = False,
|
||||
**kwargs,
|
||||
) -> BatchEncoding:
|
||||
batched_input = [(text, text_pair)] if text_pair else [text]
|
||||
batched_input = [(text, text_pair)] if text_pair is not None else [text]
|
||||
batched_output = self._batch_encode_plus(
|
||||
batched_input,
|
||||
is_split_into_words=is_split_into_words,
|
||||
|
@ -1354,6 +1354,22 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
rust_ids = rust_tokenizer.encode(words, boxes=boxes, add_special_tokens=True)
|
||||
self.assertListEqual(ids, rust_ids)
|
||||
|
||||
def test_call_empty_strings(self):
|
||||
if not self.test_slow_tokenizer:
|
||||
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
|
||||
self.skipTest(reason="test_slow_tokenizer is set to False")
|
||||
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||
|
||||
boxes = [[1, 8], [1, 8]]
|
||||
input_p = tokenizer_p(["", ""], boxes=boxes)
|
||||
input_r = tokenizer_r(["", ""], boxes=boxes)
|
||||
|
||||
self.assertEqual(input_p["input_ids"], input_r["input_ids"])
|
||||
|
||||
def test_tokenization_python_rust_equals(self):
|
||||
if not self.test_slow_tokenizer:
|
||||
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
|
||||
|
@ -1804,6 +1804,22 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
shutil.rmtree(tmpdirname2)
|
||||
|
||||
def test_call_empty_strings(self):
|
||||
if not self.test_slow_tokenizer:
|
||||
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
|
||||
self.skipTest(reason="test_slow_tokenizer is set to False")
|
||||
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||
|
||||
boxes = [[1, 8], [1, 8]]
|
||||
input_p = tokenizer_p(["", ""], boxes=boxes)
|
||||
input_r = tokenizer_r(["", ""], boxes=boxes)
|
||||
|
||||
self.assertEqual(input_p["input_ids"], input_r["input_ids"])
|
||||
|
||||
@unittest.skip(reason="TO DO: overwrite this very extensive test.")
|
||||
def test_alignement_methods(self):
|
||||
pass
|
||||
|
@ -1116,6 +1116,22 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
rust_ids = rust_tokenizer.encode(nodes, xpaths=xpaths, add_special_tokens=True)
|
||||
self.assertListEqual(ids, rust_ids)
|
||||
|
||||
def test_call_empty_strings(self):
|
||||
if not self.test_slow_tokenizer:
|
||||
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
|
||||
self.skipTest(reason="test_slow_tokenizer is set to False")
|
||||
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||
|
||||
xpaths = ["/html/body", "/html/body"]
|
||||
input_p = tokenizer_p(["", ""], xpaths=xpaths)
|
||||
input_r = tokenizer_r(["", ""], xpaths=xpaths)
|
||||
|
||||
self.assertEqual(input_p["input_ids"], input_r["input_ids"])
|
||||
|
||||
def test_tokenization_python_rust_equals(self):
|
||||
if not self.test_slow_tokenizer:
|
||||
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
|
||||
|
@ -1316,6 +1316,22 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
|
||||
self.assertSequenceEqual(tokens_r, tokens_p)
|
||||
|
||||
def test_call_empty_strings(self):
|
||||
if not self.test_slow_tokenizer:
|
||||
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
|
||||
self.skipTest(reason="test_slow_tokenizer is set to False")
|
||||
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||
|
||||
boxes = [[1, 8], [1, 8]]
|
||||
input_p = tokenizer_p(["", ""], boxes=boxes)
|
||||
input_r = tokenizer_r(["", ""], boxes=boxes)
|
||||
|
||||
self.assertEqual(input_p["input_ids"], input_r["input_ids"])
|
||||
|
||||
def test_compare_add_special_tokens(self):
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||
|
@ -3520,6 +3520,27 @@ class TokenizerTesterMixin:
|
||||
if tokenizer_r.num_special_tokens_to_add(pair=True):
|
||||
self.assertIn(None, pair_batch_sequence_ids)
|
||||
|
||||
def test_call_empty_strings(self):
|
||||
if not self.test_slow_tokenizer:
|
||||
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
|
||||
self.skipTest(reason="test_slow_tokenizer is set to False")
|
||||
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||
|
||||
# not actually adding this, but need to investigate further if this is a bug
|
||||
if type(tokenizer_r._tokenizer.pre_tokenizer).__name__ == "ByteLevel" and tokenizer_r.add_prefix_space:
|
||||
self.skipTest(reason="ByteLevel pre tokenizer not supported for empty strings")
|
||||
|
||||
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||
|
||||
# Ensure basic input match
|
||||
input_p = tokenizer_p("", "")
|
||||
input_r = tokenizer_r("", "")
|
||||
|
||||
self.assertEqual(input_p["input_ids"], input_r["input_ids"])
|
||||
|
||||
def test_tokenization_python_rust_equals(self):
|
||||
if not self.test_slow_tokenizer:
|
||||
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
|
||||
|
Reference in New Issue
Block a user