Compare commits

...

1 Commits

Author SHA1 Message Date
6779acac8f Revert error back into warning for byte fallback conversion. 2023-04-06 11:43:06 +02:00
2 changed files with 6 additions and 3 deletions

View File

@ -19,6 +19,7 @@ All the conversions are grouped here to gather SentencePiece dependencies outsid
allow to make our dependency on SentencePiece optional.
"""
import warnings
from typing import Dict, List, Tuple
from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
@ -450,7 +451,7 @@ class SpmConverter(Converter):
if self.proto.trainer_spec.byte_fallback:
if not getattr(self, "handle_byte_fallback", None):
raise RuntimeError(
warnings.warn(
"The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
" which is not implemented in the fast tokenizers. In practice this means that the fast version of the"
" tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these "

View File

@ -24,10 +24,12 @@ class ConvertSlowTokenizerTest(unittest.TestCase):
original_tokenizer_with_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_with_bytefallback)
with self.assertRaises(RuntimeError) as cm:
with warnings.catch_warnings(record=True) as w:
_ = SpmConverter(original_tokenizer_with_bytefallback)
self.assertEqual(len(w), 1)
self.assertIn(
"The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
" which is not implemented in the fast tokenizers.",
str(cm.exception),
str(w[0].message),
)