mirror of
https://github.com/huggingface/transformers.git
synced 2025-10-20 17:13:56 +08:00
Compare commits
24 Commits
740f952218
...
30824-spmc
Author | SHA1 | Date | |
---|---|---|---|
12940f6ec4 | |||
d28bdde42d | |||
79507ad4b6 | |||
5f9b5b1666 | |||
874c3f6980 | |||
7e42130a18 | |||
16eeb0cf65 | |||
2751373122 | |||
7f3b798a4c | |||
a8694bca84 | |||
b6de569868 | |||
146e8f9e8e | |||
74e78f1720 | |||
d92822e045 | |||
c416522a76 | |||
84143a2cc3 | |||
ff5974bb61 | |||
fdb63e21db | |||
79ce5bb67f | |||
896b7d152e | |||
7afb15921d | |||
31fbe4f12c | |||
d1ea757c21 | |||
24ea0cd756 |
@ -626,8 +626,12 @@ class SpmConverter(Converter):
|
||||
tokenizer.normalizer = normalizer
|
||||
|
||||
replacement = "▁"
|
||||
add_prefix_space = True
|
||||
if hasattr(self.original_tokenizer, "add_prefix_space"):
|
||||
add_prefix_space = self.proto.normalizer_spec.add_dummy_prefix
|
||||
|
||||
if (
|
||||
hasattr(self.original_tokenizer, "add_prefix_space")
|
||||
and self.original_tokenizer.add_prefix_space is not None
|
||||
):
|
||||
add_prefix_space = self.original_tokenizer.add_prefix_space
|
||||
|
||||
pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space)
|
||||
|
@ -167,8 +167,8 @@ class LlamaTokenizer(PreTrainedTokenizer):
|
||||
self.add_bos_token = add_bos_token
|
||||
self.add_eos_token = add_eos_token
|
||||
self.use_default_system_prompt = use_default_system_prompt
|
||||
self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
|
||||
self.add_prefix_space = add_prefix_space
|
||||
self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
|
||||
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
@ -202,6 +202,9 @@ class LlamaTokenizer(PreTrainedTokenizer):
|
||||
model_pb2 = import_protobuf(f"The new behaviour of {self.__class__.__name__} (with `self.legacy = False`)")
|
||||
model = model_pb2.ModelProto.FromString(sp_model)
|
||||
normalizer_spec = model_pb2.NormalizerSpec()
|
||||
self.add_prefix_space = (
|
||||
normalizer_spec.add_dummy_prefix if self.add_prefix_space is None else self.add_prefix_space
|
||||
)
|
||||
normalizer_spec.add_dummy_prefix = False
|
||||
model.normalizer_spec.MergeFrom(normalizer_spec)
|
||||
sp_model = model.SerializeToString()
|
||||
|
@ -13,10 +13,12 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
import json
|
||||
|
||||
from shutil import copyfile
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from tokenizers import processors
|
||||
from tokenizers import pre_tokenizers, normalizers, processors
|
||||
|
||||
from ...tokenization_utils_fast import PreTrainedTokenizerFast
|
||||
from ...utils import is_sentencepiece_available, logging
|
||||
@ -150,9 +152,6 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
|
||||
legacy = True
|
||||
self.legacy = legacy
|
||||
|
||||
if add_prefix_space is not None:
|
||||
kwargs["from_slow"] = True
|
||||
|
||||
super().__init__(
|
||||
vocab_file=vocab_file,
|
||||
tokenizer_file=tokenizer_file,
|
||||
|
@ -134,7 +134,7 @@ class T5Tokenizer(PreTrainedTokenizer):
|
||||
additional_special_tokens=None,
|
||||
sp_model_kwargs: Optional[Dict[str, Any]] = None,
|
||||
legacy=None,
|
||||
add_prefix_space=True,
|
||||
add_prefix_space=None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
|
||||
@ -181,10 +181,10 @@ class T5Tokenizer(PreTrainedTokenizer):
|
||||
legacy = True
|
||||
|
||||
self.legacy = legacy
|
||||
self.add_prefix_space = add_prefix_space
|
||||
self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
|
||||
self.vocab_file = vocab_file
|
||||
self._extra_ids = extra_ids
|
||||
self.add_prefix_space = add_prefix_space
|
||||
|
||||
super().__init__(
|
||||
eos_token=eos_token,
|
||||
@ -210,6 +210,9 @@ class T5Tokenizer(PreTrainedTokenizer):
|
||||
model_pb2 = import_protobuf(f"The new behaviour of {self.__class__.__name__} (with `self.legacy = False`)")
|
||||
model = model_pb2.ModelProto.FromString(sp_model)
|
||||
normalizer_spec = model_pb2.NormalizerSpec()
|
||||
self.add_prefix_space = (
|
||||
normalizer_spec.add_dummy_prefix if self.add_prefix_space is None else self.add_prefix_space
|
||||
)
|
||||
normalizer_spec.add_dummy_prefix = False
|
||||
model.normalizer_spec.MergeFrom(normalizer_spec)
|
||||
sp_model = model.SerializeToString()
|
||||
|
@ -110,11 +110,7 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
|
||||
extra_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
|
||||
additional_special_tokens = extra_tokens
|
||||
|
||||
if add_prefix_space is not None:
|
||||
logger.warning_once(
|
||||
"You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers"
|
||||
)
|
||||
kwargs["from_slow"] = True
|
||||
self.add_prefix_space = add_prefix_space
|
||||
|
||||
super().__init__(
|
||||
vocab_file,
|
||||
@ -123,6 +119,7 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
|
||||
unk_token=unk_token,
|
||||
pad_token=pad_token,
|
||||
extra_ids=extra_ids,
|
||||
add_prefix_space=add_prefix_space,
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
**kwargs,
|
||||
)
|
||||
|
@ -20,9 +20,11 @@ see tokenization_utils.py
|
||||
import copy
|
||||
import json
|
||||
import os
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
from tokenizers import pre_tokenizers, normalizers, processors
|
||||
import tokenizers.pre_tokenizers as pre_tokenizers_fast
|
||||
from tokenizers import Encoding as EncodingFast
|
||||
from tokenizers import Tokenizer as TokenizerFast
|
||||
@ -101,12 +103,10 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
||||
fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
|
||||
from_slow = kwargs.pop("from_slow", False)
|
||||
added_tokens_decoder = kwargs.pop("added_tokens_decoder", {})
|
||||
self.add_prefix_space = kwargs.pop("add_prefix_space", None)
|
||||
|
||||
if from_slow and slow_tokenizer is None and self.slow_tokenizer_class is None:
|
||||
raise ValueError(
|
||||
"Cannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you "
|
||||
"have sentencepiece installed."
|
||||
)
|
||||
if self.force_from_slow() is True:
|
||||
kwargs["from_slow"] = True
|
||||
|
||||
if tokenizer_object is not None:
|
||||
fast_tokenizer = copy.deepcopy(tokenizer_object)
|
||||
@ -116,10 +116,15 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
||||
elif slow_tokenizer is not None:
|
||||
# We need to convert a slow tokenizer to build the backend
|
||||
fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)
|
||||
elif fast_tokenizer_file is not None:
|
||||
# When sentencepiece is not installed, we can't convert a slow tokenizer to a fast one
|
||||
fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
|
||||
elif gguf_file is not None:
|
||||
# We need to convert a slow tokenizer to build the backend
|
||||
tokenizer_dict = load_gguf_checkpoint(kwargs.get("vocab_file"))["tokenizer"]
|
||||
fast_tokenizer = convert_gguf_tokenizer(tokenizer_dict)
|
||||
gguf_param = load_gguf_checkpoint(kwargs.get("vocab_file"))
|
||||
architecture = gguf_param["config"]["model_type"]
|
||||
tokenizer_dict = gguf_param["tokenizer"]
|
||||
fast_tokenizer = convert_gguf_tokenizer(architecture, tokenizer_dict)
|
||||
elif self.slow_tokenizer_class is not None:
|
||||
# We need to create and convert a slow tokenizer to build the backend
|
||||
slow_tokenizer = self.slow_tokenizer_class(*args, **kwargs)
|
||||
@ -135,6 +140,8 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
||||
|
||||
self._tokenizer = fast_tokenizer
|
||||
|
||||
self._update_pre_tokenizer()
|
||||
|
||||
if slow_tokenizer is not None:
|
||||
kwargs.update(slow_tokenizer.init_kwargs)
|
||||
|
||||
@ -861,3 +868,107 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
||||
kwargs["additional_special_tokens"] = additional_special_tokens
|
||||
|
||||
return self.__class__(tokenizer_object=tokenizer, **kwargs)
|
||||
|
||||
def force_from_slow(self):
|
||||
if getattr(self, "add_prefix_space", None) is None:
|
||||
if getattr(self, "_tokenizer", None) is None:
|
||||
return True
|
||||
curr_normalizer = json.loads(self._tokenizer.normalizer.__getstate__().decode("utf-8"))
|
||||
prepend_normalizer = [n for n in curr_normalizer["normalizers"] if n["type"] == "Prepend"]
|
||||
if not prepend_normalizer:
|
||||
return True
|
||||
|
||||
def _update_normalizer(self):
|
||||
"""Updates the underlying normalizer with the current `add_prefix_space` and `legacy` settings."""
|
||||
sequence = json.loads(normalizers.Sequence([]).__getstate__())
|
||||
final_sequence = normalizers.Sequence([])
|
||||
if self._tokenizer.normalizer is not None and type(self._tokenizer.normalizer) not in (
|
||||
normalizers.Sequence,
|
||||
normalizers.Prepend,
|
||||
normalizers.Precompiled,
|
||||
):
|
||||
return
|
||||
if type(self._tokenizer.normalizer) is normalizers.Sequence:
|
||||
curr_state = json.loads(self._tokenizer.normalizer.__getstate__().decode("utf-8"))
|
||||
sequence["normalizers"] = [
|
||||
pt for pt in curr_state["normalizers"] if pt["type"] not in ["Prepend"]
|
||||
]
|
||||
elif self._tokenizer.normalizer is not None:
|
||||
sequence["normalizers"].append(json.loads(self._tokenizer.normalizer.__getstate__().decode("utf-8")))
|
||||
if getattr(self, "legacy", True):
|
||||
if getattr(self, "add_prefix_space", True):
|
||||
new_prepend = json.loads(normalizers.Prepend(prepend="▁").__getstate__().decode("utf-8"))
|
||||
sequence["normalizers"].append(new_prepend)
|
||||
if not any(n["type"] == "Replace" for n in sequence["normalizers"]):
|
||||
new_replace = json.loads(normalizers.Replace(pattern=" ", content="▁").__getstate__().decode("utf-8"))
|
||||
sequence["normalizers"].append(new_replace)
|
||||
|
||||
final_sequence.__setstate__(json.dumps(sequence).encode("utf-8"))
|
||||
self._tokenizer.normalizer = final_sequence
|
||||
|
||||
elif not getattr(self, "legacy", True):
|
||||
self._tokenizer.normalizer = final_sequence
|
||||
|
||||
def _update_pre_tokenizer(self):
|
||||
"""Updates the underlying pre-tokenizer with the current `add_prefix_space` setting."""
|
||||
|
||||
if getattr(self, "add_prefix_space", None) == None:
|
||||
tokenizer_normalizer = getattr(self._tokenizer, "normalizer", None)
|
||||
if tokenizer_normalizer == None:
|
||||
return # No add_prefix_space to set
|
||||
|
||||
curr_normalizer = json.loads(tokenizer_normalizer.__getstate__().decode("utf-8"))
|
||||
if "normalizers" not in curr_normalizer:
|
||||
return # No add_prefix_space to set
|
||||
|
||||
if any(n["type"] == "Prepend" for n in curr_normalizer["normalizers"]):
|
||||
self.add_prefix_space = True # Update add_prefix_space based on the current normalizer
|
||||
else:
|
||||
return # No add_prefix_space to set
|
||||
|
||||
if getattr(self, "add_prefix_space", True):
|
||||
prepend_scheme = "always"
|
||||
if not getattr(self, "legacy", True):
|
||||
prepend_scheme = "first"
|
||||
|
||||
elif not getattr(self, "add_prefix_space"):
|
||||
prepend_scheme = "never"
|
||||
|
||||
if isinstance(self._tokenizer.pre_tokenizer, pre_tokenizers.Sequence):
|
||||
curr_state = json.loads(self._tokenizer.pre_tokenizer.__getstate__().decode("utf-8"))
|
||||
update_normalizer = True
|
||||
|
||||
for i, pt in enumerate(curr_state["pretokenizers"]):
|
||||
if pt["type"] == "Metaspace":
|
||||
# Create a new Metaspace pre-tokenizer
|
||||
new_metaspace = pre_tokenizers.Metaspace(
|
||||
replacement="▁", prepend_scheme=prepend_scheme, split=pt["split"]
|
||||
)
|
||||
curr_state["pretokenizers"][i] = json.loads(new_metaspace.__getstate__().decode("utf-8"))
|
||||
elif pt["type"] == "ByteLevel":
|
||||
# Create a new ByteLevel pre-tokenizer
|
||||
new_bytelevel = pre_tokenizers.ByteLevel(add_prefix_space=self.add_prefix_space)
|
||||
curr_state["pretokenizers"][i] = json.loads(new_bytelevel.__getstate__().decode("utf-8"))
|
||||
update_normalizer = False
|
||||
|
||||
new_pretokenizer = pre_tokenizers.Sequence([])
|
||||
new_pretokenizer.__setstate__(json.dumps(curr_state).encode("utf-8"))
|
||||
self._tokenizer.pre_tokenizer = new_pretokenizer
|
||||
self._update_normalizer() if update_normalizer else None
|
||||
|
||||
elif isinstance(self._tokenizer.pre_tokenizer, pre_tokenizers.Metaspace):
|
||||
self._tokenizer.pre_tokenizer.prepend_scheme = prepend_scheme
|
||||
self._update_normalizer()
|
||||
|
||||
elif self._tokenizer.pre_tokenizer is None:
|
||||
self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
|
||||
replacement="▁", prepend_scheme=prepend_scheme, split=False
|
||||
)
|
||||
self._update_normalizer()
|
||||
|
||||
|
||||
elif isinstance(self._tokenizer.pre_tokenizer, pre_tokenizers.ByteLevel):
|
||||
self._tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=self.add_prefix_space)
|
||||
|
||||
else:
|
||||
warnings.warn(f"{type(self._tokenizer.pre_tokenizer)} does not support `add_prefix_space`. ")
|
||||
|
Reference in New Issue
Block a user