Merge branch 'main' into cleanup_tok_save

draft removal of special and added
2025-10-20 17:13:56 +08:00 · 2025-10-16 11:36:34 +02:00 · 2025-09-11 16:27:32 +02:00
4 changed files with 1 additions and 92 deletions
--- a/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py
@ -232,7 +232,6 @@ def convert_biogpt_checkpoint_to_pytorch(biogpt_checkpoint_path, pytorch_dump_fo
        "eos_token": "</s>",
        "model_max_length": 1024,
        "pad_token": "<pad>",
-        "special_tokens_map_file": None,
        "tokenizer_class": "BioGptTokenizer",
        "unk_token": "<unk>",
    }
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@ -42,10 +42,6 @@ from .utils import PaddingStrategy, TensorType, add_end_docstrings, logging


 logger = logging.get_logger(__name__)
-
-# Slow tokenizers are saved in a vocabulary plus three separated files
-SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
-ADDED_TOKENS_FILE = "added_tokens.json"
 TOKENIZER_CONFIG_FILE = "tokenizer_config.json"


--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@ -140,11 +140,7 @@ EncodedInputPair = tuple[list[int], list[int]]
 # Define type aliases for text-related non-text modalities
 AudioInput = Union[np.ndarray, "torch.Tensor", list[np.ndarray], list["torch.Tensor"]]

-# Slow tokenizers used to be saved in three separated files
-SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
-ADDED_TOKENS_FILE = "added_tokens.json"
 TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
-
 # Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file
 FULL_TOKENIZER_FILE = "tokenizer.json"
 _re_tokenizer_file = re.compile(r"tokenizer\.(.*)\.json")
@ -1997,10 +1993,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
            else:
                # At this point pretrained_model_name_or_path is either a directory or a model identifier name
                additional_files_names = {
-                    "added_tokens_file": ADDED_TOKENS_FILE,  # kept only for legacy
-                    "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,  # kept only for legacy
                    "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
-                    # tokenizer_file used to initialize a slow from a fast. Properly copy the `addedTokens` instead of adding in random orders
                    "tokenizer_file": FULL_TOKENIZER_FILE,
                    "chat_template_file": CHAT_TEMPLATE_FILE,
                }
@ -2262,14 +2255,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):

        # Update with newly provided kwargs
        init_kwargs.update(kwargs)
-
        # Merge resolved_vocab_files arguments in init_kwargs.
-        added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
-        special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
        for args_name, file_path in resolved_vocab_files.items():
            if args_name not in init_kwargs:
                init_kwargs[args_name] = file_path
-        tokenizer_file = resolved_vocab_files.pop("tokenizer_file", None)

        if slow_tokenizer is not None:
            init_kwargs["__slow_tokenizer"] = slow_tokenizer
@ -2290,64 +2279,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
                    raise TypeError(
                        f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary or an AddedToken instance"
                    )
-        else:
-            # begin legacy: read the added_tokens_file and update kwargs with special_tokens_map if modified
-            if special_tokens_map_file is not None:
-                with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
-                    special_tokens_map = json.load(special_tokens_map_handle)
-                    for key, value in special_tokens_map.items():
-                        if key in kwargs and kwargs[key]:
-                            # This value has already been redefined by the kwargs
-                            # We keep this new value and ignore the one stored in the special_tokens_map_file
-                            continue
-                        if isinstance(value, dict):
-                            value["special"] = True
-                            value = AddedToken(**value)
-                        elif key == "additional_special_tokens" and isinstance(value, list):
-                            additional_special_tokens = init_kwargs.pop("additional_special_tokens", []) or []
-                            for token in value:
-                                if isinstance(token, dict):
-                                    token["special"] = True
-                                    token = AddedToken(**token)
-                                if token not in additional_special_tokens:
-                                    additional_special_tokens.append(token)
-                            value = additional_special_tokens
-                        init_kwargs[key] = value
-
-            # slow -> slow|fast, legacy: convert the `"added_tokens.json"` file to `added_tokens_decoder`.
-            # this is for legacy purpose. We don't add the tokens after init for efficiency.
-            if added_tokens_file is not None:
-                special_tokens = []
-                for key in cls.SPECIAL_TOKENS_ATTRIBUTES & init_kwargs.keys():
-                    if init_kwargs[key] is not None:
-                        if key == "additional_special_tokens":
-                            special_tokens += [str(token) for token in init_kwargs[key]]
-                        else:
-                            special_tokens.append(str(init_kwargs[key]))
-
-                with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
-                    added_tok_encoder = json.load(added_tokens_handle)
-                for str_token, index in added_tok_encoder.items():
-                    # if index not in added_tokens_decoder and str_token not in added_tokens_map:
-                    special = str_token in special_tokens
-                    added_tokens_decoder[index] = AddedToken(
-                        str_token, rstrip=False, lstrip=False, normalized=not special, special=special
-                    )
-                    added_tokens_map[str(token)] = added_tokens_decoder[index]
-
-            # allows converting a fast -> slow: add the `tokenizer.json`'s `"added_tokens"` to the slow tokenizer
-            # if `tokenizer_config.json` is `None`
-            if tokenizer_file is not None:
-                # This is for slow so can be done before
-                with open(tokenizer_file, encoding="utf-8") as tokenizer_file_handle:
-                    tokenizer_file_handle = json.load(tokenizer_file_handle)
-                    added_tokens = tokenizer_file_handle.pop("added_tokens")
-                for serialized_tokens in added_tokens:
-                    idx = serialized_tokens.pop("id")
-                    added_tokens_decoder[idx] = AddedToken(**serialized_tokens)
-                    added_tokens_map[str(added_tokens_decoder[idx])] = added_tokens_decoder[idx]
-            # end legacy
-
        # Passing AddedTokens and not strings to the class to prevent it from casting the string to a different AddedToken
        # convert {'__type': 'AddedToken', 'content': '<ent>', 'lstrip': False, 'normalized': True, ...} to AddedTokens
        init_kwargs["added_tokens_decoder"] = added_tokens_decoder
@ -2537,9 +2468,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
            repo_id = self._create_repo(repo_id, **kwargs)
            files_timestamps = self._get_files_timestamps(save_directory)

-        special_tokens_map_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + SPECIAL_TOKENS_MAP_FILE
-        )
        tokenizer_config_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_CONFIG_FILE
        )
@ -2598,7 +2526,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
        # remove private information
        if "name_or_path" in tokenizer_config:
            tokenizer_config.pop("name_or_path")
-            tokenizer_config.pop("special_tokens_map_file", None)
            tokenizer_config.pop("tokenizer_file", None)
        if "device_map" in tokenizer_config:
            tokenizer_config.pop("device_map")
@ -2607,17 +2534,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
            out_str = json.dumps(tokenizer_config, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
            f.write(out_str)
        logger.info(f"tokenizer config file saved in {tokenizer_config_file}")
-
-        # Sanitize AddedTokens in special_tokens_map
-
-        # kept for forward compatibility, will be removed in transoformers 5. Typefields are not saved for FC, special should not be save either
-        write_dict = self.convert_added_tokens(self.special_tokens_map_extended, save=True, add_type_field=False)
-        with open(special_tokens_map_file, "w", encoding="utf-8") as f:
-            out_str = json.dumps(write_dict, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
-            f.write(out_str)
-        logger.info(f"Special tokens file saved in {special_tokens_map_file}")
-
-        file_names = (tokenizer_config_file, special_tokens_map_file, *saved_raw_chat_template_files)
+        file_names = (tokenizer_config_file, *saved_raw_chat_template_files)

        save_files = self._save_pretrained(
            save_directory=save_directory,
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@ -52,12 +52,9 @@ logger = logging.get_logger(__name__)

 # Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file
 TOKENIZER_FILE = "tokenizer.json"
-SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
 TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
 TIKTOKEN_VOCAB_FILE = "tokenizer.model"

-# Slow tokenizers have an additional added tokens files
-ADDED_TOKENS_FILE = "added_tokens.json"

 INIT_TOKENIZER_DOCSTRING += """
        tokenizer_object ([`tokenizers.Tokenizer`]):
Author	SHA1	Message	Date
Arthur	52289f4dff	Merge branch 'main' into cleanup_tok_save	2025-10-16 11:36:34 +02:00
Arthur	a06992cbc5	draft removal of special and added	2025-09-11 16:27:32 +02:00