check if repo has tokenizer.json file, then load PreTrainedTokenizerFast

check if can index
parse mapping instead
2025-10-20 17:13:56 +08:00 · 2024-10-18 16:16:28 +02:00 · 2024-09-29 18:03:44 +02:00 · 2024-09-27 18:23:13 +02:00 · 2024-09-27 12:36:12 +02:00 · 2024-09-27 10:38:23 +02:00
2 changed files with 102 additions and 0 deletions
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@ -701,6 +701,101 @@ def get_tokenizer_config(
    return result


+def has_tokenizer_file(
+    pretrained_model_name_or_path: Union[str, os.PathLike],
+    cache_dir: Optional[Union[str, os.PathLike]] = None,
+    force_download: bool = False,
+    resume_download: Optional[bool] = None,
+    proxies: Optional[Dict[str, str]] = None,
+    token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+    subfolder: str = "",
+    **kwargs,
+):
+    """
+    Checks if the tokenizer.json file exists.
+    Args:
+        pretrained_model_name_or_path (`str` or `os.PathLike`):
+            This can be either:
+            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
+              huggingface.co.
+            - a path to a *directory* containing a configuration file saved using the
+              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
+        cache_dir (`str` or `os.PathLike`, *optional*):
+            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
+            cache should not be used.
+        force_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to force to (re-)download the configuration files and override the cached versions if they
+            exist.
+        resume_download:
+            Deprecated and ignored. All downloads are now resumed by default when possible.
+            Will be removed in v5 of Transformers.
+        proxies (`Dict[str, str]`, *optional*):
+            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+        token (`str` or *bool*, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `huggingface-cli login` (stored in `~/.huggingface`).
+        revision (`str`, *optional*, defaults to `"main"`):
+            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+            identifier allowed by git.
+        local_files_only (`bool`, *optional*, defaults to `False`):
+            If `True`, will only try to load the tokenizer configuration from local files.
+        subfolder (`str`, *optional*, defaults to `""`):
+            In case the tokenizer config is located inside a subfolder of the model repo on huggingface.co, you can
+            specify the folder name here.
+    <Tip>
+    Passing `token=True` is required when you want to use a private model.
+    </Tip>
+    Returns:
+        `Bool`: True if the tokenizer.json file exists, False otherwise.
+    Examples:
+    ```python
+    # Download configuration from huggingface.co and cache.
+    tokenizer_config = get_tokenizer_config("google-bert/bert-base-uncased")
+    # This model does not have a tokenizer config so the result will be an empty dict.
+    tokenizer_config = get_tokenizer_config("FacebookAI/xlm-roberta-base")
+    # Save a pretrained tokenizer locally and you can reload its config
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
+    tokenizer.save_pretrained("tokenizer-test")
+    tokenizer_config = get_tokenizer_config("tokenizer-test")
+    ```"""
+    use_auth_token = kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+            FutureWarning,
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
+    commit_hash = kwargs.get("_commit_hash", None)
+    resolved_config_file = cached_file(
+        pretrained_model_name_or_path,
+        "tokenizer.json",
+        cache_dir=cache_dir,
+        force_download=force_download,
+        resume_download=resume_download,
+        proxies=proxies,
+        token=token,
+        revision=revision,
+        local_files_only=local_files_only,
+        subfolder=subfolder,
+        _raise_exceptions_for_gated_repo=False,
+        _raise_exceptions_for_missing_entries=False,
+        _raise_exceptions_for_connection_errors=False,
+        _commit_hash=commit_hash,
+    )
+    if resolved_config_file is None:
+        logger.info("Could not locate the tokenizer configuration file, will try to use the model config instead.")
+        return False
+    return True
+
+
 class AutoTokenizer:
    r"""
    This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when
@ -898,6 +993,8 @@ class AutoTokenizer:
            if use_fast and not config_tokenizer_class.endswith("Fast"):
                tokenizer_class_candidate = f"{config_tokenizer_class}Fast"
                tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
+                if tokenizer_class is None and has_tokenizer_file(pretrained_model_name_or_path, **kwargs):
+                    tokenizer_class = PreTrainedTokenizerFast
            if tokenizer_class is None:
                tokenizer_class_candidate = config_tokenizer_class
                tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
--- a/tests/models/auto/test_tokenization_auto.py
+++ b/tests/models/auto/test_tokenization_auto.py
@ -211,6 +211,11 @@ class AutoTokenizerTest(unittest.TestCase):
        self.assertIsInstance(tokenizer2, tokenizer.__class__)
        self.assertEqual(tokenizer2.vocab_size, 12)

+    def test_PreTrainedTokenizerFast_inferred(self):
+        # Model does not have a fast tokenizer or PreTrainedTokenizerFast specified in config but can still load fast
+        tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224", use_fast=True)
+        self.assertEqual(type(tokenizer), PreTrainedTokenizerFast)
+
    def test_auto_tokenizer_fast_no_slow(self):
        tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        # There is no fast CTRL so this always gives us a slow tokenizer.
Author	SHA1	Message	Date
Ita Zaporozhets	b6ae24fcc2	check if repo has tokenizer.json file, then load PreTrainedTokenizerFast	2024-10-18 16:16:28 +02:00
Ita Zaporozhets	88b140e8dd	check if can index	2024-09-29 18:03:44 +02:00
Ita Zaporozhets	f4d7160dc1	parse mapping instead	2024-09-27 18:23:13 +02:00
Ita Zaporozhets	3db349e65b	rm siglip change	2024-09-27 12:36:12 +02:00
Ita Zaporozhets	7f2e583ff9	diff idea	2024-09-27 10:38:23 +02:00
Ita Zaporozhets	d937869072	add support for loading a pretrainedfast tokenizer if fast=true and tokenizer.json file exists	2024-09-27 10:20:42 +02:00