Compare commits

...

6 Commits

2 changed files with 102 additions and 0 deletions

View File

@ -701,6 +701,101 @@ def get_tokenizer_config(
return result
def has_tokenizer_file(
pretrained_model_name_or_path: Union[str, os.PathLike],
cache_dir: Optional[Union[str, os.PathLike]] = None,
force_download: bool = False,
resume_download: Optional[bool] = None,
proxies: Optional[Dict[str, str]] = None,
token: Optional[Union[bool, str]] = None,
revision: Optional[str] = None,
local_files_only: bool = False,
subfolder: str = "",
**kwargs,
):
"""
Checks if the tokenizer.json file exists.
Args:
pretrained_model_name_or_path (`str` or `os.PathLike`):
This can be either:
- a string, the *model id* of a pretrained model configuration hosted inside a model repo on
huggingface.co.
- a path to a *directory* containing a configuration file saved using the
[`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
cache_dir (`str` or `os.PathLike`, *optional*):
Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
cache should not be used.
force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force to (re-)download the configuration files and override the cached versions if they
exist.
resume_download:
Deprecated and ignored. All downloads are now resumed by default when possible.
Will be removed in v5 of Transformers.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
token (`str` or *bool*, *optional*):
The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
when running `huggingface-cli login` (stored in `~/.huggingface`).
revision (`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
local_files_only (`bool`, *optional*, defaults to `False`):
If `True`, will only try to load the tokenizer configuration from local files.
subfolder (`str`, *optional*, defaults to `""`):
In case the tokenizer config is located inside a subfolder of the model repo on huggingface.co, you can
specify the folder name here.
<Tip>
Passing `token=True` is required when you want to use a private model.
</Tip>
Returns:
`Bool`: True if the tokenizer.json file exists, False otherwise.
Examples:
```python
# Download configuration from huggingface.co and cache.
tokenizer_config = get_tokenizer_config("google-bert/bert-base-uncased")
# This model does not have a tokenizer config so the result will be an empty dict.
tokenizer_config = get_tokenizer_config("FacebookAI/xlm-roberta-base")
# Save a pretrained tokenizer locally and you can reload its config
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
tokenizer.save_pretrained("tokenizer-test")
tokenizer_config = get_tokenizer_config("tokenizer-test")
```"""
use_auth_token = kwargs.pop("use_auth_token", None)
if use_auth_token is not None:
warnings.warn(
"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
FutureWarning,
)
if token is not None:
raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
token = use_auth_token
commit_hash = kwargs.get("_commit_hash", None)
resolved_config_file = cached_file(
pretrained_model_name_or_path,
"tokenizer.json",
cache_dir=cache_dir,
force_download=force_download,
resume_download=resume_download,
proxies=proxies,
token=token,
revision=revision,
local_files_only=local_files_only,
subfolder=subfolder,
_raise_exceptions_for_gated_repo=False,
_raise_exceptions_for_missing_entries=False,
_raise_exceptions_for_connection_errors=False,
_commit_hash=commit_hash,
)
if resolved_config_file is None:
logger.info("Could not locate the tokenizer configuration file, will try to use the model config instead.")
return False
return True
class AutoTokenizer:
r"""
This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when
@ -898,6 +993,8 @@ class AutoTokenizer:
if use_fast and not config_tokenizer_class.endswith("Fast"):
tokenizer_class_candidate = f"{config_tokenizer_class}Fast"
tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
if tokenizer_class is None and has_tokenizer_file(pretrained_model_name_or_path, **kwargs):
tokenizer_class = PreTrainedTokenizerFast
if tokenizer_class is None:
tokenizer_class_candidate = config_tokenizer_class
tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)

View File

@ -211,6 +211,11 @@ class AutoTokenizerTest(unittest.TestCase):
self.assertIsInstance(tokenizer2, tokenizer.__class__)
self.assertEqual(tokenizer2.vocab_size, 12)
def test_PreTrainedTokenizerFast_inferred(self):
# Model does not have a fast tokenizer or PreTrainedTokenizerFast specified in config but can still load fast
tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224", use_fast=True)
self.assertEqual(type(tokenizer), PreTrainedTokenizerFast)
def test_auto_tokenizer_fast_no_slow(self):
tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
# There is no fast CTRL so this always gives us a slow tokenizer.