mirror of
https://github.com/huggingface/transformers.git
synced 2025-10-21 01:23:56 +08:00
Compare commits
16 Commits
multiple-m
...
v4.44.1
Author | SHA1 | Date | |
---|---|---|---|
ca56cd7b31 | |||
6e931e1647 | |||
74f57df61b | |||
084fe2ee6c | |||
fff9be1545 | |||
4fd0f4802b | |||
5c4b15ae01 | |||
b51a82a524 | |||
0dca7d72b8 | |||
a3e77bae9a | |||
9c7aa7b3c7 | |||
ed6acee21a | |||
51741d7e46 | |||
984bc11b08 | |||
af61272239 | |||
3e93524a13 |
@ -61,7 +61,7 @@ from transformers.utils import check_min_version, send_example_telemetry
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
Array = Any
|
||||
Dataset = datasets.arrow_dataset.Dataset
|
||||
|
@ -60,7 +60,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risk.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recognition/requirements.txt")
|
||||
|
||||
|
@ -56,7 +56,7 @@ from transformers.utils import check_min_version, send_example_telemetry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
Array = Any
|
||||
Dataset = datasets.arrow_dataset.Dataset
|
||||
|
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
|
||||
|
||||
|
@ -45,7 +45,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")
|
||||
|
||||
|
@ -54,7 +54,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")
|
||||
|
||||
|
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
|
||||
|
||||
|
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
@ -43,7 +43,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
|
||||
|
||||
|
@ -48,7 +48,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
|
||||
|
||||
|
@ -53,7 +53,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
|
||||
|
||||
|
@ -46,7 +46,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")
|
||||
|
||||
|
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")
|
||||
|
||||
|
@ -55,7 +55,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||
|
||||
|
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
@ -58,7 +58,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||
|
||||
|
@ -60,7 +60,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
@ -54,7 +54,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||
|
||||
|
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||
|
@ -47,7 +47,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||
|
||||
|
@ -47,7 +47,7 @@ from transformers.utils import PaddingStrategy, check_min_version, send_example_
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -56,7 +56,7 @@ from transformers.utils import PaddingStrategy, check_min_version, send_example_
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
# You should update this to your particular problem to have better documentation of `model_type`
|
||||
|
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/object-detection/requirements.txt")
|
||||
|
||||
|
@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = get_logger(__name__)
|
||||
|
@ -50,7 +50,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
|
||||
|
||||
|
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
|
||||
|
||||
|
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
|
||||
|
||||
|
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
|
||||
|
||||
|
@ -46,7 +46,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
|
||||
|
||||
|
@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt")
|
||||
|
||||
|
@ -50,7 +50,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
@ -50,7 +50,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
|
||||
|
||||
|
@ -53,7 +53,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
|
||||
|
||||
|
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
|
||||
|
||||
|
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
|
||||
|
||||
|
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
|
||||
|
@ -47,7 +47,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
|
||||
|
||||
|
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
|
||||
|
||||
|
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
|
||||
|
||||
|
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
|
||||
|
||||
|
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
|
||||
|
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
|
||||
|
||||
|
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
|
||||
|
@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version(
|
||||
"datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt"
|
||||
|
@ -55,7 +55,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
|
||||
|
||||
|
@ -50,7 +50,7 @@ from transformers.utils import PaddingStrategy, check_min_version, send_example_
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -62,7 +62,7 @@ except (ModuleNotFoundError, ImportError):
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -53,7 +53,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
# region Checking dependencies
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
|
||||
|
||||
|
@ -47,7 +47,7 @@ from transformers.utils import check_min_version, send_example_telemetry
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
task_to_keys = {
|
||||
"cola": ("sentence", None),
|
||||
|
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
# region Dependencies and constants
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.44.0.dev0")
|
||||
check_min_version("4.44.0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
|
||||
|
||||
|
2
setup.py
2
setup.py
@ -430,7 +430,7 @@ install_requires = [
|
||||
|
||||
setup(
|
||||
name="transformers",
|
||||
version="4.44.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
|
||||
version="4.44.1", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
|
||||
author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
|
||||
author_email="transformers@huggingface.co",
|
||||
description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",
|
||||
|
@ -18,7 +18,7 @@
|
||||
# to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
|
||||
# in the namespace without actually importing anything (and especially none of the backends).
|
||||
|
||||
__version__ = "4.44.0.dev0"
|
||||
__version__ = "4.44.1"
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
|
@ -1067,6 +1067,8 @@ class StaticCache(Cache):
|
||||
A tuple containing the updated key and value states.
|
||||
"""
|
||||
cache_position = cache_kwargs.get("cache_position")
|
||||
self.key_cache[layer_idx] = self.key_cache[layer_idx].to(device=key_states.device)
|
||||
self.value_cache[layer_idx] = self.value_cache[layer_idx].to(device=value_states.device)
|
||||
k_out = self.key_cache[layer_idx]
|
||||
v_out = self.value_cache[layer_idx]
|
||||
|
||||
@ -1078,8 +1080,6 @@ class StaticCache(Cache):
|
||||
# `tensor[:, :, index] = tensor`, but the first one is compile-friendly and it does explicitly an in-place
|
||||
# operation, that avoids copies and uses less memory.
|
||||
try:
|
||||
# If using several devices (e.g.: multiple GPUs), we need to ensure everything is on the same one
|
||||
cache_position.to(device=k_out.device)
|
||||
k_out.index_copy_(2, cache_position, key_states)
|
||||
v_out.index_copy_(2, cache_position, value_states)
|
||||
except NotImplementedError:
|
||||
|
@ -454,6 +454,7 @@ class TrainingSummary:
|
||||
metric_mapping = infer_metric_tags_from_eval_results(self.eval_results)
|
||||
|
||||
metadata = {}
|
||||
metadata = _insert_value(metadata, "library_name", "transformers")
|
||||
metadata = _insert_values_as_list(metadata, "language", self.language)
|
||||
metadata = _insert_value(metadata, "license", self.license)
|
||||
if self.finetuned_from is not None and isinstance(self.finetuned_from, str) and len(self.finetuned_from) > 0:
|
||||
|
@ -264,11 +264,10 @@ def _flash_attention_forward(
|
||||
)
|
||||
attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
|
||||
|
||||
# if position_ids is provided and check not all examples (row) contain only 1 sequence, and is in pre-fill/training stage
|
||||
# then use `flash_attn_varlen_func` to prevent cross-example attention and also allow padding free approach
|
||||
elif (
|
||||
position_ids is not None and not (position_ids[:, -1] == position_ids.size(1) - 1).all() and query_length != 1
|
||||
):
|
||||
# If position_ids is provided and check all examples do not contain only 1 sequence, If tensor in increasing
|
||||
# then we probably have one sequence, otherwise it is packed. Additionally check we are in pre-fill/training stage.
|
||||
# Use `flash_attn_varlen_func` to prevent cross-example attention and also allow padding free approach
|
||||
elif position_ids is not None and not (torch.diff(position_ids, dim=-1) >= 0).all() and query_length != 1:
|
||||
batch_size = query_states.size(0)
|
||||
query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = prepare_fa2_from_position_ids(
|
||||
query_states, key_states, value_states, position_ids
|
||||
|
@ -58,7 +58,8 @@ def _compute_default_rope_parameters(
|
||||
elif config is not None:
|
||||
base = config.rope_theta
|
||||
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
|
||||
dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor)
|
||||
head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
|
||||
dim = int(head_dim * partial_rotary_factor)
|
||||
|
||||
attention_factor = 1.0 # Unused in this type of RoPE
|
||||
|
||||
@ -143,7 +144,8 @@ def _compute_dynamic_ntk_parameters(
|
||||
elif config is not None:
|
||||
base = config.rope_theta
|
||||
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
|
||||
dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor)
|
||||
head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
|
||||
dim = int(head_dim * partial_rotary_factor)
|
||||
max_position_embeddings = config.max_position_embeddings
|
||||
factor = config.rope_scaling["factor"]
|
||||
|
||||
@ -185,7 +187,8 @@ def _compute_yarn_parameters(
|
||||
|
||||
base = config.rope_theta
|
||||
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
|
||||
dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor)
|
||||
head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
|
||||
dim = int(head_dim * partial_rotary_factor)
|
||||
max_position_embeddings = config.max_position_embeddings
|
||||
factor = config.rope_scaling["factor"]
|
||||
|
||||
@ -265,7 +268,8 @@ def _compute_longrope_parameters(
|
||||
|
||||
base = config.rope_theta
|
||||
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
|
||||
dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor)
|
||||
head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
|
||||
dim = int(head_dim * partial_rotary_factor)
|
||||
long_factor = config.rope_scaling["long_factor"]
|
||||
short_factor = config.rope_scaling["short_factor"]
|
||||
factor = config.rope_scaling.get("factor")
|
||||
@ -450,7 +454,8 @@ def _validate_longrope_parameters(config: PretrainedConfig):
|
||||
_check_received_keys(rope_type, received_keys, required_keys, optional_keys)
|
||||
|
||||
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
|
||||
dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor)
|
||||
head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
|
||||
dim = int(head_dim * partial_rotary_factor)
|
||||
|
||||
short_factor = rope_scaling.get("short_factor")
|
||||
if not isinstance(short_factor, list) and all(isinstance(x, (int, float)) for x in short_factor):
|
||||
|
@ -104,7 +104,6 @@ from .utils.quantization_config import BitsAndBytesConfig, QuantizationMethod
|
||||
|
||||
XLA_USE_BF16 = os.environ.get("XLA_USE_BF16", "0").upper()
|
||||
XLA_DOWNCAST_BF16 = os.environ.get("XLA_DOWNCAST_BF16", "0").upper()
|
||||
PARAM_RENAME_WARNING = "A parameter name that contains `{}` will be renamed internally to `{}`. Please use a different name to suppress this warning."
|
||||
|
||||
|
||||
if is_accelerate_available():
|
||||
@ -692,17 +691,30 @@ def _load_state_dict_into_model(model_to_load, state_dict, start_prefix, assign_
|
||||
# Convert old format to new format if needed from a PyTorch state_dict
|
||||
old_keys = []
|
||||
new_keys = []
|
||||
renamed_keys = {}
|
||||
renamed_gamma = {}
|
||||
renamed_beta = {}
|
||||
warning_msg = f"A pretrained model of type `{model_to_load.__class__.__name__}` "
|
||||
for key in state_dict.keys():
|
||||
new_key = None
|
||||
if "gamma" in key:
|
||||
logger.warning(PARAM_RENAME_WARNING.format("gamma", "weight"))
|
||||
# We add only the first key as an example
|
||||
new_key = key.replace("gamma", "weight")
|
||||
renamed_gamma[key] = new_key if not renamed_gamma else renamed_gamma
|
||||
if "beta" in key:
|
||||
logger.warning(PARAM_RENAME_WARNING.format("beta", "bias"))
|
||||
# We add only the first key as an example
|
||||
new_key = key.replace("beta", "bias")
|
||||
renamed_beta[key] = new_key if not renamed_beta else renamed_beta
|
||||
if new_key:
|
||||
old_keys.append(key)
|
||||
new_keys.append(new_key)
|
||||
renamed_keys = {**renamed_gamma, **renamed_beta}
|
||||
if renamed_keys:
|
||||
warning_msg += "contains parameters that have been renamed internally (a few are listed below but more are present in the model):\n"
|
||||
for old_key, new_key in renamed_keys.items():
|
||||
warning_msg += f"* `{old_key}` -> `{new_key}`\n"
|
||||
warning_msg += "If you are using a model from the Hub, consider submitting a PR to adjust these weights and help future users."
|
||||
logger.info_once(warning_msg)
|
||||
for old_key, new_key in zip(old_keys, new_keys):
|
||||
state_dict[new_key] = state_dict.pop(old_key)
|
||||
|
||||
@ -818,6 +830,7 @@ def _load_state_dict_into_meta_model(
|
||||
is_safetensors=False,
|
||||
keep_in_fp32_modules=None,
|
||||
unexpected_keys=None, # passing `unexpected` for cleanup from quantization items
|
||||
pretrained_model_name_or_path=None, # for flagging the user when the model contains renamed keys
|
||||
):
|
||||
"""
|
||||
This is somewhat similar to `_load_state_dict_into_model`, but deals with a model that has some or all of its
|
||||
@ -840,18 +853,30 @@ def _load_state_dict_into_meta_model(
|
||||
|
||||
old_keys = []
|
||||
new_keys = []
|
||||
renamed_gamma = {}
|
||||
renamed_beta = {}
|
||||
is_quantized = hf_quantizer is not None
|
||||
warning_msg = f"This model {type(model)}"
|
||||
for key in state_dict.keys():
|
||||
new_key = None
|
||||
if "gamma" in key:
|
||||
logger.warning(PARAM_RENAME_WARNING.format("gamma", "weight"))
|
||||
# We add only the first key as an example
|
||||
new_key = key.replace("gamma", "weight")
|
||||
renamed_gamma[key] = new_key if not renamed_gamma else renamed_gamma
|
||||
if "beta" in key:
|
||||
logger.warning(PARAM_RENAME_WARNING.format("beta", "bias"))
|
||||
# We add only the first key as an example
|
||||
new_key = key.replace("beta", "bias")
|
||||
renamed_beta[key] = new_key if not renamed_beta else renamed_beta
|
||||
if new_key:
|
||||
old_keys.append(key)
|
||||
new_keys.append(new_key)
|
||||
renamed_keys = {**renamed_gamma, **renamed_beta}
|
||||
if renamed_keys:
|
||||
warning_msg += "contains parameters that have been renamed internally (a few are listed below but more are present in the model):\n"
|
||||
for old_key, new_key in renamed_keys.items():
|
||||
warning_msg += f"* `{old_key}` -> `{new_key}`\n"
|
||||
warning_msg += "If you are using a model from the Hub, consider submitting a PR to adjust these weights and help future users."
|
||||
logger.info_once(warning_msg)
|
||||
for old_key, new_key in zip(old_keys, new_keys):
|
||||
state_dict[new_key] = state_dict.pop(old_key)
|
||||
|
||||
@ -932,8 +957,6 @@ def _load_state_dict_into_meta_model(
|
||||
)
|
||||
)
|
||||
):
|
||||
if is_fsdp_enabled():
|
||||
param_device = "cpu" if is_local_dist_rank_0() else "meta"
|
||||
# For backward compatibility with older versions of `accelerate` and for non-quantized params
|
||||
set_module_tensor_to_device(model, param_name, param_device, **set_module_kwargs)
|
||||
else:
|
||||
@ -944,10 +967,7 @@ def _load_state_dict_into_meta_model(
|
||||
if is_fsdp_enabled() or is_deepspeed_zero3_enabled():
|
||||
module, tensor_name = get_module_from_name(model, param_name)
|
||||
value = getattr(module, tensor_name)
|
||||
param_to = "cpu"
|
||||
if is_fsdp_enabled() and not is_local_dist_rank_0():
|
||||
param_to = "meta"
|
||||
value = type(value)(value.data.to(param_to), **value.__dict__)
|
||||
value = type(value)(value.data.to("cpu"), **value.__dict__)
|
||||
setattr(module, tensor_name, value)
|
||||
# TODO: consider removing used param_parts from state_dict before return
|
||||
|
||||
@ -1482,9 +1502,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
else:
|
||||
model = cls(config, **kwargs)
|
||||
|
||||
# Flag for if we init with `zero3`, add an attr to the model so we can check downstream for issues
|
||||
model._transformers_zero3_init_used = is_deepspeed_zero3_enabled()
|
||||
|
||||
# restore default dtype if it was modified
|
||||
if dtype_orig is not None:
|
||||
torch.set_default_dtype(dtype_orig)
|
||||
@ -3814,9 +3831,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
# Let's make sure we don't run the init function of buffer modules
|
||||
model = cls(config, *model_args, **model_kwargs)
|
||||
|
||||
# If we init with `zero3`, add an attr to the model so we can check downstream for issues
|
||||
model._transformers_zero3_init_used = is_deepspeed_zero3_enabled() and not is_quantized
|
||||
|
||||
# make sure we use the model's config since the __init__ call might have copied it
|
||||
config = model.config
|
||||
|
||||
@ -4545,7 +4559,12 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
|
||||
@staticmethod
|
||||
def _load_pretrained_model_low_mem(
|
||||
model, loaded_state_dict_keys, resolved_archive_file, start_prefix="", hf_quantizer=None
|
||||
model,
|
||||
loaded_state_dict_keys,
|
||||
resolved_archive_file,
|
||||
start_prefix="",
|
||||
hf_quantizer=None,
|
||||
pretrained_model_name_or_path=None,
|
||||
):
|
||||
"""
|
||||
This is an experimental function that loads the model using ~1.x model size CPU memory
|
||||
|
@ -1132,17 +1132,18 @@ class CohereForCausalLM(CoherePreTrainedModel):
|
||||
|
||||
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
|
||||
if inputs_embeds is not None and cache_position[0] == 0:
|
||||
model_inputs = {"inputs_embeds": inputs_embeds}
|
||||
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
|
||||
else:
|
||||
model_inputs = {"input_ids": input_ids}
|
||||
# The clone here is for the same reason as for `position_ids`.
|
||||
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
|
||||
|
||||
if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
|
||||
if inputs_embeds is not None:
|
||||
batch_size, sequence_length = inputs_embeds.shape
|
||||
device = inputs_embeds.device
|
||||
if model_inputs["inputs_embeds"] is not None:
|
||||
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
|
||||
device = model_inputs["inputs_embeds"].device
|
||||
else:
|
||||
batch_size, sequence_length = input_ids.shape
|
||||
device = input_ids.device
|
||||
batch_size, sequence_length = model_inputs["input_ids"].shape
|
||||
device = model_inputs["input_ids"].device
|
||||
|
||||
dtype = self.lm_head.weight.dtype
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
|
@ -1403,17 +1403,18 @@ class DbrxForCausalLM(DbrxPreTrainedModel):
|
||||
|
||||
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
|
||||
if inputs_embeds is not None and cache_position[0] == 0:
|
||||
model_inputs = {"inputs_embeds": inputs_embeds}
|
||||
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
|
||||
else:
|
||||
model_inputs = {"input_ids": input_ids}
|
||||
# The clone here is for the same reason as for `position_ids`.
|
||||
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
|
||||
|
||||
if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
|
||||
if inputs_embeds is not None:
|
||||
batch_size, sequence_length = inputs_embeds.shape
|
||||
device = inputs_embeds.device
|
||||
if model_inputs["inputs_embeds"] is not None:
|
||||
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
|
||||
device = model_inputs["inputs_embeds"].device
|
||||
else:
|
||||
batch_size, sequence_length = input_ids.shape
|
||||
device = input_ids.device
|
||||
batch_size, sequence_length = model_inputs["input_ids"].shape
|
||||
device = model_inputs["input_ids"].device
|
||||
|
||||
dtype = self.lm_head.weight.dtype
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
|
@ -1126,7 +1126,8 @@ class FalconForCausalLM(FalconPreTrainedModel):
|
||||
if inputs_embeds is not None and past_key_values is None:
|
||||
model_inputs = {"inputs_embeds": inputs_embeds}
|
||||
else:
|
||||
model_inputs = {"input_ids": input_ids}
|
||||
# The clone here is for the same reason as for `position_ids`.
|
||||
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
|
||||
|
||||
model_inputs.update(
|
||||
{
|
||||
|
@ -1143,17 +1143,18 @@ class GemmaForCausalLM(GemmaPreTrainedModel):
|
||||
|
||||
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
|
||||
if inputs_embeds is not None and cache_position[0] == 0:
|
||||
model_inputs = {"inputs_embeds": inputs_embeds}
|
||||
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
|
||||
else:
|
||||
model_inputs = {"input_ids": input_ids}
|
||||
# The clone here is for the same reason as for `position_ids`.
|
||||
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
|
||||
|
||||
if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
|
||||
if inputs_embeds is not None:
|
||||
batch_size, sequence_length = inputs_embeds.shape
|
||||
device = inputs_embeds.device
|
||||
if model_inputs["inputs_embeds"] is not None:
|
||||
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
|
||||
device = model_inputs["inputs_embeds"].device
|
||||
else:
|
||||
batch_size, sequence_length = input_ids.shape
|
||||
device = input_ids.device
|
||||
batch_size, sequence_length = model_inputs["input_ids"].shape
|
||||
device = model_inputs["input_ids"].device
|
||||
|
||||
dtype = self.lm_head.weight.dtype
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
|
@ -104,7 +104,6 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
|
||||
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
|
||||
padding_mask, min_dtype
|
||||
)
|
||||
|
||||
return causal_mask
|
||||
|
||||
|
||||
@ -301,7 +300,6 @@ class Gemma2Attention(nn.Module):
|
||||
attn_weights = attn_weights / self.config.attn_logit_softcapping
|
||||
attn_weights = torch.tanh(attn_weights)
|
||||
attn_weights = attn_weights * self.config.attn_logit_softcapping
|
||||
|
||||
if attention_mask is not None: # no matter the length, we just slice it
|
||||
causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
|
||||
attn_weights = attn_weights + causal_mask
|
||||
@ -429,6 +427,7 @@ class Gemma2FlashAttention2(Gemma2Attention):
|
||||
dropout=dropout_rate,
|
||||
softmax_scale=self.scaling,
|
||||
is_causal=self.is_causal,
|
||||
sliding_window=self.sliding_window,
|
||||
use_top_left_mask=self._flash_attn_uses_top_left_mask,
|
||||
softcap=self.config.attn_logit_softcapping if is_flash_attn_greater_or_equal("2.6.0") else None,
|
||||
)
|
||||
@ -501,11 +500,9 @@ class Gemma2SdpaAttention(Gemma2Attention):
|
||||
|
||||
key_states = repeat_kv(key_states, self.num_key_value_groups)
|
||||
value_states = repeat_kv(value_states, self.num_key_value_groups)
|
||||
|
||||
causal_mask = attention_mask
|
||||
if attention_mask is not None:
|
||||
causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
|
||||
|
||||
# SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
|
||||
# Reference: https://github.com/pytorch/pytorch/issues/112577.
|
||||
if query_states.device.type == "cuda" and causal_mask is not None:
|
||||
@ -516,7 +513,6 @@ class Gemma2SdpaAttention(Gemma2Attention):
|
||||
# We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
|
||||
# in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
|
||||
is_causal = True if causal_mask is None and q_len > 1 else False
|
||||
|
||||
attn_output = torch.nn.functional.scaled_dot_product_attention(
|
||||
query_states,
|
||||
key_states,
|
||||
@ -572,7 +568,8 @@ class Gemma2DecoderLayer(nn.Module):
|
||||
if self.is_sliding and attention_mask is not None: # efficient SDPA and no padding
|
||||
# Flash-attn is a 2D tensor
|
||||
if self.config._attn_implementation == "flash_attention_2":
|
||||
attention_mask = attention_mask[:, -self.sliding_window :]
|
||||
if past_key_value is not None: # when decoding
|
||||
attention_mask = attention_mask[:, -self.sliding_window :]
|
||||
else:
|
||||
min_dtype = torch.finfo(hidden_states.dtype).min
|
||||
sliding_window_mask = torch.tril(
|
||||
@ -581,7 +578,6 @@ class Gemma2DecoderLayer(nn.Module):
|
||||
attention_mask = torch.where(sliding_window_mask, min_dtype, attention_mask)
|
||||
if attention_mask.shape[-1] <= 1: # when decoding
|
||||
attention_mask = attention_mask[:, :, :, -self.sliding_window :]
|
||||
|
||||
residual = hidden_states
|
||||
|
||||
hidden_states = self.input_layernorm(hidden_states)
|
||||
@ -994,7 +990,6 @@ class Gemma2ForCausalLM(Gemma2PreTrainedModel):
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
|
||||
outputs = self.model(
|
||||
input_ids=input_ids,
|
||||
@ -1061,7 +1056,6 @@ class Gemma2ForCausalLM(Gemma2PreTrainedModel):
|
||||
input_ids = input_ids[:, -cache_position.shape[0] :]
|
||||
elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2)
|
||||
input_ids = input_ids[:, cache_position]
|
||||
|
||||
if attention_mask is not None and position_ids is None:
|
||||
# create position_ids on the fly for batch generation
|
||||
position_ids = attention_mask.long().cumsum(-1) - 1
|
||||
@ -1077,22 +1071,24 @@ class Gemma2ForCausalLM(Gemma2PreTrainedModel):
|
||||
|
||||
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
|
||||
if inputs_embeds is not None and cache_position[0] == 0:
|
||||
model_inputs = {"inputs_embeds": inputs_embeds}
|
||||
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
|
||||
else:
|
||||
# The clone here is for the same reason as for `position_ids`.
|
||||
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format)}
|
||||
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
|
||||
|
||||
if isinstance(past_key_values, HybridCache) and attention_mask.ndim == 2:
|
||||
if inputs_embeds is not None:
|
||||
batch_size, sequence_length = inputs_embeds.shape
|
||||
device = inputs_embeds.device
|
||||
if (
|
||||
isinstance(past_key_values, HybridCache)
|
||||
and attention_mask.ndim == 2
|
||||
and not self.config._attn_implementation == "flash_attention_2"
|
||||
):
|
||||
if model_inputs["inputs_embeds"] is not None:
|
||||
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
|
||||
device = model_inputs["inputs_embeds"].device
|
||||
else:
|
||||
batch_size, sequence_length = input_ids.shape
|
||||
device = input_ids.device
|
||||
|
||||
batch_size, sequence_length = model_inputs["input_ids"].shape
|
||||
device = model_inputs["input_ids"].device
|
||||
dtype = self.lm_head.weight.dtype
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
|
||||
attention_mask = _prepare_4d_causal_attention_mask_with_cache_position(
|
||||
attention_mask,
|
||||
sequence_length=sequence_length,
|
||||
@ -1103,7 +1099,6 @@ class Gemma2ForCausalLM(Gemma2PreTrainedModel):
|
||||
cache_position=cache_position,
|
||||
batch_size=batch_size,
|
||||
)
|
||||
|
||||
model_inputs.update(
|
||||
{
|
||||
"position_ids": position_ids,
|
||||
|
@ -818,7 +818,8 @@ class GPTNeoForCausalLM(GPTNeoPreTrainedModel):
|
||||
if inputs_embeds is not None and past_key_values is None:
|
||||
model_inputs = {"inputs_embeds": inputs_embeds}
|
||||
else:
|
||||
model_inputs = {"input_ids": input_ids}
|
||||
# The clone here is for the same reason as for `position_ids`.
|
||||
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
|
||||
|
||||
model_inputs.update(
|
||||
{
|
||||
|
@ -967,7 +967,8 @@ class GPTJForCausalLM(GPTJPreTrainedModel):
|
||||
if inputs_embeds is not None and past_key_values is None:
|
||||
model_inputs = {"inputs_embeds": inputs_embeds}
|
||||
else:
|
||||
model_inputs = {"input_ids": input_ids}
|
||||
# The clone here is for the same reason as for `position_ids`.
|
||||
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
|
||||
|
||||
model_inputs.update(
|
||||
{
|
||||
|
@ -1265,17 +1265,18 @@ class LlamaForCausalLM(LlamaPreTrainedModel):
|
||||
|
||||
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
|
||||
if inputs_embeds is not None and cache_position[0] == 0:
|
||||
model_inputs = {"inputs_embeds": inputs_embeds}
|
||||
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
|
||||
else:
|
||||
model_inputs = {"input_ids": input_ids}
|
||||
# The clone here is for the same reason as for `position_ids`.
|
||||
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
|
||||
|
||||
if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
|
||||
if inputs_embeds is not None:
|
||||
batch_size, sequence_length = inputs_embeds.shape
|
||||
device = inputs_embeds.device
|
||||
if model_inputs["inputs_embeds"] is not None:
|
||||
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
|
||||
device = model_inputs["inputs_embeds"].device
|
||||
else:
|
||||
batch_size, sequence_length = input_ids.shape
|
||||
device = input_ids.device
|
||||
batch_size, sequence_length = model_inputs["input_ids"].shape
|
||||
device = model_inputs["input_ids"].device
|
||||
|
||||
dtype = self.lm_head.weight.dtype
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
|
@ -853,7 +853,7 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel):
|
||||
inputs_embeds = self.get_input_embeddings()(input_ids)
|
||||
|
||||
# Merge text and images in prefill stage
|
||||
if past_key_values is None:
|
||||
if input_ids is not None and inputs_embeds.shape[1] != 1:
|
||||
# First merge image tokens if there are any
|
||||
if pixel_values is not None and pixel_values.size(0) > 0:
|
||||
image_features = self._get_image_features(pixel_values, image_sizes)
|
||||
@ -910,7 +910,7 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel):
|
||||
pass
|
||||
|
||||
# generation with cache, decoding stage
|
||||
elif past_key_values is not None and (pixel_values is not None or pixel_values_videos is not None):
|
||||
elif pixel_values is not None or pixel_values_videos is not None:
|
||||
# Retrieve the first layer to inspect the logits and mask out the hidden states that are set to 0
|
||||
first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
|
||||
# Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
|
||||
|
@ -53,6 +53,60 @@ logger = logging.get_logger(__name__)
|
||||
_CONFIG_FOR_DOC = "NemotronConfig"
|
||||
|
||||
|
||||
# Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
|
||||
def _prepare_4d_causal_attention_mask_with_cache_position(
|
||||
attention_mask: torch.Tensor,
|
||||
sequence_length: int,
|
||||
target_length: int,
|
||||
dtype: torch.dtype,
|
||||
device: torch.device,
|
||||
min_dtype: float,
|
||||
cache_position: torch.Tensor,
|
||||
batch_size: int,
|
||||
):
|
||||
"""
|
||||
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
|
||||
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
|
||||
|
||||
Args:
|
||||
attention_mask (`torch.Tensor`):
|
||||
A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
|
||||
sequence_length (`int`):
|
||||
The sequence length being processed.
|
||||
target_length (`int`):
|
||||
The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
|
||||
dtype (`torch.dtype`):
|
||||
The dtype to use for the 4D attention mask.
|
||||
device (`torch.device`):
|
||||
The device to plcae the 4D attention mask on.
|
||||
min_dtype (`float`):
|
||||
The minimum value representable with the dtype `dtype`.
|
||||
cache_position (`torch.Tensor`):
|
||||
Indices depicting the position of the input sequence tokens in the sequence.
|
||||
batch_size (`torch.Tensor`):
|
||||
Batch size.
|
||||
"""
|
||||
if attention_mask is not None and attention_mask.dim() == 4:
|
||||
# In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
|
||||
causal_mask = attention_mask
|
||||
else:
|
||||
causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
|
||||
if sequence_length != 1:
|
||||
causal_mask = torch.triu(causal_mask, diagonal=1)
|
||||
causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
|
||||
causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
|
||||
if attention_mask is not None:
|
||||
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
|
||||
mask_length = attention_mask.shape[-1]
|
||||
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
|
||||
padding_mask = padding_mask == 0
|
||||
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
|
||||
padding_mask, min_dtype
|
||||
)
|
||||
|
||||
return causal_mask
|
||||
|
||||
|
||||
def _cast_if_autocast_enabled(*args):
|
||||
if not torch.is_autocast_enabled():
|
||||
return args
|
||||
@ -902,27 +956,18 @@ class NemotronModel(NemotronPreTrainedModel):
|
||||
else past_seen_tokens + sequence_length + 1
|
||||
)
|
||||
|
||||
if attention_mask is not None and attention_mask.dim() == 4:
|
||||
# in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
|
||||
if attention_mask.max() != 0:
|
||||
raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
|
||||
causal_mask = attention_mask
|
||||
else:
|
||||
causal_mask = torch.full(
|
||||
(sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
|
||||
)
|
||||
if sequence_length != 1:
|
||||
causal_mask = torch.triu(causal_mask, diagonal=1)
|
||||
causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
|
||||
causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
|
||||
if attention_mask is not None:
|
||||
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
|
||||
mask_length = attention_mask.shape[-1]
|
||||
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
|
||||
padding_mask = padding_mask == 0
|
||||
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
|
||||
padding_mask, min_dtype
|
||||
)
|
||||
# In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
|
||||
causal_mask = _prepare_4d_causal_attention_mask_with_cache_position(
|
||||
attention_mask,
|
||||
sequence_length=sequence_length,
|
||||
target_length=target_length,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
min_dtype=min_dtype,
|
||||
cache_position=cache_position,
|
||||
batch_size=input_tensor.shape[0],
|
||||
)
|
||||
|
||||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
@ -1086,11 +1131,37 @@ class NemotronForCausalLM(NemotronPreTrainedModel):
|
||||
if past_key_values:
|
||||
position_ids = position_ids[:, -input_ids.shape[1] :]
|
||||
|
||||
# This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture.
|
||||
position_ids = position_ids.clone(memory_format=torch.contiguous_format)
|
||||
|
||||
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
|
||||
if inputs_embeds is not None and cache_position[0] == 0:
|
||||
model_inputs = {"inputs_embeds": inputs_embeds}
|
||||
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
|
||||
else:
|
||||
model_inputs = {"input_ids": input_ids.contiguous()} # `contiguous()` needed for compilation use cases
|
||||
# The clone here is for the same reason as for `position_ids`.
|
||||
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
|
||||
|
||||
if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
|
||||
if model_inputs["inputs_embeds"] is not None:
|
||||
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
|
||||
device = model_inputs["inputs_embeds"].device
|
||||
else:
|
||||
batch_size, sequence_length = model_inputs["input_ids"].shape
|
||||
device = model_inputs["input_ids"].device
|
||||
|
||||
dtype = self.lm_head.weight.dtype
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
|
||||
attention_mask = _prepare_4d_causal_attention_mask_with_cache_position(
|
||||
attention_mask,
|
||||
sequence_length=sequence_length,
|
||||
target_length=past_key_values.get_max_length(),
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
min_dtype=min_dtype,
|
||||
cache_position=cache_position,
|
||||
batch_size=batch_size,
|
||||
)
|
||||
|
||||
model_inputs.update(
|
||||
{
|
||||
|
@ -1176,17 +1176,18 @@ class OlmoForCausalLM(OlmoPreTrainedModel):
|
||||
|
||||
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
|
||||
if inputs_embeds is not None and cache_position[0] == 0:
|
||||
model_inputs = {"inputs_embeds": inputs_embeds}
|
||||
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
|
||||
else:
|
||||
model_inputs = {"input_ids": input_ids}
|
||||
# The clone here is for the same reason as for `position_ids`.
|
||||
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
|
||||
|
||||
if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
|
||||
if inputs_embeds is not None:
|
||||
batch_size, sequence_length = inputs_embeds.shape
|
||||
device = inputs_embeds.device
|
||||
if model_inputs["inputs_embeds"] is not None:
|
||||
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
|
||||
device = model_inputs["inputs_embeds"].device
|
||||
else:
|
||||
batch_size, sequence_length = input_ids.shape
|
||||
device = input_ids.device
|
||||
batch_size, sequence_length = model_inputs["input_ids"].shape
|
||||
device = model_inputs["input_ids"].device
|
||||
|
||||
dtype = self.lm_head.weight.dtype
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
|
@ -993,17 +993,18 @@ class PersimmonForCausalLM(PersimmonPreTrainedModel):
|
||||
|
||||
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
|
||||
if inputs_embeds is not None and cache_position[0] == 0:
|
||||
model_inputs = {"inputs_embeds": inputs_embeds}
|
||||
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
|
||||
else:
|
||||
model_inputs = {"input_ids": input_ids}
|
||||
# The clone here is for the same reason as for `position_ids`.
|
||||
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
|
||||
|
||||
if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
|
||||
if inputs_embeds is not None:
|
||||
batch_size, sequence_length = inputs_embeds.shape
|
||||
device = inputs_embeds.device
|
||||
if model_inputs["inputs_embeds"] is not None:
|
||||
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
|
||||
device = model_inputs["inputs_embeds"].device
|
||||
else:
|
||||
batch_size, sequence_length = input_ids.shape
|
||||
device = input_ids.device
|
||||
batch_size, sequence_length = model_inputs["input_ids"].shape
|
||||
device = model_inputs["input_ids"].device
|
||||
|
||||
dtype = self.lm_head.weight.dtype
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
|
@ -1278,17 +1278,18 @@ class PhiForCausalLM(PhiPreTrainedModel):
|
||||
|
||||
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
|
||||
if inputs_embeds is not None and cache_position[0] == 0:
|
||||
model_inputs = {"inputs_embeds": inputs_embeds}
|
||||
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
|
||||
else:
|
||||
model_inputs = {"input_ids": input_ids}
|
||||
# The clone here is for the same reason as for `position_ids`.
|
||||
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
|
||||
|
||||
if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
|
||||
if inputs_embeds is not None:
|
||||
batch_size, sequence_length = inputs_embeds.shape
|
||||
device = inputs_embeds.device
|
||||
if model_inputs["inputs_embeds"] is not None:
|
||||
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
|
||||
device = model_inputs["inputs_embeds"].device
|
||||
else:
|
||||
batch_size, sequence_length = input_ids.shape
|
||||
device = input_ids.device
|
||||
batch_size, sequence_length = model_inputs["input_ids"].shape
|
||||
device = model_inputs["input_ids"].device
|
||||
|
||||
dtype = self.lm_head.weight.dtype
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
|
@ -540,7 +540,7 @@ class Phi3FlashAttention2(Phi3Attention):
|
||||
max(kv_seq_len, position_ids[:, -1].max().item() + 1) if position_ids is not None else kv_seq_len
|
||||
)
|
||||
|
||||
cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
|
||||
cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len, position_ids=position_ids)
|
||||
|
||||
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
|
||||
|
||||
@ -1318,17 +1318,18 @@ class Phi3ForCausalLM(Phi3PreTrainedModel):
|
||||
|
||||
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
|
||||
if inputs_embeds is not None and cache_position[0] == 0:
|
||||
model_inputs = {"inputs_embeds": inputs_embeds}
|
||||
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
|
||||
else:
|
||||
model_inputs = {"input_ids": input_ids}
|
||||
# The clone here is for the same reason as for `position_ids`.
|
||||
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
|
||||
|
||||
if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
|
||||
if inputs_embeds is not None:
|
||||
batch_size, sequence_length = inputs_embeds.shape
|
||||
device = inputs_embeds.device
|
||||
if model_inputs["inputs_embeds"] is not None:
|
||||
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
|
||||
device = model_inputs["inputs_embeds"].device
|
||||
else:
|
||||
batch_size, sequence_length = input_ids.shape
|
||||
device = input_ids.device
|
||||
batch_size, sequence_length = model_inputs["input_ids"].shape
|
||||
device = model_inputs["input_ids"].device
|
||||
|
||||
dtype = self.lm_head.weight.dtype
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
|
@ -1176,17 +1176,18 @@ class Qwen2ForCausalLM(Qwen2PreTrainedModel):
|
||||
|
||||
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
|
||||
if inputs_embeds is not None and cache_position[0] == 0:
|
||||
model_inputs = {"inputs_embeds": inputs_embeds}
|
||||
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
|
||||
else:
|
||||
model_inputs = {"input_ids": input_ids}
|
||||
# The clone here is for the same reason as for `position_ids`.
|
||||
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
|
||||
|
||||
if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
|
||||
if inputs_embeds is not None:
|
||||
batch_size, sequence_length = inputs_embeds.shape
|
||||
device = inputs_embeds.device
|
||||
if model_inputs["inputs_embeds"] is not None:
|
||||
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
|
||||
device = model_inputs["inputs_embeds"].device
|
||||
else:
|
||||
batch_size, sequence_length = input_ids.shape
|
||||
device = input_ids.device
|
||||
batch_size, sequence_length = model_inputs["input_ids"].shape
|
||||
device = model_inputs["input_ids"].device
|
||||
|
||||
dtype = self.lm_head.weight.dtype
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
|
@ -1372,17 +1372,18 @@ class Qwen2MoeForCausalLM(Qwen2MoePreTrainedModel):
|
||||
|
||||
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
|
||||
if inputs_embeds is not None and cache_position[0] == 0:
|
||||
model_inputs = {"inputs_embeds": inputs_embeds}
|
||||
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
|
||||
else:
|
||||
model_inputs = {"input_ids": input_ids}
|
||||
# The clone here is for the same reason as for `position_ids`.
|
||||
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
|
||||
|
||||
if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
|
||||
if inputs_embeds is not None:
|
||||
batch_size, sequence_length = inputs_embeds.shape
|
||||
device = inputs_embeds.device
|
||||
if model_inputs["inputs_embeds"] is not None:
|
||||
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
|
||||
device = model_inputs["inputs_embeds"].device
|
||||
else:
|
||||
batch_size, sequence_length = input_ids.shape
|
||||
device = input_ids.device
|
||||
batch_size, sequence_length = model_inputs["input_ids"].shape
|
||||
device = model_inputs["input_ids"].device
|
||||
|
||||
dtype = self.lm_head.weight.dtype
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
|
@ -818,6 +818,7 @@ class RecurrentGemmaForCausalLM(RecurrentGemmaPreTrainedModel):
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
@ -858,6 +859,7 @@ class RecurrentGemmaForCausalLM(RecurrentGemmaPreTrainedModel):
|
||||
output_hidden_states = True
|
||||
outputs = self.model(
|
||||
input_ids=input_ids,
|
||||
position_ids=position_ids,
|
||||
cache_position=cache_position,
|
||||
attention_mask=attention_mask,
|
||||
inputs_embeds=inputs_embeds,
|
||||
@ -913,13 +915,17 @@ class RecurrentGemmaForCausalLM(RecurrentGemmaPreTrainedModel):
|
||||
if past_length > 0:
|
||||
position_ids = position_ids[:, past_length:]
|
||||
|
||||
if inputs_embeds is not None:
|
||||
model_inputs = {"inputs_embeds": inputs_embeds[:, past_length:]}
|
||||
else:
|
||||
model_inputs = {"input_ids": input_ids[:, past_length:].contiguous()}
|
||||
if inputs_embeds is not None: # Exception 1
|
||||
input_ids = input_ids[:, -cache_position.shape[0] :]
|
||||
elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2)
|
||||
input_ids = input_ids[:, cache_position]
|
||||
|
||||
if cache_position is not None:
|
||||
cache_position = cache_position[-position_ids.shape[1] :]
|
||||
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
|
||||
if inputs_embeds is not None and cache_position[0] == 0:
|
||||
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
|
||||
else:
|
||||
# The clone here is for the same reason as for `position_ids`.
|
||||
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
|
||||
|
||||
model_inputs.update(
|
||||
{
|
||||
|
@ -1271,17 +1271,18 @@ class StableLmForCausalLM(StableLmPreTrainedModel):
|
||||
|
||||
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
|
||||
if inputs_embeds is not None and cache_position[0] == 0:
|
||||
model_inputs = {"inputs_embeds": inputs_embeds}
|
||||
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
|
||||
else:
|
||||
model_inputs = {"input_ids": input_ids}
|
||||
# The clone here is for the same reason as for `position_ids`.
|
||||
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
|
||||
|
||||
if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
|
||||
if inputs_embeds is not None:
|
||||
batch_size, sequence_length = inputs_embeds.shape
|
||||
device = inputs_embeds.device
|
||||
if model_inputs["inputs_embeds"] is not None:
|
||||
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
|
||||
device = model_inputs["inputs_embeds"].device
|
||||
else:
|
||||
batch_size, sequence_length = input_ids.shape
|
||||
device = input_ids.device
|
||||
batch_size, sequence_length = model_inputs["input_ids"].shape
|
||||
device = model_inputs["input_ids"].device
|
||||
|
||||
dtype = self.lm_head.weight.dtype
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
|
@ -1152,17 +1152,18 @@ class Starcoder2ForCausalLM(Starcoder2PreTrainedModel):
|
||||
|
||||
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
|
||||
if inputs_embeds is not None and cache_position[0] == 0:
|
||||
model_inputs = {"inputs_embeds": inputs_embeds}
|
||||
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
|
||||
else:
|
||||
model_inputs = {"input_ids": input_ids}
|
||||
# The clone here is for the same reason as for `position_ids`.
|
||||
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
|
||||
|
||||
if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
|
||||
if inputs_embeds is not None:
|
||||
batch_size, sequence_length = inputs_embeds.shape
|
||||
device = inputs_embeds.device
|
||||
if model_inputs["inputs_embeds"] is not None:
|
||||
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
|
||||
device = model_inputs["inputs_embeds"].device
|
||||
else:
|
||||
batch_size, sequence_length = input_ids.shape
|
||||
device = input_ids.device
|
||||
batch_size, sequence_length = model_inputs["input_ids"].shape
|
||||
device = model_inputs["input_ids"].device
|
||||
|
||||
dtype = self.lm_head.weight.dtype
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
|
@ -653,9 +653,6 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel):
|
||||
if cache_length < past_length and attention_mask is not None:
|
||||
attention_mask = attention_mask[:, -(cache_length + input_ids.shape[1]) :]
|
||||
|
||||
pixel_values_videos = None
|
||||
pixel_values_images = None
|
||||
|
||||
position_ids = kwargs.get("position_ids", None)
|
||||
if attention_mask is not None and position_ids is None:
|
||||
# create position_ids on the fly for batch generation
|
||||
|
@ -12,7 +12,6 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import importlib
|
||||
import inspect
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
|
||||
|
||||
from packaging import version
|
||||
@ -200,16 +199,11 @@ class Bnb4BitHfQuantizer(HfQuantizer):
|
||||
if unexpected_keys is not None and k in unexpected_keys:
|
||||
unexpected_keys.remove(k)
|
||||
|
||||
param_kwargs = {}
|
||||
sig = inspect.signature(bnb.nn.Params4bit.from_prequantized)
|
||||
if "module" in sig.parameters:
|
||||
param_kwargs["module"] = module
|
||||
new_value = bnb.nn.Params4bit.from_prequantized(
|
||||
data=param_value,
|
||||
quantized_stats=quantized_stats,
|
||||
requires_grad=False,
|
||||
device=target_device,
|
||||
**param_kwargs,
|
||||
)
|
||||
else:
|
||||
new_value = param_value.to("cpu")
|
||||
|
@ -100,7 +100,6 @@ from .trainer_pt_utils import (
|
||||
get_model_param_count,
|
||||
get_module_class_from_name,
|
||||
get_parameter_names,
|
||||
is_deepspeed_zero3_enabled,
|
||||
nested_concat,
|
||||
nested_detach,
|
||||
nested_numpify,
|
||||
@ -436,15 +435,6 @@ class Trainer:
|
||||
)
|
||||
self.model_init = model_init
|
||||
|
||||
# Will reach this branch if the user has
|
||||
# 1. Used `.from_pretrained` or `.from_config` to initialize their model
|
||||
# 2. Did not configure Zero-3 via `TrainingArguments` or `accelerate launch` beforehand
|
||||
# New models init such as `MyModel()` will not hit this step
|
||||
if is_deepspeed_zero3_enabled() and not getattr(model, "_transformers_zero3_init_used", True):
|
||||
raise ValueError(
|
||||
"Model was not initialized with `Zero-3` despite being configured for DeepSpeed Zero-3. Please re-initialize your model via `Model.from_pretrained(...)` or `Model.from_config(...)` after creating your `TrainingArguments`!"
|
||||
)
|
||||
|
||||
if model.__class__.__name__ in MODEL_MAPPING_NAMES:
|
||||
raise ValueError(
|
||||
f"The model you have picked ({model.__class__.__name__}) cannot be used as is for training: it only "
|
||||
|
@ -692,7 +692,7 @@ def is_torchdynamo_compiling():
|
||||
import torch
|
||||
|
||||
return torch.compiler.is_compiling()
|
||||
except AttributeError:
|
||||
except Exception:
|
||||
try:
|
||||
import torch._dynamo as dynamo # noqa: F401
|
||||
|
||||
|
@ -331,6 +331,21 @@ def warning_once(self, *args, **kwargs):
|
||||
logging.Logger.warning_once = warning_once
|
||||
|
||||
|
||||
@functools.lru_cache(None)
|
||||
def info_once(self, *args, **kwargs):
|
||||
"""
|
||||
This method is identical to `logger.info()`, but will emit the info with the same message only once
|
||||
|
||||
Note: The cache is for the function arguments, so 2 different callers using the same arguments will hit the cache.
|
||||
The assumption here is that all warning messages are unique across the code. If they aren't then need to switch to
|
||||
another type of cache that includes the caller frame information in the hashing function.
|
||||
"""
|
||||
self.info(*args, **kwargs)
|
||||
|
||||
|
||||
logging.Logger.info_once = info_once
|
||||
|
||||
|
||||
class EmptyTqdm:
|
||||
"""Dummy tqdm which doesn't do anything."""
|
||||
|
||||
|
@ -709,30 +709,34 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
|
||||
# Relative difference. See the note above how to get identical loss on a small bs
|
||||
self.assertTrue((no_grad_accum_loss - yes_grad_accum_loss) / (no_grad_accum_loss + 1e-15) <= 1e-3)
|
||||
|
||||
def test_missed_zero3_init(self):
|
||||
from transformers import Trainer # noqa
|
||||
# NOTE: Currently a disabled test. In the future we should re-enable it.
|
||||
# Issue resolves around Zero-3 w/ DPO/TRL + DeepSpeed
|
||||
# As well as Zero-3 inference
|
||||
# Related PR: https://github.com/huggingface/transformers/pull/32299
|
||||
# def test_missed_zero3_init(self):
|
||||
# from transformers import Trainer # noqa
|
||||
|
||||
with mockenv_context(**self.dist_env_1_gpu):
|
||||
model = AutoModel.from_pretrained(T5_TINY)
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./test_missed_zero3_init",
|
||||
deepspeed=self.get_config_dict(ZERO3),
|
||||
)
|
||||
with self.assertRaises(
|
||||
ValueError, msg="Model was not initialized with `Zero-3` despite being configured."
|
||||
):
|
||||
_ = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
)
|
||||
# Now do it properly, triggered from our `TrainingArguments` earlier
|
||||
model = AutoModel.from_pretrained(T5_TINY)
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
)
|
||||
assert trainer.is_deepspeed_enabled
|
||||
assert model._transformers_zero3_init_used
|
||||
# with mockenv_context(**self.dist_env_1_gpu):
|
||||
# model = AutoModel.from_pretrained(T5_TINY)
|
||||
# training_args = TrainingArguments(
|
||||
# output_dir="./test_missed_zero3_init",
|
||||
# deepspeed=self.get_config_dict(ZERO3),
|
||||
# )
|
||||
# with self.assertRaises(
|
||||
# ValueError, msg="Model was not initialized with `Zero-3` despite being configured."
|
||||
# ):
|
||||
# _ = Trainer(
|
||||
# model=model,
|
||||
# args=training_args,
|
||||
# )
|
||||
# # Now do it properly, triggered from our `TrainingArguments` earlier
|
||||
# model = AutoModel.from_pretrained(T5_TINY)
|
||||
# trainer = Trainer(
|
||||
# model=model,
|
||||
# args=training_args,
|
||||
# )
|
||||
# assert trainer.is_deepspeed_enabled
|
||||
# assert model._transformers_zero3_init_used
|
||||
|
||||
def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage, dtype):
|
||||
# adapted from TrainerIntegrationCommon.check_saved_checkpoints
|
||||
|
@ -1540,3 +1540,8 @@ class BartStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, un
|
||||
@unittest.skip
|
||||
def test_save_load_fast_init_from_base(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="Generate needs input ids")
|
||||
def test_inputs_embeds_matches_input_ids_with_generate(self):
|
||||
# generate only works with input ids for bartforcausalLM
|
||||
pass
|
||||
|
@ -502,6 +502,11 @@ class BertModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
|
||||
self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
|
||||
|
||||
@unittest.skip(reason="Generate needs input ids")
|
||||
def test_inputs_embeds_matches_input_ids_with_generate(self):
|
||||
# generate only works with input ids for bertforcausalLM
|
||||
pass
|
||||
|
||||
def test_model_as_decoder_with_default_input_mask(self):
|
||||
# This regression test was failing with PyTorch < 1.3
|
||||
(
|
||||
|
@ -21,6 +21,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import BlipConfig, BlipTextConfig, BlipVisionConfig
|
||||
from transformers.testing_utils import (
|
||||
@ -1106,6 +1107,7 @@ class BlipTextRetrievalModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
@require_torch
|
||||
class BlipTextImageModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (BlipForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_generative_model_classes = (BlipForConditionalGeneration,) if is_torch_available() else ()
|
||||
fx_compatible = False
|
||||
test_head_masking = False
|
||||
test_pruning = False
|
||||
@ -1116,6 +1118,18 @@ class BlipTextImageModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.model_tester = BlipTextImageModelsModelTester(self)
|
||||
|
||||
@parameterized.expand([(True,), (False,)])
|
||||
def test_greedy_generation(self, use_cache: bool):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_generative_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
out = model.generate(**inputs_dict, min_new_tokens=20, max_new_tokens=20, use_cache=use_cache)
|
||||
self.assertTrue(out.shape[1] == inputs_dict["input_ids"].shape[1] + 19)
|
||||
|
||||
def test_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||
|
@ -20,6 +20,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import CONFIG_MAPPING, Blip2Config, Blip2QFormerConfig, Blip2VisionConfig
|
||||
from transformers.testing_utils import (
|
||||
@ -314,7 +315,7 @@ class Blip2TextModelDecoderOnlyTester:
|
||||
hidden_act="gelu",
|
||||
hidden_dropout_prob=0.1,
|
||||
attention_probs_dropout_prob=0.1,
|
||||
max_position_embeddings=20,
|
||||
max_position_embeddings=256,
|
||||
eos_token_id=2,
|
||||
pad_token_id=1,
|
||||
bos_token_id=0,
|
||||
@ -436,8 +437,9 @@ class Blip2ForConditionalGenerationDecoderOnlyModelTester:
|
||||
|
||||
|
||||
@require_torch
|
||||
class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
|
||||
class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (Blip2ForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_generative_model_classes = (Blip2ForConditionalGeneration,) if is_torch_available() else ()
|
||||
fx_compatible = False
|
||||
test_head_masking = False
|
||||
test_pruning = False
|
||||
@ -448,6 +450,18 @@ class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationT
|
||||
def setUp(self):
|
||||
self.model_tester = Blip2ForConditionalGenerationDecoderOnlyModelTester(self)
|
||||
|
||||
@parameterized.expand([(True,), (False,)])
|
||||
def test_greedy_generation(self, use_cache: bool):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_generative_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
out = model.generate(**inputs_dict, min_new_tokens=20, max_new_tokens=20, use_cache=use_cache)
|
||||
self.assertTrue(out.shape[1] == 21) # BLIP is special, so should be 21
|
||||
|
||||
def test_for_conditional_generation(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)
|
||||
|
@ -20,6 +20,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import (
|
||||
CONFIG_MAPPING,
|
||||
@ -38,7 +39,6 @@ from transformers.testing_utils import (
|
||||
)
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
|
||||
from ...generation.test_utils import GenerationTesterMixin
|
||||
from ...test_configuration_common import ConfigTester
|
||||
from ...test_modeling_common import (
|
||||
ModelTesterMixin,
|
||||
@ -319,7 +319,7 @@ class InstructBlipTextModelDecoderOnlyTester:
|
||||
hidden_act="gelu",
|
||||
hidden_dropout_prob=0.1,
|
||||
attention_probs_dropout_prob=0.1,
|
||||
max_position_embeddings=20,
|
||||
max_position_embeddings=256,
|
||||
eos_token_id=2,
|
||||
pad_token_id=1,
|
||||
bos_token_id=0,
|
||||
@ -452,8 +452,9 @@ class InstructBlipForConditionalGenerationDecoderOnlyModelTester:
|
||||
|
||||
|
||||
@require_torch
|
||||
class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
|
||||
class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_generative_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else ()
|
||||
fx_compatible = False
|
||||
test_head_masking = False
|
||||
test_pruning = False
|
||||
@ -464,6 +465,19 @@ class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, Gene
|
||||
def setUp(self):
|
||||
self.model_tester = InstructBlipForConditionalGenerationDecoderOnlyModelTester(self)
|
||||
|
||||
@parameterized.expand([(True,), (False,)])
|
||||
def test_greedy_generation(self, use_cache: bool):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_generative_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
model.config.text_config.architectures = ["OptForCausalLM"]
|
||||
|
||||
out = model.generate(**inputs_dict, min_new_tokens=20, max_new_tokens=20, use_cache=use_cache)
|
||||
self.assertTrue(out.shape[1] == 21) # BLIP is special, therefore 21
|
||||
|
||||
def test_for_conditional_generation(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)
|
||||
|
@ -20,6 +20,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
from huggingface_hub import hf_hub_download
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import (
|
||||
CONFIG_MAPPING,
|
||||
@ -38,7 +39,6 @@ from transformers.testing_utils import (
|
||||
)
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
|
||||
from ...generation.test_utils import GenerationTesterMixin
|
||||
from ...test_configuration_common import ConfigTester
|
||||
from ...test_modeling_common import (
|
||||
ModelTesterMixin,
|
||||
@ -333,7 +333,7 @@ class InstructBlipVideoTextModelDecoderOnlyTester:
|
||||
hidden_act="gelu",
|
||||
hidden_dropout_prob=0.1,
|
||||
attention_probs_dropout_prob=0.1,
|
||||
max_position_embeddings=100,
|
||||
max_position_embeddings=256,
|
||||
eos_token_id=2,
|
||||
pad_token_id=1,
|
||||
bos_token_id=0,
|
||||
@ -471,10 +471,9 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester:
|
||||
|
||||
|
||||
@require_torch
|
||||
class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(
|
||||
ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
|
||||
):
|
||||
class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (InstructBlipVideoForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_generative_model_classes = (InstructBlipVideoForConditionalGeneration,) if is_torch_available() else ()
|
||||
fx_compatible = False
|
||||
test_head_masking = False
|
||||
test_pruning = False
|
||||
@ -485,6 +484,19 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(
|
||||
def setUp(self):
|
||||
self.model_tester = InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester(self)
|
||||
|
||||
@parameterized.expand([(True,), (False,)])
|
||||
def test_greedy_generation(self, use_cache: bool):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_generative_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
model.config.text_config.architectures = ["OptForCausalLM"]
|
||||
|
||||
out = model.generate(**inputs_dict, min_new_tokens=20, max_new_tokens=20, use_cache=use_cache)
|
||||
self.assertTrue(out.shape[1] == 21) # BLIP is special, therefore 21
|
||||
|
||||
def test_for_conditional_generation(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)
|
||||
|
@ -281,6 +281,17 @@ class Kosmos2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
|
||||
self.model_tester = Kosmos2ModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=Kosmos2Config, hidden_size=37)
|
||||
|
||||
def test_greedy_generation(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_generative_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
out = model.generate(**inputs_dict, min_new_tokens=20, max_new_tokens=20)
|
||||
self.assertTrue(out.shape[1] == inputs_dict["input_ids"].shape[1] + 20)
|
||||
|
||||
# overwrite from common to skip `image_to_text_projection.latent_query`
|
||||
def test_initialization(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
@ -18,6 +18,7 @@ import gc
|
||||
import unittest
|
||||
|
||||
import requests
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import (
|
||||
AutoProcessor,
|
||||
@ -80,7 +81,7 @@ class LlavaVisionText2TextModelTester:
|
||||
"initializer_range": 0.02,
|
||||
"num_labels": 3,
|
||||
"num_choices": 4,
|
||||
"pad_token_id": 0,
|
||||
"pad_token_id": 1,
|
||||
},
|
||||
is_training=True,
|
||||
vision_config={
|
||||
@ -148,6 +149,8 @@ class LlavaVisionText2TextModelTester:
|
||||
config, pixel_values = config_and_inputs
|
||||
input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
|
||||
attention_mask = input_ids.ne(1).to(torch_device)
|
||||
# set to random non-image token to prevent flakiness
|
||||
input_ids[input_ids == config.image_token_index] = 1
|
||||
# we are giving 3 images let's make sure we pass in 3 image tokens
|
||||
input_ids[:, 1] = config.image_token_index
|
||||
inputs_dict = {
|
||||
@ -178,6 +181,7 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase
|
||||
"""
|
||||
|
||||
all_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_generative_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else ()
|
||||
pipeline_model_mapping = {"image-to-text": LlavaForConditionalGeneration} if is_torch_available() else {}
|
||||
test_pruning = False
|
||||
test_head_masking = False
|
||||
@ -186,6 +190,24 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase
|
||||
self.model_tester = LlavaVisionText2TextModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=LlavaConfig, has_text_modality=False)
|
||||
|
||||
@parameterized.expand([(True,), (False,)])
|
||||
def test_greedy_generation(self, use_cache: bool):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_generative_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
out = model.generate(
|
||||
**inputs_dict,
|
||||
min_new_tokens=20,
|
||||
max_new_tokens=20,
|
||||
use_cache=use_cache,
|
||||
bad_words_ids=[[config.image_token_index]],
|
||||
)
|
||||
self.assertTrue(out.shape[1] == inputs_dict["input_ids"].shape[1] + 20)
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
|
@ -19,6 +19,7 @@ import unittest
|
||||
|
||||
import requests
|
||||
from huggingface_hub import hf_hub_download
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import (
|
||||
AutoProcessor,
|
||||
@ -34,7 +35,6 @@ from transformers.testing_utils import (
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ...generation.test_utils import GenerationTesterMixin
|
||||
from ...test_configuration_common import ConfigTester
|
||||
from ...test_modeling_common import (
|
||||
ModelTesterMixin,
|
||||
@ -86,7 +86,7 @@ class LlavaNextVisionText2TextModelTester:
|
||||
"initializer_range": 0.02,
|
||||
"num_labels": 3,
|
||||
"num_choices": 4,
|
||||
"pad_token_id": 0,
|
||||
"pad_token_id": 1,
|
||||
},
|
||||
is_training=True,
|
||||
vision_config={
|
||||
@ -157,6 +157,8 @@ class LlavaNextVisionText2TextModelTester:
|
||||
config, pixel_values = config_and_inputs
|
||||
input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2
|
||||
attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)
|
||||
# set to random non-image token to prevent flakiness
|
||||
input_ids[input_ids == config.image_token_index] = 2
|
||||
# we are giving 3 images let's make sure we pass in 3 image tokens
|
||||
input_ids[:, 1] = config.image_token_index
|
||||
labels = torch.zeros((self.batch_size, self.seq_length), dtype=torch.long, device=torch_device)
|
||||
@ -208,12 +210,13 @@ class LlavaNextVisionText2TextModelTester:
|
||||
|
||||
|
||||
@require_torch
|
||||
class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
|
||||
class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
"""
|
||||
Model tester for `LlavaNextForConditionalGeneration`.
|
||||
"""
|
||||
|
||||
all_model_classes = (LlavaNextForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_generative_model_classes = (LlavaNextForConditionalGeneration,) if is_torch_available() else ()
|
||||
test_pruning = False
|
||||
test_head_masking = False
|
||||
|
||||
@ -237,6 +240,24 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
|
||||
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||
)
|
||||
|
||||
@parameterized.expand([(True,), (False,)])
|
||||
def test_greedy_generation(self, use_cache: bool):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_generative_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
out = model.generate(
|
||||
**inputs_dict,
|
||||
min_new_tokens=20,
|
||||
max_new_tokens=20,
|
||||
use_cache=use_cache,
|
||||
bad_words_ids=[[config.image_token_index]],
|
||||
)
|
||||
self.assertTrue(out.shape[1] == inputs_dict["input_ids"].shape[1] + 20)
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
|
@ -19,6 +19,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
from huggingface_hub import hf_hub_download
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import (
|
||||
AutoProcessor,
|
||||
@ -34,7 +35,6 @@ from transformers.testing_utils import (
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ...generation.test_utils import GenerationTesterMixin
|
||||
from ...test_configuration_common import ConfigTester
|
||||
from ...test_modeling_common import (
|
||||
ModelTesterMixin,
|
||||
@ -86,7 +86,7 @@ class LlavaNextVideoVisionText2TextModelTester:
|
||||
"initializer_range": 0.02,
|
||||
"num_labels": 3,
|
||||
"num_choices": 4,
|
||||
"pad_token_id": 0,
|
||||
"pad_token_id": 1,
|
||||
},
|
||||
is_training=True,
|
||||
vision_config={
|
||||
@ -167,6 +167,9 @@ class LlavaNextVideoVisionText2TextModelTester:
|
||||
config, pixel_values, pixel_values_videos = self.prepare_config_and_inputs()
|
||||
input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2
|
||||
attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)
|
||||
# set to random non-image token to prevent flakiness
|
||||
input_ids[input_ids == config.image_token_index] = 2
|
||||
input_ids[input_ids == config.video_token_index] = 2
|
||||
# we are giving 3 images and videos let's make sure we pass in 3 special tokens
|
||||
input_ids[:, 1] = config.image_token_index
|
||||
input_ids[:, 2] = config.video_token_index
|
||||
@ -223,12 +226,13 @@ class LlavaNextVideoVisionText2TextModelTester:
|
||||
|
||||
|
||||
@require_torch
|
||||
class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
|
||||
class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
"""
|
||||
Model tester for `LlavaNextVideoForConditionalGeneration`.
|
||||
"""
|
||||
|
||||
all_model_classes = (LlavaNextVideoForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_generative_model_classes = (LlavaNextVideoForConditionalGeneration,) if is_torch_available() else ()
|
||||
test_pruning = False
|
||||
test_head_masking = False
|
||||
|
||||
@ -274,6 +278,24 @@ class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, Generati
|
||||
with torch.no_grad():
|
||||
model(**inputs)
|
||||
|
||||
@parameterized.expand([(True,), (False,)])
|
||||
def test_greedy_generation(self, use_cache: bool):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_generative_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
out = model.generate(
|
||||
**inputs_dict,
|
||||
min_new_tokens=20,
|
||||
max_new_tokens=20,
|
||||
use_cache=use_cache,
|
||||
bad_words_ids=[[config.image_token_index], [config.video_token_index]],
|
||||
)
|
||||
self.assertTrue(out.shape[1] == inputs_dict["input_ids"].shape[1] + 20)
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
|
@ -176,6 +176,7 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, unittest.Test
|
||||
"""
|
||||
|
||||
all_model_classes = (PaliGemmaForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_generative_model_classes = (PaliGemmaForConditionalGeneration,) if is_torch_available() else ()
|
||||
fx_compatible = False
|
||||
test_pruning = False
|
||||
test_torchscript = False
|
||||
@ -185,6 +186,18 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, unittest.Test
|
||||
self.model_tester = PaliGemmaVisionText2TextModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=PaliGemmaConfig, has_text_modality=False)
|
||||
|
||||
@parameterized.expand([(True,), (False,)])
|
||||
def test_greedy_generation(self, use_cache: bool):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_generative_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
out = model.generate(**inputs_dict, min_new_tokens=20, max_new_tokens=20, use_cache=use_cache)
|
||||
self.assertTrue(out.shape[1] == inputs_dict["input_ids"].shape[1] + 20)
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
|
@ -20,6 +20,7 @@ import unittest
|
||||
import numpy as np
|
||||
import requests
|
||||
from huggingface_hub import hf_hub_download
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import (
|
||||
VideoLlavaConfig,
|
||||
@ -30,7 +31,6 @@ from transformers import (
|
||||
)
|
||||
from transformers.testing_utils import require_bitsandbytes, require_torch, require_torch_gpu, slow, torch_device
|
||||
|
||||
from ...generation.test_utils import GenerationTesterMixin
|
||||
from ...test_configuration_common import ConfigTester
|
||||
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
|
||||
|
||||
@ -75,7 +75,7 @@ class VideoLlavaVisionText2TextModelTester:
|
||||
"initializer_range": 0.02,
|
||||
"num_labels": 3,
|
||||
"num_choices": 4,
|
||||
"pad_token_id": 0,
|
||||
"pad_token_id": 1,
|
||||
},
|
||||
is_training=True,
|
||||
vision_config={
|
||||
@ -158,10 +158,11 @@ class VideoLlavaVisionText2TextModelTester:
|
||||
config, pixel_values_images, pixel_values_videos = config_and_inputs
|
||||
input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
|
||||
attention_mask = input_ids.ne(1).to(torch_device)
|
||||
# set to random non-image token to prevent flakiness
|
||||
input_ids[input_ids == config.image_token_index] = 2
|
||||
input_ids[input_ids == config.video_token_index] = 2
|
||||
|
||||
# we are giving 3 videos and 3 images. Need to pass in image and video tokens, both
|
||||
# also need to make sure no other special tokens are set
|
||||
input_ids[(input_ids == 0) | (input_ids == 1)] = 3
|
||||
# we are giving 3 videos and 3 images. Need to pass in image and video tokens
|
||||
input_ids[:, 0] = config.video_token_index
|
||||
input_ids[:, 1:2] = config.image_token_index
|
||||
inputs_dict = {
|
||||
@ -190,12 +191,13 @@ class VideoLlavaVisionText2TextModelTester:
|
||||
|
||||
|
||||
@require_torch
|
||||
class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
|
||||
class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
"""
|
||||
Model tester for `VideoLlavaForConditionalGeneration`.
|
||||
"""
|
||||
|
||||
all_model_classes = (VideoLlavaForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_generative_model_classes = (VideoLlavaForConditionalGeneration,) if is_torch_available() else ()
|
||||
fx_compatible = False
|
||||
test_pruning = False
|
||||
test_resize_embeddings = True
|
||||
@ -205,6 +207,24 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
|
||||
self.model_tester = VideoLlavaVisionText2TextModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=VideoLlavaConfig, has_text_modality=False)
|
||||
|
||||
@parameterized.expand([(True,), (False,)])
|
||||
def test_greedy_generation(self, use_cache: bool):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_generative_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
out = model.generate(
|
||||
**inputs_dict,
|
||||
min_new_tokens=20,
|
||||
max_new_tokens=20,
|
||||
use_cache=use_cache,
|
||||
bad_words_ids=[[config.image_token_index], [config.video_token_index]],
|
||||
)
|
||||
self.assertTrue(out.shape[1] == inputs_dict["input_ids"].shape[1] + 20)
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
|
@ -18,6 +18,7 @@ import gc
|
||||
import unittest
|
||||
|
||||
import requests
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import (
|
||||
AutoProcessor,
|
||||
@ -73,7 +74,7 @@ class VipLlavaVisionText2TextModelTester:
|
||||
"initializer_range": 0.02,
|
||||
"num_labels": 3,
|
||||
"num_choices": 4,
|
||||
"pad_token_id": 0,
|
||||
"pad_token_id": 1,
|
||||
},
|
||||
is_training=True,
|
||||
vision_config={
|
||||
@ -140,6 +141,8 @@ class VipLlavaVisionText2TextModelTester:
|
||||
config, pixel_values = config_and_inputs
|
||||
input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
|
||||
attention_mask = input_ids.ne(1).to(torch_device)
|
||||
# set to random non-image token to prevent flakiness
|
||||
input_ids[input_ids == config.image_token_index] = 2
|
||||
# we are giving 3 images let's make sure we pass in 3 image tokens
|
||||
input_ids[:, 1] = config.image_token_index
|
||||
inputs_dict = {
|
||||
@ -158,6 +161,7 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestC
|
||||
"""
|
||||
|
||||
all_model_classes = (VipLlavaForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_generative_model_classes = (VipLlavaForConditionalGeneration,) if is_torch_available() else ()
|
||||
fx_compatible = False
|
||||
test_pruning = False
|
||||
test_resize_embeddings = True
|
||||
@ -167,6 +171,24 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestC
|
||||
self.model_tester = VipLlavaVisionText2TextModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=VipLlavaConfig, has_text_modality=False)
|
||||
|
||||
@parameterized.expand([(True,), (False,)])
|
||||
def test_greedy_generation(self, use_cache: bool):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_generative_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
out = model.generate(
|
||||
**inputs_dict,
|
||||
min_new_tokens=20,
|
||||
max_new_tokens=20,
|
||||
use_cache=use_cache,
|
||||
bad_words_ids=[[config.image_token_index]],
|
||||
)
|
||||
self.assertTrue(out.shape[1] == inputs_dict["input_ids"].shape[1] + 20)
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
|
@ -1844,6 +1844,59 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
|
||||
)
|
||||
assert isinstance(pred_ids, expected_output_type)
|
||||
|
||||
@require_flash_attn
|
||||
@require_torch_gpu
|
||||
@pytest.mark.flash_attn_test
|
||||
@slow
|
||||
def test_flash_attn_2_generate_reuse_cache(self):
|
||||
max_new_tokens = 2
|
||||
for model_class in self.all_generative_model_classes:
|
||||
if not model_class._supports_flash_attn_2:
|
||||
self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
|
||||
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
dummy_input = inputs_dict[model_class.main_input_name][..., :10]
|
||||
if dummy_input.dtype in [torch.float32, torch.bfloat16]:
|
||||
dummy_input = dummy_input.to(torch.float16)
|
||||
|
||||
# make sure that all models have enough positions for generation
|
||||
if hasattr(config, "max_position_embeddings"):
|
||||
config.max_position_embeddings = dummy_input.shape[1] * 2 + max_new_tokens * 2 + 1
|
||||
|
||||
model = model_class(config)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
model.save_pretrained(tmpdirname)
|
||||
|
||||
model = model_class.from_pretrained(
|
||||
tmpdirname,
|
||||
torch_dtype=torch.float16,
|
||||
attn_implementation="flash_attention_2",
|
||||
low_cpu_mem_usage=True,
|
||||
).to(torch_device)
|
||||
|
||||
# run generate once to get filled cache
|
||||
output = model.generate(
|
||||
dummy_input,
|
||||
max_new_tokens=max_new_tokens,
|
||||
do_sample=False,
|
||||
use_cache=True,
|
||||
return_dict_in_generate=True,
|
||||
)
|
||||
past_key_values = output.past_key_values
|
||||
|
||||
# Try to continue generation from where we left, given that we have more than 1 new token to process
|
||||
# e.g. this can happen in speculative decoding when feeding candidate tokens back to target model
|
||||
_ = model.generate(
|
||||
dummy_input,
|
||||
decoder_input_ids=output.sequences,
|
||||
max_new_tokens=max_new_tokens,
|
||||
do_sample=False,
|
||||
use_cache=True,
|
||||
past_key_values=past_key_values,
|
||||
)
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_torchaudio
|
||||
@ -4058,6 +4111,11 @@ class WhisperStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin,
|
||||
# generate only works with input ids for whisper
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="Generate needs input ids")
|
||||
def test_inputs_embeds_matches_input_ids_with_generate(self):
|
||||
# generate only works with input ids for whisper
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="Decoder can't keep attention grads")
|
||||
def test_retain_grad_hidden_states_attentions(self):
|
||||
return
|
||||
@ -4066,6 +4124,12 @@ class WhisperStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin,
|
||||
def test_save_load_fast_init_from_base(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(
|
||||
reason="FA2 testing suite needs to be refactored to be compatible with WhisperDecoder for that test"
|
||||
)
|
||||
def test_flash_attn_2_generate_reuse_cache(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(
|
||||
"Duplicated test with WhisperModelTest + the FA2 testing suite needs to be refactored to be compatible with WhisperDecoder for that test"
|
||||
)
|
||||
|
@ -2819,6 +2819,53 @@ class ModelTesterMixin:
|
||||
)[0]
|
||||
self.assertTrue(torch.allclose(out_embeds, out_ids))
|
||||
|
||||
def test_inputs_embeds_matches_input_ids_with_generate(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
for model_class in self.all_model_classes:
|
||||
if model_class.__name__ not in get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES):
|
||||
continue
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
model_forward_args = inspect.signature(model.forward).parameters
|
||||
if "inputs_embeds" not in model_forward_args:
|
||||
self.skipTest(reason="This model doesn't use `inputs_embeds`")
|
||||
|
||||
inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
|
||||
pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1
|
||||
|
||||
wte = model.get_input_embeddings()
|
||||
if not self.is_encoder_decoder:
|
||||
input_ids = inputs["input_ids"]
|
||||
# some models infer position ids/attn mask differently when input ids
|
||||
# by check if pad_token let's make sure no padding is in input ids
|
||||
not_pad_token_id = pad_token_id + 1 if max(0, pad_token_id - 1) == 0 else pad_token_id - 1
|
||||
input_ids[input_ids == pad_token_id] = not_pad_token_id
|
||||
del inputs["input_ids"]
|
||||
inputs_embeds = wte(input_ids)
|
||||
out_ids = model.generate(input_ids=input_ids, **inputs, max_new_tokens=2)[:, -2:]
|
||||
out_embeds = model.generate(inputs_embeds=inputs_embeds, **inputs, max_new_tokens=2)
|
||||
else:
|
||||
encoder_input_ids = inputs["input_ids"]
|
||||
decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
|
||||
encoder_input_ids[encoder_input_ids == pad_token_id] = max(0, pad_token_id + 1)
|
||||
decoder_input_ids[decoder_input_ids == pad_token_id] = max(0, pad_token_id + 1)
|
||||
del inputs["input_ids"]
|
||||
inputs.pop("decoder_input_ids", None)
|
||||
inputs_embeds = wte(encoder_input_ids)
|
||||
decoder_inputs_embeds = wte(decoder_input_ids)
|
||||
out_ids = model.generate(
|
||||
input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids, **inputs, max_new_tokens=2
|
||||
)[:, -2:]
|
||||
out_embeds = model.generate(
|
||||
inputs_embeds=inputs_embeds,
|
||||
decoder_inputs_embeds=decoder_inputs_embeds,
|
||||
**inputs,
|
||||
max_new_tokens=2,
|
||||
)
|
||||
self.assertTrue(torch.allclose(out_embeds, out_ids))
|
||||
|
||||
@require_torch_multi_gpu
|
||||
def test_multi_gpu_data_parallel_forward(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
@ -4284,6 +4331,62 @@ class ModelTesterMixin:
|
||||
use_cache=True,
|
||||
)
|
||||
|
||||
@require_flash_attn
|
||||
@require_torch_gpu
|
||||
@mark.flash_attn_test
|
||||
@slow
|
||||
def test_flash_attn_2_generate_reuse_cache(self):
|
||||
if not self.has_attentions:
|
||||
self.skipTest(reason="Model architecture does not support attentions")
|
||||
|
||||
max_new_tokens = 2
|
||||
for model_class in self.all_generative_model_classes:
|
||||
if not model_class._supports_flash_attn_2:
|
||||
self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
|
||||
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
dummy_input = inputs_dict[model_class.main_input_name]
|
||||
if dummy_input.dtype in [torch.float32, torch.bfloat16]:
|
||||
dummy_input = dummy_input.to(torch.float16)
|
||||
|
||||
# make sure that all models have enough positions for generation
|
||||
if hasattr(config, "max_position_embeddings"):
|
||||
config.max_position_embeddings = dummy_input.shape[1] * 2 + max_new_tokens * 2 + 1
|
||||
|
||||
model = model_class(config)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
model.save_pretrained(tmpdirname)
|
||||
|
||||
model = model_class.from_pretrained(
|
||||
tmpdirname,
|
||||
torch_dtype=torch.float16,
|
||||
attn_implementation="flash_attention_2",
|
||||
low_cpu_mem_usage=True,
|
||||
).to(torch_device)
|
||||
|
||||
# run generate once to get filled cache
|
||||
output = model.generate(
|
||||
dummy_input,
|
||||
max_new_tokens=max_new_tokens,
|
||||
do_sample=False,
|
||||
use_cache=True,
|
||||
return_dict_in_generate=True,
|
||||
)
|
||||
past_key_values = output.past_key_values
|
||||
|
||||
# Try to continue generation from where we left, given that we have more than 1 new token to process
|
||||
# e.g. this can happen in speculative decoding when feeding candidate tokens back to target model
|
||||
dummy_input_updated = torch.cat([dummy_input, output.sequences], dim=-1)
|
||||
_ = model.generate(
|
||||
dummy_input_updated,
|
||||
max_new_tokens=max_new_tokens,
|
||||
do_sample=False,
|
||||
use_cache=True,
|
||||
past_key_values=past_key_values,
|
||||
)
|
||||
|
||||
@require_flash_attn
|
||||
@require_torch_gpu
|
||||
@require_bitsandbytes
|
||||
|
@ -19,7 +19,7 @@ import os
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers.modelcard import ModelCard
|
||||
from transformers.modelcard import ModelCard, TrainingSummary
|
||||
|
||||
|
||||
class ModelCardTester(unittest.TestCase):
|
||||
@ -82,3 +82,8 @@ class ModelCardTester(unittest.TestCase):
|
||||
model_card_second = ModelCard.from_pretrained(tmpdirname)
|
||||
|
||||
self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict())
|
||||
|
||||
def test_model_summary_modelcard_base_metadata(self):
|
||||
metadata = TrainingSummary("Model name").create_metadata()
|
||||
self.assertTrue("library_name" in metadata)
|
||||
self.assertTrue(metadata["library_name"] == "transformers")
|
||||
|
@ -1640,17 +1640,18 @@ class ModelUtilsTest(TestCasePlus):
|
||||
|
||||
logger = logging.get_logger("transformers.modeling_utils")
|
||||
config = PretrainedConfig()
|
||||
warning_msg_gamma = "A parameter name that contains `gamma` will be renamed internally"
|
||||
warning_msg_gamma = "`gamma_param` -> `weight_param`"
|
||||
model = TestModelGamma(config)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
model.save_pretrained(tmp_dir)
|
||||
with LoggingLevel(logging.WARNING):
|
||||
with LoggingLevel(logging.INFO):
|
||||
with CaptureLogger(logger) as cl1:
|
||||
_, loading_info = TestModelGamma.from_pretrained(tmp_dir, config=config, output_loading_info=True)
|
||||
|
||||
missing_keys = loading_info["missing_keys"]
|
||||
unexpected_keys = loading_info["unexpected_keys"]
|
||||
self.assertIn("`TestModelGamma`", cl1.out)
|
||||
self.assertIn(warning_msg_gamma, cl1.out)
|
||||
self.assertIn("gamma_param", missing_keys)
|
||||
self.assertIn("weight_param", unexpected_keys)
|
||||
@ -1664,17 +1665,18 @@ class ModelUtilsTest(TestCasePlus):
|
||||
def forward(self):
|
||||
return self.beta_param.sum()
|
||||
|
||||
warning_msg_beta = "A parameter name that contains `beta` will be renamed internally"
|
||||
warning_msg_beta = "`beta_param` -> `bias_param`"
|
||||
model = TestModelBeta(config)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
model.save_pretrained(tmp_dir)
|
||||
with LoggingLevel(logging.WARNING):
|
||||
with LoggingLevel(logging.INFO):
|
||||
with CaptureLogger(logger) as cl2:
|
||||
_, loading_info = TestModelBeta.from_pretrained(tmp_dir, config=config, output_loading_info=True)
|
||||
|
||||
missing_keys = loading_info["missing_keys"]
|
||||
unexpected_keys = loading_info["unexpected_keys"]
|
||||
self.assertIn("`TestModelBeta`", cl2.out)
|
||||
self.assertIn(warning_msg_beta, cl2.out)
|
||||
self.assertIn("beta_param", missing_keys)
|
||||
self.assertIn("bias_param", unexpected_keys)
|
||||
|
Reference in New Issue
Block a user