Compare commits

...

16 Commits

Author SHA1 Message Date
ca56cd7b31 v4.44.1 2024-08-20 19:41:53 +02:00
6e931e1647 Gemma2: fix FA2 generation (#32553)
fix FA2
2024-08-20 19:41:49 +02:00
74f57df61b Fix generate with inputs_embeds as input (#32493)
* I think inputs_embeds has ndim == 3

* fix sequence length catch

* add generate test

* [run-slow]olmo, persimmon, gemma, gemma2, qwen2, llama

* skip whisper

* fix bart test

* more fixes
2024-08-20 19:41:05 +02:00
084fe2ee6c Merge branch 'v4.44-release' of github.com:huggingface/transformers into v4.44-release 2024-08-20 19:41:05 +02:00
fff9be1545 Reduce the error log when using core models that need their weights renamed, and provide a step forward (#32656)
* Fin

* Modify msg

* Finish up nits
2024-08-20 19:41:05 +02:00
4fd0f4802b Fix VLM generation issues (#32836)
* fix in one commit

* add parameterized

* fix tests

* fix test flakiness

* maybe that's why flaky

* style

* flakiness...

---------

Co-authored-by: raushan <raushan@huggingface.co>
2024-08-20 19:41:05 +02:00
5c4b15ae01 fix multi-gpu with static cache (#32543) 2024-08-20 19:41:05 +02:00
b51a82a524 Revert PR 32299, flag users when Zero-3 was missed (#32851)
Revert PR 32299
2024-08-20 19:41:05 +02:00
0dca7d72b8 Use head_dim if in config for RoPE (#32495)
* use head_dim if in config for RoPE

* typo

* simplify with getattr
2024-08-20 19:41:05 +02:00
a3e77bae9a add back the position ids (#32554)
* add back the position ids

* fix failing test
2024-08-20 19:41:05 +02:00
9c7aa7b3c7 Automatically add transformers tag to the modelcard (#32623)
* Automatically add `transformers` tag to the modelcard

* Specify library_name and test
2024-08-20 19:41:05 +02:00
ed6acee21a Fix sliding window attention used in Gemma2FlashAttention2 (#32522)
* fix sliding window attention (flash2) in gemma2 model

* [run-slow] gemma

* fix slicing attention_mask for flash_attn2

* fix slicing attention_mask when flash_attn is used

* add missing comment

* slice the last seq_len tokens in the key, value states

* revert code of slicing key, value states
2024-08-20 19:41:05 +02:00
51741d7e46 Fix: FA2 with packed training (#32487)
* fix check

* add tests

* [run-slow] llama, gemma2

* oops, whisper actually runs but needed some special treatment
2024-08-20 19:41:05 +02:00
984bc11b08 Revert "fixes to properly shard FSDP across cpu and meta for cpu_effcient_loading for prequantized 4bit (#32276)" (#32477)
* Revert "fixes to properly shard FSDP across cpu and meta for cpu_efficient_loading for prequantized 4bit (#32276)"

This reverts commit 62c60a30181a65e1a3a7f19c3055a240a6a21335.

We uncovered an issue with this change that caused our training runs to hang.

* `is_torchdynamo_compiling` -- cast a wide exception net (#32476)

* cast a wide net

* make fix-copies with a few manual changes

* add copied from

---------

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
2024-08-06 20:35:22 +02:00
af61272239 is_torchdynamo_compiling -- cast a wide exception net (#32476)
* cast a wide net

* make fix-copies with a few manual changes

* add copied from
2024-08-06 20:35:17 +02:00
3e93524a13 release v4.44.0 2024-08-06 17:00:10 +02:00
100 changed files with 774 additions and 294 deletions

View File

@ -61,7 +61,7 @@ from transformers.utils import check_min_version, send_example_telemetry
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
Array = Any
Dataset = datasets.arrow_dataset.Dataset

View File

@ -60,7 +60,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risk.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recognition/requirements.txt")

View File

@ -56,7 +56,7 @@ from transformers.utils import check_min_version, send_example_telemetry
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
Array = Any
Dataset = datasets.arrow_dataset.Dataset

View File

@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")

View File

@ -45,7 +45,7 @@ from transformers.utils.versions import require_version
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")

View File

@ -54,7 +54,7 @@ from transformers.utils.versions import require_version
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")

View File

@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")

View File

@ -49,7 +49,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
logger = get_logger(__name__)

View File

@ -43,7 +43,7 @@ from transformers.utils.versions import require_version
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

View File

@ -48,7 +48,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

View File

@ -53,7 +53,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

View File

@ -46,7 +46,7 @@ from transformers.utils.versions import require_version
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")

View File

@ -52,7 +52,7 @@ from transformers.utils.versions import require_version
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")

View File

@ -55,7 +55,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

View File

@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
logger = get_logger(__name__)

View File

@ -58,7 +58,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

View File

@ -60,7 +60,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
logger = get_logger(__name__)

View File

@ -54,7 +54,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

View File

@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
logger = get_logger(__name__)
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

View File

@ -47,7 +47,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

View File

@ -47,7 +47,7 @@ from transformers.utils import PaddingStrategy, check_min_version, send_example_
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
logger = logging.getLogger(__name__)

View File

@ -56,7 +56,7 @@ from transformers.utils import PaddingStrategy, check_min_version, send_example_
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
logger = get_logger(__name__)
# You should update this to your particular problem to have better documentation of `model_type`

View File

@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/object-detection/requirements.txt")

View File

@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
logging.basicConfig(level=logging.INFO)
logger = get_logger(__name__)

View File

@ -50,7 +50,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

View File

@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

View File

@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

View File

@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

View File

@ -46,7 +46,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

View File

@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt")

View File

@ -50,7 +50,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
logger = get_logger(__name__)

View File

@ -50,7 +50,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")

View File

@ -53,7 +53,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")

View File

@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")

View File

@ -52,7 +52,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

View File

@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
logger = get_logger(__name__)
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

View File

@ -47,7 +47,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

View File

@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

View File

@ -49,7 +49,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
logger = get_logger(__name__)

View File

@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

View File

@ -49,7 +49,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")

View File

@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
logger = get_logger(__name__)
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")

View File

@ -52,7 +52,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")

View File

@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
logger = get_logger(__name__)
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")

View File

@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version(
"datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt"

View File

@ -55,7 +55,7 @@ from transformers.utils.versions import require_version
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")

View File

@ -50,7 +50,7 @@ from transformers.utils import PaddingStrategy, check_min_version, send_example_
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
logger = logging.getLogger(__name__)

View File

@ -62,7 +62,7 @@ except (ModuleNotFoundError, ImportError):
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
logger = logging.getLogger(__name__)

View File

@ -53,7 +53,7 @@ from transformers.utils.versions import require_version
# region Checking dependencies
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

View File

@ -47,7 +47,7 @@ from transformers.utils import check_min_version, send_example_telemetry
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
task_to_keys = {
"cola": ("sentence", None),

View File

@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
# region Dependencies and constants
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.44.0.dev0")
check_min_version("4.44.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

View File

@ -430,7 +430,7 @@ install_requires = [
setup(
name="transformers",
version="4.44.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
version="4.44.1", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
author_email="transformers@huggingface.co",
description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",

View File

@ -18,7 +18,7 @@
# to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
# in the namespace without actually importing anything (and especially none of the backends).
__version__ = "4.44.0.dev0"
__version__ = "4.44.1"
from typing import TYPE_CHECKING

View File

@ -1067,6 +1067,8 @@ class StaticCache(Cache):
A tuple containing the updated key and value states.
"""
cache_position = cache_kwargs.get("cache_position")
self.key_cache[layer_idx] = self.key_cache[layer_idx].to(device=key_states.device)
self.value_cache[layer_idx] = self.value_cache[layer_idx].to(device=value_states.device)
k_out = self.key_cache[layer_idx]
v_out = self.value_cache[layer_idx]
@ -1078,8 +1080,6 @@ class StaticCache(Cache):
# `tensor[:, :, index] = tensor`, but the first one is compile-friendly and it does explicitly an in-place
# operation, that avoids copies and uses less memory.
try:
# If using several devices (e.g.: multiple GPUs), we need to ensure everything is on the same one
cache_position.to(device=k_out.device)
k_out.index_copy_(2, cache_position, key_states)
v_out.index_copy_(2, cache_position, value_states)
except NotImplementedError:

View File

@ -454,6 +454,7 @@ class TrainingSummary:
metric_mapping = infer_metric_tags_from_eval_results(self.eval_results)
metadata = {}
metadata = _insert_value(metadata, "library_name", "transformers")
metadata = _insert_values_as_list(metadata, "language", self.language)
metadata = _insert_value(metadata, "license", self.license)
if self.finetuned_from is not None and isinstance(self.finetuned_from, str) and len(self.finetuned_from) > 0:

View File

@ -264,11 +264,10 @@ def _flash_attention_forward(
)
attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
# if position_ids is provided and check not all examples (row) contain only 1 sequence, and is in pre-fill/training stage
# then use `flash_attn_varlen_func` to prevent cross-example attention and also allow padding free approach
elif (
position_ids is not None and not (position_ids[:, -1] == position_ids.size(1) - 1).all() and query_length != 1
):
# If position_ids is provided and check all examples do not contain only 1 sequence, If tensor in increasing
# then we probably have one sequence, otherwise it is packed. Additionally check we are in pre-fill/training stage.
# Use `flash_attn_varlen_func` to prevent cross-example attention and also allow padding free approach
elif position_ids is not None and not (torch.diff(position_ids, dim=-1) >= 0).all() and query_length != 1:
batch_size = query_states.size(0)
query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = prepare_fa2_from_position_ids(
query_states, key_states, value_states, position_ids

View File

@ -58,7 +58,8 @@ def _compute_default_rope_parameters(
elif config is not None:
base = config.rope_theta
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor)
head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
dim = int(head_dim * partial_rotary_factor)
attention_factor = 1.0 # Unused in this type of RoPE
@ -143,7 +144,8 @@ def _compute_dynamic_ntk_parameters(
elif config is not None:
base = config.rope_theta
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor)
head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
dim = int(head_dim * partial_rotary_factor)
max_position_embeddings = config.max_position_embeddings
factor = config.rope_scaling["factor"]
@ -185,7 +187,8 @@ def _compute_yarn_parameters(
base = config.rope_theta
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor)
head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
dim = int(head_dim * partial_rotary_factor)
max_position_embeddings = config.max_position_embeddings
factor = config.rope_scaling["factor"]
@ -265,7 +268,8 @@ def _compute_longrope_parameters(
base = config.rope_theta
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor)
head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
dim = int(head_dim * partial_rotary_factor)
long_factor = config.rope_scaling["long_factor"]
short_factor = config.rope_scaling["short_factor"]
factor = config.rope_scaling.get("factor")
@ -450,7 +454,8 @@ def _validate_longrope_parameters(config: PretrainedConfig):
_check_received_keys(rope_type, received_keys, required_keys, optional_keys)
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor)
head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
dim = int(head_dim * partial_rotary_factor)
short_factor = rope_scaling.get("short_factor")
if not isinstance(short_factor, list) and all(isinstance(x, (int, float)) for x in short_factor):

View File

@ -104,7 +104,6 @@ from .utils.quantization_config import BitsAndBytesConfig, QuantizationMethod
XLA_USE_BF16 = os.environ.get("XLA_USE_BF16", "0").upper()
XLA_DOWNCAST_BF16 = os.environ.get("XLA_DOWNCAST_BF16", "0").upper()
PARAM_RENAME_WARNING = "A parameter name that contains `{}` will be renamed internally to `{}`. Please use a different name to suppress this warning."
if is_accelerate_available():
@ -692,17 +691,30 @@ def _load_state_dict_into_model(model_to_load, state_dict, start_prefix, assign_
# Convert old format to new format if needed from a PyTorch state_dict
old_keys = []
new_keys = []
renamed_keys = {}
renamed_gamma = {}
renamed_beta = {}
warning_msg = f"A pretrained model of type `{model_to_load.__class__.__name__}` "
for key in state_dict.keys():
new_key = None
if "gamma" in key:
logger.warning(PARAM_RENAME_WARNING.format("gamma", "weight"))
# We add only the first key as an example
new_key = key.replace("gamma", "weight")
renamed_gamma[key] = new_key if not renamed_gamma else renamed_gamma
if "beta" in key:
logger.warning(PARAM_RENAME_WARNING.format("beta", "bias"))
# We add only the first key as an example
new_key = key.replace("beta", "bias")
renamed_beta[key] = new_key if not renamed_beta else renamed_beta
if new_key:
old_keys.append(key)
new_keys.append(new_key)
renamed_keys = {**renamed_gamma, **renamed_beta}
if renamed_keys:
warning_msg += "contains parameters that have been renamed internally (a few are listed below but more are present in the model):\n"
for old_key, new_key in renamed_keys.items():
warning_msg += f"* `{old_key}` -> `{new_key}`\n"
warning_msg += "If you are using a model from the Hub, consider submitting a PR to adjust these weights and help future users."
logger.info_once(warning_msg)
for old_key, new_key in zip(old_keys, new_keys):
state_dict[new_key] = state_dict.pop(old_key)
@ -818,6 +830,7 @@ def _load_state_dict_into_meta_model(
is_safetensors=False,
keep_in_fp32_modules=None,
unexpected_keys=None, # passing `unexpected` for cleanup from quantization items
pretrained_model_name_or_path=None, # for flagging the user when the model contains renamed keys
):
"""
This is somewhat similar to `_load_state_dict_into_model`, but deals with a model that has some or all of its
@ -840,18 +853,30 @@ def _load_state_dict_into_meta_model(
old_keys = []
new_keys = []
renamed_gamma = {}
renamed_beta = {}
is_quantized = hf_quantizer is not None
warning_msg = f"This model {type(model)}"
for key in state_dict.keys():
new_key = None
if "gamma" in key:
logger.warning(PARAM_RENAME_WARNING.format("gamma", "weight"))
# We add only the first key as an example
new_key = key.replace("gamma", "weight")
renamed_gamma[key] = new_key if not renamed_gamma else renamed_gamma
if "beta" in key:
logger.warning(PARAM_RENAME_WARNING.format("beta", "bias"))
# We add only the first key as an example
new_key = key.replace("beta", "bias")
renamed_beta[key] = new_key if not renamed_beta else renamed_beta
if new_key:
old_keys.append(key)
new_keys.append(new_key)
renamed_keys = {**renamed_gamma, **renamed_beta}
if renamed_keys:
warning_msg += "contains parameters that have been renamed internally (a few are listed below but more are present in the model):\n"
for old_key, new_key in renamed_keys.items():
warning_msg += f"* `{old_key}` -> `{new_key}`\n"
warning_msg += "If you are using a model from the Hub, consider submitting a PR to adjust these weights and help future users."
logger.info_once(warning_msg)
for old_key, new_key in zip(old_keys, new_keys):
state_dict[new_key] = state_dict.pop(old_key)
@ -932,8 +957,6 @@ def _load_state_dict_into_meta_model(
)
)
):
if is_fsdp_enabled():
param_device = "cpu" if is_local_dist_rank_0() else "meta"
# For backward compatibility with older versions of `accelerate` and for non-quantized params
set_module_tensor_to_device(model, param_name, param_device, **set_module_kwargs)
else:
@ -944,10 +967,7 @@ def _load_state_dict_into_meta_model(
if is_fsdp_enabled() or is_deepspeed_zero3_enabled():
module, tensor_name = get_module_from_name(model, param_name)
value = getattr(module, tensor_name)
param_to = "cpu"
if is_fsdp_enabled() and not is_local_dist_rank_0():
param_to = "meta"
value = type(value)(value.data.to(param_to), **value.__dict__)
value = type(value)(value.data.to("cpu"), **value.__dict__)
setattr(module, tensor_name, value)
# TODO: consider removing used param_parts from state_dict before return
@ -1482,9 +1502,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
else:
model = cls(config, **kwargs)
# Flag for if we init with `zero3`, add an attr to the model so we can check downstream for issues
model._transformers_zero3_init_used = is_deepspeed_zero3_enabled()
# restore default dtype if it was modified
if dtype_orig is not None:
torch.set_default_dtype(dtype_orig)
@ -3814,9 +3831,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
# Let's make sure we don't run the init function of buffer modules
model = cls(config, *model_args, **model_kwargs)
# If we init with `zero3`, add an attr to the model so we can check downstream for issues
model._transformers_zero3_init_used = is_deepspeed_zero3_enabled() and not is_quantized
# make sure we use the model's config since the __init__ call might have copied it
config = model.config
@ -4545,7 +4559,12 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
@staticmethod
def _load_pretrained_model_low_mem(
model, loaded_state_dict_keys, resolved_archive_file, start_prefix="", hf_quantizer=None
model,
loaded_state_dict_keys,
resolved_archive_file,
start_prefix="",
hf_quantizer=None,
pretrained_model_name_or_path=None,
):
"""
This is an experimental function that loads the model using ~1.x model size CPU memory

View File

@ -1132,17 +1132,18 @@ class CohereForCausalLM(CoherePreTrainedModel):
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
if inputs_embeds is not None and cache_position[0] == 0:
model_inputs = {"inputs_embeds": inputs_embeds}
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
else:
model_inputs = {"input_ids": input_ids}
# The clone here is for the same reason as for `position_ids`.
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
if inputs_embeds is not None:
batch_size, sequence_length = inputs_embeds.shape
device = inputs_embeds.device
if model_inputs["inputs_embeds"] is not None:
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
device = model_inputs["inputs_embeds"].device
else:
batch_size, sequence_length = input_ids.shape
device = input_ids.device
batch_size, sequence_length = model_inputs["input_ids"].shape
device = model_inputs["input_ids"].device
dtype = self.lm_head.weight.dtype
min_dtype = torch.finfo(dtype).min

View File

@ -1403,17 +1403,18 @@ class DbrxForCausalLM(DbrxPreTrainedModel):
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
if inputs_embeds is not None and cache_position[0] == 0:
model_inputs = {"inputs_embeds": inputs_embeds}
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
else:
model_inputs = {"input_ids": input_ids}
# The clone here is for the same reason as for `position_ids`.
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
if inputs_embeds is not None:
batch_size, sequence_length = inputs_embeds.shape
device = inputs_embeds.device
if model_inputs["inputs_embeds"] is not None:
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
device = model_inputs["inputs_embeds"].device
else:
batch_size, sequence_length = input_ids.shape
device = input_ids.device
batch_size, sequence_length = model_inputs["input_ids"].shape
device = model_inputs["input_ids"].device
dtype = self.lm_head.weight.dtype
min_dtype = torch.finfo(dtype).min

View File

@ -1126,7 +1126,8 @@ class FalconForCausalLM(FalconPreTrainedModel):
if inputs_embeds is not None and past_key_values is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}
# The clone here is for the same reason as for `position_ids`.
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
model_inputs.update(
{

View File

@ -1143,17 +1143,18 @@ class GemmaForCausalLM(GemmaPreTrainedModel):
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
if inputs_embeds is not None and cache_position[0] == 0:
model_inputs = {"inputs_embeds": inputs_embeds}
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
else:
model_inputs = {"input_ids": input_ids}
# The clone here is for the same reason as for `position_ids`.
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
if inputs_embeds is not None:
batch_size, sequence_length = inputs_embeds.shape
device = inputs_embeds.device
if model_inputs["inputs_embeds"] is not None:
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
device = model_inputs["inputs_embeds"].device
else:
batch_size, sequence_length = input_ids.shape
device = input_ids.device
batch_size, sequence_length = model_inputs["input_ids"].shape
device = model_inputs["input_ids"].device
dtype = self.lm_head.weight.dtype
min_dtype = torch.finfo(dtype).min

View File

@ -104,7 +104,6 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
padding_mask, min_dtype
)
return causal_mask
@ -301,7 +300,6 @@ class Gemma2Attention(nn.Module):
attn_weights = attn_weights / self.config.attn_logit_softcapping
attn_weights = torch.tanh(attn_weights)
attn_weights = attn_weights * self.config.attn_logit_softcapping
if attention_mask is not None: # no matter the length, we just slice it
causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
attn_weights = attn_weights + causal_mask
@ -429,6 +427,7 @@ class Gemma2FlashAttention2(Gemma2Attention):
dropout=dropout_rate,
softmax_scale=self.scaling,
is_causal=self.is_causal,
sliding_window=self.sliding_window,
use_top_left_mask=self._flash_attn_uses_top_left_mask,
softcap=self.config.attn_logit_softcapping if is_flash_attn_greater_or_equal("2.6.0") else None,
)
@ -501,11 +500,9 @@ class Gemma2SdpaAttention(Gemma2Attention):
key_states = repeat_kv(key_states, self.num_key_value_groups)
value_states = repeat_kv(value_states, self.num_key_value_groups)
causal_mask = attention_mask
if attention_mask is not None:
causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
# SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
# Reference: https://github.com/pytorch/pytorch/issues/112577.
if query_states.device.type == "cuda" and causal_mask is not None:
@ -516,7 +513,6 @@ class Gemma2SdpaAttention(Gemma2Attention):
# We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
# in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
is_causal = True if causal_mask is None and q_len > 1 else False
attn_output = torch.nn.functional.scaled_dot_product_attention(
query_states,
key_states,
@ -572,7 +568,8 @@ class Gemma2DecoderLayer(nn.Module):
if self.is_sliding and attention_mask is not None: # efficient SDPA and no padding
# Flash-attn is a 2D tensor
if self.config._attn_implementation == "flash_attention_2":
attention_mask = attention_mask[:, -self.sliding_window :]
if past_key_value is not None: # when decoding
attention_mask = attention_mask[:, -self.sliding_window :]
else:
min_dtype = torch.finfo(hidden_states.dtype).min
sliding_window_mask = torch.tril(
@ -581,7 +578,6 @@ class Gemma2DecoderLayer(nn.Module):
attention_mask = torch.where(sliding_window_mask, min_dtype, attention_mask)
if attention_mask.shape[-1] <= 1: # when decoding
attention_mask = attention_mask[:, :, :, -self.sliding_window :]
residual = hidden_states
hidden_states = self.input_layernorm(hidden_states)
@ -994,7 +990,6 @@ class Gemma2ForCausalLM(Gemma2PreTrainedModel):
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
outputs = self.model(
input_ids=input_ids,
@ -1061,7 +1056,6 @@ class Gemma2ForCausalLM(Gemma2PreTrainedModel):
input_ids = input_ids[:, -cache_position.shape[0] :]
elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2)
input_ids = input_ids[:, cache_position]
if attention_mask is not None and position_ids is None:
# create position_ids on the fly for batch generation
position_ids = attention_mask.long().cumsum(-1) - 1
@ -1077,22 +1071,24 @@ class Gemma2ForCausalLM(Gemma2PreTrainedModel):
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
if inputs_embeds is not None and cache_position[0] == 0:
model_inputs = {"inputs_embeds": inputs_embeds}
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
else:
# The clone here is for the same reason as for `position_ids`.
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format)}
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
if isinstance(past_key_values, HybridCache) and attention_mask.ndim == 2:
if inputs_embeds is not None:
batch_size, sequence_length = inputs_embeds.shape
device = inputs_embeds.device
if (
isinstance(past_key_values, HybridCache)
and attention_mask.ndim == 2
and not self.config._attn_implementation == "flash_attention_2"
):
if model_inputs["inputs_embeds"] is not None:
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
device = model_inputs["inputs_embeds"].device
else:
batch_size, sequence_length = input_ids.shape
device = input_ids.device
batch_size, sequence_length = model_inputs["input_ids"].shape
device = model_inputs["input_ids"].device
dtype = self.lm_head.weight.dtype
min_dtype = torch.finfo(dtype).min
attention_mask = _prepare_4d_causal_attention_mask_with_cache_position(
attention_mask,
sequence_length=sequence_length,
@ -1103,7 +1099,6 @@ class Gemma2ForCausalLM(Gemma2PreTrainedModel):
cache_position=cache_position,
batch_size=batch_size,
)
model_inputs.update(
{
"position_ids": position_ids,

View File

@ -818,7 +818,8 @@ class GPTNeoForCausalLM(GPTNeoPreTrainedModel):
if inputs_embeds is not None and past_key_values is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}
# The clone here is for the same reason as for `position_ids`.
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
model_inputs.update(
{

View File

@ -967,7 +967,8 @@ class GPTJForCausalLM(GPTJPreTrainedModel):
if inputs_embeds is not None and past_key_values is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}
# The clone here is for the same reason as for `position_ids`.
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
model_inputs.update(
{

View File

@ -1265,17 +1265,18 @@ class LlamaForCausalLM(LlamaPreTrainedModel):
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
if inputs_embeds is not None and cache_position[0] == 0:
model_inputs = {"inputs_embeds": inputs_embeds}
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
else:
model_inputs = {"input_ids": input_ids}
# The clone here is for the same reason as for `position_ids`.
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
if inputs_embeds is not None:
batch_size, sequence_length = inputs_embeds.shape
device = inputs_embeds.device
if model_inputs["inputs_embeds"] is not None:
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
device = model_inputs["inputs_embeds"].device
else:
batch_size, sequence_length = input_ids.shape
device = input_ids.device
batch_size, sequence_length = model_inputs["input_ids"].shape
device = model_inputs["input_ids"].device
dtype = self.lm_head.weight.dtype
min_dtype = torch.finfo(dtype).min

View File

@ -853,7 +853,7 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel):
inputs_embeds = self.get_input_embeddings()(input_ids)
# Merge text and images in prefill stage
if past_key_values is None:
if input_ids is not None and inputs_embeds.shape[1] != 1:
# First merge image tokens if there are any
if pixel_values is not None and pixel_values.size(0) > 0:
image_features = self._get_image_features(pixel_values, image_sizes)
@ -910,7 +910,7 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel):
pass
# generation with cache, decoding stage
elif past_key_values is not None and (pixel_values is not None or pixel_values_videos is not None):
elif pixel_values is not None or pixel_values_videos is not None:
# Retrieve the first layer to inspect the logits and mask out the hidden states that are set to 0
first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
# Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941

View File

@ -53,6 +53,60 @@ logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "NemotronConfig"
# Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
def _prepare_4d_causal_attention_mask_with_cache_position(
attention_mask: torch.Tensor,
sequence_length: int,
target_length: int,
dtype: torch.dtype,
device: torch.device,
min_dtype: float,
cache_position: torch.Tensor,
batch_size: int,
):
"""
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
Args:
attention_mask (`torch.Tensor`):
A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
sequence_length (`int`):
The sequence length being processed.
target_length (`int`):
The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
dtype (`torch.dtype`):
The dtype to use for the 4D attention mask.
device (`torch.device`):
The device to plcae the 4D attention mask on.
min_dtype (`float`):
The minimum value representable with the dtype `dtype`.
cache_position (`torch.Tensor`):
Indices depicting the position of the input sequence tokens in the sequence.
batch_size (`torch.Tensor`):
Batch size.
"""
if attention_mask is not None and attention_mask.dim() == 4:
# In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
causal_mask = attention_mask
else:
causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
if sequence_length != 1:
causal_mask = torch.triu(causal_mask, diagonal=1)
causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
if attention_mask is not None:
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
mask_length = attention_mask.shape[-1]
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
padding_mask = padding_mask == 0
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
padding_mask, min_dtype
)
return causal_mask
def _cast_if_autocast_enabled(*args):
if not torch.is_autocast_enabled():
return args
@ -902,27 +956,18 @@ class NemotronModel(NemotronPreTrainedModel):
else past_seen_tokens + sequence_length + 1
)
if attention_mask is not None and attention_mask.dim() == 4:
# in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
if attention_mask.max() != 0:
raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
causal_mask = attention_mask
else:
causal_mask = torch.full(
(sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
)
if sequence_length != 1:
causal_mask = torch.triu(causal_mask, diagonal=1)
causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
if attention_mask is not None:
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
mask_length = attention_mask.shape[-1]
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
padding_mask = padding_mask == 0
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
padding_mask, min_dtype
)
# In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
causal_mask = _prepare_4d_causal_attention_mask_with_cache_position(
attention_mask,
sequence_length=sequence_length,
target_length=target_length,
dtype=dtype,
device=device,
min_dtype=min_dtype,
cache_position=cache_position,
batch_size=input_tensor.shape[0],
)
if (
self.config._attn_implementation == "sdpa"
and attention_mask is not None
@ -1086,11 +1131,37 @@ class NemotronForCausalLM(NemotronPreTrainedModel):
if past_key_values:
position_ids = position_ids[:, -input_ids.shape[1] :]
# This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture.
position_ids = position_ids.clone(memory_format=torch.contiguous_format)
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
if inputs_embeds is not None and cache_position[0] == 0:
model_inputs = {"inputs_embeds": inputs_embeds}
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
else:
model_inputs = {"input_ids": input_ids.contiguous()} # `contiguous()` needed for compilation use cases
# The clone here is for the same reason as for `position_ids`.
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
if model_inputs["inputs_embeds"] is not None:
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
device = model_inputs["inputs_embeds"].device
else:
batch_size, sequence_length = model_inputs["input_ids"].shape
device = model_inputs["input_ids"].device
dtype = self.lm_head.weight.dtype
min_dtype = torch.finfo(dtype).min
attention_mask = _prepare_4d_causal_attention_mask_with_cache_position(
attention_mask,
sequence_length=sequence_length,
target_length=past_key_values.get_max_length(),
dtype=dtype,
device=device,
min_dtype=min_dtype,
cache_position=cache_position,
batch_size=batch_size,
)
model_inputs.update(
{

View File

@ -1176,17 +1176,18 @@ class OlmoForCausalLM(OlmoPreTrainedModel):
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
if inputs_embeds is not None and cache_position[0] == 0:
model_inputs = {"inputs_embeds": inputs_embeds}
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
else:
model_inputs = {"input_ids": input_ids}
# The clone here is for the same reason as for `position_ids`.
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
if inputs_embeds is not None:
batch_size, sequence_length = inputs_embeds.shape
device = inputs_embeds.device
if model_inputs["inputs_embeds"] is not None:
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
device = model_inputs["inputs_embeds"].device
else:
batch_size, sequence_length = input_ids.shape
device = input_ids.device
batch_size, sequence_length = model_inputs["input_ids"].shape
device = model_inputs["input_ids"].device
dtype = self.lm_head.weight.dtype
min_dtype = torch.finfo(dtype).min

View File

@ -993,17 +993,18 @@ class PersimmonForCausalLM(PersimmonPreTrainedModel):
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
if inputs_embeds is not None and cache_position[0] == 0:
model_inputs = {"inputs_embeds": inputs_embeds}
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
else:
model_inputs = {"input_ids": input_ids}
# The clone here is for the same reason as for `position_ids`.
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
if inputs_embeds is not None:
batch_size, sequence_length = inputs_embeds.shape
device = inputs_embeds.device
if model_inputs["inputs_embeds"] is not None:
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
device = model_inputs["inputs_embeds"].device
else:
batch_size, sequence_length = input_ids.shape
device = input_ids.device
batch_size, sequence_length = model_inputs["input_ids"].shape
device = model_inputs["input_ids"].device
dtype = self.lm_head.weight.dtype
min_dtype = torch.finfo(dtype).min

View File

@ -1278,17 +1278,18 @@ class PhiForCausalLM(PhiPreTrainedModel):
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
if inputs_embeds is not None and cache_position[0] == 0:
model_inputs = {"inputs_embeds": inputs_embeds}
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
else:
model_inputs = {"input_ids": input_ids}
# The clone here is for the same reason as for `position_ids`.
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
if inputs_embeds is not None:
batch_size, sequence_length = inputs_embeds.shape
device = inputs_embeds.device
if model_inputs["inputs_embeds"] is not None:
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
device = model_inputs["inputs_embeds"].device
else:
batch_size, sequence_length = input_ids.shape
device = input_ids.device
batch_size, sequence_length = model_inputs["input_ids"].shape
device = model_inputs["input_ids"].device
dtype = self.lm_head.weight.dtype
min_dtype = torch.finfo(dtype).min

View File

@ -540,7 +540,7 @@ class Phi3FlashAttention2(Phi3Attention):
max(kv_seq_len, position_ids[:, -1].max().item() + 1) if position_ids is not None else kv_seq_len
)
cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len, position_ids=position_ids)
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
@ -1318,17 +1318,18 @@ class Phi3ForCausalLM(Phi3PreTrainedModel):
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
if inputs_embeds is not None and cache_position[0] == 0:
model_inputs = {"inputs_embeds": inputs_embeds}
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
else:
model_inputs = {"input_ids": input_ids}
# The clone here is for the same reason as for `position_ids`.
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
if inputs_embeds is not None:
batch_size, sequence_length = inputs_embeds.shape
device = inputs_embeds.device
if model_inputs["inputs_embeds"] is not None:
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
device = model_inputs["inputs_embeds"].device
else:
batch_size, sequence_length = input_ids.shape
device = input_ids.device
batch_size, sequence_length = model_inputs["input_ids"].shape
device = model_inputs["input_ids"].device
dtype = self.lm_head.weight.dtype
min_dtype = torch.finfo(dtype).min

View File

@ -1176,17 +1176,18 @@ class Qwen2ForCausalLM(Qwen2PreTrainedModel):
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
if inputs_embeds is not None and cache_position[0] == 0:
model_inputs = {"inputs_embeds": inputs_embeds}
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
else:
model_inputs = {"input_ids": input_ids}
# The clone here is for the same reason as for `position_ids`.
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
if inputs_embeds is not None:
batch_size, sequence_length = inputs_embeds.shape
device = inputs_embeds.device
if model_inputs["inputs_embeds"] is not None:
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
device = model_inputs["inputs_embeds"].device
else:
batch_size, sequence_length = input_ids.shape
device = input_ids.device
batch_size, sequence_length = model_inputs["input_ids"].shape
device = model_inputs["input_ids"].device
dtype = self.lm_head.weight.dtype
min_dtype = torch.finfo(dtype).min

View File

@ -1372,17 +1372,18 @@ class Qwen2MoeForCausalLM(Qwen2MoePreTrainedModel):
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
if inputs_embeds is not None and cache_position[0] == 0:
model_inputs = {"inputs_embeds": inputs_embeds}
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
else:
model_inputs = {"input_ids": input_ids}
# The clone here is for the same reason as for `position_ids`.
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
if inputs_embeds is not None:
batch_size, sequence_length = inputs_embeds.shape
device = inputs_embeds.device
if model_inputs["inputs_embeds"] is not None:
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
device = model_inputs["inputs_embeds"].device
else:
batch_size, sequence_length = input_ids.shape
device = input_ids.device
batch_size, sequence_length = model_inputs["input_ids"].shape
device = model_inputs["input_ids"].device
dtype = self.lm_head.weight.dtype
min_dtype = torch.finfo(dtype).min

View File

@ -818,6 +818,7 @@ class RecurrentGemmaForCausalLM(RecurrentGemmaPreTrainedModel):
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
cache_position: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
@ -858,6 +859,7 @@ class RecurrentGemmaForCausalLM(RecurrentGemmaPreTrainedModel):
output_hidden_states = True
outputs = self.model(
input_ids=input_ids,
position_ids=position_ids,
cache_position=cache_position,
attention_mask=attention_mask,
inputs_embeds=inputs_embeds,
@ -913,13 +915,17 @@ class RecurrentGemmaForCausalLM(RecurrentGemmaPreTrainedModel):
if past_length > 0:
position_ids = position_ids[:, past_length:]
if inputs_embeds is not None:
model_inputs = {"inputs_embeds": inputs_embeds[:, past_length:]}
else:
model_inputs = {"input_ids": input_ids[:, past_length:].contiguous()}
if inputs_embeds is not None: # Exception 1
input_ids = input_ids[:, -cache_position.shape[0] :]
elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2)
input_ids = input_ids[:, cache_position]
if cache_position is not None:
cache_position = cache_position[-position_ids.shape[1] :]
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
if inputs_embeds is not None and cache_position[0] == 0:
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
else:
# The clone here is for the same reason as for `position_ids`.
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
model_inputs.update(
{

View File

@ -1271,17 +1271,18 @@ class StableLmForCausalLM(StableLmPreTrainedModel):
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
if inputs_embeds is not None and cache_position[0] == 0:
model_inputs = {"inputs_embeds": inputs_embeds}
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
else:
model_inputs = {"input_ids": input_ids}
# The clone here is for the same reason as for `position_ids`.
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
if inputs_embeds is not None:
batch_size, sequence_length = inputs_embeds.shape
device = inputs_embeds.device
if model_inputs["inputs_embeds"] is not None:
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
device = model_inputs["inputs_embeds"].device
else:
batch_size, sequence_length = input_ids.shape
device = input_ids.device
batch_size, sequence_length = model_inputs["input_ids"].shape
device = model_inputs["input_ids"].device
dtype = self.lm_head.weight.dtype
min_dtype = torch.finfo(dtype).min

View File

@ -1152,17 +1152,18 @@ class Starcoder2ForCausalLM(Starcoder2PreTrainedModel):
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
if inputs_embeds is not None and cache_position[0] == 0:
model_inputs = {"inputs_embeds": inputs_embeds}
model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
else:
model_inputs = {"input_ids": input_ids}
# The clone here is for the same reason as for `position_ids`.
model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
if inputs_embeds is not None:
batch_size, sequence_length = inputs_embeds.shape
device = inputs_embeds.device
if model_inputs["inputs_embeds"] is not None:
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
device = model_inputs["inputs_embeds"].device
else:
batch_size, sequence_length = input_ids.shape
device = input_ids.device
batch_size, sequence_length = model_inputs["input_ids"].shape
device = model_inputs["input_ids"].device
dtype = self.lm_head.weight.dtype
min_dtype = torch.finfo(dtype).min

View File

@ -653,9 +653,6 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel):
if cache_length < past_length and attention_mask is not None:
attention_mask = attention_mask[:, -(cache_length + input_ids.shape[1]) :]
pixel_values_videos = None
pixel_values_images = None
position_ids = kwargs.get("position_ids", None)
if attention_mask is not None and position_ids is None:
# create position_ids on the fly for batch generation

View File

@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import importlib
import inspect
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
from packaging import version
@ -200,16 +199,11 @@ class Bnb4BitHfQuantizer(HfQuantizer):
if unexpected_keys is not None and k in unexpected_keys:
unexpected_keys.remove(k)
param_kwargs = {}
sig = inspect.signature(bnb.nn.Params4bit.from_prequantized)
if "module" in sig.parameters:
param_kwargs["module"] = module
new_value = bnb.nn.Params4bit.from_prequantized(
data=param_value,
quantized_stats=quantized_stats,
requires_grad=False,
device=target_device,
**param_kwargs,
)
else:
new_value = param_value.to("cpu")

View File

@ -100,7 +100,6 @@ from .trainer_pt_utils import (
get_model_param_count,
get_module_class_from_name,
get_parameter_names,
is_deepspeed_zero3_enabled,
nested_concat,
nested_detach,
nested_numpify,
@ -436,15 +435,6 @@ class Trainer:
)
self.model_init = model_init
# Will reach this branch if the user has
# 1. Used `.from_pretrained` or `.from_config` to initialize their model
# 2. Did not configure Zero-3 via `TrainingArguments` or `accelerate launch` beforehand
# New models init such as `MyModel()` will not hit this step
if is_deepspeed_zero3_enabled() and not getattr(model, "_transformers_zero3_init_used", True):
raise ValueError(
"Model was not initialized with `Zero-3` despite being configured for DeepSpeed Zero-3. Please re-initialize your model via `Model.from_pretrained(...)` or `Model.from_config(...)` after creating your `TrainingArguments`!"
)
if model.__class__.__name__ in MODEL_MAPPING_NAMES:
raise ValueError(
f"The model you have picked ({model.__class__.__name__}) cannot be used as is for training: it only "

View File

@ -692,7 +692,7 @@ def is_torchdynamo_compiling():
import torch
return torch.compiler.is_compiling()
except AttributeError:
except Exception:
try:
import torch._dynamo as dynamo # noqa: F401

View File

@ -331,6 +331,21 @@ def warning_once(self, *args, **kwargs):
logging.Logger.warning_once = warning_once
@functools.lru_cache(None)
def info_once(self, *args, **kwargs):
"""
This method is identical to `logger.info()`, but will emit the info with the same message only once
Note: The cache is for the function arguments, so 2 different callers using the same arguments will hit the cache.
The assumption here is that all warning messages are unique across the code. If they aren't then need to switch to
another type of cache that includes the caller frame information in the hashing function.
"""
self.info(*args, **kwargs)
logging.Logger.info_once = info_once
class EmptyTqdm:
"""Dummy tqdm which doesn't do anything."""

View File

@ -709,30 +709,34 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
# Relative difference. See the note above how to get identical loss on a small bs
self.assertTrue((no_grad_accum_loss - yes_grad_accum_loss) / (no_grad_accum_loss + 1e-15) <= 1e-3)
def test_missed_zero3_init(self):
from transformers import Trainer # noqa
# NOTE: Currently a disabled test. In the future we should re-enable it.
# Issue resolves around Zero-3 w/ DPO/TRL + DeepSpeed
# As well as Zero-3 inference
# Related PR: https://github.com/huggingface/transformers/pull/32299
# def test_missed_zero3_init(self):
# from transformers import Trainer # noqa
with mockenv_context(**self.dist_env_1_gpu):
model = AutoModel.from_pretrained(T5_TINY)
training_args = TrainingArguments(
output_dir="./test_missed_zero3_init",
deepspeed=self.get_config_dict(ZERO3),
)
with self.assertRaises(
ValueError, msg="Model was not initialized with `Zero-3` despite being configured."
):
_ = Trainer(
model=model,
args=training_args,
)
# Now do it properly, triggered from our `TrainingArguments` earlier
model = AutoModel.from_pretrained(T5_TINY)
trainer = Trainer(
model=model,
args=training_args,
)
assert trainer.is_deepspeed_enabled
assert model._transformers_zero3_init_used
# with mockenv_context(**self.dist_env_1_gpu):
# model = AutoModel.from_pretrained(T5_TINY)
# training_args = TrainingArguments(
# output_dir="./test_missed_zero3_init",
# deepspeed=self.get_config_dict(ZERO3),
# )
# with self.assertRaises(
# ValueError, msg="Model was not initialized with `Zero-3` despite being configured."
# ):
# _ = Trainer(
# model=model,
# args=training_args,
# )
# # Now do it properly, triggered from our `TrainingArguments` earlier
# model = AutoModel.from_pretrained(T5_TINY)
# trainer = Trainer(
# model=model,
# args=training_args,
# )
# assert trainer.is_deepspeed_enabled
# assert model._transformers_zero3_init_used
def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage, dtype):
# adapted from TrainerIntegrationCommon.check_saved_checkpoints

View File

@ -1540,3 +1540,8 @@ class BartStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, un
@unittest.skip
def test_save_load_fast_init_from_base(self):
pass
@unittest.skip(reason="Generate needs input ids")
def test_inputs_embeds_matches_input_ids_with_generate(self):
# generate only works with input ids for bartforcausalLM
pass

View File

@ -502,6 +502,11 @@ class BertModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
@unittest.skip(reason="Generate needs input ids")
def test_inputs_embeds_matches_input_ids_with_generate(self):
# generate only works with input ids for bertforcausalLM
pass
def test_model_as_decoder_with_default_input_mask(self):
# This regression test was failing with PyTorch < 1.3
(

View File

@ -21,6 +21,7 @@ import unittest
import numpy as np
import requests
from parameterized import parameterized
from transformers import BlipConfig, BlipTextConfig, BlipVisionConfig
from transformers.testing_utils import (
@ -1106,6 +1107,7 @@ class BlipTextRetrievalModelTest(ModelTesterMixin, unittest.TestCase):
@require_torch
class BlipTextImageModelTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (BlipForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (BlipForConditionalGeneration,) if is_torch_available() else ()
fx_compatible = False
test_head_masking = False
test_pruning = False
@ -1116,6 +1118,18 @@ class BlipTextImageModelTest(ModelTesterMixin, unittest.TestCase):
def setUp(self):
self.model_tester = BlipTextImageModelsModelTester(self)
@parameterized.expand([(True,), (False,)])
def test_greedy_generation(self, use_cache: bool):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
out = model.generate(**inputs_dict, min_new_tokens=20, max_new_tokens=20, use_cache=use_cache)
self.assertTrue(out.shape[1] == inputs_dict["input_ids"].shape[1] + 19)
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)

View File

@ -20,6 +20,7 @@ import unittest
import numpy as np
import requests
from parameterized import parameterized
from transformers import CONFIG_MAPPING, Blip2Config, Blip2QFormerConfig, Blip2VisionConfig
from transformers.testing_utils import (
@ -314,7 +315,7 @@ class Blip2TextModelDecoderOnlyTester:
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=20,
max_position_embeddings=256,
eos_token_id=2,
pad_token_id=1,
bos_token_id=0,
@ -436,8 +437,9 @@ class Blip2ForConditionalGenerationDecoderOnlyModelTester:
@require_torch
class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (Blip2ForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (Blip2ForConditionalGeneration,) if is_torch_available() else ()
fx_compatible = False
test_head_masking = False
test_pruning = False
@ -448,6 +450,18 @@ class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationT
def setUp(self):
self.model_tester = Blip2ForConditionalGenerationDecoderOnlyModelTester(self)
@parameterized.expand([(True,), (False,)])
def test_greedy_generation(self, use_cache: bool):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
out = model.generate(**inputs_dict, min_new_tokens=20, max_new_tokens=20, use_cache=use_cache)
self.assertTrue(out.shape[1] == 21) # BLIP is special, so should be 21
def test_for_conditional_generation(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)

View File

@ -20,6 +20,7 @@ import unittest
import numpy as np
import requests
from parameterized import parameterized
from transformers import (
CONFIG_MAPPING,
@ -38,7 +39,6 @@ from transformers.testing_utils import (
)
from transformers.utils import is_torch_available, is_vision_available
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
ModelTesterMixin,
@ -319,7 +319,7 @@ class InstructBlipTextModelDecoderOnlyTester:
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=20,
max_position_embeddings=256,
eos_token_id=2,
pad_token_id=1,
bos_token_id=0,
@ -452,8 +452,9 @@ class InstructBlipForConditionalGenerationDecoderOnlyModelTester:
@require_torch
class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else ()
fx_compatible = False
test_head_masking = False
test_pruning = False
@ -464,6 +465,19 @@ class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, Gene
def setUp(self):
self.model_tester = InstructBlipForConditionalGenerationDecoderOnlyModelTester(self)
@parameterized.expand([(True,), (False,)])
def test_greedy_generation(self, use_cache: bool):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
model.config.text_config.architectures = ["OptForCausalLM"]
out = model.generate(**inputs_dict, min_new_tokens=20, max_new_tokens=20, use_cache=use_cache)
self.assertTrue(out.shape[1] == 21) # BLIP is special, therefore 21
def test_for_conditional_generation(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)

View File

@ -20,6 +20,7 @@ import unittest
import numpy as np
from huggingface_hub import hf_hub_download
from parameterized import parameterized
from transformers import (
CONFIG_MAPPING,
@ -38,7 +39,6 @@ from transformers.testing_utils import (
)
from transformers.utils import is_torch_available, is_vision_available
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
ModelTesterMixin,
@ -333,7 +333,7 @@ class InstructBlipVideoTextModelDecoderOnlyTester:
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=100,
max_position_embeddings=256,
eos_token_id=2,
pad_token_id=1,
bos_token_id=0,
@ -471,10 +471,9 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester:
@require_torch
class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(
ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
):
class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (InstructBlipVideoForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (InstructBlipVideoForConditionalGeneration,) if is_torch_available() else ()
fx_compatible = False
test_head_masking = False
test_pruning = False
@ -485,6 +484,19 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(
def setUp(self):
self.model_tester = InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester(self)
@parameterized.expand([(True,), (False,)])
def test_greedy_generation(self, use_cache: bool):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
model.config.text_config.architectures = ["OptForCausalLM"]
out = model.generate(**inputs_dict, min_new_tokens=20, max_new_tokens=20, use_cache=use_cache)
self.assertTrue(out.shape[1] == 21) # BLIP is special, therefore 21
def test_for_conditional_generation(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)

View File

@ -281,6 +281,17 @@ class Kosmos2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
self.model_tester = Kosmos2ModelTester(self)
self.config_tester = ConfigTester(self, config_class=Kosmos2Config, hidden_size=37)
def test_greedy_generation(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
out = model.generate(**inputs_dict, min_new_tokens=20, max_new_tokens=20)
self.assertTrue(out.shape[1] == inputs_dict["input_ids"].shape[1] + 20)
# overwrite from common to skip `image_to_text_projection.latent_query`
def test_initialization(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

View File

@ -18,6 +18,7 @@ import gc
import unittest
import requests
from parameterized import parameterized
from transformers import (
AutoProcessor,
@ -80,7 +81,7 @@ class LlavaVisionText2TextModelTester:
"initializer_range": 0.02,
"num_labels": 3,
"num_choices": 4,
"pad_token_id": 0,
"pad_token_id": 1,
},
is_training=True,
vision_config={
@ -148,6 +149,8 @@ class LlavaVisionText2TextModelTester:
config, pixel_values = config_and_inputs
input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
attention_mask = input_ids.ne(1).to(torch_device)
# set to random non-image token to prevent flakiness
input_ids[input_ids == config.image_token_index] = 1
# we are giving 3 images let's make sure we pass in 3 image tokens
input_ids[:, 1] = config.image_token_index
inputs_dict = {
@ -178,6 +181,7 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase
"""
all_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else ()
pipeline_model_mapping = {"image-to-text": LlavaForConditionalGeneration} if is_torch_available() else {}
test_pruning = False
test_head_masking = False
@ -186,6 +190,24 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase
self.model_tester = LlavaVisionText2TextModelTester(self)
self.config_tester = ConfigTester(self, config_class=LlavaConfig, has_text_modality=False)
@parameterized.expand([(True,), (False,)])
def test_greedy_generation(self, use_cache: bool):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
out = model.generate(
**inputs_dict,
min_new_tokens=20,
max_new_tokens=20,
use_cache=use_cache,
bad_words_ids=[[config.image_token_index]],
)
self.assertTrue(out.shape[1] == inputs_dict["input_ids"].shape[1] + 20)
@unittest.skip(
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
)

View File

@ -19,6 +19,7 @@ import unittest
import requests
from huggingface_hub import hf_hub_download
from parameterized import parameterized
from transformers import (
AutoProcessor,
@ -34,7 +35,6 @@ from transformers.testing_utils import (
torch_device,
)
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
ModelTesterMixin,
@ -86,7 +86,7 @@ class LlavaNextVisionText2TextModelTester:
"initializer_range": 0.02,
"num_labels": 3,
"num_choices": 4,
"pad_token_id": 0,
"pad_token_id": 1,
},
is_training=True,
vision_config={
@ -157,6 +157,8 @@ class LlavaNextVisionText2TextModelTester:
config, pixel_values = config_and_inputs
input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2
attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)
# set to random non-image token to prevent flakiness
input_ids[input_ids == config.image_token_index] = 2
# we are giving 3 images let's make sure we pass in 3 image tokens
input_ids[:, 1] = config.image_token_index
labels = torch.zeros((self.batch_size, self.seq_length), dtype=torch.long, device=torch_device)
@ -208,12 +210,13 @@ class LlavaNextVisionText2TextModelTester:
@require_torch
class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase):
"""
Model tester for `LlavaNextForConditionalGeneration`.
"""
all_model_classes = (LlavaNextForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (LlavaNextForConditionalGeneration,) if is_torch_available() else ()
test_pruning = False
test_head_masking = False
@ -237,6 +240,24 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)
@parameterized.expand([(True,), (False,)])
def test_greedy_generation(self, use_cache: bool):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
out = model.generate(
**inputs_dict,
min_new_tokens=20,
max_new_tokens=20,
use_cache=use_cache,
bad_words_ids=[[config.image_token_index]],
)
self.assertTrue(out.shape[1] == inputs_dict["input_ids"].shape[1] + 20)
@unittest.skip(
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
)

View File

@ -19,6 +19,7 @@ import unittest
import numpy as np
from huggingface_hub import hf_hub_download
from parameterized import parameterized
from transformers import (
AutoProcessor,
@ -34,7 +35,6 @@ from transformers.testing_utils import (
torch_device,
)
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
ModelTesterMixin,
@ -86,7 +86,7 @@ class LlavaNextVideoVisionText2TextModelTester:
"initializer_range": 0.02,
"num_labels": 3,
"num_choices": 4,
"pad_token_id": 0,
"pad_token_id": 1,
},
is_training=True,
vision_config={
@ -167,6 +167,9 @@ class LlavaNextVideoVisionText2TextModelTester:
config, pixel_values, pixel_values_videos = self.prepare_config_and_inputs()
input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2
attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)
# set to random non-image token to prevent flakiness
input_ids[input_ids == config.image_token_index] = 2
input_ids[input_ids == config.video_token_index] = 2
# we are giving 3 images and videos let's make sure we pass in 3 special tokens
input_ids[:, 1] = config.image_token_index
input_ids[:, 2] = config.video_token_index
@ -223,12 +226,13 @@ class LlavaNextVideoVisionText2TextModelTester:
@require_torch
class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase):
"""
Model tester for `LlavaNextVideoForConditionalGeneration`.
"""
all_model_classes = (LlavaNextVideoForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (LlavaNextVideoForConditionalGeneration,) if is_torch_available() else ()
test_pruning = False
test_head_masking = False
@ -274,6 +278,24 @@ class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, Generati
with torch.no_grad():
model(**inputs)
@parameterized.expand([(True,), (False,)])
def test_greedy_generation(self, use_cache: bool):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
out = model.generate(
**inputs_dict,
min_new_tokens=20,
max_new_tokens=20,
use_cache=use_cache,
bad_words_ids=[[config.image_token_index], [config.video_token_index]],
)
self.assertTrue(out.shape[1] == inputs_dict["input_ids"].shape[1] + 20)
@unittest.skip(
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
)

View File

@ -176,6 +176,7 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, unittest.Test
"""
all_model_classes = (PaliGemmaForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (PaliGemmaForConditionalGeneration,) if is_torch_available() else ()
fx_compatible = False
test_pruning = False
test_torchscript = False
@ -185,6 +186,18 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, unittest.Test
self.model_tester = PaliGemmaVisionText2TextModelTester(self)
self.config_tester = ConfigTester(self, config_class=PaliGemmaConfig, has_text_modality=False)
@parameterized.expand([(True,), (False,)])
def test_greedy_generation(self, use_cache: bool):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
out = model.generate(**inputs_dict, min_new_tokens=20, max_new_tokens=20, use_cache=use_cache)
self.assertTrue(out.shape[1] == inputs_dict["input_ids"].shape[1] + 20)
@unittest.skip(
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
)

View File

@ -20,6 +20,7 @@ import unittest
import numpy as np
import requests
from huggingface_hub import hf_hub_download
from parameterized import parameterized
from transformers import (
VideoLlavaConfig,
@ -30,7 +31,6 @@ from transformers import (
)
from transformers.testing_utils import require_bitsandbytes, require_torch, require_torch_gpu, slow, torch_device
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
@ -75,7 +75,7 @@ class VideoLlavaVisionText2TextModelTester:
"initializer_range": 0.02,
"num_labels": 3,
"num_choices": 4,
"pad_token_id": 0,
"pad_token_id": 1,
},
is_training=True,
vision_config={
@ -158,10 +158,11 @@ class VideoLlavaVisionText2TextModelTester:
config, pixel_values_images, pixel_values_videos = config_and_inputs
input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
attention_mask = input_ids.ne(1).to(torch_device)
# set to random non-image token to prevent flakiness
input_ids[input_ids == config.image_token_index] = 2
input_ids[input_ids == config.video_token_index] = 2
# we are giving 3 videos and 3 images. Need to pass in image and video tokens, both
# also need to make sure no other special tokens are set
input_ids[(input_ids == 0) | (input_ids == 1)] = 3
# we are giving 3 videos and 3 images. Need to pass in image and video tokens
input_ids[:, 0] = config.video_token_index
input_ids[:, 1:2] = config.image_token_index
inputs_dict = {
@ -190,12 +191,13 @@ class VideoLlavaVisionText2TextModelTester:
@require_torch
class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase):
"""
Model tester for `VideoLlavaForConditionalGeneration`.
"""
all_model_classes = (VideoLlavaForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (VideoLlavaForConditionalGeneration,) if is_torch_available() else ()
fx_compatible = False
test_pruning = False
test_resize_embeddings = True
@ -205,6 +207,24 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
self.model_tester = VideoLlavaVisionText2TextModelTester(self)
self.config_tester = ConfigTester(self, config_class=VideoLlavaConfig, has_text_modality=False)
@parameterized.expand([(True,), (False,)])
def test_greedy_generation(self, use_cache: bool):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
out = model.generate(
**inputs_dict,
min_new_tokens=20,
max_new_tokens=20,
use_cache=use_cache,
bad_words_ids=[[config.image_token_index], [config.video_token_index]],
)
self.assertTrue(out.shape[1] == inputs_dict["input_ids"].shape[1] + 20)
@unittest.skip(
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
)

View File

@ -18,6 +18,7 @@ import gc
import unittest
import requests
from parameterized import parameterized
from transformers import (
AutoProcessor,
@ -73,7 +74,7 @@ class VipLlavaVisionText2TextModelTester:
"initializer_range": 0.02,
"num_labels": 3,
"num_choices": 4,
"pad_token_id": 0,
"pad_token_id": 1,
},
is_training=True,
vision_config={
@ -140,6 +141,8 @@ class VipLlavaVisionText2TextModelTester:
config, pixel_values = config_and_inputs
input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
attention_mask = input_ids.ne(1).to(torch_device)
# set to random non-image token to prevent flakiness
input_ids[input_ids == config.image_token_index] = 2
# we are giving 3 images let's make sure we pass in 3 image tokens
input_ids[:, 1] = config.image_token_index
inputs_dict = {
@ -158,6 +161,7 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestC
"""
all_model_classes = (VipLlavaForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (VipLlavaForConditionalGeneration,) if is_torch_available() else ()
fx_compatible = False
test_pruning = False
test_resize_embeddings = True
@ -167,6 +171,24 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestC
self.model_tester = VipLlavaVisionText2TextModelTester(self)
self.config_tester = ConfigTester(self, config_class=VipLlavaConfig, has_text_modality=False)
@parameterized.expand([(True,), (False,)])
def test_greedy_generation(self, use_cache: bool):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
out = model.generate(
**inputs_dict,
min_new_tokens=20,
max_new_tokens=20,
use_cache=use_cache,
bad_words_ids=[[config.image_token_index]],
)
self.assertTrue(out.shape[1] == inputs_dict["input_ids"].shape[1] + 20)
@unittest.skip(
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
)

View File

@ -1844,6 +1844,59 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
)
assert isinstance(pred_ids, expected_output_type)
@require_flash_attn
@require_torch_gpu
@pytest.mark.flash_attn_test
@slow
def test_flash_attn_2_generate_reuse_cache(self):
max_new_tokens = 2
for model_class in self.all_generative_model_classes:
if not model_class._supports_flash_attn_2:
self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
dummy_input = inputs_dict[model_class.main_input_name][..., :10]
if dummy_input.dtype in [torch.float32, torch.bfloat16]:
dummy_input = dummy_input.to(torch.float16)
# make sure that all models have enough positions for generation
if hasattr(config, "max_position_embeddings"):
config.max_position_embeddings = dummy_input.shape[1] * 2 + max_new_tokens * 2 + 1
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
low_cpu_mem_usage=True,
).to(torch_device)
# run generate once to get filled cache
output = model.generate(
dummy_input,
max_new_tokens=max_new_tokens,
do_sample=False,
use_cache=True,
return_dict_in_generate=True,
)
past_key_values = output.past_key_values
# Try to continue generation from where we left, given that we have more than 1 new token to process
# e.g. this can happen in speculative decoding when feeding candidate tokens back to target model
_ = model.generate(
dummy_input,
decoder_input_ids=output.sequences,
max_new_tokens=max_new_tokens,
do_sample=False,
use_cache=True,
past_key_values=past_key_values,
)
@require_torch
@require_torchaudio
@ -4058,6 +4111,11 @@ class WhisperStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin,
# generate only works with input ids for whisper
pass
@unittest.skip(reason="Generate needs input ids")
def test_inputs_embeds_matches_input_ids_with_generate(self):
# generate only works with input ids for whisper
pass
@unittest.skip(reason="Decoder can't keep attention grads")
def test_retain_grad_hidden_states_attentions(self):
return
@ -4066,6 +4124,12 @@ class WhisperStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin,
def test_save_load_fast_init_from_base(self):
pass
@unittest.skip(
reason="FA2 testing suite needs to be refactored to be compatible with WhisperDecoder for that test"
)
def test_flash_attn_2_generate_reuse_cache(self):
pass
@unittest.skip(
"Duplicated test with WhisperModelTest + the FA2 testing suite needs to be refactored to be compatible with WhisperDecoder for that test"
)

View File

@ -2819,6 +2819,53 @@ class ModelTesterMixin:
)[0]
self.assertTrue(torch.allclose(out_embeds, out_ids))
def test_inputs_embeds_matches_input_ids_with_generate(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
if model_class.__name__ not in get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES):
continue
model = model_class(config)
model.to(torch_device)
model.eval()
model_forward_args = inspect.signature(model.forward).parameters
if "inputs_embeds" not in model_forward_args:
self.skipTest(reason="This model doesn't use `inputs_embeds`")
inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1
wte = model.get_input_embeddings()
if not self.is_encoder_decoder:
input_ids = inputs["input_ids"]
# some models infer position ids/attn mask differently when input ids
# by check if pad_token let's make sure no padding is in input ids
not_pad_token_id = pad_token_id + 1 if max(0, pad_token_id - 1) == 0 else pad_token_id - 1
input_ids[input_ids == pad_token_id] = not_pad_token_id
del inputs["input_ids"]
inputs_embeds = wte(input_ids)
out_ids = model.generate(input_ids=input_ids, **inputs, max_new_tokens=2)[:, -2:]
out_embeds = model.generate(inputs_embeds=inputs_embeds, **inputs, max_new_tokens=2)
else:
encoder_input_ids = inputs["input_ids"]
decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
encoder_input_ids[encoder_input_ids == pad_token_id] = max(0, pad_token_id + 1)
decoder_input_ids[decoder_input_ids == pad_token_id] = max(0, pad_token_id + 1)
del inputs["input_ids"]
inputs.pop("decoder_input_ids", None)
inputs_embeds = wte(encoder_input_ids)
decoder_inputs_embeds = wte(decoder_input_ids)
out_ids = model.generate(
input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids, **inputs, max_new_tokens=2
)[:, -2:]
out_embeds = model.generate(
inputs_embeds=inputs_embeds,
decoder_inputs_embeds=decoder_inputs_embeds,
**inputs,
max_new_tokens=2,
)
self.assertTrue(torch.allclose(out_embeds, out_ids))
@require_torch_multi_gpu
def test_multi_gpu_data_parallel_forward(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@ -4284,6 +4331,62 @@ class ModelTesterMixin:
use_cache=True,
)
@require_flash_attn
@require_torch_gpu
@mark.flash_attn_test
@slow
def test_flash_attn_2_generate_reuse_cache(self):
if not self.has_attentions:
self.skipTest(reason="Model architecture does not support attentions")
max_new_tokens = 2
for model_class in self.all_generative_model_classes:
if not model_class._supports_flash_attn_2:
self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
dummy_input = inputs_dict[model_class.main_input_name]
if dummy_input.dtype in [torch.float32, torch.bfloat16]:
dummy_input = dummy_input.to(torch.float16)
# make sure that all models have enough positions for generation
if hasattr(config, "max_position_embeddings"):
config.max_position_embeddings = dummy_input.shape[1] * 2 + max_new_tokens * 2 + 1
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
low_cpu_mem_usage=True,
).to(torch_device)
# run generate once to get filled cache
output = model.generate(
dummy_input,
max_new_tokens=max_new_tokens,
do_sample=False,
use_cache=True,
return_dict_in_generate=True,
)
past_key_values = output.past_key_values
# Try to continue generation from where we left, given that we have more than 1 new token to process
# e.g. this can happen in speculative decoding when feeding candidate tokens back to target model
dummy_input_updated = torch.cat([dummy_input, output.sequences], dim=-1)
_ = model.generate(
dummy_input_updated,
max_new_tokens=max_new_tokens,
do_sample=False,
use_cache=True,
past_key_values=past_key_values,
)
@require_flash_attn
@require_torch_gpu
@require_bitsandbytes

View File

@ -19,7 +19,7 @@ import os
import tempfile
import unittest
from transformers.modelcard import ModelCard
from transformers.modelcard import ModelCard, TrainingSummary
class ModelCardTester(unittest.TestCase):
@ -82,3 +82,8 @@ class ModelCardTester(unittest.TestCase):
model_card_second = ModelCard.from_pretrained(tmpdirname)
self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict())
def test_model_summary_modelcard_base_metadata(self):
metadata = TrainingSummary("Model name").create_metadata()
self.assertTrue("library_name" in metadata)
self.assertTrue(metadata["library_name"] == "transformers")

View File

@ -1640,17 +1640,18 @@ class ModelUtilsTest(TestCasePlus):
logger = logging.get_logger("transformers.modeling_utils")
config = PretrainedConfig()
warning_msg_gamma = "A parameter name that contains `gamma` will be renamed internally"
warning_msg_gamma = "`gamma_param` -> `weight_param`"
model = TestModelGamma(config)
with tempfile.TemporaryDirectory() as tmp_dir:
model.save_pretrained(tmp_dir)
with LoggingLevel(logging.WARNING):
with LoggingLevel(logging.INFO):
with CaptureLogger(logger) as cl1:
_, loading_info = TestModelGamma.from_pretrained(tmp_dir, config=config, output_loading_info=True)
missing_keys = loading_info["missing_keys"]
unexpected_keys = loading_info["unexpected_keys"]
self.assertIn("`TestModelGamma`", cl1.out)
self.assertIn(warning_msg_gamma, cl1.out)
self.assertIn("gamma_param", missing_keys)
self.assertIn("weight_param", unexpected_keys)
@ -1664,17 +1665,18 @@ class ModelUtilsTest(TestCasePlus):
def forward(self):
return self.beta_param.sum()
warning_msg_beta = "A parameter name that contains `beta` will be renamed internally"
warning_msg_beta = "`beta_param` -> `bias_param`"
model = TestModelBeta(config)
with tempfile.TemporaryDirectory() as tmp_dir:
model.save_pretrained(tmp_dir)
with LoggingLevel(logging.WARNING):
with LoggingLevel(logging.INFO):
with CaptureLogger(logger) as cl2:
_, loading_info = TestModelBeta.from_pretrained(tmp_dir, config=config, output_loading_info=True)
missing_keys = loading_info["missing_keys"]
unexpected_keys = loading_info["unexpected_keys"]
self.assertIn("`TestModelBeta`", cl2.out)
self.assertIn(warning_msg_beta, cl2.out)
self.assertIn("beta_param", missing_keys)
self.assertIn("bias_param", unexpected_keys)