mirror of
https://github.com/huggingface/transformers.git
synced 2025-11-13 09:45:07 +08:00
Compare commits
21 Commits
logger_ref
...
kernels-op
| Author | SHA1 | Date | |
|---|---|---|---|
| 1d3de2d89d | |||
| 2b8068c306 | |||
| 33c60a5254 | |||
| fa22b56903 | |||
| f30c22500b | |||
| 496c283615 | |||
| df45a92cea | |||
| 3ff0e69f84 | |||
| 31839d741a | |||
| 2072f3059e | |||
| 3760afb21c | |||
| 3c0b2b101e | |||
| e869e9df54 | |||
| 37d48bbb48 | |||
| 21913b2e10 | |||
| fddc1de0a4 | |||
| bfb29a0259 | |||
| 0f45870a77 | |||
| 51c803299a | |||
| ebc7977b4f | |||
| 87bce70355 |
@ -169,6 +169,9 @@ print("Pooled output shape:", pooled_output.shape)
|
||||
[[autodoc]] DINOv3ViTModel
|
||||
- forward
|
||||
|
||||
## DINOv3ViTBackbone
|
||||
[[autodoc]] DINOv3ViTBackbone
|
||||
|
||||
## DINOv3ConvNextModel
|
||||
|
||||
[[autodoc]] DINOv3ConvNextModel
|
||||
|
||||
@ -127,7 +127,7 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--use_slow_tokenizer",
|
||||
action="store_true",
|
||||
help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
|
||||
help="If passed, will use a slow tokenizer (not backed by the Hugging Face Tokenizers library).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--per_device_train_batch_size",
|
||||
|
||||
@ -132,7 +132,7 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--use_slow_tokenizer",
|
||||
action="store_true",
|
||||
help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
|
||||
help="If passed, will use a slow tokenizer (not backed by the Tokenizers library).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--per_device_train_batch_size",
|
||||
|
||||
@ -130,7 +130,7 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--use_slow_tokenizer",
|
||||
action="store_true",
|
||||
help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
|
||||
help="If passed, will use a slow tokenizer (not backed by the Hugging Face Tokenizers library).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--per_device_train_batch_size",
|
||||
|
||||
@ -128,7 +128,7 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--use_slow_tokenizer",
|
||||
action="store_true",
|
||||
help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
|
||||
help="If passed, will use a slow tokenizer (not backed by the HuggingFace Tokenizers library).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--per_device_train_batch_size",
|
||||
|
||||
@ -151,7 +151,7 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--use_slow_tokenizer",
|
||||
action="store_true",
|
||||
help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
|
||||
help="If passed, will use a slow tokenizer (not backed by the Hugging Face Tokenizers library).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--per_device_train_batch_size",
|
||||
|
||||
@ -223,7 +223,7 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--use_slow_tokenizer",
|
||||
action="store_true",
|
||||
help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
|
||||
help="If passed, will use a slow tokenizer (not backed by the Hugging Face Tokenizers library).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--per_device_train_batch_size",
|
||||
|
||||
@ -120,7 +120,7 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--use_slow_tokenizer",
|
||||
action="store_true",
|
||||
help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
|
||||
help="If passed, will use a slow tokenizer (not backed by the Hugging Face Tokenizers library).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--per_device_train_batch_size",
|
||||
|
||||
@ -212,7 +212,7 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--use_slow_tokenizer",
|
||||
action="store_true",
|
||||
help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
|
||||
help="If passed, will use a slow tokenizer (not backed by the Hugging Face Tokenizers library).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--per_device_train_batch_size",
|
||||
|
||||
2
setup.py
2
setup.py
@ -138,7 +138,7 @@ _deps = [
|
||||
"pyyaml>=5.1",
|
||||
"pydantic>=2",
|
||||
"pytest>=7.2.0",
|
||||
"pytest-asyncio",
|
||||
"pytest-asyncio>=1.2.0",
|
||||
"pytest-rerunfailures<16.0",
|
||||
"pytest-timeout",
|
||||
"pytest-xdist",
|
||||
|
||||
@ -723,7 +723,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
|
||||
|
||||
if self.mask_replace_prob < 1:
|
||||
warnings.warn(
|
||||
"Random token replacement is not supported with whole word masking.",
|
||||
"Random token replacement is not supported with whole word masking. "
|
||||
"Setting mask_replace_prob to 1.",
|
||||
)
|
||||
self.mask_replace_prob = 1
|
||||
|
||||
@ -82,7 +82,7 @@ class GlueDataset(Dataset):
|
||||
cache_dir: Optional[str] = None,
|
||||
):
|
||||
warnings.warn(
|
||||
"This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets "
|
||||
"This dataset will be removed from the library soon, preprocessing should be handled with the Hugging Face Datasets "
|
||||
"library. You can have a look at this example script for pointers: "
|
||||
"https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py",
|
||||
FutureWarning,
|
||||
|
||||
@ -21,7 +21,7 @@ if is_sklearn_available():
|
||||
|
||||
|
||||
DEPRECATION_WARNING = (
|
||||
"This metric will be removed from the library soon, metrics should be handled with the 🤗 Evaluate "
|
||||
"This metric will be removed from the library soon, metrics should be handled with the Hugging Face Evaluate "
|
||||
"library. You can have a look at this example script for pointers: "
|
||||
"https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py"
|
||||
)
|
||||
|
||||
@ -28,7 +28,7 @@ from .utils import DataProcessor, InputExample, InputFeatures
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
DEPRECATION_WARNING = (
|
||||
"This {0} will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets "
|
||||
"This {0} will be removed from the library soon, preprocessing should be handled with the Hugging Face Datasets "
|
||||
"library. You can have a look at this example script for pointers: "
|
||||
"https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py"
|
||||
)
|
||||
|
||||
@ -48,7 +48,7 @@ deps = {
|
||||
"pyyaml": "pyyaml>=5.1",
|
||||
"pydantic": "pydantic>=2",
|
||||
"pytest": "pytest>=7.2.0",
|
||||
"pytest-asyncio": "pytest-asyncio",
|
||||
"pytest-asyncio": "pytest-asyncio>=1.2.0",
|
||||
"pytest-rerunfailures": "pytest-rerunfailures<16.0",
|
||||
"pytest-timeout": "pytest-timeout",
|
||||
"pytest-xdist": "pytest-xdist",
|
||||
|
||||
@ -608,7 +608,7 @@ class GenerationMixin(ContinuousMixin):
|
||||
use_cache = kwargs.get("use_cache")
|
||||
if use_cache is None:
|
||||
use_cache = getattr(self.config, "use_cache", False)
|
||||
if past_key_values is None or use_cache:
|
||||
if past_key_values is not None or use_cache:
|
||||
# TODO (joao): handle the case where cache length == input_ids length. The function below results in an
|
||||
# exception because we get empty input_ids after slicing. In essence, we need to roll back the cache 1
|
||||
# token to recompute the logits for the first token to be generated (but not all caches support roll backs)
|
||||
|
||||
@ -821,14 +821,26 @@ def split_to_tiles(images: "torch.Tensor", num_tiles_height: int, num_tiles_widt
|
||||
return image
|
||||
|
||||
|
||||
def _cast_tensor_to_float(x):
|
||||
if x.is_floating_point():
|
||||
return x
|
||||
return x.float()
|
||||
|
||||
|
||||
def _group_images_by_shape(nested_images, *paired_inputs, is_nested: bool = False):
|
||||
"""Helper function to flatten a single level of nested image and batch structures and group by shape."""
|
||||
"""
|
||||
Helper function to flatten a single level of nested image and batch structures and group by shape.
|
||||
Args:
|
||||
nested_images (list):
|
||||
A list of images or a single tensor
|
||||
paired_inputs (Any, *optional*):
|
||||
Zero or more lists that mirror the structure of `nested_images` (flat list, or list of lists when
|
||||
`is_nested=True`). Each element is paired 1:1 with the corresponding image so it can be grouped by the
|
||||
same shape key. These paired values are grouped alongside `nested_images` but are not stacked in the output, so
|
||||
they do not need to be tensors.
|
||||
is_nested (bool, *optional*, defaults to False):
|
||||
Whether the images are nested.
|
||||
Returns:
|
||||
tuple[dict, ...]:
|
||||
- A dictionary with shape as key and list of images with that shape as value
|
||||
- A dictionary with shape as key and list of paired values with that shape as value
|
||||
- A dictionary mapping original indices to (shape, index) tuples
|
||||
- A dictionary mapping original indices to (shape, index) tuples for each paired input
|
||||
"""
|
||||
grouped_images = defaultdict(list)
|
||||
grouped_images_index = {}
|
||||
paired_grouped_values = [defaultdict(list) for _ in paired_inputs]
|
||||
@ -880,27 +892,20 @@ def _reconstruct_nested_structure(indices, processed_images):
|
||||
return result
|
||||
|
||||
|
||||
def _disable_grouping_output_nested(images, *paired_inputs):
|
||||
"""Build the disable_grouping output tuple for a single-level nested structure."""
|
||||
outer_range = range(len(images))
|
||||
inner_ranges = [range(len(images[i])) for i in outer_range]
|
||||
def _iterate_items(items, is_nested: bool):
|
||||
"""
|
||||
Helper function to iterate over items yielding (key, item) pairs.
|
||||
|
||||
# Precompute all (i, j) pairs
|
||||
ij_pairs = [(i, j) for i in outer_range for j in inner_ranges[i]]
|
||||
|
||||
images_dict = {(i, j): images[i][j].unsqueeze(0) for (i, j) in ij_pairs}
|
||||
paired_dicts = [{(i, j): paired_list[i][j].unsqueeze(0) for (i, j) in ij_pairs} for paired_list in paired_inputs]
|
||||
index_map = {(i, j): ((i, j), 0) for (i, j) in ij_pairs}
|
||||
return images_dict, *paired_dicts, index_map
|
||||
|
||||
|
||||
def _disable_grouping_output_flat(images, *paired_inputs):
|
||||
"""Build the disable_grouping output tuple for a flat list structure."""
|
||||
idx_range = range(len(images))
|
||||
images_dict = {i: images[i].unsqueeze(0) for i in idx_range}
|
||||
paired_dicts = [{i: paired_list[i].unsqueeze(0) for i in idx_range} for paired_list in paired_inputs]
|
||||
index_map = {i: (i, 0) for i in idx_range}
|
||||
return images_dict, *paired_dicts, index_map
|
||||
For nested structures, yields ((row_index, col_index), item).
|
||||
For flat structures, yields (index, item).
|
||||
"""
|
||||
if is_nested:
|
||||
for i, row in enumerate(items):
|
||||
for j, item in enumerate(row):
|
||||
yield (i, j), item
|
||||
else:
|
||||
for i, item in enumerate(items):
|
||||
yield i, item
|
||||
|
||||
|
||||
def group_images_by_shape(
|
||||
@ -920,7 +925,7 @@ def group_images_by_shape(
|
||||
Args:
|
||||
images (Union[list["torch.Tensor"], "torch.Tensor"]):
|
||||
A list of images or a single tensor
|
||||
*paired_inputs (Any):
|
||||
paired_inputs (Any, *optional*):
|
||||
Zero or more lists that mirror the structure of `images` (flat list, or list of lists when
|
||||
`is_nested=True`). Each element is paired 1:1 with the corresponding image so it can be grouped by the
|
||||
same shape key. These paired values are grouped alongside `images` but are not stacked in the output, so
|
||||
@ -944,10 +949,14 @@ def group_images_by_shape(
|
||||
disable_grouping = device == "cpu"
|
||||
|
||||
if disable_grouping:
|
||||
if is_nested:
|
||||
return _disable_grouping_output_nested(images, *paired_inputs)
|
||||
else:
|
||||
return _disable_grouping_output_flat(images, *paired_inputs)
|
||||
return (
|
||||
{key: img.unsqueeze(0) for key, img in _iterate_items(images, is_nested)},
|
||||
*[
|
||||
{key: item.unsqueeze(0) for key, item in _iterate_items(paired_list, is_nested)}
|
||||
for paired_list in paired_inputs
|
||||
],
|
||||
{key: (key, 0) for key, _ in _iterate_items(images, is_nested)},
|
||||
)
|
||||
|
||||
# Handle single level nested structure
|
||||
grouped_images, *paired_grouped_values, grouped_images_index = _group_images_by_shape(
|
||||
@ -990,14 +999,3 @@ def reorder_images(
|
||||
]
|
||||
|
||||
return _reconstruct_nested_structure(grouped_images_index, processed_images)
|
||||
|
||||
|
||||
class NumpyToTensor:
|
||||
"""
|
||||
Convert a numpy array to a PyTorch tensor.
|
||||
"""
|
||||
|
||||
def __call__(self, image: np.ndarray):
|
||||
# Same as in PyTorch, we assume incoming numpy images are in HWC format
|
||||
# c.f. https://github.com/pytorch/vision/blob/61d97f41bc209e1407dcfbd685d2ee2da9c1cdad/torchvision/transforms/functional.py#L154
|
||||
return torch.from_numpy(image.transpose(2, 0, 1)).contiguous()
|
||||
|
||||
@ -11,6 +11,7 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
import re
|
||||
from collections.abc import Callable
|
||||
from functools import partial
|
||||
@ -18,7 +19,7 @@ from types import ModuleType
|
||||
from typing import Optional, Union
|
||||
|
||||
from ..modeling_flash_attention_utils import lazy_import_flash_attention
|
||||
from ..utils import logging
|
||||
from ..utils import ENV_VARS_TRUE_VALUES, logging
|
||||
from ..utils.import_utils import is_kernels_available
|
||||
from .flash_attention import flash_attention_forward
|
||||
|
||||
@ -33,10 +34,22 @@ try:
|
||||
get_kernel,
|
||||
register_kernel_mapping,
|
||||
replace_kernel_forward_from_hub,
|
||||
use_kernel_forward_from_hub,
|
||||
)
|
||||
|
||||
_TRANSFORMERS_USE_HUB_KERNELS = os.environ.get("USE_HUB_KERNELS", "YES").upper()
|
||||
_kernels_available = True
|
||||
_kernels_enabled = _TRANSFORMERS_USE_HUB_KERNELS in ENV_VARS_TRUE_VALUES
|
||||
|
||||
def use_kernel_forward_from_hub(layer_name: str):
|
||||
if _kernels_enabled:
|
||||
from kernels import use_kernel_forward_from_hub as _kernels_use_kernel_forward_from_hub
|
||||
|
||||
return _kernels_use_kernel_forward_from_hub(layer_name)
|
||||
else:
|
||||
logger.warning_once(
|
||||
f"kernels hub usage is disabled through the environment USE_HUB_KERNELS={_TRANSFORMERS_USE_HUB_KERNELS}"
|
||||
)
|
||||
return lambda cls: cls
|
||||
|
||||
_KERNEL_MAPPING: dict[str, dict[Union[Device, str], LayerRepository]] = {
|
||||
"MultiScaleDeformableAttention": {
|
||||
@ -161,6 +174,7 @@ try:
|
||||
|
||||
except ImportError:
|
||||
_kernels_available = False
|
||||
_kernels_enabled = False
|
||||
|
||||
# Stub to make decorators int transformers work when `kernels`
|
||||
# is not installed.
|
||||
|
||||
@ -38,7 +38,7 @@ from transformers.utils.import_utils import _is_package_available
|
||||
|
||||
|
||||
if os.getenv("WANDB_MODE") == "offline":
|
||||
print("⚙️ Running in WANDB offline mode")
|
||||
print("[INFO] Running in WANDB offline mode")
|
||||
|
||||
from .. import PreTrainedModel, TrainingArguments
|
||||
from .. import __version__ as version
|
||||
|
||||
@ -272,7 +272,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -1700,6 +1700,7 @@ MODEL_FOR_BACKBONE_MAPPING_NAMES = OrderedDict(
|
||||
("dinov2", "Dinov2Backbone"),
|
||||
("dinov2_with_registers", "Dinov2WithRegistersBackbone"),
|
||||
("dinov3_convnext", "DINOv3ConvNextBackbone"),
|
||||
("dinov3_vit", "DINOv3ViTBackbone"),
|
||||
("focalnet", "FocalNetBackbone"),
|
||||
("hgnet_v2", "HGNetV2Backbone"),
|
||||
("hiera", "HieraBackbone"),
|
||||
|
||||
@ -138,7 +138,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model and processor to the 🤗 hub.",
|
||||
help="Whether or not to push the converted model and processor to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -257,7 +257,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -274,7 +274,7 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the Hugging Face hub."
|
||||
)
|
||||
parser.add_argument("--sample_rate", default=None, type=str, help="Sample rate used by DacFeatureExtractor")
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -258,7 +258,7 @@ def convert_wav2vec2_checkpoint(
|
||||
max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
|
||||
print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7
|
||||
success = torch.allclose(our_output, their_output, atol=1e-3)
|
||||
print("Do both models output the same tensors?", "🔥" if success else "💩")
|
||||
print("Do both models output the same tensors?", "[PASS]" if success else "[FAIL]")
|
||||
if not success:
|
||||
raise Exception("Something went wRoNg")
|
||||
|
||||
|
||||
@ -180,7 +180,7 @@ def convert_data2vec_checkpoint_to_pytorch(
|
||||
max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
|
||||
print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7
|
||||
success = torch.allclose(our_output, their_output, atol=1e-3)
|
||||
print("Do both models output the same tensors?", "🔥" if success else "💩")
|
||||
print("Do both models output the same tensors?", "[PASS]" if success else "[FAIL]")
|
||||
if not success:
|
||||
raise Exception("Something went wRoNg")
|
||||
|
||||
|
||||
@ -341,7 +341,7 @@ def main():
|
||||
|
||||
print(f"max_absolute_diff = {max_absolute_diff}")
|
||||
success = torch.allclose(hf_output, orig_model_output, atol=1e-3)
|
||||
print("Do both models output the same tensors?", "🔥" if success else "💩")
|
||||
print("Do both models output the same tensors?", "[PASS]" if success else "[FAIL]")
|
||||
if not success:
|
||||
raise Exception("Something went wRoNg")
|
||||
|
||||
|
||||
@ -222,7 +222,9 @@ if __name__ == "__main__":
|
||||
help="Path to the folder to output PyTorch model.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
convert_deformable_detr_checkpoint(
|
||||
|
||||
@ -299,9 +299,9 @@ def convert_bort_checkpoint_to_pytorch(bort_checkpoint_path: str, pytorch_dump_f
|
||||
success = np.allclose(gluon_layer, hf_layer, atol=1e-3)
|
||||
|
||||
if success:
|
||||
print("✔️ Both model do output the same tensors")
|
||||
print("[SUCCESS] Both models do output the same tensors")
|
||||
else:
|
||||
print("❌ Both model do **NOT** output the same tensors")
|
||||
print("[FAIL] Both models do **NOT** output the same tensors")
|
||||
print("Absolute difference is:", max_absolute_diff)
|
||||
|
||||
|
||||
|
||||
@ -313,7 +313,9 @@ if __name__ == "__main__":
|
||||
help="Path to the folder to output PyTorch model.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
convert_deta_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
|
||||
|
||||
@ -320,7 +320,9 @@ if __name__ == "__main__":
|
||||
help="Path to the folder to output PyTorch model.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
convert_deta_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
|
||||
|
||||
@ -22,7 +22,7 @@ from ....activations import ACT2FN
|
||||
from ....cache_utils import Cache
|
||||
from ....modeling_outputs import MoECausalLMOutputWithPast, MoEModelOutputWithPastAndCrossAttentions
|
||||
from ....modeling_utils import PreTrainedModel
|
||||
from ....utils import DUMMY_INPUTS, DUMMY_MASK, auto_docstring
|
||||
from ....utils import DUMMY_INPUTS, DUMMY_MASK
|
||||
from .configuration_gptsan_japanese import GPTSanJapaneseConfig
|
||||
|
||||
|
||||
@ -635,7 +635,6 @@ class GPTSanJapaneseModel(GPTSanJapanesePreTrainedModel):
|
||||
def set_input_embeddings(self, new_embeddings):
|
||||
self.embed_tokens = new_embeddings
|
||||
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
|
||||
@ -278,7 +278,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -284,7 +284,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -18,12 +18,13 @@ from typing import Optional
|
||||
|
||||
from ...configuration_utils import PreTrainedConfig
|
||||
from ...utils import logging
|
||||
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class DINOv3ViTConfig(PreTrainedConfig):
|
||||
class DINOv3ViTConfig(BackboneConfigMixin, PreTrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`DINOv3Model`]. It is used to instantiate an
|
||||
DINOv3 model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
||||
@ -86,6 +87,16 @@ class DINOv3ViTConfig(PreTrainedConfig):
|
||||
pos_embed_rescale (`float`, *optional*, defaults to 2.0):
|
||||
Amount to randomly rescale position embedding coordinates in log-uniform value in [1/rescale, rescale],
|
||||
applied only in training mode if not `None`.
|
||||
out_features (`list[str]`, *optional*):
|
||||
If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
|
||||
(depending on how many stages the model has). Will default to the last stage if unset.
|
||||
out_indices (`list[int]`, *optional*):
|
||||
If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc.
|
||||
(depending on how many stages the model has). Will default to the last stage if unset.
|
||||
apply_layernorm (`bool`, *optional*, defaults to `True`):
|
||||
Whether to apply layer normalization to the feature maps when used as backbone.
|
||||
reshape_hidden_states (`bool`, *optional*, defaults to `True`):
|
||||
Whether to reshape the hidden states to spatial dimensions when used as backbone.
|
||||
|
||||
Example:
|
||||
|
||||
@ -131,6 +142,10 @@ class DINOv3ViTConfig(PreTrainedConfig):
|
||||
pos_embed_shift: Optional[float] = None,
|
||||
pos_embed_jitter: Optional[float] = None,
|
||||
pos_embed_rescale: Optional[float] = 2.0,
|
||||
out_features: Optional[list[str]] = None,
|
||||
out_indices: Optional[list[int]] = None,
|
||||
apply_layernorm: bool = True,
|
||||
reshape_hidden_states: bool = True,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
@ -161,6 +176,18 @@ class DINOv3ViTConfig(PreTrainedConfig):
|
||||
self.pos_embed_shift = pos_embed_shift
|
||||
self.pos_embed_jitter = pos_embed_jitter
|
||||
self.pos_embed_rescale = pos_embed_rescale
|
||||
# Initialize backbone-specific configuration
|
||||
self.apply_layernorm = apply_layernorm
|
||||
self.reshape_hidden_states = reshape_hidden_states
|
||||
|
||||
# Initialize backbone stage names
|
||||
stage_names = ["stem"] + [f"stage{i}" for i in range(1, num_hidden_layers + 1)]
|
||||
self.stage_names = stage_names
|
||||
|
||||
# Initialize backbone features/indices
|
||||
self._out_features, self._out_indices = get_aligned_output_features_output_indices(
|
||||
out_features=out_features, out_indices=out_indices, stage_names=stage_names
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["DINOv3ViTConfig"]
|
||||
|
||||
@ -29,11 +29,12 @@ from torch import nn
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import BaseModelOutputWithPooling
|
||||
from ...modeling_outputs import BackboneOutput, BaseModelOutputWithPooling
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
||||
from ...processing_utils import Unpack
|
||||
from ...pytorch_utils import compile_compatible_method_lru_cache
|
||||
from ...utils import TransformersKwargs, auto_docstring
|
||||
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
||||
from ...utils.backbone_utils import BackboneMixin
|
||||
from ...utils.generic import check_model_inputs
|
||||
from .configuration_dinov3_vit import DINOv3ViTConfig
|
||||
|
||||
@ -522,10 +523,79 @@ class DINOv3ViTModel(DINOv3ViTPreTrainedModel):
|
||||
sequence_output = self.norm(hidden_states)
|
||||
pooled_output = sequence_output[:, 0, :]
|
||||
|
||||
return BaseModelOutputWithPooling(
|
||||
last_hidden_state=sequence_output,
|
||||
pooler_output=pooled_output,
|
||||
)
|
||||
return BaseModelOutputWithPooling(last_hidden_state=sequence_output, pooler_output=pooled_output)
|
||||
|
||||
|
||||
__all__ = ["DINOv3ViTModel", "DINOv3ViTPreTrainedModel"]
|
||||
@auto_docstring
|
||||
class DINOv3ViTBackbone(DINOv3ViTPreTrainedModel, BackboneMixin):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
super()._init_backbone(config)
|
||||
|
||||
self.embeddings = DINOv3ViTEmbeddings(config)
|
||||
self.rope_embeddings = DINOv3ViTRopePositionEmbedding(config)
|
||||
self.layer = nn.ModuleList([DINOv3ViTLayer(config) for _ in range(config.num_hidden_layers)])
|
||||
self.norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||
self.gradient_checkpointing = False
|
||||
|
||||
self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
|
||||
self.post_init()
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.embeddings.patch_embeddings
|
||||
|
||||
@check_model_inputs()
|
||||
@can_return_tuple
|
||||
def forward(
|
||||
self,
|
||||
pixel_values: torch.Tensor,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> BackboneOutput:
|
||||
pixel_values = pixel_values.to(self.embeddings.patch_embeddings.weight.dtype)
|
||||
hidden_states = self.embeddings(pixel_values)
|
||||
position_embeddings = self.rope_embeddings(pixel_values)
|
||||
|
||||
stage_hidden_states: list[torch.Tensor] = [hidden_states]
|
||||
|
||||
for layer_module in self.layer:
|
||||
hidden_states = layer_module(hidden_states, position_embeddings=position_embeddings)
|
||||
stage_hidden_states.append(hidden_states)
|
||||
|
||||
batch_size, _, image_height, image_width = pixel_values.shape
|
||||
patch_size = self.config.patch_size
|
||||
num_patches_height = image_height // patch_size
|
||||
num_patches_width = image_width // patch_size
|
||||
|
||||
num_prefix = 1 + getattr(self.config, "num_register_tokens", 0)
|
||||
|
||||
feature_maps = []
|
||||
sequence_output = None
|
||||
last_stage_idx = len(self.stage_names) - 1
|
||||
for idx, (stage_name, hidden_state) in enumerate(zip(self.stage_names, stage_hidden_states)):
|
||||
if idx == last_stage_idx:
|
||||
hidden_state = self.norm(hidden_state)
|
||||
sequence_output = hidden_state
|
||||
elif self.config.apply_layernorm:
|
||||
hidden_state = self.norm(hidden_state)
|
||||
|
||||
if stage_name in self.out_features:
|
||||
patch_tokens = hidden_state[:, num_prefix:, :]
|
||||
if self.config.reshape_hidden_states:
|
||||
fmap = (
|
||||
patch_tokens.reshape(batch_size, num_patches_height, num_patches_width, patch_tokens.shape[-1])
|
||||
.permute(0, 3, 1, 2)
|
||||
.contiguous()
|
||||
)
|
||||
else:
|
||||
fmap = patch_tokens
|
||||
|
||||
feature_maps.append(fmap)
|
||||
|
||||
output = BackboneOutput(feature_maps=tuple(feature_maps))
|
||||
output.last_hidden_state = sequence_output
|
||||
|
||||
return output
|
||||
|
||||
|
||||
__all__ = ["DINOv3ViTModel", "DINOv3ViTPreTrainedModel", "DINOv3ViTBackbone"]
|
||||
|
||||
@ -33,11 +33,12 @@ from transformers.models.llama.modeling_llama import LlamaMLP
|
||||
from transformers.models.pixtral.modeling_pixtral import PixtralAttention, rotate_half
|
||||
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import BaseModelOutputWithPooling
|
||||
from ...modeling_outputs import BackboneOutput, BaseModelOutputWithPooling
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
||||
from ...processing_utils import Unpack
|
||||
from ...pytorch_utils import compile_compatible_method_lru_cache
|
||||
from ...utils import TransformersKwargs, auto_docstring, logging
|
||||
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
|
||||
from ...utils.backbone_utils import BackboneMixin
|
||||
from ...utils.generic import check_model_inputs
|
||||
from .configuration_dinov3_vit import DINOv3ViTConfig
|
||||
|
||||
@ -417,10 +418,79 @@ class DINOv3ViTModel(DINOv3ViTPreTrainedModel):
|
||||
sequence_output = self.norm(hidden_states)
|
||||
pooled_output = sequence_output[:, 0, :]
|
||||
|
||||
return BaseModelOutputWithPooling(
|
||||
last_hidden_state=sequence_output,
|
||||
pooler_output=pooled_output,
|
||||
)
|
||||
return BaseModelOutputWithPooling(last_hidden_state=sequence_output, pooler_output=pooled_output)
|
||||
|
||||
|
||||
__all__ = ["DINOv3ViTModel", "DINOv3ViTPreTrainedModel"]
|
||||
@auto_docstring
|
||||
class DINOv3ViTBackbone(DINOv3ViTPreTrainedModel, BackboneMixin):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
super()._init_backbone(config)
|
||||
|
||||
self.embeddings = DINOv3ViTEmbeddings(config)
|
||||
self.rope_embeddings = DINOv3ViTRopePositionEmbedding(config)
|
||||
self.layer = nn.ModuleList([DINOv3ViTLayer(config) for _ in range(config.num_hidden_layers)])
|
||||
self.norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||
self.gradient_checkpointing = False
|
||||
|
||||
self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
|
||||
self.post_init()
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.embeddings.patch_embeddings
|
||||
|
||||
@check_model_inputs()
|
||||
@can_return_tuple
|
||||
def forward(
|
||||
self,
|
||||
pixel_values: torch.Tensor,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> BackboneOutput:
|
||||
pixel_values = pixel_values.to(self.embeddings.patch_embeddings.weight.dtype)
|
||||
hidden_states = self.embeddings(pixel_values)
|
||||
position_embeddings = self.rope_embeddings(pixel_values)
|
||||
|
||||
stage_hidden_states: list[torch.Tensor] = [hidden_states]
|
||||
|
||||
for layer_module in self.layer:
|
||||
hidden_states = layer_module(hidden_states, position_embeddings=position_embeddings)
|
||||
stage_hidden_states.append(hidden_states)
|
||||
|
||||
batch_size, _, image_height, image_width = pixel_values.shape
|
||||
patch_size = self.config.patch_size
|
||||
num_patches_height = image_height // patch_size
|
||||
num_patches_width = image_width // patch_size
|
||||
|
||||
num_prefix = 1 + getattr(self.config, "num_register_tokens", 0)
|
||||
|
||||
feature_maps = []
|
||||
sequence_output = None
|
||||
last_stage_idx = len(self.stage_names) - 1
|
||||
for idx, (stage_name, hidden_state) in enumerate(zip(self.stage_names, stage_hidden_states)):
|
||||
if idx == last_stage_idx:
|
||||
hidden_state = self.norm(hidden_state)
|
||||
sequence_output = hidden_state
|
||||
elif self.config.apply_layernorm:
|
||||
hidden_state = self.norm(hidden_state)
|
||||
|
||||
if stage_name in self.out_features:
|
||||
patch_tokens = hidden_state[:, num_prefix:, :]
|
||||
if self.config.reshape_hidden_states:
|
||||
fmap = (
|
||||
patch_tokens.reshape(batch_size, num_patches_height, num_patches_width, patch_tokens.shape[-1])
|
||||
.permute(0, 3, 1, 2)
|
||||
.contiguous()
|
||||
)
|
||||
else:
|
||||
fmap = patch_tokens
|
||||
|
||||
feature_maps.append(fmap)
|
||||
|
||||
output = BackboneOutput(feature_maps=tuple(feature_maps))
|
||||
output.last_hidden_state = sequence_output
|
||||
|
||||
return output
|
||||
|
||||
|
||||
__all__ = ["DINOv3ViTModel", "DINOv3ViTPreTrainedModel", "DINOv3ViTBackbone"]
|
||||
|
||||
@ -227,7 +227,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model and processor to the 🤗 hub.",
|
||||
help="Whether or not to push the converted model and processor to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -352,7 +352,7 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the Hugging Face hub."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -316,7 +316,7 @@ def convert_esm_checkpoint_to_pytorch(
|
||||
hf_tokens = hf_tokenizer([row[1] for row in sample_data], return_tensors="pt", padding=True)
|
||||
success = torch.all(hf_tokens["input_ids"] == batch_tokens)
|
||||
|
||||
print("Do both models tokenizers output the same tokens?", "🔥" if success else "💩")
|
||||
print("Do both models tokenizers output the same tokens?", "[PASS]" if success else "[FAIL]")
|
||||
if not success:
|
||||
raise Exception("Tokenization does not match!")
|
||||
|
||||
@ -348,7 +348,7 @@ def convert_esm_checkpoint_to_pytorch(
|
||||
success = torch.allclose(our_output, their_output, atol=1e-5)
|
||||
|
||||
print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-5
|
||||
print("Do both models output the same tensors?", "🔥" if success else "💩")
|
||||
print("Do both models output the same tensors?", "[PASS]" if success else "[FAIL]")
|
||||
|
||||
if not success:
|
||||
raise Exception("Something went wRoNg")
|
||||
@ -362,7 +362,7 @@ def convert_esm_checkpoint_to_pytorch(
|
||||
|
||||
print("Contact prediction testing:")
|
||||
print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-5
|
||||
print("Do both models output the same tensors?", "🔥" if success else "💩")
|
||||
print("Do both models output the same tensors?", "[PASS]" if success else "[FAIL]")
|
||||
|
||||
if not success:
|
||||
raise Exception("Something went wRoNg")
|
||||
|
||||
@ -198,7 +198,7 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the Hugging Face hub."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -122,7 +122,7 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the Hugging Face hub."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -89,7 +89,7 @@ if __name__ == "__main__":
|
||||
help="Path to the output `FastSpeech2ConformerModel` PyTorch model.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the Hugging Face hub."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -250,7 +250,9 @@ def main():
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
write_tokenizer(
|
||||
|
||||
@ -26,6 +26,7 @@ from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...masking_utils import create_causal_mask
|
||||
from ...modeling_flash_attention_utils import is_flash_attn_available
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
CausalLMOutputWithCrossAttentions,
|
||||
@ -266,7 +267,7 @@ class GPTBigCodeMLP(nn.Module):
|
||||
return hidden_states
|
||||
|
||||
|
||||
class GPTBigCodeBlock(nn.Module):
|
||||
class GPTBigCodeBlock(GradientCheckpointingLayer):
|
||||
def __init__(self, config, layer_idx=None):
|
||||
super().__init__()
|
||||
hidden_size = config.hidden_size
|
||||
@ -291,9 +292,9 @@ class GPTBigCodeBlock(nn.Module):
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: Optional[tuple[torch.Tensor]],
|
||||
encoder_hidden_states: Optional[torch.Tensor] = None,
|
||||
layer_past: Optional[Cache] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
encoder_hidden_states: Optional[torch.Tensor] = None,
|
||||
encoder_attention_mask: Optional[torch.Tensor] = None,
|
||||
use_cache: Optional[bool] = False,
|
||||
output_attentions: Optional[bool] = False,
|
||||
@ -536,10 +537,10 @@ class GPTBigCodeModel(GPTBigCodePreTrainedModel):
|
||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||
|
||||
outputs = block(
|
||||
hidden_states,
|
||||
past_key_values,
|
||||
causal_mask,
|
||||
hidden_states, # as a positional argument for gradient checkpointing
|
||||
encoder_hidden_states, # as a positional argument for gradient checkpointing
|
||||
layer_past=past_key_values, # as keyword argument so it can be removed by GradientCheckpointingLayer
|
||||
attention_mask=causal_mask,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
|
||||
@ -481,7 +481,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verify_logits", action="store_false", help="Whether or not to verify logits after conversion."
|
||||
|
||||
@ -210,7 +210,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model and processor to the 🤗 hub using the provided `model_name`.",
|
||||
help="Whether or not to push the converted model and processor to the Hugging Face hub using the provided `model_name`.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
@ -353,7 +353,9 @@ if __name__ == "__main__":
|
||||
help="Whether or not to verify the logits against the original implementation.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push-to-hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push-to-hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base-model",
|
||||
|
||||
@ -253,7 +253,7 @@ def main():
|
||||
parser.add_argument(
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the model to the 🤗 Hub.",
|
||||
help="Whether or not to push the model to the Hugging Face Hub.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verify_logits", action="store_false", help="Whether or not to verify logits after conversion."
|
||||
|
||||
@ -433,7 +433,9 @@ def main():
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
write_tokenizer(
|
||||
|
||||
@ -387,7 +387,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", type=str, required=True, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
@ -267,7 +267,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
@ -379,7 +379,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", type=str, required=True, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
@ -826,7 +826,7 @@ class OriginalMask2FormerCheckpointToOursConverter:
|
||||
checkpoints: list[Path] = checkpoints_dir.glob("**/*.pkl")
|
||||
|
||||
for checkpoint in checkpoints:
|
||||
logger.info(f"💪 Converting {checkpoint.stem}")
|
||||
logger.info(f"Converting {checkpoint.stem}")
|
||||
# find associated config file
|
||||
|
||||
# dataset_name e.g 'coco'
|
||||
@ -902,7 +902,7 @@ def test(
|
||||
"The predicted masks are not the same."
|
||||
)
|
||||
|
||||
logger.info("✅ Test passed!")
|
||||
logger.info("Test passed!")
|
||||
|
||||
|
||||
def get_model_name(checkpoint_file: Path):
|
||||
@ -1012,9 +1012,9 @@ if __name__ == "__main__":
|
||||
if model_name in high_tolerance_models:
|
||||
tolerance = 3e-1
|
||||
|
||||
logger.info(f"🪄 Testing {model_name}...")
|
||||
logger.info(f"Testing {model_name}...")
|
||||
test(original_model, mask2former_for_segmentation, image_processor, tolerance)
|
||||
logger.info(f"🪄 Pushing {model_name} to hub...")
|
||||
logger.info(f"Pushing {model_name} to hub...")
|
||||
|
||||
image_processor.push_to_hub(model_name)
|
||||
mask2former_for_segmentation.push_to_hub(model_name)
|
||||
|
||||
@ -109,9 +109,6 @@ class Mask2FormerImageProcessorFast(BaseImageProcessorFast):
|
||||
valid_kwargs = Mask2FormerImageProcessorKwargs
|
||||
|
||||
def __init__(self, **kwargs: Unpack[Mask2FormerImageProcessorKwargs]) -> None:
|
||||
if "pad_and_return_pixel_mask" in kwargs:
|
||||
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
|
||||
|
||||
size = kwargs.pop("size", None)
|
||||
max_size = kwargs.pop("max_size", None)
|
||||
|
||||
@ -224,7 +221,7 @@ class Mask2FormerImageProcessorFast(BaseImageProcessorFast):
|
||||
padding = [0, 0, padding_right, padding_bottom]
|
||||
images = F.pad(images, padding, fill=fill)
|
||||
if segmentation_maps is not None:
|
||||
segmentation_maps = F.pad(segmentation_maps, padding, fill=ignore_index)
|
||||
segmentation_maps = [F.pad(mask, padding, fill=ignore_index) for mask in segmentation_maps]
|
||||
|
||||
# Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
|
||||
pixel_mask = torch.zeros((images.shape[0], *padded_size), dtype=torch.int64, device=images.device)
|
||||
@ -318,9 +315,11 @@ class Mask2FormerImageProcessorFast(BaseImageProcessorFast):
|
||||
stacked_images = self.resize(
|
||||
image=stacked_images, size=size, size_divisor=size_divisor, interpolation=interpolation
|
||||
)
|
||||
if segmentation_maps is not None:
|
||||
if segmentation_maps is not None:
|
||||
stacked_segmentation_maps = grouped_segmentation_maps[shape]
|
||||
if do_resize:
|
||||
stacked_segmentation_maps = self.resize(
|
||||
image=grouped_segmentation_maps[shape],
|
||||
image=stacked_segmentation_maps,
|
||||
size=size,
|
||||
size_divisor=size_divisor,
|
||||
interpolation=F.InterpolationMode.NEAREST_EXACT,
|
||||
@ -357,14 +356,18 @@ class Mask2FormerImageProcessorFast(BaseImageProcessorFast):
|
||||
mask_labels.append(masks)
|
||||
class_labels.append(classes)
|
||||
|
||||
grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
|
||||
processed_images_grouped = {}
|
||||
processed_pixel_masks_grouped = {}
|
||||
if segmentation_maps is not None:
|
||||
grouped_segmentation_maps, grouped_segmentation_maps_index = group_images_by_shape(
|
||||
mask_labels, disable_grouping=disable_grouping
|
||||
# group mask_labels as paired inputs and not images so as not to stack them
|
||||
grouped_images, grouped_segmentation_maps, grouped_images_index = group_images_by_shape(
|
||||
resized_images, mask_labels, disable_grouping=disable_grouping
|
||||
)
|
||||
processed_segmentation_maps_grouped = {}
|
||||
else:
|
||||
grouped_images, grouped_images_index = group_images_by_shape(
|
||||
resized_images, disable_grouping=disable_grouping
|
||||
)
|
||||
processed_images_grouped = {}
|
||||
processed_pixel_masks_grouped = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
# Fused rescale and normalize
|
||||
stacked_images = self.rescale_and_normalize(
|
||||
@ -379,7 +382,8 @@ class Mask2FormerImageProcessorFast(BaseImageProcessorFast):
|
||||
processed_images_grouped[shape] = padded_images
|
||||
processed_pixel_masks_grouped[shape] = pixel_masks
|
||||
if segmentation_maps is not None:
|
||||
processed_segmentation_maps_grouped[shape] = padded_segmentation_maps.squeeze(1)
|
||||
processed_segmentation_maps_grouped[shape] = padded_segmentation_maps
|
||||
|
||||
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
||||
processed_pixel_masks = reorder_images(processed_pixel_masks_grouped, grouped_images_index)
|
||||
encoded_inputs = BatchFeature(
|
||||
@ -390,7 +394,7 @@ class Mask2FormerImageProcessorFast(BaseImageProcessorFast):
|
||||
tensor_type=return_tensors,
|
||||
)
|
||||
if segmentation_maps is not None:
|
||||
mask_labels = reorder_images(processed_segmentation_maps_grouped, grouped_segmentation_maps_index)
|
||||
mask_labels = reorder_images(processed_segmentation_maps_grouped, grouped_images_index)
|
||||
# we cannot batch them since they don't share a common class size
|
||||
encoded_inputs["mask_labels"] = mask_labels
|
||||
encoded_inputs["class_labels"] = class_labels
|
||||
|
||||
@ -548,7 +548,7 @@ class OriginalMaskFormerCheckpointToOursConverter:
|
||||
checkpoints: list[Path] = checkpoints_dir.glob("**/*.pkl")
|
||||
|
||||
for checkpoint in checkpoints:
|
||||
logger.info(f"💪 Converting {checkpoint.stem}")
|
||||
logger.info(f"Converting {checkpoint.stem}")
|
||||
# find associated config file
|
||||
config: Path = config_dir / checkpoint.parents[0].stem / "swin" / f"{checkpoint.stem}.yaml"
|
||||
|
||||
@ -607,7 +607,7 @@ def test(original_model, our_model: MaskFormerForInstanceSegmentation, image_pro
|
||||
"The segmentation image is not the same."
|
||||
)
|
||||
|
||||
logger.info("✅ Test passed!")
|
||||
logger.info("Test passed!")
|
||||
|
||||
|
||||
def get_name(checkpoint_file: Path):
|
||||
@ -715,7 +715,7 @@ if __name__ == "__main__":
|
||||
test(original_model, mask_former_for_instance_segmentation, image_processor)
|
||||
|
||||
model_name = get_name(checkpoint_file)
|
||||
logger.info(f"🪄 Saving {model_name}")
|
||||
logger.info(f"Saving {model_name}")
|
||||
|
||||
image_processor.save_pretrained(save_directory / model_name)
|
||||
mask_former_for_instance_segmentation.save_pretrained(save_directory / model_name)
|
||||
|
||||
@ -391,7 +391,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -334,7 +334,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -114,9 +114,6 @@ class MaskFormerImageProcessorFast(BaseImageProcessorFast):
|
||||
valid_kwargs = MaskFormerImageProcessorKwargs
|
||||
|
||||
def __init__(self, **kwargs: Unpack[MaskFormerImageProcessorKwargs]) -> None:
|
||||
if "pad_and_return_pixel_mask" in kwargs:
|
||||
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
|
||||
|
||||
size = kwargs.pop("size", None)
|
||||
max_size = kwargs.pop("max_size", None)
|
||||
|
||||
@ -229,7 +226,7 @@ class MaskFormerImageProcessorFast(BaseImageProcessorFast):
|
||||
padding = [0, 0, padding_right, padding_bottom]
|
||||
images = F.pad(images, padding, fill=fill)
|
||||
if segmentation_maps is not None:
|
||||
segmentation_maps = F.pad(segmentation_maps, padding, fill=ignore_index)
|
||||
segmentation_maps = [F.pad(mask, padding, fill=ignore_index) for mask in segmentation_maps]
|
||||
|
||||
# Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
|
||||
pixel_mask = torch.zeros((images.shape[0], *padded_size), dtype=torch.int64, device=images.device)
|
||||
@ -323,9 +320,11 @@ class MaskFormerImageProcessorFast(BaseImageProcessorFast):
|
||||
stacked_images = self.resize(
|
||||
image=stacked_images, size=size, size_divisor=size_divisor, interpolation=interpolation
|
||||
)
|
||||
if segmentation_maps is not None:
|
||||
if segmentation_maps is not None:
|
||||
stacked_segmentation_maps = grouped_segmentation_maps[shape]
|
||||
if do_resize:
|
||||
stacked_segmentation_maps = self.resize(
|
||||
image=grouped_segmentation_maps[shape],
|
||||
image=stacked_segmentation_maps,
|
||||
size=size,
|
||||
size_divisor=size_divisor,
|
||||
interpolation=F.InterpolationMode.NEAREST_EXACT,
|
||||
@ -362,14 +361,18 @@ class MaskFormerImageProcessorFast(BaseImageProcessorFast):
|
||||
mask_labels.append(masks)
|
||||
class_labels.append(classes)
|
||||
|
||||
grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
|
||||
processed_images_grouped = {}
|
||||
processed_pixel_masks_grouped = {}
|
||||
if segmentation_maps is not None:
|
||||
grouped_segmentation_maps, grouped_segmentation_maps_index = group_images_by_shape(
|
||||
mask_labels, disable_grouping=disable_grouping
|
||||
# group mask_labels as paired inputs and not images so as not to stack them
|
||||
grouped_images, grouped_segmentation_maps, grouped_images_index = group_images_by_shape(
|
||||
resized_images, mask_labels, disable_grouping=disable_grouping
|
||||
)
|
||||
processed_segmentation_maps_grouped = {}
|
||||
else:
|
||||
grouped_images, grouped_images_index = group_images_by_shape(
|
||||
resized_images, disable_grouping=disable_grouping
|
||||
)
|
||||
processed_images_grouped = {}
|
||||
processed_pixel_masks_grouped = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
# Fused rescale and normalize
|
||||
stacked_images = self.rescale_and_normalize(
|
||||
@ -384,7 +387,8 @@ class MaskFormerImageProcessorFast(BaseImageProcessorFast):
|
||||
processed_images_grouped[shape] = padded_images
|
||||
processed_pixel_masks_grouped[shape] = pixel_masks
|
||||
if segmentation_maps is not None:
|
||||
processed_segmentation_maps_grouped[shape] = padded_segmentation_maps.squeeze(1)
|
||||
processed_segmentation_maps_grouped[shape] = padded_segmentation_maps
|
||||
|
||||
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
||||
processed_pixel_masks = reorder_images(processed_pixel_masks_grouped, grouped_images_index)
|
||||
encoded_inputs = BatchFeature(
|
||||
@ -395,7 +399,7 @@ class MaskFormerImageProcessorFast(BaseImageProcessorFast):
|
||||
tensor_type=return_tensors,
|
||||
)
|
||||
if segmentation_maps is not None:
|
||||
mask_labels = reorder_images(processed_segmentation_maps_grouped, grouped_segmentation_maps_index)
|
||||
mask_labels = reorder_images(processed_segmentation_maps_grouped, grouped_images_index)
|
||||
# we cannot batch them since they don't share a common class size
|
||||
encoded_inputs["mask_labels"] = mask_labels
|
||||
encoded_inputs["class_labels"] = class_labels
|
||||
|
||||
@ -349,10 +349,10 @@ def verify_conversion(
|
||||
|
||||
# Check if they're close
|
||||
if orig_logits.shape == hf_logits.shape and torch.allclose(orig_logits, hf_logits, atol=1e-4):
|
||||
print("✅ Conversion verified! Outputs match.")
|
||||
print("[SUCCESS] Conversion verified! Outputs match.")
|
||||
return True
|
||||
else:
|
||||
print("❌ Conversion failed! Outputs don't match.")
|
||||
print("[FAIL] Conversion failed! Outputs don't match.")
|
||||
if orig_logits.numel() > 0 and hf_logits.numel() > 0:
|
||||
print(f"Max difference: {(orig_logits - hf_logits).abs().max()}")
|
||||
return False
|
||||
@ -365,9 +365,9 @@ def push_to_hub(hf_model: MetaClip2Model, processor: CLIPProcessor, repo_name: s
|
||||
try:
|
||||
hf_model.push_to_hub(repo_name)
|
||||
processor.push_to_hub(repo_name)
|
||||
print(f"✅ Successfully pushed to {repo_name}")
|
||||
print(f"[SUCCESS] Successfully pushed to {repo_name}")
|
||||
except Exception as e:
|
||||
print(f"❌ Failed to push to hub: {e}")
|
||||
print(f"[FAIL] Failed to push to hub: {e}")
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
@ -186,7 +186,7 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the Hugging Face hub."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -327,7 +327,9 @@ if __name__ == "__main__":
|
||||
help="Whether to verify hidden_state against the original implementation.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -234,7 +234,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", required=True, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -336,7 +336,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", required=True, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -302,7 +302,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", required=True, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -268,7 +268,7 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the Hugging Face hub."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -25,7 +25,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...masking_utils import create_bidirectional_mask, create_causal_mask
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
@ -41,18 +41,11 @@ from ...utils import (
|
||||
DUMMY_INPUTS,
|
||||
DUMMY_MASK,
|
||||
auto_docstring,
|
||||
is_torch_flex_attn_available,
|
||||
is_torchdynamo_compiling,
|
||||
logging,
|
||||
)
|
||||
from .configuration_mt5 import MT5Config
|
||||
|
||||
|
||||
if is_torch_flex_attn_available():
|
||||
from torch.nn.attention.flex_attention import BlockMask
|
||||
|
||||
from ...integrations.flex_attention import make_flex_block_causal_mask
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@ -735,40 +728,31 @@ class MT5Stack(MT5PreTrainedModel):
|
||||
past_key_values_length, past_key_values_length + seq_length, device=inputs_embeds.device
|
||||
)
|
||||
|
||||
if attention_mask is None and not is_torchdynamo_compiling():
|
||||
# required mask seq length can be calculated via length of past cache
|
||||
mask_seq_length = past_key_values_length + seq_length
|
||||
attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
|
||||
|
||||
if self.config.is_decoder:
|
||||
causal_mask = self._update_causal_mask(
|
||||
attention_mask,
|
||||
inputs_embeds,
|
||||
cache_position,
|
||||
past_key_values.self_attention_cache
|
||||
attention_mask = create_causal_mask(
|
||||
config=self.config,
|
||||
input_embeds=inputs_embeds,
|
||||
attention_mask=attention_mask,
|
||||
cache_position=cache_position,
|
||||
past_key_values=past_key_values.self_attention_cache
|
||||
if isinstance(past_key_values, EncoderDecoderCache)
|
||||
else past_key_values,
|
||||
output_attentions,
|
||||
)
|
||||
elif attention_mask is not None:
|
||||
causal_mask = attention_mask[:, None, None, :]
|
||||
causal_mask = causal_mask.to(dtype=inputs_embeds.dtype)
|
||||
causal_mask = (1.0 - causal_mask) * torch.finfo(inputs_embeds.dtype).min
|
||||
else:
|
||||
causal_mask = None
|
||||
attention_mask = create_bidirectional_mask(
|
||||
config=self.config,
|
||||
input_embeds=inputs_embeds,
|
||||
attention_mask=attention_mask,
|
||||
)
|
||||
|
||||
# If a 2D or 3D attention mask is provided for the cross-attention
|
||||
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
||||
encoder_extended_attention_mask = None
|
||||
if self.is_decoder and encoder_hidden_states is not None:
|
||||
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
|
||||
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
|
||||
if encoder_attention_mask is None:
|
||||
encoder_attention_mask = torch.ones(
|
||||
encoder_hidden_shape, device=inputs_embeds.device, dtype=torch.long
|
||||
)
|
||||
encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
|
||||
else:
|
||||
encoder_extended_attention_mask = None
|
||||
encoder_extended_attention_mask = create_bidirectional_mask(
|
||||
config=self.config,
|
||||
input_embeds=inputs_embeds,
|
||||
attention_mask=encoder_attention_mask,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
)
|
||||
|
||||
all_hidden_states = () if output_hidden_states else None
|
||||
all_attentions = () if output_attentions else None
|
||||
@ -778,13 +762,13 @@ class MT5Stack(MT5PreTrainedModel):
|
||||
|
||||
hidden_states = self.dropout(inputs_embeds)
|
||||
|
||||
for i, layer_module in enumerate(self.block):
|
||||
for layer_module in self.block:
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
causal_mask,
|
||||
attention_mask,
|
||||
position_bias,
|
||||
encoder_hidden_states,
|
||||
encoder_extended_attention_mask,
|
||||
@ -837,131 +821,6 @@ class MT5Stack(MT5PreTrainedModel):
|
||||
cross_attentions=all_cross_attentions,
|
||||
)
|
||||
|
||||
# Copied from transformers.models.gptj.modeling_gptj.GPTJModel._update_causal_mask
|
||||
def _update_causal_mask(
|
||||
self,
|
||||
attention_mask: Union[torch.Tensor, "BlockMask"],
|
||||
input_tensor: torch.Tensor,
|
||||
cache_position: torch.Tensor,
|
||||
past_key_values: Cache,
|
||||
output_attentions: bool = False,
|
||||
):
|
||||
if self.config._attn_implementation == "flash_attention_2":
|
||||
if attention_mask is not None and (attention_mask == 0.0).any():
|
||||
return attention_mask
|
||||
return None
|
||||
if self.config._attn_implementation == "flex_attention":
|
||||
if isinstance(attention_mask, torch.Tensor):
|
||||
attention_mask = make_flex_block_causal_mask(attention_mask)
|
||||
return attention_mask
|
||||
|
||||
# For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
|
||||
# order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
|
||||
# to infer the attention mask.
|
||||
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
|
||||
using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False
|
||||
|
||||
# When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
|
||||
if self.config._attn_implementation == "sdpa" and not using_compilable_cache and not output_attentions:
|
||||
if AttentionMaskConverter._ignore_causal_mask_sdpa(
|
||||
attention_mask,
|
||||
inputs_embeds=input_tensor,
|
||||
past_key_values_length=past_seen_tokens,
|
||||
is_training=self.training,
|
||||
):
|
||||
return None
|
||||
|
||||
dtype = input_tensor.dtype
|
||||
sequence_length = input_tensor.shape[1]
|
||||
if using_compilable_cache:
|
||||
target_length = past_key_values.get_max_cache_shape()
|
||||
else:
|
||||
target_length = (
|
||||
attention_mask.shape[-1]
|
||||
if isinstance(attention_mask, torch.Tensor)
|
||||
else past_seen_tokens + sequence_length + 1
|
||||
)
|
||||
|
||||
# In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
|
||||
causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
|
||||
attention_mask,
|
||||
sequence_length=sequence_length,
|
||||
target_length=target_length,
|
||||
dtype=dtype,
|
||||
cache_position=cache_position,
|
||||
batch_size=input_tensor.shape[0],
|
||||
)
|
||||
|
||||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type in ["cuda", "xpu", "npu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
# using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
|
||||
# Details: https://github.com/pytorch/pytorch/issues/110213
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
|
||||
|
||||
return causal_mask
|
||||
|
||||
@staticmethod
|
||||
# Copied from transformers.models.gptj.modeling_gptj.GPTJModel._prepare_4d_causal_attention_mask_with_cache_position
|
||||
def _prepare_4d_causal_attention_mask_with_cache_position(
|
||||
attention_mask: torch.Tensor,
|
||||
sequence_length: int,
|
||||
target_length: int,
|
||||
dtype: torch.dtype,
|
||||
cache_position: torch.Tensor,
|
||||
batch_size: int,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
|
||||
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
|
||||
|
||||
Args:
|
||||
attention_mask (`torch.Tensor`):
|
||||
A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
|
||||
`(batch_size, 1, query_length, key_value_length)`.
|
||||
sequence_length (`int`):
|
||||
The sequence length being processed.
|
||||
target_length (`int`):
|
||||
The target length: when generating with static cache, the mask should be as long as the static cache,
|
||||
to account for the 0 padding, the part of the cache that is not filled yet.
|
||||
dtype (`torch.dtype`):
|
||||
The dtype to use for the 4D attention mask.
|
||||
cache_position (`torch.Tensor`):
|
||||
Indices depicting the position of the input sequence tokens in the sequence.
|
||||
batch_size (`torch.Tensor`):
|
||||
Batch size.
|
||||
"""
|
||||
if attention_mask is not None and attention_mask.dim() == 4:
|
||||
# In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
|
||||
causal_mask = attention_mask
|
||||
else:
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
causal_mask = torch.full(
|
||||
(sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
|
||||
)
|
||||
if sequence_length != 1:
|
||||
causal_mask = torch.triu(causal_mask, diagonal=1)
|
||||
causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
|
||||
causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
|
||||
if attention_mask is not None:
|
||||
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
|
||||
mask_length = attention_mask.shape[-1]
|
||||
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
|
||||
causal_mask.device
|
||||
)
|
||||
padding_mask = padding_mask == 0
|
||||
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
|
||||
padding_mask, min_dtype
|
||||
)
|
||||
|
||||
return causal_mask
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class MT5Model(MT5PreTrainedModel):
|
||||
|
||||
@ -221,7 +221,7 @@ if __name__ == "__main__":
|
||||
help="Path to the output PyTorch model directory.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the Hugging Face hub."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda."
|
||||
|
||||
@ -254,7 +254,7 @@ if __name__ == "__main__":
|
||||
"--push_to_hub",
|
||||
default="musicgen-melody",
|
||||
type=str,
|
||||
help="Where to upload the converted model on the 🤗 hub.",
|
||||
help="Where to upload the converted model on the Hugging Face hub.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda."
|
||||
|
||||
@ -275,7 +275,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model and processor to the 🤗 hub.",
|
||||
help="Whether or not to push the converted model and processor to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -23,9 +23,7 @@ from typing import Optional, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from transformers.tokenization_utils_base import INIT_TOKENIZER_DOCSTRING
|
||||
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
|
||||
from transformers.utils import add_end_docstrings
|
||||
|
||||
from ...utils import is_levenshtein_available, is_nltk_available, logging, requires_backends
|
||||
|
||||
@ -40,16 +38,6 @@ if is_nltk_available():
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
INIT_TOKENIZER_DOCSTRING += """
|
||||
tokenizer_object ([`tokenizers.Tokenizer`]):
|
||||
A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
|
||||
tokenizers](../fast_tokenizers) for more information.
|
||||
tokenizer_file ([`str`]):
|
||||
A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
|
||||
tokenizers.
|
||||
"""
|
||||
|
||||
|
||||
VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"}
|
||||
|
||||
|
||||
@ -358,7 +346,6 @@ def remove_slice_from_lines(lines, clean_text, slice) -> str:
|
||||
return to_delete.strip()
|
||||
|
||||
|
||||
@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
|
||||
class NougatTokenizerFast(PreTrainedTokenizerFast):
|
||||
"""
|
||||
Fast tokenizer for Nougat (backed by HuggingFace tokenizers library).
|
||||
|
||||
@ -339,7 +339,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--use_timm_backbone", action="store_true", help="Whether or not to use timm backbone for vision backbone."
|
||||
|
||||
@ -926,7 +926,7 @@ class OriginalOneFormerCheckpointToOursConverter:
|
||||
checkpoints: list[Path] = checkpoints_dir.glob("**/*.pth")
|
||||
|
||||
for checkpoint in checkpoints:
|
||||
logger.info(f"💪 Converting {checkpoint.stem}")
|
||||
logger.info(f"Converting {checkpoint.stem}")
|
||||
# find associated config file
|
||||
config: Path = config_dir / f"{checkpoint.stem}.yaml"
|
||||
|
||||
@ -1054,7 +1054,7 @@ def test(
|
||||
"The segmentation image is not the same."
|
||||
)
|
||||
|
||||
logger.info("✅ Test passed!")
|
||||
logger.info("Test passed!")
|
||||
|
||||
|
||||
def get_name(checkpoint_file: Path):
|
||||
@ -1175,7 +1175,7 @@ if __name__ == "__main__":
|
||||
)
|
||||
|
||||
model_name = get_name(checkpoint_file)
|
||||
logger.info(f"🪄 Saving {model_name}")
|
||||
logger.info(f"Saving {model_name}")
|
||||
|
||||
processor.save_pretrained(save_directory / model_name)
|
||||
oneformer_for_universal_segmentation.save_pretrained(save_directory / model_name)
|
||||
|
||||
@ -839,6 +839,7 @@ class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
|
||||
"padding": False,
|
||||
"return_mm_token_type_ids": False,
|
||||
},
|
||||
"videos_kwargs": {"return_metadata": True},
|
||||
}
|
||||
|
||||
|
||||
@ -922,10 +923,17 @@ class Qwen2_5_VLProcessor(Qwen2VLProcessor):
|
||||
image_grid_thw = image_inputs["image_grid_thw"]
|
||||
|
||||
if videos is not None:
|
||||
fps = output_kwargs["videos_kwargs"].get("fps", 2.0)
|
||||
videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
|
||||
video_grid_thw = videos_inputs["video_grid_thw"]
|
||||
|
||||
# Get video metadata
|
||||
if not kwargs.get("return_metadata"):
|
||||
video_metadata = videos_inputs.pop("video_metadata")
|
||||
else:
|
||||
video_metadata = videos_inputs["video_metadata"]
|
||||
|
||||
fps = [metadata.sampled_fps for metadata in video_metadata]
|
||||
|
||||
if isinstance(fps, (int, float)):
|
||||
second_per_grid_ts = [self.video_processor.temporal_patch_size / fps] * len(video_grid_thw)
|
||||
elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw):
|
||||
|
||||
@ -41,6 +41,7 @@ class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
|
||||
"padding": False,
|
||||
"return_mm_token_type_ids": False,
|
||||
},
|
||||
"videos_kwargs": {"return_metadata": True},
|
||||
}
|
||||
|
||||
|
||||
@ -129,10 +130,17 @@ class Qwen2_5_VLProcessor(ProcessorMixin):
|
||||
image_grid_thw = image_inputs["image_grid_thw"]
|
||||
|
||||
if videos is not None:
|
||||
fps = output_kwargs["videos_kwargs"].get("fps", 2.0)
|
||||
videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
|
||||
video_grid_thw = videos_inputs["video_grid_thw"]
|
||||
|
||||
# Get video metadata
|
||||
if not kwargs.get("return_metadata"):
|
||||
video_metadata = videos_inputs.pop("video_metadata")
|
||||
else:
|
||||
video_metadata = videos_inputs["video_metadata"]
|
||||
|
||||
fps = [metadata.sampled_fps for metadata in video_metadata]
|
||||
|
||||
if isinstance(fps, (int, float)):
|
||||
second_per_grid_ts = [self.video_processor.temporal_patch_size / fps] * len(video_grid_thw)
|
||||
elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw):
|
||||
|
||||
@ -86,7 +86,7 @@ class RagTokenizer:
|
||||
**kwargs,
|
||||
) -> BatchEncoding:
|
||||
warnings.warn(
|
||||
"`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of 🤗 Transformers. Use the "
|
||||
"`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of Hugging Face Transformers. Use the "
|
||||
"regular `__call__` method to prepare your inputs and the tokenizer under the `with_target_tokenizer` "
|
||||
"context manager to prepare your targets. See the documentation of your specific tokenizer for more "
|
||||
"details",
|
||||
|
||||
@ -150,7 +150,7 @@ def convert_roberta_checkpoint_to_pytorch(
|
||||
max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
|
||||
print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7
|
||||
success = torch.allclose(our_output, their_output, atol=1e-3)
|
||||
print("Do both models output the same tensors?", "🔥" if success else "💩")
|
||||
print("Do both models output the same tensors?", "[PASS]" if success else "[FAIL]")
|
||||
if not success:
|
||||
raise Exception("Something went wRoNg")
|
||||
|
||||
|
||||
@ -214,7 +214,9 @@ if __name__ == "__main__":
|
||||
help="Whether or not to verify the logits against the original implementation.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -525,7 +525,9 @@ if __name__ == "__main__":
|
||||
help="Whether to verify logits against the original implementation.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -431,7 +431,9 @@ if __name__ == "__main__":
|
||||
help="Whether to verify logits against the original implementation.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -95,7 +95,7 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the Hugging Face hub."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -387,7 +387,7 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the Hugging Face hub."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -21,6 +21,7 @@ import torch
|
||||
from torch import nn
|
||||
|
||||
from ...activations import ACT2CLS
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import BaseModelOutputWithNoAttention, ImageClassifierOutputWithNoAttention
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...utils import auto_docstring, logging
|
||||
@ -295,7 +296,7 @@ class SwiftFormerEncoderBlock(nn.Module):
|
||||
return x
|
||||
|
||||
|
||||
class SwiftFormerStage(nn.Module):
|
||||
class SwiftFormerStage(GradientCheckpointingLayer):
|
||||
"""
|
||||
A Swiftformer stage consisting of a series of `SwiftFormerConvEncoder` blocks and a final
|
||||
`SwiftFormerEncoderBlock`.
|
||||
|
||||
@ -175,7 +175,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -25,7 +25,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...masking_utils import create_bidirectional_mask, create_causal_mask
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
@ -41,19 +41,11 @@ from ...utils import (
|
||||
DUMMY_INPUTS,
|
||||
DUMMY_MASK,
|
||||
auto_docstring,
|
||||
is_torch_flex_attn_available,
|
||||
is_torchdynamo_compiling,
|
||||
logging,
|
||||
)
|
||||
from .configuration_t5 import T5Config
|
||||
|
||||
|
||||
if is_torch_flex_attn_available():
|
||||
from torch.nn.attention.flex_attention import BlockMask
|
||||
|
||||
from ...integrations.flex_attention import make_flex_block_causal_mask
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@ -738,40 +730,31 @@ class T5Stack(T5PreTrainedModel):
|
||||
past_key_values_length, past_key_values_length + seq_length, device=inputs_embeds.device
|
||||
)
|
||||
|
||||
if attention_mask is None and not is_torchdynamo_compiling():
|
||||
# required mask seq length can be calculated via length of past cache
|
||||
mask_seq_length = past_key_values_length + seq_length
|
||||
attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
|
||||
|
||||
if self.config.is_decoder:
|
||||
causal_mask = self._update_causal_mask(
|
||||
attention_mask,
|
||||
inputs_embeds,
|
||||
cache_position,
|
||||
past_key_values.self_attention_cache
|
||||
attention_mask = create_causal_mask(
|
||||
config=self.config,
|
||||
input_embeds=inputs_embeds,
|
||||
attention_mask=attention_mask,
|
||||
cache_position=cache_position,
|
||||
past_key_values=past_key_values.self_attention_cache
|
||||
if isinstance(past_key_values, EncoderDecoderCache)
|
||||
else past_key_values,
|
||||
output_attentions,
|
||||
)
|
||||
elif attention_mask is not None:
|
||||
causal_mask = attention_mask[:, None, None, :]
|
||||
causal_mask = causal_mask.to(dtype=inputs_embeds.dtype)
|
||||
causal_mask = (1.0 - causal_mask) * torch.finfo(inputs_embeds.dtype).min
|
||||
else:
|
||||
causal_mask = None
|
||||
attention_mask = create_bidirectional_mask(
|
||||
config=self.config,
|
||||
input_embeds=inputs_embeds,
|
||||
attention_mask=attention_mask,
|
||||
)
|
||||
|
||||
# If a 2D or 3D attention mask is provided for the cross-attention
|
||||
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
||||
encoder_extended_attention_mask = None
|
||||
if self.is_decoder and encoder_hidden_states is not None:
|
||||
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
|
||||
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
|
||||
if encoder_attention_mask is None:
|
||||
encoder_attention_mask = torch.ones(
|
||||
encoder_hidden_shape, device=inputs_embeds.device, dtype=torch.long
|
||||
)
|
||||
encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
|
||||
else:
|
||||
encoder_extended_attention_mask = None
|
||||
encoder_extended_attention_mask = create_bidirectional_mask(
|
||||
config=self.config,
|
||||
input_embeds=inputs_embeds,
|
||||
attention_mask=encoder_attention_mask,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
)
|
||||
|
||||
all_hidden_states = () if output_hidden_states else None
|
||||
all_attentions = () if output_attentions else None
|
||||
@ -781,13 +764,13 @@ class T5Stack(T5PreTrainedModel):
|
||||
|
||||
hidden_states = self.dropout(inputs_embeds)
|
||||
|
||||
for i, layer_module in enumerate(self.block):
|
||||
for layer_module in self.block:
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
causal_mask,
|
||||
attention_mask,
|
||||
position_bias,
|
||||
encoder_hidden_states,
|
||||
encoder_extended_attention_mask,
|
||||
@ -840,131 +823,6 @@ class T5Stack(T5PreTrainedModel):
|
||||
cross_attentions=all_cross_attentions,
|
||||
)
|
||||
|
||||
# Copied from transformers.models.gptj.modeling_gptj.GPTJModel._update_causal_mask
|
||||
def _update_causal_mask(
|
||||
self,
|
||||
attention_mask: Union[torch.Tensor, "BlockMask"],
|
||||
input_tensor: torch.Tensor,
|
||||
cache_position: torch.Tensor,
|
||||
past_key_values: Cache,
|
||||
output_attentions: bool = False,
|
||||
):
|
||||
if self.config._attn_implementation == "flash_attention_2":
|
||||
if attention_mask is not None and (attention_mask == 0.0).any():
|
||||
return attention_mask
|
||||
return None
|
||||
if self.config._attn_implementation == "flex_attention":
|
||||
if isinstance(attention_mask, torch.Tensor):
|
||||
attention_mask = make_flex_block_causal_mask(attention_mask)
|
||||
return attention_mask
|
||||
|
||||
# For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
|
||||
# order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
|
||||
# to infer the attention mask.
|
||||
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
|
||||
using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False
|
||||
|
||||
# When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
|
||||
if self.config._attn_implementation == "sdpa" and not using_compilable_cache and not output_attentions:
|
||||
if AttentionMaskConverter._ignore_causal_mask_sdpa(
|
||||
attention_mask,
|
||||
inputs_embeds=input_tensor,
|
||||
past_key_values_length=past_seen_tokens,
|
||||
is_training=self.training,
|
||||
):
|
||||
return None
|
||||
|
||||
dtype = input_tensor.dtype
|
||||
sequence_length = input_tensor.shape[1]
|
||||
if using_compilable_cache:
|
||||
target_length = past_key_values.get_max_cache_shape()
|
||||
else:
|
||||
target_length = (
|
||||
attention_mask.shape[-1]
|
||||
if isinstance(attention_mask, torch.Tensor)
|
||||
else past_seen_tokens + sequence_length + 1
|
||||
)
|
||||
|
||||
# In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
|
||||
causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
|
||||
attention_mask,
|
||||
sequence_length=sequence_length,
|
||||
target_length=target_length,
|
||||
dtype=dtype,
|
||||
cache_position=cache_position,
|
||||
batch_size=input_tensor.shape[0],
|
||||
)
|
||||
|
||||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type in ["cuda", "xpu", "npu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
# using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
|
||||
# Details: https://github.com/pytorch/pytorch/issues/110213
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
|
||||
|
||||
return causal_mask
|
||||
|
||||
@staticmethod
|
||||
# Copied from transformers.models.gptj.modeling_gptj.GPTJModel._prepare_4d_causal_attention_mask_with_cache_position
|
||||
def _prepare_4d_causal_attention_mask_with_cache_position(
|
||||
attention_mask: torch.Tensor,
|
||||
sequence_length: int,
|
||||
target_length: int,
|
||||
dtype: torch.dtype,
|
||||
cache_position: torch.Tensor,
|
||||
batch_size: int,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
|
||||
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
|
||||
|
||||
Args:
|
||||
attention_mask (`torch.Tensor`):
|
||||
A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
|
||||
`(batch_size, 1, query_length, key_value_length)`.
|
||||
sequence_length (`int`):
|
||||
The sequence length being processed.
|
||||
target_length (`int`):
|
||||
The target length: when generating with static cache, the mask should be as long as the static cache,
|
||||
to account for the 0 padding, the part of the cache that is not filled yet.
|
||||
dtype (`torch.dtype`):
|
||||
The dtype to use for the 4D attention mask.
|
||||
cache_position (`torch.Tensor`):
|
||||
Indices depicting the position of the input sequence tokens in the sequence.
|
||||
batch_size (`torch.Tensor`):
|
||||
Batch size.
|
||||
"""
|
||||
if attention_mask is not None and attention_mask.dim() == 4:
|
||||
# In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
|
||||
causal_mask = attention_mask
|
||||
else:
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
causal_mask = torch.full(
|
||||
(sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
|
||||
)
|
||||
if sequence_length != 1:
|
||||
causal_mask = torch.triu(causal_mask, diagonal=1)
|
||||
causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
|
||||
causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
|
||||
if attention_mask is not None:
|
||||
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
|
||||
mask_length = attention_mask.shape[-1]
|
||||
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
|
||||
causal_mask.device
|
||||
)
|
||||
padding_mask = padding_mask == 0
|
||||
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
|
||||
padding_mask, min_dtype
|
||||
)
|
||||
|
||||
return causal_mask
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class T5Model(T5PreTrainedModel):
|
||||
|
||||
@ -448,11 +448,28 @@ class T5GemmaEncoderLayer(GradientCheckpointingLayer):
|
||||
return hidden_states
|
||||
|
||||
|
||||
class T5GemmaDecoderLayer(T5GemmaEncoderLayer):
|
||||
class T5GemmaDecoderLayer(GradientCheckpointingLayer):
|
||||
"""Decoder sub-layer: an extra cross-attention layer."""
|
||||
|
||||
def __init__(self, config, layer_idx: int):
|
||||
super().__init__(config, layer_idx)
|
||||
super().__init__()
|
||||
self.hidden_size = config.hidden_size
|
||||
self.config = config
|
||||
self.layer_idx = layer_idx
|
||||
self.attention_type = config.layer_types[layer_idx]
|
||||
|
||||
self.self_attn = T5GemmaSelfAttention(
|
||||
config=config,
|
||||
layer_idx=layer_idx,
|
||||
)
|
||||
self.pre_self_attn_layernorm = T5GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.post_self_attn_layernorm = T5GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
|
||||
self.mlp = T5GemmaMLP(config)
|
||||
self.pre_feedforward_layernorm = T5GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.post_feedforward_layernorm = T5GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
|
||||
self.dropout = nn.Dropout(config.dropout_rate)
|
||||
self.cross_attn = T5GemmaCrossAttention(config=config, layer_idx=layer_idx)
|
||||
self.pre_cross_attn_layernorm = T5GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.post_cross_attn_layernorm = T5GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
@ -732,7 +749,7 @@ class T5GemmaEncoder(T5GemmaPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class T5GemmaDecoder(T5GemmaEncoder):
|
||||
class T5GemmaDecoder(T5GemmaPreTrainedModel):
|
||||
_can_record_outputs = {
|
||||
"attentions": OutputRecorder(T5GemmaSelfAttention, index=1),
|
||||
"cross_attentions": OutputRecorder(T5GemmaCrossAttention, index=1),
|
||||
@ -741,11 +758,20 @@ class T5GemmaDecoder(T5GemmaEncoder):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.padding_idx = config.pad_token_id
|
||||
self.vocab_size = config.vocab_size
|
||||
|
||||
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
|
||||
self.norm = T5GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.gradient_checkpointing = False
|
||||
|
||||
self.layers = nn.ModuleList(
|
||||
[T5GemmaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
|
||||
)
|
||||
self.dropout = nn.Dropout(config.dropout_rate)
|
||||
self.rotary_emb = T5GemmaRotaryEmbedding(config=config)
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
@check_model_inputs()
|
||||
@ -771,7 +797,9 @@ class T5GemmaDecoder(T5GemmaEncoder):
|
||||
inputs_embeds = self.embed_tokens(input_ids)
|
||||
|
||||
if not self.training and use_cache and past_key_values is None:
|
||||
past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
|
||||
# We do not pass the config to the cross attn cache to avoid initializing SWA
|
||||
# --> we use full attention between our cross attentions
|
||||
past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache())
|
||||
if cache_position is None:
|
||||
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
|
||||
cache_position = torch.arange(
|
||||
|
||||
@ -517,11 +517,28 @@ class T5GemmaEncoderLayer(GradientCheckpointingLayer):
|
||||
return hidden_states
|
||||
|
||||
|
||||
class T5GemmaDecoderLayer(T5GemmaEncoderLayer):
|
||||
class T5GemmaDecoderLayer(GradientCheckpointingLayer):
|
||||
"""Decoder sub-layer: an extra cross-attention layer."""
|
||||
|
||||
def __init__(self, config, layer_idx: int):
|
||||
super().__init__(config, layer_idx)
|
||||
super().__init__()
|
||||
self.hidden_size = config.hidden_size
|
||||
self.config = config
|
||||
self.layer_idx = layer_idx
|
||||
self.attention_type = config.layer_types[layer_idx]
|
||||
|
||||
self.self_attn = T5GemmaSelfAttention(
|
||||
config=config,
|
||||
layer_idx=layer_idx,
|
||||
)
|
||||
self.pre_self_attn_layernorm = T5GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.post_self_attn_layernorm = T5GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
|
||||
self.mlp = T5GemmaMLP(config)
|
||||
self.pre_feedforward_layernorm = T5GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.post_feedforward_layernorm = T5GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
|
||||
self.dropout = nn.Dropout(config.dropout_rate)
|
||||
self.cross_attn = T5GemmaCrossAttention(config=config, layer_idx=layer_idx)
|
||||
self.pre_cross_attn_layernorm = T5GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.post_cross_attn_layernorm = T5GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
@ -770,7 +787,7 @@ class T5GemmaEncoder(T5GemmaPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class T5GemmaDecoder(T5GemmaEncoder):
|
||||
class T5GemmaDecoder(T5GemmaPreTrainedModel):
|
||||
_can_record_outputs = {
|
||||
"attentions": OutputRecorder(T5GemmaSelfAttention, index=1),
|
||||
"cross_attentions": OutputRecorder(T5GemmaCrossAttention, index=1),
|
||||
@ -779,11 +796,20 @@ class T5GemmaDecoder(T5GemmaEncoder):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.padding_idx = config.pad_token_id
|
||||
self.vocab_size = config.vocab_size
|
||||
|
||||
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
|
||||
self.norm = T5GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.gradient_checkpointing = False
|
||||
|
||||
self.layers = nn.ModuleList(
|
||||
[T5GemmaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
|
||||
)
|
||||
self.dropout = nn.Dropout(config.dropout_rate)
|
||||
self.rotary_emb = T5GemmaRotaryEmbedding(config=config)
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
@check_model_inputs()
|
||||
@ -809,7 +835,9 @@ class T5GemmaDecoder(T5GemmaEncoder):
|
||||
inputs_embeds = self.embed_tokens(input_ids)
|
||||
|
||||
if not self.training and use_cache and past_key_values is None:
|
||||
past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
|
||||
# We do not pass the config to the cross attn cache to avoid initializing SWA
|
||||
# --> we use full attention between our cross attentions
|
||||
past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache())
|
||||
if cache_position is None:
|
||||
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
|
||||
cache_position = torch.arange(
|
||||
|
||||
@ -311,7 +311,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
convert_table_transformer_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub)
|
||||
|
||||
@ -428,7 +428,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
convert_table_transformer_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub)
|
||||
|
||||
@ -244,7 +244,9 @@ if __name__ == "__main__":
|
||||
)
|
||||
parser.add_argument("--model_name", default="timesformer-base-finetuned-k400", type=str, help="Name of the model.")
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -217,7 +217,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -141,7 +141,7 @@ def main():
|
||||
"--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the Hugging Face hub."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--safe_serialization", action="store_true", help="Whether to save the model using `safetensors`."
|
||||
|
||||
@ -207,7 +207,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -290,7 +290,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -317,7 +317,9 @@ if __name__ == "__main__":
|
||||
)
|
||||
parser.add_argument("--model_name", default="videomae-base", type=str, help="Name of the model.")
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -163,7 +163,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -410,7 +410,9 @@ def main():
|
||||
"--pytorch_dump_folder_path", default=None, type=str, help="Path to store the converted model."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--check_logits", action="store_false", help="Whether or not to verify the logits of the converted model."
|
||||
|
||||
@ -374,7 +374,7 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the Hugging Face hub."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -336,7 +336,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the 🤗 hub.",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
parser.add_argument("--upload_original", action="store_true", help="upload the original checkpoint")
|
||||
|
||||
|
||||
@ -379,7 +379,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user