🚨🚨🚨 Fully remove Tensorflow and Jax support library-wide (#40760)

* setup

* start the purge

* continue the purge

* more and more

* more

* continue the quest: remove loading tf/jax checkpoints

* style

* fix configs

* oups forgot conflict

* continue

* still grinding

* always more

* in tje zone

* never stop

* should fix doc

* fic

* fix

* fix

* fix tests

* still tests

* fix non-deterministic

* style

* remove last rebase issues

* onnx configs

* still on the grind

* always more references

* nearly the end

* could it really be the end?

* small fix

* add converters back

* post rebase

* latest qwen

* add back all converters

* explicitly add functions in converters

* re-add
This commit is contained in:
Cyril Vallez
2025-09-18 18:27:39 +02:00
committed by GitHub
parent 5ac3c5171a
commit 4df2529d79
854 changed files with 3786 additions and 181263 deletions

View File

@ -80,7 +80,7 @@ Explore the [Hub](https://huggingface.com/) today to find a model and use Transf
## Installation
Transformers works with Python 3.9+ [PyTorch](https://pytorch.org/get-started/locally/) 2.1+, [TensorFlow](https://www.tensorflow.org/install/pip) 2.6+, and [Flax](https://flax.readthedocs.io/en/latest/) 0.4.1+.
Transformers works with Python 3.9+, and [PyTorch](https://pytorch.org/get-started/locally/) 2.1+.
Create and activate a virtual environment with [venv](https://docs.python.org/3/library/venv.html) or [uv](https://docs.astral.sh/uv/), a fast Rust-based Python package and project manager.

View File

@ -67,8 +67,6 @@ NOT_DEVICE_TESTS = {
"test_mismatched_shapes_have_properly_initialized_weights",
"test_matched_shapes_have_loaded_weights_when_some_mismatched_shapes_exist",
"test_model_is_small",
"test_tf_from_pt_safetensors",
"test_flax_from_pt_safetensors",
"ModelTest::test_pipeline_", # None of the pipeline tests from PipelineTesterMixin (of which XxxModelTest inherits from) are running on device
"ModelTester::test_pipeline_",
"/repo_utils/",

View File

@ -6,10 +6,8 @@ RUN apt-get update && apt-get install -y time git g++ pkg-config make git-lfs
ENV UV_PYTHON=/usr/local/bin/python
RUN pip install uv && uv pip install --no-cache-dir -U pip setuptools GitPython
RUN uv pip install --no-cache-dir --upgrade 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
# tensorflow pin matching setup.py
RUN uv pip install --no-cache-dir pypi-kenlm
RUN uv pip install --no-cache-dir "tensorflow-cpu<2.16" "tf-keras<2.16"
RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,quality,testing,torch-speech,vision]"
RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[quality,testing,torch-speech,vision]"
RUN git lfs install
RUN uv pip uninstall transformers

View File

@ -26,9 +26,7 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
# 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future.
# 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`.
# Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions).
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA && python3 -m pip uninstall -y tensorflow tensorflow_text tensorflow_probability
RUN python3 -m pip uninstall -y flax jax
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
RUN python3 -m pip install --no-cache-dir -U timm

View File

@ -15,7 +15,6 @@ RUN apt update && \
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
python3 -m pip install --no-cache-dir \
jupyter \
tensorflow \
torch
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/kernels@main#egg=kernels

View File

@ -1,59 +0,0 @@
ARG BASE_DOCKER_IMAGE
FROM $BASE_DOCKER_IMAGE
LABEL maintainer="Hugging Face"
ARG DEBIAN_FRONTEND=noninteractive
# Use login shell to read variables from `~/.profile` (to pass dynamic created variables between RUN commands)
SHELL ["sh", "-lc"]
RUN apt update
RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs libaio-dev
RUN git lfs install
RUN python3 -m pip install --no-cache-dir --upgrade pip
ARG REF=main
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime]
# When installing in editable mode, `transformers` is not recognized as a package.
# this line must be added in order for python to be aware of transformers.
RUN cd transformers && python3 setup.py develop
ARG FRAMEWORK
ARG VERSION
# Control `setuptools` version to avoid some issues
RUN [ "$VERSION" != "1.10" ] && python3 -m pip install -U setuptools || python3 -m pip install -U "setuptools<=59.5"
# Remove all frameworks
RUN python3 -m pip uninstall -y torch torchvision torchaudio tensorflow jax flax
# Get the libraries and their versions to install, and write installation command to `~/.profile`.
RUN python3 ./transformers/utils/past_ci_versions.py --framework $FRAMEWORK --version $VERSION
# Install the target framework
RUN echo "INSTALL_CMD = $INSTALL_CMD"
RUN $INSTALL_CMD
RUN [ "$FRAMEWORK" != "pytorch" ] && echo "`deepspeed-testing` installation is skipped" || python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
# Remove `accelerate`: it requires `torch`, and this causes import issues for TF-only testing
# We will install `accelerate@main` in Past CI workflow file
RUN python3 -m pip uninstall -y accelerate
# Uninstall `torch-tensorrt` and `apex` shipped with the base image
RUN python3 -m pip uninstall -y torch-tensorrt apex
# Pre-build **nightly** release of DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
RUN python3 -m pip uninstall -y deepspeed
# This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.)
# Issue: https://github.com/deepspeedai/DeepSpeed/issues/2010
# RUN git clone https://github.com/deepspeedai/DeepSpeed && cd DeepSpeed && rm -rf build && \
# DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
RUN python3 -m pip install -U "itsdangerous<2.1.0"
# When installing in editable mode, `transformers` is not recognized as a package.
# this line must be added in order for python to be aware of transformers.
RUN cd transformers && python3 setup.py develop

View File

@ -23,9 +23,6 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
# Install transformers
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video,audio]
# Remove tensorflow and flax as they are no longer supported by transformers
RUN python3 -m pip uninstall -y tensorflow flax
# When installing in editable mode, `transformers` is not recognized as a package.
# this line must be added in order for python to be aware of transformers.
RUN cd transformers && python3 setup.py develop

View File

@ -25,8 +25,6 @@ RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch';
RUN [ ${#TORCH_VISION} -gt 0 ] && VERSION='torchvision=='TORCH_VISION'.*' || VERSION='torchvision'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA
RUN [ ${#TORCH_AUDIO} -gt 0 ] && VERSION='torchaudio=='TORCH_AUDIO'.*' || VERSION='torchaudio'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA
RUN python3 -m pip uninstall -y tensorflow flax
RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
RUN python3 -m pip install -U "itsdangerous<2.1.0"

View File

@ -1,25 +0,0 @@
FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04
LABEL maintainer="Hugging Face"
ARG DEBIAN_FRONTEND=noninteractive
RUN apt update
RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg
RUN python3 -m pip install --no-cache-dir --upgrade pip
ARG REF=main
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-tensorflow,testing]
# If set to nothing, will install the latest version
ARG TENSORFLOW='2.13'
RUN [ ${#TENSORFLOW} -gt 0 ] && VERSION='tensorflow=='$TENSORFLOW'.*' || VERSION='tensorflow'; python3 -m pip install --no-cache-dir -U $VERSION
RUN python3 -m pip uninstall -y torch flax
RUN python3 -m pip install -U "itsdangerous<2.1.0"
RUN python3 -m pip install --no-cache-dir -U "tensorflow_probability<0.22"
# When installing in editable mode, `transformers` is not recognized as a package.
# this line must be added in order for python to be aware of transformers.
RUN cd transformers && python3 setup.py develop

View File

@ -123,8 +123,6 @@
title: تشغيل التدريب على Amazon SageMaker
- local: serialization
title: التصدير إلى ONNX
- local: tflite
title: التصدير إلى TFLite
- local: torchscript
title: التصدير إلى TorchScript
- local: notebooks
@ -184,8 +182,6 @@
# title: التدريب الفعال على وحدة المعالجة المركزية (CPU)
# - local: perf_train_cpu_many
# title: التدريب الموزع لوحدة المعالجة المركزية (CPU)
# - local: perf_train_tpu_tf
# title: التدريب على (TPU) باستخدام TensorFlow
# - local: perf_train_special
# title: تدريب PyTorch على Apple silicon
# - local: perf_hardware
@ -203,8 +199,6 @@
# title: إنشاء نموذج كبير
# - local: debugging
# title: تصحيح الأخطاء البرمجية
# - local: tf_xla
# title: تكامل XLA لنماذج TensorFlow
# - local: perf_torch_compile
# title: تحسين الاستدلال باستخدام `torch.compile()`
# title: الأداء وقابلية التوسع
@ -260,8 +254,6 @@
# title: التكوين
# - local: main_classes/data_collator
# title: مجمع البيانات
# - local: main_classes/keras_callbacks
# title: استدعاءات Keras
# - local: main_classes/logging
# title: التسجيل
# - local: main_classes/model

View File

@ -1,40 +0,0 @@
# التصدير إلى TFLite
[TensorFlow Lite](https://www.tensorflow.org/lite/guide) هو إطار عمل خفيف الوزن لنشر نماذج التعلم الآلي على الأجهزة المحدودة الموارد، مثل الهواتف المحمولة، والأنظمة المدمجة، وأجهزة إنترنت الأشياء (IoT). تم تصميم TFLite لتشغيل النماذج وتحسينها بكفاءة على هذه الأجهزة ذات الطاقة الحاسوبية والذاكرة واستهلاك الطاقة المحدودة.
يُمثَّل نموذج TensorFlow Lite بتنسيق محمول فعال خاص يُعرَّف بامتداد الملف `.tflite`.
🤗 Optimum يقدم وظيفة لتصدير نماذج 🤗 Transformers إلى TFLite من خلال الوحدة النمطية `exporters.tflite`. بالنسبة لقائمة هندسات النماذج المدعومة، يرجى الرجوع إلى [وثائق 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/tflite/overview).
لتصدير نموذج إلى TFLite، قم بتثبيت متطلبات البرنامج المطلوبة:
```bash
pip install optimum[exporters-tf]
```
للاطلاع على جميع المغامﻻت المتاحة، راجع [وثائق 🤗 Optimum](https://huggingface.co/docs/optimum/main/en/exporters/tflite/usage_guides/export_a_model)، أو عرض المساعدة في سطر الأوامر:
```bash
optimum-cli export tflite --help
```
لتصدير نسخة النموذج ل 🤗 Hub، على سبيل المثال، `google-bert/bert-base-uncased`، قم بتشغيل الأمر التالي:
```bash
optimum-cli export tflite --model google-bert/bert-base-uncased --sequence_length 128 bert_tflite/
```
ستظهر لك السجلات التي تُبيّن التقدم وموقع حفظ ملف `model.tflite` الناتج، كما في المثال التالي:
```bash
Validating TFLite model...
-[] TFLite model output names match reference model (logits)
- Validating TFLite Model output "logits":
-[] (1, 128, 30522) matches (1, 128, 30522)
-[x] values not close enough, max diff: 5.817413330078125e-05 (atol: 1e-05)
The TensorFlow Lite export succeeded with the warning: The maximum absolute difference between the output of the reference model and the TFLite exported model is not within the set tolerance 1e-05:
- logits: max diff = 5.817413330078125e-05.
The exported model was saved at: bert_tflite
```
يُبيّن المثال أعلاه كيفية تصدير نسخة من النموذج ل 🤗 Hub. عند تصدير نموذج محلي، تأكد أولاً من حفظ ملفات أوزان النموذج المجزء اللغوى في نفس المسار (`local_path`). عند استخدام CLI، قم بتمرير `local_path` إلى معامل `model` بدلاً من اسم النسخة على 🤗 Hub.

View File

@ -220,8 +220,6 @@
sections:
- local: serialization
title: ONNX
- local: tflite
title: LiteRT
- local: executorch
title: ExecuTorch
- local: torchscript
@ -336,8 +334,6 @@
title: Configuration
- local: main_classes/data_collator
title: Data Collator
- local: main_classes/keras_callbacks
title: Keras callbacks
- local: main_classes/logging
title: Logging
- local: main_classes/model

View File

@ -1,28 +0,0 @@
<!--Copyright 2021 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# Keras callbacks
When training a Transformers model with Keras, there are some library-specific callbacks available to automate common
tasks:
## KerasMetricCallback
[[autodoc]] KerasMetricCallback
## PushToHubCallback
[[autodoc]] PushToHubCallback

View File

@ -1,66 +0,0 @@
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# LiteRT
[LiteRT](https://ai.google.dev/edge/litert) (previously known as TensorFlow Lite) is a high-performance runtime designed for on-device machine learning.
The [Optimum](https://huggingface.co/docs/optimum/index) library exports a model to LiteRT for [many architectures](https://huggingface.co/docs/optimum/exporters/onnx/overview).
The benefits of exporting to LiteRT include the following.
- Low-latency, privacy-focused, no internet connectivity required, and reduced model size and power consumption for on-device machine learning.
- Broad platform, model framework, and language support.
- Hardware acceleration for GPUs and Apple Silicon.
Export a Transformers model to LiteRT with the Optimum CLI.
Run the command below to install Optimum and the [exporters](https://huggingface.co/docs/optimum/exporters/overview) module for LiteRT.
```bash
pip install optimum[exporters-tf]
```
> [!TIP]
> Refer to the [Export a model to TFLite with optimum.exporters.tflite](https://huggingface.co/docs/optimum/main/en/exporters/tflite/usage_guides/export_a_model) guide for all available arguments or with the command below.
> ```bash
> optimum-cli export tflite --help
> ```
Set the `--model` argument to export a from the Hub.
```bash
optimum-cli export tflite --model google-bert/bert-base-uncased --sequence_length 128 bert_tflite/
```
You should see logs indicating the progress and showing where the resulting `model.tflite` is saved.
```bash
Validating TFLite model...
-[] TFLite model output names match reference model (logits)
- Validating TFLite Model output "logits":
-[] (1, 128, 30522) matches (1, 128, 30522)
-[x] values not close enough, max diff: 5.817413330078125e-05 (atol: 1e-05)
The TensorFlow Lite export succeeded with the warning: The maximum absolute difference between the output of the reference model and the TFLite exported model is not within the set tolerance 1e-05:
- logits: max diff = 5.817413330078125e-05.
The exported model was saved at: bert_tflite
```
For local models, make sure the model weights and tokenizer files are saved in the same directory, for example `local_path`. Pass the directory to the `--model` argument and use `--task` to indicate the [task](https://huggingface.co/docs/optimum/exporters/task_manager) a model can perform. If `--task` isn't provided, the model architecture without a task-specific head is used.
```bash
optimum-cli export tflite --model local_path --task question-answering google-bert/bert-base-uncased --sequence_length 128 bert_tflite/
```

View File

@ -64,8 +64,6 @@
title: Entrenador
- local: sagemaker
title: Ejecutar el entrenamiento en Amazon SageMaker
- local: converting_tensorflow_models
title: Convertir checkpoints de TensorFlow
- local: serialization
title: Exportar a ONNX
- local: torchscript

View File

@ -1,139 +0,0 @@
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# Convertir checkpoints de Tensorflow
Te proporcionamos una interfaz de línea de comando (`CLI`, por sus siglas en inglés) para convertir puntos de control (_checkpoints_) originales de Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM en modelos que se puedan cargar utilizando los métodos `from_pretrained` de la biblioteca.
<Tip>
Desde 2.3.0, el script para convertir es parte de la CLI de transformers (**transformers**) disponible en cualquier instalación de transformers >= 2.3.0.
La siguiente documentación refleja el formato para el comando **transformers convert**.
</Tip>
## BERT
Puedes convertir cualquier checkpoint de TensorFlow para BERT (en particular, [los modelos pre-entrenados y publicados por Google](https://github.com/google-research/bert#pre-trained-models)) en un archivo de PyTorch mediante el script [convert_bert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py).
Esta CLI toma como entrada un checkpoint de TensorFlow (tres archivos que comienzan con `bert_model.ckpt`) y el archivo de configuración asociado (`bert_config.json`), y crea un modelo PyTorch para esta configuración, carga los pesos del checkpoint de TensorFlow en el modelo de PyTorch y guarda el modelo resultante en un archivo estándar de PyTorch que se puede importar usando `from_pretrained()` (ve el ejemplo en [Tour rápido](quicktour), [run_glue.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_glue.py)).
Solo necesitas ejecutar este script **una vez** para convertir un modelo a PyTorch. Después, puedes ignorar el checkpoint de TensorFlow (los tres archivos que comienzan con `bert_model.ckpt`), pero asegúrate de conservar el archivo de configuración (`bert_config.json`) y el archivo de vocabulario (`vocab.txt`) ya que estos también son necesarios para el modelo en PyTorch.
Para ejecutar este script deberás tener instalado TensorFlow y PyTorch (`pip install tensorflow`). El resto del repositorio solo requiere PyTorch.
Aquí hay un ejemplo del proceso para convertir un modelo `BERT-Base Uncased` pre-entrenado:
```bash
export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
transformers convert --model_type bert \
--tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
--config $BERT_BASE_DIR/bert_config.json \
--pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
```
Puedes descargar los modelos pre-entrenados de Google para la conversión [aquí](https://github.com/google-research/bert#pre-trained-models).
## ALBERT
Convierte los checkpoints del modelo ALBERT de TensorFlow a PyTorch usando el script [convert_albert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py).
La CLI toma como entrada un checkpoint de TensorFlow (tres archivos que comienzan con `model.ckpt-best`) y el archivo de configuración adjunto (`albert_config.json`), luego crea y guarda un modelo de PyTorch. Para ejecutar esta conversión deberás tener instalados TensorFlow y PyTorch.
Aquí hay un ejemplo del proceso para convertir un modelo `ALBERT Base` pre-entrenado:
```bash
export ALBERT_BASE_DIR=/path/to/albert/albert_base
transformers convert --model_type albert \
--tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
--config $ALBERT_BASE_DIR/albert_config.json \
--pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
```
Puedes descargar los modelos pre-entrenados de Google para la conversión [aquí](https://github.com/google-research/albert#pre-trained-models).
## OpenAI GPT
Este es un ejemplo del proceso para convertir un modelo OpenAI GPT pre-entrenado, asumiendo que tu checkpoint de NumPy se guarda con el mismo formato que el modelo pre-entrenado de OpenAI (más información [aquí](https://github.com/openai/finetune-transformer-lm)):
```bash
export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
transformers convert --model_type gpt \
--tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
[--config OPENAI_GPT_CONFIG] \
[--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
```
## OpenAI GPT-2
Aquí hay un ejemplo del proceso para convertir un modelo OpenAI GPT-2 pre-entrenado (más información [aquí](https://github.com/openai/gpt-2)):
```bash
export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/openai-community/gpt2/pretrained/weights
transformers convert --model_type gpt2 \
--tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
[--config OPENAI_GPT2_CONFIG] \
[--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
```
## XLNet
Aquí hay un ejemplo del proceso para convertir un modelo XLNet pre-entrenado:
```bash
export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
transformers convert --model_type xlnet \
--tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
--config $TRANSFO_XL_CONFIG_PATH \
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
[--finetuning_task_name XLNET_FINETUNED_TASK] \
```
## XLM
Aquí hay un ejemplo del proceso para convertir un modelo XLM pre-entrenado:
```bash
export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
transformers convert --model_type xlm \
--tf_checkpoint $XLM_CHECKPOINT_PATH \
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT
[--config XML_CONFIG] \
[--finetuning_task_name XML_FINETUNED_TASK]
```
## T5
Aquí hay un ejemplo del proceso para convertir un modelo T5 pre-entrenado:
```bash
export T5=/path/to/t5/uncased_L-12_H-768_A-12
transformers convert --model_type t5 \
--tf_checkpoint $T5/t5_model.ckpt \
--config $T5/t5_config.json \
--pytorch_dump_output $T5/pytorch_model.bin
```

View File

@ -2,6 +2,4 @@
- local: pipeline_tutorial
title: पाइपलाइनों के साथ अनुमान चलाएँ
- local: accelerate
title: 🤗 Accelerate के साथ वितरित प्रशिक्षण सेट करें
- local: tflite
title: TFLite में निर्यात करें
title: 🤗 Accelerate के साथ वितरित प्रशिक्षण सेट करें

View File

@ -1,55 +0,0 @@
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# TFLite में निर्यात करें
[TensorFlow Lite](https://www.tensorflow.org/lite/guide) एक हल्का ढांचा है जो मशीन लर्निंग मॉडल को संसाधन-सीमित उपकरणों, जैसे मोबाइल फोन, एम्बेडेड सिस्टम और इंटरनेट ऑफ थिंग्स (IoT) उपकरणों पर तैनात करने के लिए है। TFLite को इन उपकरणों पर सीमित गणनात्मक शक्ति, मेमोरी और ऊर्जा खपत के साथ मॉडल को कुशलता से ऑप्टिमाइज़ और चलाने के लिए डिज़ाइन किया गया है। एक TensorFlow Lite मॉडल को एक विशेष कुशल पोर्टेबल प्रारूप में दर्शाया जाता है जिसे `.tflite` फ़ाइल एक्सटेंशन द्वारा पहचाना जाता है।
🤗 Optimum में `exporters.tflite` मॉड्यूल के माध्यम से 🤗 Transformers मॉडल को TFLite में निर्यात करने की कार्यक्षमता है। समर्थित मॉडल आर्किटेक्चर की सूची के लिए, कृपया [🤗 Optimum दस्तावेज़](https://huggingface.co/docs/optimum/exporters/tflite/overview) देखें।
TFLite में एक मॉडल निर्यात करने के लिए, आवश्यक निर्भरताएँ स्थापित करें:
```bash
pip install optimum[exporters-tf]
```
सभी उपलब्ध तर्कों की जांच करने के लिए, [🤗 Optimum दस्तावेज़](https://huggingface.co/docs/optimum/main/en/exporters/tflite/usage_guides/export_a_model) देखें,
या कमांड लाइन में मदद देखें:
```bash
optimum-cli export tflite --help
```
यदि आप 🤗 Hub से एक मॉडल का चेकपॉइंट निर्यात करना चाहते हैं, उदाहरण के लिए, `google-bert/bert-base-uncased`, निम्नलिखित कमांड चलाएँ:
```bash
optimum-cli export tflite --model google-bert/bert-base-uncased --sequence_length 128 bert_tflite/
```
आपको प्रगति को दर्शाते हुए लॉग दिखाई देंगे और यह दिखाएंगे कि परिणामस्वरूप `model.tflite` कहाँ सहेजा गया है, जैसे:
```bash
Validating TFLite model...
-[] TFLite model output names match reference model (logits)
- Validating TFLite Model output "logits":
-[] (1, 128, 30522) matches (1, 128, 30522)
-[x] values not close enough, max diff: 5.817413330078125e-05 (atol: 1e-05)
The TensorFlow Lite export succeeded with the warning: The maximum absolute difference between the output of the reference model and the TFLite exported model is not within the set tolerance 1e-05:
- logits: max diff = 5.817413330078125e-05.
The exported model was saved at: bert_tflite
```
उपरोक्त उदाहरण 🤗 Hub से एक चेकपॉइंट निर्यात करने को दर्शाता है। जब एक स्थानीय मॉडल निर्यात करते हैं, तो पहले सुनिश्चित करें कि आपने मॉडल के वज़न और टोकनाइज़र फ़ाइलों को एक ही निर्देशिका (`local_path`) में सहेजा है। CLI का उपयोग करते समय, चेकपॉइंट नाम के बजाय `model` तर्क में `local_path` पास करें।

View File

@ -29,8 +29,6 @@
title: Addestramento con script
- local: multilingual
title: Modelli multilingua per l'inferenza
- local: converting_tensorflow_models
title: Convertire modelli tensorflow
- local: serialization
title: Esporta modelli Transformers
- local: perf_train_cpu

View File

@ -1,144 +0,0 @@
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# Convertire checkpoint di Tensorflow
È disponibile un'interfaccia a linea di comando per convertire gli originali checkpoint di Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM
in modelli che possono essere caricati utilizzando i metodi `from_pretrained` della libreria.
<Tip>
A partire dalla versione 2.3.0 lo script di conversione è parte di transformers CLI (**transformers**), disponibile in ogni installazione
di transformers >=2.3.0.
La seguente documentazione riflette il formato dei comandi di **transformers convert**.
</Tip>
## BERT
Puoi convertire qualunque checkpoint Tensorflow di BERT (in particolare
[i modeli pre-allenati rilasciati da Google](https://github.com/google-research/bert#pre-trained-models))
in un file di salvataggio Pytorch utilizzando lo script
[convert_bert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py).
Questo CLI prende come input un checkpoint di Tensorflow (tre files che iniziano con `bert_model.ckpt`) ed il relativo
file di configurazione (`bert_config.json`), crea un modello Pytorch per questa configurazione, carica i pesi dal
checkpoint di Tensorflow nel modello di Pytorch e salva il modello che ne risulta in un file di salvataggio standard di Pytorch che
può essere importato utilizzando `from_pretrained()` (vedi l'esempio nel
[quicktour](quicktour) , [run_glue.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_glue.py) ).
Devi soltanto lanciare questo script di conversione **una volta** per ottenere un modello Pytorch. Dopodichè, potrai tralasciare
il checkpoint di Tensorflow (i tre files che iniziano con `bert_model.ckpt`), ma assicurati di tenere il file di configurazione
(`bert_config.json`) ed il file di vocabolario (`vocab.txt`) in quanto queste componenti sono necessarie anche per il modello di Pytorch.
Per lanciare questo specifico script di conversione avrai bisogno di un'installazione di Tensorflow e di Pytorch
(`pip install tensorflow`). Il resto della repository richiede soltanto Pytorch.
Questo è un esempio del processo di conversione per un modello `BERT-Base Uncased` pre-allenato:
```bash
export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
transformers convert --model_type bert \
--tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
--config $BERT_BASE_DIR/bert_config.json \
--pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
```
Puoi scaricare i modelli pre-allenati di Google per la conversione [qua](https://github.com/google-research/bert#pre-trained-models).
## ALBERT
Per il modello ALBERT, converti checkpoint di Tensoflow in Pytorch utilizzando lo script
[convert_albert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py).
Il CLI prende come input un checkpoint di Tensorflow (tre files che iniziano con `model.ckpt-best`) e i relativi file di
configurazione (`albert_config.json`), dopodichè crea e salva un modello Pytorch. Per lanciare questa conversione
avrai bisogno di un'installazione di Tensorflow e di Pytorch.
Ecco un esempio del procedimento di conversione di un modello `ALBERT Base` pre-allenato:
```bash
export ALBERT_BASE_DIR=/path/to/albert/albert_base
transformers convert --model_type albert \
--tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
--config $ALBERT_BASE_DIR/albert_config.json \
--pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
```
Puoi scaricare i modelli pre-allenati di Google per la conversione [qui](https://github.com/google-research/albert#pre-trained-models).
## OpenAI GPT
Ecco un esempio del processo di conversione di un modello OpenAI GPT pre-allenato, assumendo che il tuo checkpoint di NumPy
sia salvato nello stesso formato dei modelli pre-allenati OpenAI (vedi [qui](https://github.com/openai/finetune-transformer-lm)):
```bash
export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
transformers convert --model_type gpt \
--tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
[--config OPENAI_GPT_CONFIG] \
[--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
```
## OpenAI GPT-2
Ecco un esempio del processo di conversione di un modello OpenAI GPT-2 pre-allenato (vedi [qui](https://github.com/openai/gpt-2)):
```bash
export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/openai-community/gpt2/pretrained/weights
transformers convert --model_type gpt2 \
--tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
[--config OPENAI_GPT2_CONFIG] \
[--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
```
## XLNet
Ecco un esempio del processo di conversione di un modello XLNet pre-allenato:
```bash
export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
transformers convert --model_type xlnet \
--tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
--config $TRANSFO_XL_CONFIG_PATH \
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
[--finetuning_task_name XLNET_FINETUNED_TASK] \
```
## XLM
Ecco un esempio del processo di conversione di un modello XLM pre-allenato:
```bash
export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
transformers convert --model_type xlm \
--tf_checkpoint $XLM_CHECKPOINT_PATH \
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT
[--config XML_CONFIG] \
[--finetuning_task_name XML_FINETUNED_TASK]
```
## T5
Ecco un esempio del processo di conversione di un modello T5 pre-allenato:
```bash
export T5=/path/to/t5/uncased_L-12_H-768_A-12
transformers convert --model_type t5 \
--tf_checkpoint $T5/t5_model.ckpt \
--config $T5/t5_config.json \
--pytorch_dump_output $T5/pytorch_model.bin
```

View File

@ -109,8 +109,6 @@
title: チャットモデルのテンプレート
- local: serialization
title: ONNX へのエクスポート
- local: tflite
title: TFLite へのエクスポート
- local: torchscript
title: トーチスクリプトへのエクスポート
- local: community
@ -132,8 +130,6 @@
title: 分散CPUトレーニング
- local: perf_train_tpu
title: TPU に関するトレーニング
- local: perf_train_tpu_tf
title: TensorFlow を使用した TPU のトレーニング
- local: perf_train_special
title: 特殊なハードウェアに関するトレーニング
- local: perf_hardware
@ -153,8 +149,6 @@
title: 推論の最適化
- local: big_models
title: 大きなモデルのインスタンス化
- local: tf_xla
title: TensorFlowモデルのXLA統合
- local: perf_torch_compile
title: torch.compile()を使用した推論の最適化
title: パフォーマンスとスケーラビリティ
@ -202,8 +196,6 @@
title: 構成
- local: main_classes/data_collator
title: データ照合者
- local: main_classes/keras_callbacks
title: Keras コールバック
- local: main_classes/logging
title: ロギング
- local: main_classes/model

View File

@ -1,28 +0,0 @@
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# Keras callbacks
Keras を使用して Transformers モデルをトレーニングする場合、一般的な処理を自動化するために使用できるライブラリ固有のコールバックがいくつかあります。
タスク:
## KerasMetricCallback
[[autodoc]] KerasMetricCallback
## PushToHubCallback
[[autodoc]] PushToHubCallback

View File

@ -1,168 +0,0 @@
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# Training on TPU with TensorFlow
<Tip>
詳細な説明が不要で、単にTPUのコードサンプルを入手してトレーニングを開始したい場合は、[私たちのTPUの例のートブックをチェックしてください](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb)
</Tip>
### What is a TPU?
TPUは**Tensor Processing Unitテンソル処理ユニット**の略です。これらはGoogleが設計したハードウェアで、ニューラルネットワーク内のテンソル計算を大幅に高速化するために使用されます。これはGPUのようなものです。ネットワークのトレーニングと推論の両方に使用できます。一般的にはGoogleのクラウドサービスを介してアクセスされますが、Google ColabとKaggle Kernelsを通じても無料で小規模のTPUに直接アクセスできます。
[🤗 TransformersのすべてのTensorFlowモデルはKerasモデルです](https://huggingface.co/blog/tensorflow-philosophy)ので、この文書のほとんどの方法は一般的にKerasモデル用のTPUトレーニングに適用できますただし、TransformersとDatasetsのHuggingFaceエコシステムhug-o-systemに固有のポイントもいくつかあり、それについては適用するときにそれを示します。
### What kinds of TPU are available?
新しいユーザーは、さまざまなTPUとそのアクセス方法に関する幅広い情報によく混乱します。理解するための最初の重要な違いは、**TPUード**と**TPU VM**の違いです。
**TPUード**を使用すると、事実上リモートのTPUに間接的にアクセスします。別個のVMが必要で、ネットワークとデータパイプラインを初期化し、それらをリモートードに転送します。Google ColabでTPUを使用すると、**TPUード**スタイルでアクセスしています。
TPUードを使用すると、それに慣れていない人々にはかなり予期しない動作が発生することがあります特に、TPUはPythonコードを実行しているマシンと物理的に異なるシステムに配置されているため、データはローカルマシンにローカルで格納されているデータパイプラインが完全に失敗します。代わりに、データはGoogle Cloud Storageに格納する必要があります。ここでデータパイプラインはリモートのTPUードで実行されている場合でも、データにアクセスできます。
<Tip>
すべてのデータを`np.ndarray`または`tf.Tensor`としてメモリに収めることができる場合、ColabまたはTPUードを使用している場合でも、データをGoogle Cloud Storageにアップロードせずに`fit()`でトレーニングできます。
</Tip>
<Tip>
**🤗 Hugging Face固有のヒント🤗:** TFコードの例でよく見るであろう`Dataset.to_tf_dataset()`とその高レベルのラッパーである`model.prepare_tf_dataset()`は、TPUードで失敗します。これは、`tf.data.Dataset`を作成しているにもかかわらず、それが「純粋な」`tf.data`パイプラインではなく、`tf.numpy_function`または`Dataset.from_generator()`を使用して基盤となるHuggingFace `Dataset`からデータをストリームで読み込むことからです。このHuggingFace `Dataset`はローカルディスク上のデータをバックアップしており、リモートTPUードが読み取ることができないためです。
</Tip>
TPUにアクセスする第二の方法は、**TPU VM**を介してです。TPU VMを使用する場合、TPUが接続されているマシンに直接接続します。これはGPU VMでトレーニングを行うのと同様です。TPU VMは一般的にデータパイプラインに関しては特に作業がしやすく、上記のすべての警告はTPU VMには適用されません
これは主観的な文書ですので、こちらの意見です:**可能な限りTPUードの使用を避けてください。** TPU VMよりも混乱しやすく、デバッグが難しいです。将来的にはサポートされなくなる可能性もあります - Googleの最新のTPUであるTPUv4は、TPU VMとしてのみアクセスできるため、TPUードは将来的には「レガシー」のアクセス方法になる可能性が高いです。ただし、無料でTPUにアクセスできるのはColabとKaggle Kernelsの場合があります。その場合、どうしても使用しなければならない場合の取り扱い方法を説明しようとします詳細は[TPUの例のートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb)で詳細な説明を確認してください。
### What sizes of TPU are available?
単一のTPUv2-8/v3-8/v4-8は8つのレプリカを実行します。TPUは数百から数千のレプリカを同時に実行できる**ポッド**に存在します。単一のTPUよりも多くのTPUを使用するが、ポッド全体ではない場合たとえばv3-32、TPUフリートは**ポッドスライス**として参照されます。
Colabを介して無料のTPUにアクセスする場合、通常は単一のv2-8 TPUが提供されます。
### I keep hearing about this XLA thing. Whats XLA, and how does it relate to TPUs?
XLAは、TensorFlowとJAXの両方で使用される最適化コンパイラです。JAXでは唯一のコンパイラであり、TensorFlowではオプションですがしかしTPUでは必須です、Kerasモデルをトレーニングする際に`model.compile()`に引数`jit_compile=True`を渡すことで最も簡単に有効にできます。エラーが発生せず、パフォーマンスが良好であれば、それはTPUに移行する準備が整った良い兆候です
TPU上でのデバッグは一般的にCPU/GPUよりも少し難しいため、TPUで試す前にまずCPU/GPUでXLAを使用してコードを実行することをお勧めします。もちろん、長時間トレーニングする必要はありません。モデルとデータパイプラインが期待通りに動作するかを確認するための数ステップだけです。
<Tip>
XLAコンパイルされたコードは通常高速です。したがって、TPUで実行する予定がない場合でも、`jit_compile=True`を追加することでパフォーマンスを向上させることができます。ただし、以下のXLA互換性に関する注意事項に注意してください
</Tip>
<Tip warning={true}>
**苦い経験から生まれたヒント:** `jit_compile=True`を使用することは、CPU/GPUコードがXLA互換であることを確認し、速度を向上させる良い方法ですが、実際にTPUでコードを実行する際には多くの問題を引き起こす可能性があります。 XLAコンパイルはTPU上で暗黙的に行われるため、実際にコードをTPUで実行する前にその行を削除することを忘れないでください
</Tip>
### How do I make my model XLA compatible?
多くの場合、コードはすでにXLA互換かもしれませんただし、XLAでは動作する通常のTensorFlowでも動作しないいくつかの要素があります。以下に、3つの主要なルールにまとめています
<Tip>
**🤗 HuggingFace固有のヒント🤗:** TensorFlowモデルと損失関数をXLA互換に書き直すために多くの努力を払っています。通常、モデルと損失関数はデフォルトでルール1と2に従っているため、`transformers`モデルを使用している場合はこれらをスキップできます。ただし、独自のモデルと損失関数を記述する場合は、これらのルールを忘れないでください!
</Tip>
#### XLA Rule #1: Your code cannot have “data-dependent conditionals”
これは、任意の`if`ステートメントが`tf.Tensor`内の値に依存していない必要があることを意味します。例えば、次のコードブロックはXLAでコンパイルできません
```python
if tf.reduce_sum(tensor) > 10:
tensor = tensor / 2.0
```
これは最初は非常に制限的に思えるかもしれませんが、ほとんどのニューラルネットコードはこれを行う必要はありません。通常、この制約を回避するために`tf.cond`を使用するか(ドキュメントはこちらを参照)、条件を削除して代わりに指示変数を使用したりすることができます。次のように:
```python
sum_over_10 = tf.cast(tf.reduce_sum(tensor) > 10, tf.float32)
tensor = tensor / (1.0 + sum_over_10)
```
このコードは、上記のコードとまったく同じ効果を持っていますが、条件を回避することで、XLAで問題なくコンパイルできることを確認します
#### XLA Rule #2: Your code cannot have “data-dependent shapes”
これは、コード内のすべての `tf.Tensor` オブジェクトの形状が、その値に依存しないことを意味します。たとえば、`tf.unique` 関数はXLAでコンパイルできないので、このルールに違反します。なぜなら、これは入力 `Tensor` の一意の値の各インスタンスを含む `tensor` を返すためです。この出力の形状は、入力 `Tensor` の重複具合によって異なるため、XLAはそれを処理しないことになります
一般的に、ほとんどのニューラルネットワークコードはデフォルトでルール2に従います。ただし、いくつかの一般的なケースでは問題が発生することがあります。非常に一般的なケースの1つは、**ラベルマスキング**を使用する場合です。ラベルを無視して損失を計算する場所を示すために、ラベルを負の値に設定する方法です。NumPyまたはPyTorchのラベルマスキングをサポートする損失関数を見ると、次のような[ブールインデックス](https://numpy.org/doc/stable/user/basics.indexing.html#boolean-array-indexing)を使用したコードがよく見られます:
```python
label_mask = labels >= 0
masked_outputs = outputs[label_mask]
masked_labels = labels[label_mask]
loss = compute_loss(masked_outputs, masked_labels)
mean_loss = torch.mean(loss)
```
このコードはNumPyやPyTorchでは完全に機能しますが、XLAでは動作しませんなぜなら、`masked_outputs``masked_labels`の形状はマスクされた位置の数に依存するため、これは**データ依存の形状**になります。ただし、ルール1と同様に、このコードを書き直して、データ依存の形状なしでまったく同じ出力を生成できることがあります。
```python
label_mask = tf.cast(labels >= 0, tf.float32)
loss = compute_loss(outputs, labels)
loss = loss * label_mask # Set negative label positions to 0
mean_loss = tf.reduce_sum(loss) / tf.reduce_sum(label_mask)
```
ここでは、データ依存の形状を避けるために、各位置で損失を計算してから、平均を計算する際に分子と分母の両方でマスクされた位置をゼロ化する方法を紹介します。これにより、最初のアプローチとまったく同じ結果が得られますが、XLA互換性を維持します。注意点として、ルール1と同じトリックを使用します - `tf.bool``tf.float32`に変換して指標変数として使用します。これは非常に便利なトリックですので、自分のコードをXLAに変換する必要がある場合には覚えておいてください
#### XLA Rule #3: XLA will need to recompile your model for every different input shape it sees
これは重要なルールです。これはつまり、入力形状が非常に変動的な場合、XLA はモデルを何度も再コンパイルする必要があるため、大きなパフォーマンスの問題が発生する可能性があるということです。これは NLP モデルで一般的に発生し、トークナイズ後の入力テキストの長さが異なる場合があります。他のモダリティでは、静的な形状が一般的であり、このルールはほとんど問題になりません。
ルール3を回避する方法は何でしょうか鍵は「パディング」です - すべての入力を同じ長さにパディングし、次に「attention_mask」を使用することで、可変形状と同じ結果を得ることができますが、XLA の問題は発生しません。ただし、過度のパディングも深刻な遅延を引き起こす可能性があります - データセット全体で最大の長さにすべてのサンプルをパディングすると、多くの計算とメモリを無駄にする可能性があります!
この問題には完璧な解決策はありませんが、いくつかのトリックを試すことができます。非常に便利なトリックの1つは、**バッチのサンプルを32または64トークンの倍数までパディングする**ことです。これにより、トークン数がわずかに増加するだけで、すべての入力形状が32または64の倍数である必要があるため、一意の入力形状の数が大幅に減少します。一意の入力形状が少ないと、XLA の再コンパイルが少なくなります!
<Tip>
**🤗 HuggingFace に関する具体的なヒント🤗:** 弊社のトークナイザーとデータコレクターには、ここで役立つメソッドがあります。トークナイザーを呼び出す際に `padding="max_length"` または `padding="longest"` を使用して、パディングされたデータを出力するように設定できます。トークナイザーとデータコレクターには、一意の入力形状の数を減らすのに役立つ `pad_to_multiple_of` 引数もあります!
</Tip>
### How do I actually train my model on TPU?
一度トレーニングが XLA 互換性があることを確認し、TPU Node/Colab を使用する場合はデータセットが適切に準備されている場合、TPU 上で実行することは驚くほど簡単です!コードを変更する必要があるのは、いくつかの行を追加して TPU を初期化し、モデルとデータセットが `TPUStrategy` スコープ内で作成されるようにすることだけです。これを実際に見るには、[TPU のサンプルノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb)をご覧ください!
### Summary
ここでは多くの情報が提供されましたので、TPU でモデルをトレーニングする際に以下のチェックリストを使用できます:
- コードが XLA の三つのルールに従っていることを確認します。
- CPU/GPU で `jit_compile=True` を使用してモデルをコンパイルし、XLA でトレーニングできることを確認します。
- データセットをメモリに読み込むか、TPU 互換のデータセット読み込みアプローチを使用します([ノートブックを参照](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb))。
- コードを Colabアクセラレータを「TPU」に設定または Google Cloud の TPU VM に移行します。
- TPU 初期化コードを追加します([ノートブックを参照](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb))。
- `TPUStrategy` を作成し、データセットの読み込みとモデルの作成が `strategy.scope()` 内で行われることを確認します([ノートブックを参照](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb))。
- TPU に移行する際に `jit_compile=True` を外すのを忘れないでください!
- 🙏🙏🙏🥺🥺🥺
- `model.fit()` を呼び出します。
- おめでとうございます!

View File

@ -1,179 +0,0 @@
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# XLA Integration for TensorFlow Models
[[open-in-colab]]
加速線形代数Accelerated Linear Algebra、通称XLAは、TensorFlowモデルのランタイムを高速化するためのコンパイラです。[公式ドキュメント](https://www.tensorflow.org/xla)によれば、XLAAccelerated Linear Algebraは線形代数のためのドメイン固有のコンパイラで、TensorFlowモデルを潜在的にソースコードの変更なしで高速化できます。
TensorFlowでXLAを使用するのは簡単です。XLAは`tensorflow`ライブラリ内にパッケージ化されており、[`tf.function`](https://www.tensorflow.org/guide/intro_to_graphs)などのグラフを作成する関数内で`jit_compile`引数を使用してトリガーできます。`fit()``predict()`などのKerasメソッドを使用する場合、`model.compile()``jit_compile`引数を渡すだけでXLAを有効にできます。ただし、XLAはこれらのメソッドに限定されているわけではありません。任意の`tf.function`を高速化するためにも使用できます。
🤗 Transformers内のいくつかのTensorFlowメソッドは、XLAと互換性があるように書き直されています。これには、[GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2)、[T5](https://huggingface.co/docs/transformers/model_doc/t5)、[OPT](https://huggingface.co/docs/transformers/model_doc/opt)などのテキスト生成モデルや、[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)などの音声処理モデルも含まれます。
速度向上の具体的な量はモデルに非常に依存しますが、🤗 Transformers内のTensorFlowテキスト生成モデルでは、約100倍の速度向上を確認しています。このドキュメントでは、これらのモデルにXLAを使用して最大のパフォーマンスを得る方法を説明します。また、ベンチマークとXLA統合のデザイン哲学について詳しく学びたい場合の追加リソースへのリンクも提供します。
## Running TF functions with XLA
以下のTensorFlowモデルを考えてみましょう
```py
import tensorflow as tf
model = tf.keras.Sequential(
[tf.keras.layers.Dense(10, input_shape=(10,), activation="relu"), tf.keras.layers.Dense(5, activation="softmax")]
)
```
上記のモデルは、次元が`(10, )`の入力を受け入れます。このモデルをフォワードパスで実行するには、次のようにします:
```py
# Generate random inputs for the model.
batch_size = 16
input_vector_dim = 10
random_inputs = tf.random.normal((batch_size, input_vector_dim))
# Run a forward pass.
_ = model(random_inputs)
```
XLAでコンパイルされた関数を使用してフォワードパスを実行するには、以下のようにします
```py
xla_fn = tf.function(model, jit_compile=True)
_ = xla_fn(random_inputs)
```
`model`のデフォルトの `call()` 関数はXLAグラフをコンパイルするために使用されます。ただし、XLAにコンパイルしたい他のモデル関数がある場合、それも可能です。以下はその方法です
```py
my_xla_fn = tf.function(model.my_xla_fn, jit_compile=True)
```
## Running a TF text generation model with XLA from 🤗 Transformers
🤗 Transformers内でXLAでの高速化された生成を有効にするには、最新バージョンの`transformers`がインストールされている必要があります。次のコマンドを実行してインストールできます:
```bash
pip install transformers --upgrade
```
次に、次のコードを実行できます:
```py
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForCausalLM
# Will error if the minimal version of Transformers is not installed.
from transformers.utils import check_min_version
check_min_version("4.21.0")
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
input_string = ["TensorFlow is"]
# One line to create an XLA generation function
xla_generate = tf.function(model.generate, jit_compile=True)
tokenized_input = tokenizer(input_string, return_tensors="tf")
generated_tokens = xla_generate(**tokenized_input, num_beams=2)
decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
print(f"Generated -- {decoded_text}")
# Generated -- TensorFlow is an open-source, open-source, distributed-source application # framework for the
```
`generate()`でXLAを有効にするのは、たった一行のコードです。コードの残り部分は変更されていません。ただし、XLA固有のいくつかの注意点が上記のコードスニペットにあります。これらに注意する必要があり、XLAがもたらす速度向上を実現するためにそれらを把握することが重要です。次のセクションでこれらについて詳しく説明します。
## Gotchas to be aware of
XLAを有効にした関数上記の`xla_generate()`など)を初めて実行すると、内部で計算グラフを推論しようとしますが、これは時間がかかります。このプロセスは["トレーシング"tracing](https://www.tensorflow.org/guide/intro_to_graphs#when_is_a_function_tracing)として知られています。
生成時間が高速ではないことに気付くかもしれません。`xla_generate()`または他のXLA対応関数の連続呼び出しでは、関数への入力が最初に計算グラフが構築されたときと同じ形状に従っている場合、計算グラフを推論する必要はありません。これは、入力形状が固定されているモダリティ画像には問題ありませんが、変数の入力形状モダリティテキストを扱う場合には注意が必要です。
`xla_generate()`が常に同じ入力形状で動作するようにするには、トークナイザを呼び出す際に`padding`引数を指定できます。
```py
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
input_string = ["TensorFlow is"]
xla_generate = tf.function(model.generate, jit_compile=True)
# Here, we call the tokenizer with padding options.
tokenized_input = tokenizer(input_string, pad_to_multiple_of=8, padding=True, return_tensors="tf")
generated_tokens = xla_generate(**tokenized_input, num_beams=2)
decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
print(f"Generated -- {decoded_text}")
```
これにより、`xla_generate()`への入力が常にトレースされた形状の入力を受け取ることを確認し、生成時間の高速化を実現できます。以下のコードでこれを確認できます:
```py
import time
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
xla_generate = tf.function(model.generate, jit_compile=True)
for input_string in ["TensorFlow is", "TensorFlow is a", "TFLite is a"]:
tokenized_input = tokenizer(input_string, pad_to_multiple_of=8, padding=True, return_tensors="tf")
start = time.time_ns()
generated_tokens = xla_generate(**tokenized_input, num_beams=2)
end = time.time_ns()
print(f"Execution time -- {(end - start) / 1e6:.1f} ms\n")
```
Tesla T4 GPUを使用すると、次のような出力が期待されます
```bash
Execution time -- 30819.6 ms
Execution time -- 79.0 ms
Execution time -- 78.9 ms
```
最初の`xla_generate()`呼び出しはトレーシングのために時間がかかりますが、連続する呼び出しは桁違いに高速です。生成オプションのいかなる変更も、再トレーシングを引き起こし、生成時間の遅延を引き起こすことに注意してください。
このドキュメントでは、🤗 Transformersが提供するテキスト生成オプションをすべて網羅していません。高度なユースケースについてはドキュメンテーションを参照することをお勧めします。
## Additional Resources
ここでは、🤗 Transformersと一般的なXLAについてさらに詳しく学びたい場合のいくつかの追加リソースを提供します。
* [このColab Notebook](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/91_tf_xla_generate.ipynb)では、XLA対応のエンコーダーデコーダー[T5](https://huggingface.co/docs/transformers/model_doc/t5)など)およびデコーダー専用([GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2)など)テキスト生成モデルを試すための対話型デモが提供されています。
* [このブログ記事](https://huggingface.co/blog/tf-xla-generate)では、XLA対応モデルの比較ベンチマークの概要と、TensorFlowでのXLAについての友好的な紹介が提供されています。
* [このブログ記事](https://blog.tensorflow.org/2022/11/how-hugging-face-improved-text-generation-performance-with-xla.html)では、🤗 TransformersのTensorFlowモデルにXLAサポートを追加する際の設計哲学について説明しています。
* 一般的なXLAとTensorFlowグラフについて詳しく学ぶためのおすすめの投稿
* [XLA: 機械学習用の最適化コンパイラ](https://www.tensorflow.org/xla)
* [グラフと`tf.function`の紹介](https://www.tensorflow.org/guide/intro_to_graphs)
* [`tf.function`を使用したパフォーマンス向上](https://www.tensorflow.org/guide/function)

View File

@ -1,58 +0,0 @@
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# Export to TFLite
[TensorFlow Lite](https://www.tensorflow.org/lite/guide)は、モバイルフォン、組み込みシステム、およびモのインターネットIoTデバイスなど、リソースに制約のあるデバイスに機械学習モデルを展開するための軽量なフレームワークです。TFLiteは、計算能力、メモリ、および電力消費が限られているこれらのデバイス上でモデルを効率的に最適化して実行するために設計されています。
TensorFlow Liteモデルは、`.tflite`ファイル拡張子で識別される特別な効率的なポータブル形式で表されます。
🤗 Optimumは、🤗 TransformersモデルをTFLiteにエクスポートするための機能を`exporters.tflite`モジュールを介して提供しています。サポートされているモデルアーキテクチャのリストについては、[🤗 Optimumのドキュメント](https://huggingface.co/docs/optimum/exporters/tflite/overview)をご参照ください。
モデルをTFLiteにエクスポートするには、必要な依存関係をインストールしてください
```bash
pip install optimum[exporters-tf]
```
すべての利用可能な引数を確認するには、[🤗 Optimumドキュメント](https://huggingface.co/docs/optimum/main/en/exporters/tflite/usage_guides/export_a_model)を参照するか、コマンドラインでヘルプを表示してください:
```bash
optimum-cli export tflite --help
```
🤗 Hubからモデルのチェックポイントをエクスポートするには、例えば `google-bert/bert-base-uncased` を使用する場合、次のコマンドを実行します:
```bash
optimum-cli export tflite --model google-bert/bert-base-uncased --sequence_length 128 bert_tflite/
```
進行状況を示すログが表示され、生成された `model.tflite` が保存された場所も表示されるはずです:
```bash
Validating TFLite model...
-[✓] TFLite model output names match reference model (logits)
- Validating TFLite Model output "logits":
-[✓] (1, 128, 30522) matches (1, 128, 30522)
-[x] values not close enough, max diff: 5.817413330078125e-05 (atol: 1e-05)
The TensorFlow Lite export succeeded with the warning: The maximum absolute difference between the output of the reference model and the TFLite exported model is not within the set tolerance 1e-05:
- logits: max diff = 5.817413330078125e-05.
The exported model was saved at: bert_tflite
```
上記の例は🤗 Hubからチェックポイントをエクスポートする方法を示しています。ローカルモデルをエクスポートする場合、まずモデルの重みファイルとトークナイザファイルを同じディレクトリ`local_path`に保存したことを確認してください。CLIを使用する場合、🤗 Hubのチェックポイント名の代わりに`model`引数に`local_path`を渡します。

View File

@ -208,8 +208,6 @@
sections:
- local: serialization
title: ONNX로 내보내기
- local: tflite
title: TFLite로 내보내기
- local: executorch
title: ExecuTorch
- local: torchscript
@ -402,8 +400,6 @@
title: Configuration
- local: main_classes/data_collator
title: Data Collator
- local: main_classes/keras_callbacks
title: Keras callbacks
- local: main_classes/logging
title: Logging
- local: main_classes/model

View File

@ -1,27 +0,0 @@
<!--Copyright 2021 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# 케라스 콜백[[keras-callbacks]]
케라스로 트랜스포머 모델을 학습할 때, 일반적인 작업을 자동화하기 위한 라이브러리 전용 콜백들을 사용 할 수 있습니다.
## KerasMetricCallback[[transformers.KerasMetricCallback]]
[[autodoc]] KerasMetricCallback
## PushToHubCallback[[transformers.PushToHubCallback]]
[[autodoc]] PushToHubCallback

View File

@ -1,62 +0,0 @@
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# TFLite로 내보내기[[export-to-tflite]]
[TensorFlow Lite](https://www.tensorflow.org/lite/guide)는 자원이 제한된 휴대폰, 임베디드 시스템, 사물인터넷(IoT) 기기에서
기계학습 모델을 배포하기 위한 경량 프레임워크입니다.
TFLite는 연산 능력, 메모리, 전력 소비가 제한된 기기에서 모델을 효율적으로 최적화하고 실행하기 위해
설계되었습니다.
TensorFlow Lite 모델은 `.tflite` 파일 확장자로 식별되는 특수하고 효율적인 휴대용 포맷으로 표현됩니다.
🤗 Optimum은 `exporters.tflite` 모듈로 🤗 Transformers 모델을 TFLite로 내보내는 기능을 제공합니다.
지원되는 모델 아키텍처 목록은 [🤗 Optimum 문서](https://huggingface.co/docs/optimum/exporters/tflite/overview)를 참고하세요.
모델을 TFLite로 내보내려면, 필요한 종속성을 설치하세요:
```bash
pip install optimum[exporters-tf]
```
모든 사용 가능한 인수를 확인하려면, [🤗 Optimum 문서](https://huggingface.co/docs/optimum/main/en/exporters/tflite/usage_guides/export_a_model)를 참고하거나
터미널에서 도움말을 살펴보세요:
```bash
optimum-cli export tflite --help
```
예를 들어 🤗 Hub에서의 `google-bert/bert-base-uncased` 모델 체크포인트를 내보내려면, 다음 명령을 실행하세요:
```bash
optimum-cli export tflite --model google-bert/bert-base-uncased --sequence_length 128 bert_tflite/
```
다음과 같이 진행 상황을 나타내는 로그와 결과물인 `model.tflite`가 저장된 위치를 보여주는 로그가 표시됩니다:
```bash
Validating TFLite model...
-[✓] TFLite model output names match reference model (logits)
- Validating TFLite Model output "logits":
-[✓] (1, 128, 30522) matches (1, 128, 30522)
-[x] values not close enough, max diff: 5.817413330078125e-05 (atol: 1e-05)
The TensorFlow Lite export succeeded with the warning: The maximum absolute difference between the output of the reference model and the TFLite exported model is not within the set tolerance 1e-05:
- logits: max diff = 5.817413330078125e-05.
The exported model was saved at: bert_tflite
```
위 예제는 🤗 Hub에서의 체크포인트를 내보내는 방법을 보여줍니다.
로컬 모델을 내보낸다면, 먼저 모델 가중치와 토크나이저 파일이 모두 같은 디렉터리( `local_path` )에 저장됐는지 확인하세요.
CLI를 사용할 때, 🤗 Hub에서의 체크포인트 이름 대신 `model` 인수에 `local_path`를 전달하면 됩니다.

View File

@ -115,8 +115,6 @@
title: Latihan pada banyak CPU
- local: perf_train_tpu
title: Latihan mengenai TPU
- local: perf_train_tpu_tf
title: Latihan tentang TPU dengan TensorFlow
- local: perf_train_special
title: Latihan mengenai Perkakasan Khusus
- local: perf_infer_cpu
@ -135,8 +133,6 @@
title: Penyahpepijatan
- local: hpo_train
title: Carian Hiperparameter menggunakan API Pelatih
- local: tf_xla
title: Penyepaduan XLA untuk Model TensorFlow
title: Prestasi dan kebolehskalaan
- sections:
- local: contributing
@ -185,8 +181,6 @@
title: Configuration
- local: main_classes/data_collator
title: Data Collator
- local: main_classes/keras_callbacks
title: Keras callbacks
- local: main_classes/logging
title: Logging
- local: main_classes/model

View File

@ -23,8 +23,6 @@
title: Compartilhando modelos customizados
- local: run_scripts
title: Treinamento a partir de um script
- local: converting_tensorflow_models
title: Convertendo checkpoints do TensorFlow para Pytorch
- local: serialization
title: Exportando modelos para ONNX
- sections:

View File

@ -1,152 +0,0 @@
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# Convertendo checkpoints do TensorFlow para Pytorch
Uma interface de linha de comando é fornecida para converter os checkpoints originais Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM em modelos
que podem ser carregados usando os métodos `from_pretrained` da biblioteca.
<Tip>
A partir da versão 2.3.0 o script de conversão agora faz parte do transformers CLI (**transformers**) disponível em qualquer instalação
transformers >= 2.3.0.
A documentação abaixo reflete o formato do comando **transformers convert**.
</Tip>
## BERT
Você pode converter qualquer checkpoint do BERT em TensorFlow (em particular [os modelos pré-treinados lançados pelo Google](https://github.com/google-research/bert#pre-trained-models)) em um arquivo PyTorch usando um
[convert_bert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py) script.
Esta Interface de Linha de Comando (CLI) recebe como entrada um checkpoint do TensorFlow (três arquivos começando com `bert_model.ckpt`) e o
arquivo de configuração (`bert_config.json`), e então cria um modelo PyTorch para esta configuração, carrega os pesos
do checkpoint do TensorFlow no modelo PyTorch e salva o modelo resultante em um arquivo PyTorch que pode
ser importado usando `from_pretrained()` (veja o exemplo em [quicktour](quicktour) , [run_glue.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_glue.py) ).
Você só precisa executar este script de conversão **uma vez** para obter um modelo PyTorch. Você pode então desconsiderar o checkpoint em
TensorFlow (os três arquivos começando com `bert_model.ckpt`), mas certifique-se de manter o arquivo de configuração (\
`bert_config.json`) e o arquivo de vocabulário (`vocab.txt`), pois eles também são necessários para o modelo PyTorch.
Para executar este script de conversão específico, você precisará ter o TensorFlow e o PyTorch instalados (`pip install tensorflow`). O resto do repositório requer apenas o PyTorch.
Aqui está um exemplo do processo de conversão para um modelo `BERT-Base Uncased` pré-treinado:
```bash
export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
transformers convert --model_type bert \
--tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
--config $BERT_BASE_DIR/bert_config.json \
--pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
```
Você pode baixar os modelos pré-treinados do Google para a conversão [aqui](https://github.com/google-research/bert#pre-trained-models).
## ALBERT
Converta os checkpoints do modelo ALBERT em TensorFlow para PyTorch usando o
[convert_albert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py) script.
A Interface de Linha de Comando (CLI) recebe como entrada um checkpoint do TensorFlow (três arquivos começando com `model.ckpt-best`) e o
arquivo de configuração (`albert_config.json`), então cria e salva um modelo PyTorch. Para executar esta conversão, você
precisa ter o TensorFlow e o PyTorch instalados.
Aqui está um exemplo do processo de conversão para o modelo `ALBERT Base` pré-treinado:
```bash
export ALBERT_BASE_DIR=/path/to/albert/albert_base
transformers convert --model_type albert \
--tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
--config $ALBERT_BASE_DIR/albert_config.json \
--pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
```
Você pode baixar os modelos pré-treinados do Google para a conversão [aqui](https://github.com/google-research/albert#pre-trained-models).
## OpenAI GPT
Aqui está um exemplo do processo de conversão para um modelo OpenAI GPT pré-treinado, supondo que seu checkpoint NumPy
foi salvo com o mesmo formato do modelo pré-treinado OpenAI (veja [aqui](https://github.com/openai/finetune-transformer-lm)\
)
```bash
export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
transformers convert --model_type gpt \
--tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
[--config OPENAI_GPT_CONFIG] \
[--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
```
## OpenAI GPT-2
Aqui está um exemplo do processo de conversão para um modelo OpenAI GPT-2 pré-treinado (consulte [aqui](https://github.com/openai/gpt-2))
```bash
export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/openai-community/gpt2/pretrained/weights
transformers convert --model_type gpt2 \
--tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
[--config OPENAI_GPT2_CONFIG] \
[--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
```
## XLNet
Aqui está um exemplo do processo de conversão para um modelo XLNet pré-treinado:
```bash
export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
transformers convert --model_type xlnet \
--tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
--config $TRANSFO_XL_CONFIG_PATH \
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
[--finetuning_task_name XLNET_FINETUNED_TASK] \
```
## XLM
Aqui está um exemplo do processo de conversão para um modelo XLM pré-treinado:
```bash
export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
transformers convert --model_type xlm \
--tf_checkpoint $XLM_CHECKPOINT_PATH \
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT
[--config XML_CONFIG] \
[--finetuning_task_name XML_FINETUNED_TASK]
```
## T5
Aqui está um exemplo do processo de conversão para um modelo T5 pré-treinado:
```bash
export T5=/path/to/t5/uncased_L-12_H-768_A-12
transformers convert --model_type t5 \
--tf_checkpoint $T5/t5_model.ckpt \
--config $T5/t5_config.json \
--pytorch_dump_output $T5/pytorch_model.bin
```

View File

@ -44,8 +44,6 @@
title: 聊天模型的模板
- local: serialization
title: 导出为 ONNX
- local: tflite
title: 导出为 TFLite
- local: torchscript
title: 导出为 TorchScript
- local: gguf
@ -76,8 +74,6 @@
title: 实例化大模型
- local: debugging
title: 问题定位及解决
- local: tf_xla
title: TensorFlow模型的XLA集成
- local: perf_torch_compile
title: 使用 `torch.compile()` 优化推理
title: 性能和可扩展性
@ -107,8 +103,6 @@
title: Configuration
- local: main_classes/data_collator
title: Data Collator
- local: main_classes/keras_callbacks
title: Keras callbacks
- local: main_classes/logging
title: Logging
- local: main_classes/model

View File

@ -1,27 +0,0 @@
<!--Copyright 2021 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# Keras callbacks
在Keras中训练Transformers模型时有一些库特定的callbacks函数可用于自动执行常见任务
## KerasMetricCallback
[[autodoc]] KerasMetricCallback
## PushToHubCallback
[[autodoc]] PushToHubCallback

View File

@ -1,179 +0,0 @@
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# 用于 TensorFlow 模型的 XLA 集成
[[open-in-colab]]
加速线性代数也称为XLA是一个用于加速TensorFlow模型运行时间的编译器。从[官方文档](https://www.tensorflow.org/xla)中可以看到:
XLA加速线性代数是一种针对线性代数的特定领域编译器可以在可能不需要更改源代码的情况下加速TensorFlow模型。
在TensorFlow中使用XLA非常简单——它包含在`tensorflow`库中,并且可以使用任何图创建函数中的`jit_compile`参数来触发,例如[`tf.function`](https://www.tensorflow.org/guide/intro_to_graphs)。在使用Keras方法如`fit()``predict()`时,只需将`jit_compile`参数传递给`model.compile()`即可启用XLA。然而XLA不仅限于这些方法 - 它还可以用于加速任何任意的`tf.function`
在🤗 Transformers中几个TensorFlow方法已经被重写为与XLA兼容包括[GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2)、[T5](https://huggingface.co/docs/transformers/model_doc/t5)和[OPT](https://huggingface.co/docs/transformers/model_doc/opt)等文本生成模型,以及[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)等语音处理模型。
虽然确切的加速倍数很大程度上取决于模型,但对于🤗 Transformers中的TensorFlow文本生成模型我们注意到速度提高了约100倍。本文档将解释如何在这些模型上使用XLA获得最大的性能。如果您有兴趣了解更多关于基准测试和我们在XLA集成背后的设计哲学的信息我们还将提供额外的资源链接。
## 使用 XLA 运行 TensorFlow 函数
让我们考虑以下TensorFlow 中的模型:
```py
import tensorflow as tf
model = tf.keras.Sequential(
[tf.keras.layers.Dense(10, input_shape=(10,), activation="relu"), tf.keras.layers.Dense(5, activation="softmax")]
)
```
上述模型接受维度为 `(10,)` 的输入。我们可以像下面这样使用模型进行前向传播:
```py
# Generate random inputs for the model.
batch_size = 16
input_vector_dim = 10
random_inputs = tf.random.normal((batch_size, input_vector_dim))
# Run a forward pass.
_ = model(random_inputs)
```
为了使用 XLA 编译的函数运行前向传播,我们需要执行以下操作:
```py
xla_fn = tf.function(model, jit_compile=True)
_ = xla_fn(random_inputs)
```
`model`的默认`call()`函数用于编译XLA图。但如果你想将其他模型函数编译成XLA也是可以的如下所示
```py
my_xla_fn = tf.function(model.my_xla_fn, jit_compile=True)
```
## 在🤗 Transformers库中使用XLA运行TensorFlow文本生成模型
要在🤗 Transformers中启用XLA加速生成您需要安装最新版本的`transformers`。您可以通过运行以下命令来安装它:
```bash
pip install transformers --upgrade
```
然后您可以运行以下代码:
```py
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForCausalLM
# Will error if the minimal version of Transformers is not installed.
from transformers.utils import check_min_version
check_min_version("4.21.0")
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
input_string = ["TensorFlow is"]
# One line to create an XLA generation function
xla_generate = tf.function(model.generate, jit_compile=True)
tokenized_input = tokenizer(input_string, return_tensors="tf")
generated_tokens = xla_generate(**tokenized_input, num_beams=2)
decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
print(f"Generated -- {decoded_text}")
# Generated -- TensorFlow is an open-source, open-source, distributed-source application # framework for the
```
正如您所注意到的,在`generate()`上启用XLA只需要一行代码。其余部分代码保持不变。然而上面的代码片段中有一些与XLA相关的注意事项。您需要了解这些注意事项以充分利用XLA可能带来的性能提升。我们将在下面的部分讨论这些内容。
## 需要关注的注意事项
当您首次执行启用XLA的函数如上面的`xla_generate()`)时,它将在内部尝试推断计算图,这是一个耗时的过程。这个过程被称为[“tracing”](https://www.tensorflow.org/guide/intro_to_graphs#when_is_a_function_tracing)。
您可能会注意到生成时间并不快。连续调用`xla_generate()`或任何其他启用了XLA的函数不需要再次推断计算图只要函数的输入与最初构建计算图时的形状相匹配。对于具有固定输入形状的模态例如图像这不是问题但如果您正在处理具有可变输入形状的模态例如文本则必须注意。
为了确保`xla_generate()`始终使用相同的输入形状,您可以在调用`tokenizer`时指定`padding`参数。
```py
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
input_string = ["TensorFlow is"]
xla_generate = tf.function(model.generate, jit_compile=True)
# Here, we call the tokenizer with padding options.
tokenized_input = tokenizer(input_string, pad_to_multiple_of=8, padding=True, return_tensors="tf")
generated_tokens = xla_generate(**tokenized_input, num_beams=2)
decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
print(f"Generated -- {decoded_text}")
```
通过这种方式,您可以确保`xla_generate()`的输入始终具有它跟踪的形状,从而加速生成时间。您可以使用以下代码来验证这一点:
```py
import time
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
xla_generate = tf.function(model.generate, jit_compile=True)
for input_string in ["TensorFlow is", "TensorFlow is a", "TFLite is a"]:
tokenized_input = tokenizer(input_string, pad_to_multiple_of=8, padding=True, return_tensors="tf")
start = time.time_ns()
generated_tokens = xla_generate(**tokenized_input, num_beams=2)
end = time.time_ns()
print(f"Execution time -- {(end - start) / 1e6:.1f} ms\n")
```
在Tesla T4 GPU上您可以期望如下的输出
```bash
Execution time -- 30819.6 ms
Execution time -- 79.0 ms
Execution time -- 78.9 ms
```
第一次调用`xla_generate()`会因为`tracing`而耗时,但后续的调用会快得多。请注意,任何时候对生成选项的更改都会触发重新`tracing`,从而导致生成时间减慢。
在本文档中,我们没有涵盖🤗 Transformers提供的所有文本生成选项。我们鼓励您阅读文档以了解高级用例。
## 附加资源
以下是一些附加资源,如果您想深入了解在🤗 Transformers和其他库下使用XLA
* [这个Colab Notebook](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/91_tf_xla_generate.ipynb) 提供了一个互动演示让您可以尝试使用XLA兼容的编码器-解码器(例如[T5](https://huggingface.co/docs/transformers/model_doc/t5))和仅解码器(例如[GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2))文本生成模型。
* [这篇博客文章](https://huggingface.co/blog/tf-xla-generate) 提供了XLA兼容模型的比较基准概述以及关于在TensorFlow中使用XLA的友好介绍。
* [这篇博客文章](https://blog.tensorflow.org/2022/11/how-hugging-face-improved-text-generation-performance-with-xla.html) 讨论了我们在🤗 Transformers中为TensorFlow模型添加XLA支持的设计理念。
* 推荐用于更多学习XLA和TensorFlow图的资源
* [XLA面向机器学习的优化编译器](https://www.tensorflow.org/xla)
* [图和tf.function简介](https://www.tensorflow.org/guide/intro_to_graphs)
* [使用tf.function获得更好的性能](https://www.tensorflow.org/guide/function)

View File

@ -1,54 +0,0 @@
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# 导出为 TFLite
[TensorFlow Lite](https://www.tensorflow.org/lite/guide) 是一个轻量级框架用于资源受限的设备上如手机、嵌入式系统和物联网IoT设备部署机器学习模型。TFLite 旨在在计算能力、内存和功耗有限的设备上优化和高效运行模型。模型以一种特殊的高效可移植格式表示,其文件扩展名为 `.tflite`
🤗 Optimum 通过 `exporters.tflite` 模块提供将 🤗 Transformers 模型导出至 TFLite 格式的功能。请参考 [🤗 Optimum 文档](https://huggingface.co/docs/optimum/exporters/tflite/overview) 以获取支持的模型架构列表。
要将模型导出为 TFLite 格式,请安装所需的依赖项:
```bash
pip install optimum[exporters-tf]
```
请参阅 [🤗 Optimum 文档](https://huggingface.co/docs/optimum/main/en/exporters/tflite/usage_guides/export_a_model) 以查看所有可用参数,或者在命令行中查看帮助:
```bash
optimum-cli export tflite --help
```
运行以下命令,以从 🤗 Hub 导出模型的检查点checkpoint`google-bert/bert-base-uncased` 为例:
```bash
optimum-cli export tflite --model google-bert/bert-base-uncased --sequence_length 128 bert_tflite/
```
你应该能在日志中看到导出进度以及生成的 `model.tflite` 文件的保存位置,如下所示:
```bash
Validating TFLite model...
-[] TFLite model output names match reference model (logits)
- Validating TFLite Model output "logits":
-[] (1, 128, 30522) matches (1, 128, 30522)
-[x] values not close enough, max diff: 5.817413330078125e-05 (atol: 1e-05)
The TensorFlow Lite export succeeded with the warning: The maximum absolute difference between the output of the reference model and the TFLite exported model is not within the set tolerance 1e-05:
- logits: max diff = 5.817413330078125e-05.
The exported model was saved at: bert_tflite
```
上面的示例说明了从 🤗 Hub 导出检查点的过程。导出本地模型时,首先需要确保将模型的权重和分词器文件保存在同一目录(`local_path`)中。在使用 CLI命令行`local_path` 传递给 `model` 参数,而不是 🤗 Hub 上的检查点名称。

View File

@ -26,7 +26,7 @@ from typing import Optional
import tqdm
from filelock import FileLock
from transformers import PreTrainedTokenizer, is_tf_available, is_torch_available
from transformers import PreTrainedTokenizer, is_torch_available
logger = logging.getLogger(__name__)
@ -78,11 +78,6 @@ if is_torch_available():
from torch.utils.data import Dataset
class MultipleChoiceDataset(Dataset):
"""
This will be superseded by a framework-agnostic approach
soon.
"""
features: list[InputFeatures]
def __init__(
@ -139,94 +134,6 @@ if is_torch_available():
return self.features[i]
if is_tf_available():
import tensorflow as tf
class TFMultipleChoiceDataset:
"""
This will be superseded by a framework-agnostic approach
soon.
"""
features: list[InputFeatures]
def __init__(
self,
data_dir: str,
tokenizer: PreTrainedTokenizer,
task: str,
max_seq_length: Optional[int] = 128,
overwrite_cache=False,
mode: Split = Split.train,
):
processor = processors[task]()
logger.info(f"Creating features from dataset file at {data_dir}")
label_list = processor.get_labels()
if mode == Split.dev:
examples = processor.get_dev_examples(data_dir)
elif mode == Split.test:
examples = processor.get_test_examples(data_dir)
else:
examples = processor.get_train_examples(data_dir)
logger.info("Training examples: %s", len(examples))
self.features = convert_examples_to_features(
examples,
label_list,
max_seq_length,
tokenizer,
)
def gen():
for ex_index, ex in tqdm.tqdm(enumerate(self.features), desc="convert examples to features"):
if ex_index % 10000 == 0:
logger.info("Writing example %d of %d" % (ex_index, len(examples)))
yield (
{
"example_id": 0,
"input_ids": ex.input_ids,
"attention_mask": ex.attention_mask,
"token_type_ids": ex.token_type_ids,
},
ex.label,
)
self.dataset = tf.data.Dataset.from_generator(
gen,
(
{
"example_id": tf.int32,
"input_ids": tf.int32,
"attention_mask": tf.int32,
"token_type_ids": tf.int32,
},
tf.int64,
),
(
{
"example_id": tf.TensorShape([]),
"input_ids": tf.TensorShape([None, None]),
"attention_mask": tf.TensorShape([None, None]),
"token_type_ids": tf.TensorShape([None, None]),
},
tf.TensorShape([]),
),
)
def get_dataset(self):
self.dataset = self.dataset.apply(tf.data.experimental.assert_cardinality(len(self.features)))
return self.dataset
def __len__(self):
return len(self.features)
def __getitem__(self, i) -> InputFeatures:
return self.features[i]
class DataProcessor:
"""Base class for data converters for multiple choice data sets."""

View File

@ -22,7 +22,7 @@ from typing import Optional, Union
from filelock import FileLock
from transformers import PreTrainedTokenizer, is_tf_available, is_torch_available
from transformers import PreTrainedTokenizer, is_torch_available
logger = logging.getLogger(__name__)
@ -208,11 +208,6 @@ if is_torch_available():
from torch.utils.data import Dataset
class TokenClassificationDataset(Dataset):
"""
This will be superseded by a framework-agnostic approach
soon.
"""
features: list[InputFeatures]
pad_token_label_id: int = nn.CrossEntropyLoss().ignore_index
# Use cross entropy ignore_index as padding label id so that only
@ -271,100 +266,3 @@ if is_torch_available():
def __getitem__(self, i) -> InputFeatures:
return self.features[i]
if is_tf_available():
import tensorflow as tf
class TFTokenClassificationDataset:
"""
This will be superseded by a framework-agnostic approach
soon.
"""
features: list[InputFeatures]
pad_token_label_id: int = -100
# Use cross entropy ignore_index as padding label id so that only
# real label ids contribute to the loss later.
def __init__(
self,
token_classification_task: TokenClassificationTask,
data_dir: str,
tokenizer: PreTrainedTokenizer,
labels: list[str],
model_type: str,
max_seq_length: Optional[int] = None,
overwrite_cache=False,
mode: Split = Split.train,
):
examples = token_classification_task.read_examples_from_file(data_dir, mode)
# TODO clean up all this to leverage built-in features of tokenizers
self.features = token_classification_task.convert_examples_to_features(
examples,
labels,
max_seq_length,
tokenizer,
cls_token_at_end=bool(model_type in ["xlnet"]),
# xlnet has a cls token at the end
cls_token=tokenizer.cls_token,
cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
sep_token=tokenizer.sep_token,
sep_token_extra=False,
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
pad_on_left=bool(tokenizer.padding_side == "left"),
pad_token=tokenizer.pad_token_id,
pad_token_segment_id=tokenizer.pad_token_type_id,
pad_token_label_id=self.pad_token_label_id,
)
def gen():
for ex in self.features:
if ex.token_type_ids is None:
yield (
{"input_ids": ex.input_ids, "attention_mask": ex.attention_mask},
ex.label_ids,
)
else:
yield (
{
"input_ids": ex.input_ids,
"attention_mask": ex.attention_mask,
"token_type_ids": ex.token_type_ids,
},
ex.label_ids,
)
if "token_type_ids" not in tokenizer.model_input_names:
self.dataset = tf.data.Dataset.from_generator(
gen,
({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int64),
(
{"input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None])},
tf.TensorShape([None]),
),
)
else:
self.dataset = tf.data.Dataset.from_generator(
gen,
({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
(
{
"input_ids": tf.TensorShape([None]),
"attention_mask": tf.TensorShape([None]),
"token_type_ids": tf.TensorShape([None]),
},
tf.TensorShape([None]),
),
)
def get_dataset(self):
self.dataset = self.dataset.apply(tf.data.experimental.assert_cardinality(len(self.features)))
return self.dataset
def __len__(self):
return len(self.features)
def __getitem__(self, i) -> InputFeatures:
return self.features[i]

View File

@ -152,7 +152,7 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
images: ImageInput,
do_resize: Optional[bool] = None,
size: Optional[dict[str, int]] = None,
resample: PILImageResampling = None,
resample: Optional[PILImageResampling] = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None,
@ -194,10 +194,8 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
@ -221,13 +219,11 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
size = size if size is not None else self.size
size = get_size_dict(size, default_to_square=False)
images = self.fetch_images(images)
images = make_flat_list_of_images(images)
if not valid_images(images):
raise ValueError(
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
"torch.Tensor, tf.Tensor or jax.ndarray."
)
raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, or torch.Tensor")
validate_preprocess_arguments(
do_rescale=do_rescale,

View File

@ -5,11 +5,9 @@
# modular_dummy_bert.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
import math
import os
from typing import Optional, Union
import torch
from packaging import version
from torch import nn
from ...activations import ACT2FN
@ -19,7 +17,7 @@ from ...modeling_layers import GradientCheckpointingLayer
from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, BaseModelOutputWithPoolingAndCrossAttentions
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import auto_docstring, get_torch_version, logging
from ...utils import auto_docstring, logging
from ...utils.deprecation import deprecate_kwarg
from .configuration_dummy_bert import DummyBertConfig
@ -36,8 +34,6 @@ class DummyBertEmbeddings(nn.Module):
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
@ -228,7 +224,6 @@ class DummyBertSdpaSelfAttention(DummyBertSelfAttention):
def __init__(self, config, position_embedding_type=None, layer_idx=None):
super().__init__(config, position_embedding_type=position_embedding_type, layer_idx=layer_idx)
self.dropout_prob = config.attention_probs_dropout_prob
self.require_contiguous_qkv = version.parse(get_torch_version()) < version.parse("2.2.0")
# Adapted from DummyBertSelfAttention
@deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
@ -308,14 +303,6 @@ class DummyBertSdpaSelfAttention(DummyBertSelfAttention):
if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
past_key_values.is_updated[self.layer_idx] = True
# SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom
# attn_mask, so we need to call `.contiguous()` here. This was fixed in torch==2.2.0.
# Reference: https://github.com/pytorch/pytorch/issues/112577
if self.require_contiguous_qkv and query_layer.device.type == "cuda" and attention_mask is not None:
query_layer = query_layer.contiguous()
key_layer = key_layer.contiguous()
value_layer = value_layer.contiguous()
# We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
# in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
# The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create
@ -655,83 +642,9 @@ class DummyBertLMPredictionHead(nn.Module):
return hidden_states
def load_tf_weights_in_dummy_bert(model, config, tf_checkpoint_path):
"""Load tf checkpoints in a pytorch model."""
try:
import re
import numpy as np
import tensorflow as tf
except ImportError:
logger.error(
"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path = os.path.abspath(tf_checkpoint_path)
logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
# Load weights from TF model
init_vars = tf.train.list_variables(tf_path)
names = []
arrays = []
for name, shape in init_vars:
logger.info(f"Loading TF weight {name} with shape {shape}")
array = tf.train.load_variable(tf_path, name)
names.append(name)
arrays.append(array)
for name, array in zip(names, arrays):
name = name.split("/")
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model
if any(
n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
for n in name
):
logger.info(f"Skipping {'/'.join(name)}")
continue
pointer = model
for m_name in name:
if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
scope_names = re.split(r"_(\d+)", m_name)
else:
scope_names = [m_name]
if scope_names[0] == "kernel" or scope_names[0] == "gamma":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
pointer = getattr(pointer, "bias")
elif scope_names[0] == "output_weights":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "squad":
pointer = getattr(pointer, "classifier")
else:
try:
pointer = getattr(pointer, scope_names[0])
except AttributeError:
logger.info(f"Skipping {'/'.join(name)}")
continue
if len(scope_names) >= 2:
num = int(scope_names[1])
pointer = pointer[num]
if m_name[-11:] == "_embeddings":
pointer = getattr(pointer, "weight")
elif m_name == "kernel":
array = np.transpose(array)
try:
if pointer.shape != array.shape:
raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
except ValueError as e:
e.args += (pointer.shape, array.shape)
raise
logger.info(f"Initialize PyTorch weight {name}")
pointer.data = torch.from_numpy(array)
return model
@auto_docstring
class DummyBertPreTrainedModel(PreTrainedModel):
config: DummyBertConfig
load_tf_weights = load_tf_weights_in_dummy_bert
base_model_prefix = "dummy_bert"
supports_gradient_checkpointing = True
_supports_sdpa = True
@ -739,8 +652,6 @@ class DummyBertPreTrainedModel(PreTrainedModel):
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, nn.Linear):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()

View File

@ -12,13 +12,9 @@ from torch import nn
from ...activations import ACT2FN
from ...modeling_layers import GradientCheckpointingLayer
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
from ...utils import logging
from .configuration_from_uppercase_model import FromUppercaseModelTextConfig, FromUppercaseModelVisionConfig
logger = logging.get_logger(__name__)
def eager_attention_forward(
module: nn.Module,
query: torch.Tensor,
@ -96,13 +92,7 @@ class FromUppercaseModelAttention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and output_attentions:
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,

View File

@ -16,13 +16,10 @@ from ...activations import ACT2FN
from ...modeling_layers import GradientCheckpointingLayer
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from ...utils import auto_docstring, can_return_tuple, logging, torch_int
from ...utils import auto_docstring, can_return_tuple, torch_int
from .configuration_multimodal2 import Multimodal2Config, Multimodal2TextConfig, Multimodal2VisionConfig
logger = logging.get_logger(__name__)
def eager_attention_forward(
module: nn.Module,
query: torch.Tensor,
@ -100,13 +97,7 @@ class Multimodal2VisionAttention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and output_attentions:
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
@ -196,13 +187,7 @@ class Multimodal2Attention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and output_attentions:
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,

View File

@ -220,7 +220,7 @@ class MyNewModel2DecoderLayer(GradientCheckpointingLayer):
cache_position: Optional[torch.LongTensor] = None,
position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
**kwargs: Unpack[TransformersKwargs],
) -> tuple[torch.Tensor]:
) -> torch.Tensor:
residual = hidden_states
hidden_states = self.input_layernorm(hidden_states)
# Self Attention

View File

@ -10,7 +10,7 @@ from typing import ClassVar, Optional, Union
import torch
from torch import nn
from ...cache_utils import Cache, HybridCache, StaticCache
from ...cache_utils import Cache, StaticCache
from ...generation import GenerationMixin
from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...modeling_outputs import BaseModelOutputWithPast
@ -93,7 +93,7 @@ class NewTaskModelPreTrainedModel(PreTrainedModel):
_no_split_modules = ["NewTaskModelMultiModalProjector"]
_skip_keys_device_placement = "past_key_values"
_can_compile_fullgraph = True
_can_compile_fullgraph = False
_supports_flash_attn = True
_supports_sdpa = True
_supports_flex_attn = True
@ -166,8 +166,6 @@ class NewTaskModelModel(NewTaskModelPreTrainedModel):
inputs_lead_dim, sequence_length = input_tensor.shape[:2]
if using_static_cache:
target_length = past_key_values.get_max_cache_shape()
elif isinstance(past_key_values, HybridCache):
target_length = past_key_values.get_max_cache_shape()
else:
target_length = (
attention_mask.shape[-1]
@ -256,8 +254,8 @@ class NewTaskModelModel(NewTaskModelPreTrainedModel):
@auto_docstring
def forward(
self,
input_ids: torch.LongTensor = None,
pixel_values: torch.FloatTensor = None,
input_ids: Optional[torch.LongTensor] = None,
pixel_values: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Union[list[torch.FloatTensor], Cache]] = None,
@ -505,7 +503,8 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
if cache_position[0] == 0:
model_inputs["pixel_values"] = pixel_values
is_training = token_type_ids is not None and labels is not None
if cache_position[0] == 0 and isinstance(past_key_values, HybridCache):
is_static_hybrid_cache = isinstance(past_key_values, StaticCache) and any(past_key_values.is_sliding)
if cache_position[0] == 0 and is_static_hybrid_cache:
input_tensor = inputs_embeds if inputs_embeds is not None else input_ids
causal_mask = self.model._update_causal_mask(
attention_mask, token_type_ids, past_key_values, cache_position, input_tensor, is_training

View File

@ -5,12 +5,10 @@
# modular_roberta.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
import math
import os
from typing import Optional, Union
import torch
import torch.nn as nn
from packaging import version
from ...activations import ACT2FN
from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
@ -19,7 +17,7 @@ from ...modeling_layers import GradientCheckpointingLayer
from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, BaseModelOutputWithPoolingAndCrossAttentions
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import auto_docstring, get_torch_version, logging
from ...utils import auto_docstring, logging
from ...utils.deprecation import deprecate_kwarg
from .configuration_roberta import RobertaConfig
@ -38,8 +36,6 @@ class RobertaEmbeddings(nn.Module):
)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
@ -231,7 +227,6 @@ class RobertaSdpaSelfAttention(RobertaSelfAttention):
def __init__(self, config, position_embedding_type=None, layer_idx=None):
super().__init__(config, position_embedding_type=position_embedding_type, layer_idx=layer_idx)
self.dropout_prob = config.attention_probs_dropout_prob
self.require_contiguous_qkv = version.parse(get_torch_version()) < version.parse("2.2.0")
# Adapted from RobertaSelfAttention
@deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
@ -311,14 +306,6 @@ class RobertaSdpaSelfAttention(RobertaSelfAttention):
if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
past_key_values.is_updated[self.layer_idx] = True
# SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom
# attn_mask, so we need to call `.contiguous()` here. This was fixed in torch==2.2.0.
# Reference: https://github.com/pytorch/pytorch/issues/112577
if self.require_contiguous_qkv and query_layer.device.type == "cuda" and attention_mask is not None:
query_layer = query_layer.contiguous()
key_layer = key_layer.contiguous()
value_layer = value_layer.contiguous()
# We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
# in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
# The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create
@ -658,83 +645,9 @@ class RobertaLMPredictionHead(nn.Module):
return hidden_states
def load_tf_weights_in_roberta(model, config, tf_checkpoint_path):
"""Load tf checkpoints in a pytorch model."""
try:
import re
import numpy as np
import tensorflow as tf
except ImportError:
logger.error(
"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path = os.path.abspath(tf_checkpoint_path)
logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
# Load weights from TF model
init_vars = tf.train.list_variables(tf_path)
names = []
arrays = []
for name, shape in init_vars:
logger.info(f"Loading TF weight {name} with shape {shape}")
array = tf.train.load_variable(tf_path, name)
names.append(name)
arrays.append(array)
for name, array in zip(names, arrays):
name = name.split("/")
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model
if any(
n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
for n in name
):
logger.info(f"Skipping {'/'.join(name)}")
continue
pointer = model
for m_name in name:
if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
scope_names = re.split(r"_(\d+)", m_name)
else:
scope_names = [m_name]
if scope_names[0] == "kernel" or scope_names[0] == "gamma":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
pointer = getattr(pointer, "bias")
elif scope_names[0] == "output_weights":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "squad":
pointer = getattr(pointer, "classifier")
else:
try:
pointer = getattr(pointer, scope_names[0])
except AttributeError:
logger.info(f"Skipping {'/'.join(name)}")
continue
if len(scope_names) >= 2:
num = int(scope_names[1])
pointer = pointer[num]
if m_name[-11:] == "_embeddings":
pointer = getattr(pointer, "weight")
elif m_name == "kernel":
array = np.transpose(array)
try:
if pointer.shape != array.shape:
raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
except ValueError as e:
e.args += (pointer.shape, array.shape)
raise
logger.info(f"Initialize PyTorch weight {name}")
pointer.data = torch.from_numpy(array)
return model
@auto_docstring
class RobertaPreTrainedModel(PreTrainedModel):
config: RobertaConfig
load_tf_weights = load_tf_weights_in_roberta
base_model_prefix = "roberta"
supports_gradient_checkpointing = True
_supports_sdpa = True
@ -742,8 +655,6 @@ class RobertaPreTrainedModel(PreTrainedModel):
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, nn.Linear):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()

View File

@ -46,6 +46,8 @@ class SuperRMSNorm(nn.Module):
class SuperRotaryEmbedding(nn.Module):
inv_freq: torch.Tensor # fix linting for `register_buffer`
def __init__(self, config: SuperConfig, device=None):
super().__init__()
# BC: "rope_type" was originally "type"
@ -260,7 +262,7 @@ class SuperDecoderLayer(GradientCheckpointingLayer):
cache_position: Optional[torch.LongTensor] = None,
position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
**kwargs: Unpack[TransformersKwargs],
) -> tuple[torch.Tensor]:
) -> torch.Tensor:
residual = hidden_states
hidden_states = self.input_layernorm(hidden_states)
# Self Attention

View File

@ -846,8 +846,6 @@ class TestDetrPreTrainedModel(PreTrainedModel):
nn.init.xavier_uniform_(module.output_proj.weight.data)
nn.init.constant_(module.output_proj.bias.data, 0.0)
elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()

View File

@ -185,7 +185,7 @@ def postprocess_qa_predictions(
if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""):
predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0})
# Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
# Compute the softmax of all scores (we do it with numpy to stay independent from torch in this file, using
# the LogSumExp trick).
scores = np.array([pred.pop("score") for pred in predictions])
exp_scores = np.exp(scores - np.max(scores))
@ -392,7 +392,7 @@ def postprocess_qa_predictions_with_beam_search(
min_null_score = -2e-6
predictions.insert(0, {"text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": min_null_score})
# Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
# Compute the softmax of all scores (we do it with numpy to stay independent from torch in this file, using
# the LogSumExp trick).
scores = np.array([pred.pop("score") for pred in predictions])
exp_scores = np.exp(scores - np.max(scores))

View File

@ -109,7 +109,6 @@ _deps = [
"faiss-cpu",
"fastapi",
"filelock",
"flax>=0.4.1,<=0.7.0",
"ftfy",
"fugashi>=1.0",
"GitPython<3.1.19",
@ -118,13 +117,8 @@ _deps = [
"huggingface-hub>=0.34.0,<1.0",
"importlib_metadata",
"ipadic>=1.0.0,<2.0",
"jax>=0.4.1,<=0.4.13",
"jaxlib>=0.4.1,<=0.4.13",
"jinja2>=3.1.0",
"kenlm",
# Keras pin - this is to make sure Keras 3 doesn't destroy us. Remove or change when we have proper support.
"keras>2.9,<2.16",
"keras-nlp>=0.3.1,<0.14.0", # keras-nlp 0.14 doesn't support keras 2, see pin on keras.
"kernels>=0.6.1,<=0.9",
"librosa",
"natten>=0.14.6,<0.15.0",
@ -170,19 +164,13 @@ _deps = [
"sagemaker>=2.31.0",
"schedulefree>=1.2.6",
"scikit-learn",
"scipy<1.13.0", # SciPy >= 1.13.0 is not supported with the current jax pin (`jax>=0.4.1,<=0.4.13`)
"scipy",
"sentencepiece>=0.1.91,!=0.1.92",
"sigopt",
"starlette",
"sudachipy>=0.6.6",
"sudachidict_core>=20220729",
"tensorboard",
# TensorFlow pin. When changing this value, update examples/tensorflow/_tests_requirements.txt accordingly
"tensorflow-cpu>2.9,<2.16",
"tensorflow>2.9,<2.16",
"tensorflow-text<2.16",
"tensorflow-probability<0.24",
"tf2onnx",
"timeout-decorator",
"tiktoken",
"timm<=1.0.19,!=1.0.18",
@ -273,32 +261,19 @@ extras = {}
extras["ja"] = deps_list("fugashi", "ipadic", "unidic_lite", "unidic", "sudachipy", "sudachidict_core", "rhoknp")
extras["sklearn"] = deps_list("scikit-learn")
extras["tf"] = deps_list("tensorflow", "onnxconverter-common", "tf2onnx", "tensorflow-text", "keras-nlp")
extras["tf-cpu"] = deps_list(
"keras",
"tensorflow-cpu",
"onnxconverter-common",
"tf2onnx",
"tensorflow-text",
"keras-nlp",
"tensorflow-probability",
)
extras["torch"] = deps_list("torch", "accelerate")
extras["accelerate"] = deps_list("accelerate")
extras["hf_xet"] = deps_list("hf_xet")
if os.name == "nt": # windows
extras["retrieval"] = deps_list("datasets") # faiss is not supported on windows
extras["flax"] = [] # jax is not supported on windows
else:
extras["retrieval"] = deps_list("faiss-cpu", "datasets")
extras["flax"] = deps_list("jax", "jaxlib", "flax", "optax", "scipy")
extras["tokenizers"] = deps_list("tokenizers")
extras["ftfy"] = deps_list("ftfy")
extras["onnxruntime"] = deps_list("onnxruntime", "onnxruntime-tools")
extras["onnx"] = deps_list("onnxconverter-common", "tf2onnx") + extras["onnxruntime"]
extras["onnx"] = deps_list("onnxconverter-common") + extras["onnxruntime"]
extras["modelcreation"] = deps_list("cookiecutter")
extras["sagemaker"] = deps_list("sagemaker")
@ -320,8 +295,6 @@ extras["audio"] = deps_list(
# `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead
extras["speech"] = deps_list("torchaudio") + extras["audio"]
extras["torch-speech"] = deps_list("torchaudio") + extras["audio"]
extras["tf-speech"] = extras["audio"]
extras["flax-speech"] = extras["audio"]
extras["vision"] = deps_list("Pillow")
extras["timm"] = deps_list("timm")
extras["torch-vision"] = deps_list("torchvision") + extras["vision"]
@ -372,9 +345,7 @@ extras["ruff"] = deps_list("ruff")
extras["quality"] = deps_list("datasets", "ruff", "GitPython", "urllib3", "libcst", "rich", "pandas")
extras["all"] = (
extras["tf"]
+ extras["torch"]
+ extras["flax"]
extras["torch"]
+ extras["sentencepiece"]
+ extras["tokenizers"]
+ extras["torch-speech"]
@ -409,18 +380,7 @@ extras["dev-torch"] = (
+ extras["onnxruntime"]
+ extras["num2words"]
)
extras["dev-tensorflow"] = (
extras["testing"]
+ extras["tf"]
+ extras["sentencepiece"]
+ extras["tokenizers"]
+ extras["vision"]
+ extras["quality"]
+ extras["sklearn"]
+ extras["modelcreation"]
+ extras["onnx"]
+ extras["tf-speech"]
)
extras["dev"] = (
extras["all"] + extras["testing"] + extras["quality"] + extras["ja"] + extras["sklearn"] + extras["modelcreation"]
)
@ -464,10 +424,10 @@ setup(
version="4.57.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
author_email="transformers@huggingface.co",
description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",
description="Transformers: the model-definition framework for state-of-the-art machine learning models in text, vision, audio, and multimodal models, for both inference and training.",
long_description=open("README.md", "r", encoding="utf-8").read(),
long_description_content_type="text/markdown",
keywords="NLP vision speech deep learning transformer pytorch tensorflow jax BERT GPT-2 Wav2Vec2 ViT",
keywords="machine-learning nlp python pytorch transformer llm vlm deep-learning inference training model-hub pretrained-models llama gemma qwen",
license="Apache 2.0 License",
url="https://github.com/huggingface/transformers",
package_dir={"": "src"},
@ -503,14 +463,10 @@ setup(
)
extras["tests_torch"] = deps_list()
extras["tests_tf"] = deps_list()
extras["tests_flax"] = deps_list()
extras["tests_hub"] = deps_list()
extras["tests_pipelines_torch"] = deps_list()
extras["tests_pipelines_tf"] = deps_list()
extras["tests_onnx"] = deps_list()
extras["tests_examples_torch"] = deps_list()
extras["tests_examples_tf"] = deps_list()
extras["tests_custom_tokenizers"] = deps_list()
extras["tests_exotic_models"] = deps_list()
extras["consistency"] = deps_list()

View File

@ -40,13 +40,9 @@ from .utils import (
# so that mypy, pylint or other static linters can recognize them,
# given that they are not exported using `__all__` in this file.
from .utils import is_bitsandbytes_available as is_bitsandbytes_available
from .utils import is_flax_available as is_flax_available
from .utils import is_keras_nlp_available as is_keras_nlp_available
from .utils import is_scipy_available as is_scipy_available
from .utils import is_sentencepiece_available as is_sentencepiece_available
from .utils import is_speech_available as is_speech_available
from .utils import is_tensorflow_text_available as is_tensorflow_text_available
from .utils import is_tf_available as is_tf_available
from .utils import is_timm_available as is_timm_available
from .utils import is_tokenizers_available as is_tokenizers_available
from .utils import is_torch_available as is_torch_available
@ -64,9 +60,7 @@ _import_structure = {
"audio_utils": [],
"commands": [],
"configuration_utils": ["PretrainedConfig"],
"convert_graph_to_onnx": [],
"convert_slow_tokenizers_checkpoints_to_fast": [],
"convert_tf_hub_seq_to_seq_bert_to_pytorch": [],
"data": [
"DataProcessor",
"InputExample",
@ -137,16 +131,6 @@ _import_structure = {
],
"loss": [],
"modelcard": ["ModelCard"],
# Losses
"modeling_tf_pytorch_utils": [
"convert_tf_weight_name_to_pt_weight_name",
"load_pytorch_checkpoint_in_tf2_model",
"load_pytorch_model_in_tf2_model",
"load_pytorch_weights_in_tf2_model",
"load_tf2_checkpoint_in_pytorch_model",
"load_tf2_model_in_pytorch_model",
"load_tf2_weights_in_pytorch_model",
],
# Models
"onnx": [],
"pipelines": [
@ -218,15 +202,12 @@ _import_structure = {
],
"training_args": ["TrainingArguments"],
"training_args_seq2seq": ["Seq2SeqTrainingArguments"],
"training_args_tf": ["TFTrainingArguments"],
"utils": [
"CONFIG_NAME",
"MODEL_CARD_NAME",
"PYTORCH_PRETRAINED_BERT_CACHE",
"PYTORCH_TRANSFORMERS_CACHE",
"SPIECE_UNDERLINE",
"TF2_WEIGHTS_NAME",
"TF_WEIGHTS_NAME",
"TRANSFORMERS_CACHE",
"WEIGHTS_NAME",
"TensorType",
@ -237,8 +218,6 @@ _import_structure = {
"is_bitsandbytes_available",
"is_datasets_available",
"is_faiss_available",
"is_flax_available",
"is_keras_nlp_available",
"is_matplotlib_available",
"is_mlx_available",
"is_phonemizer_available",
@ -251,8 +230,6 @@ _import_structure = {
"is_sentencepiece_available",
"is_sklearn_available",
"is_speech_available",
"is_tensorflow_text_available",
"is_tf_available",
"is_timm_available",
"is_tokenizers_available",
"is_torch_available",
@ -501,84 +478,6 @@ else:
_import_structure["trainer_pt_utils"] = ["torch_distributed_zero_first"]
_import_structure["trainer_seq2seq"] = ["Seq2SeqTrainer"]
# TensorFlow-backed objects
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from .utils import dummy_tf_objects
_import_structure["utils.dummy_tf_objects"] = [name for name in dir(dummy_tf_objects) if not name.startswith("_")]
else:
_import_structure["activations_tf"] = []
_import_structure["generation"].extend(
[
"TFForcedBOSTokenLogitsProcessor",
"TFForcedEOSTokenLogitsProcessor",
"TFForceTokensLogitsProcessor",
"TFGenerationMixin",
"TFLogitsProcessor",
"TFLogitsProcessorList",
"TFLogitsWarper",
"TFMinLengthLogitsProcessor",
"TFNoBadWordsLogitsProcessor",
"TFNoRepeatNGramLogitsProcessor",
"TFRepetitionPenaltyLogitsProcessor",
"TFSuppressTokensAtBeginLogitsProcessor",
"TFSuppressTokensLogitsProcessor",
"TFTemperatureLogitsWarper",
"TFTopKLogitsWarper",
"TFTopPLogitsWarper",
]
)
_import_structure["keras_callbacks"] = ["KerasMetricCallback", "PushToHubCallback"]
_import_structure["modeling_tf_outputs"] = []
_import_structure["modeling_tf_utils"] = [
"TFPreTrainedModel",
"TFSequenceSummary",
"TFSharedEmbeddings",
"shape_list",
]
_import_structure["optimization_tf"] = [
"AdamWeightDecay",
"GradientAccumulator",
"WarmUp",
"create_optimizer",
]
_import_structure["tf_utils"] = []
# FLAX-backed objects
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from .utils import dummy_flax_objects
_import_structure["utils.dummy_flax_objects"] = [
name for name in dir(dummy_flax_objects) if not name.startswith("_")
]
else:
_import_structure["generation"].extend(
[
"FlaxForcedBOSTokenLogitsProcessor",
"FlaxForcedEOSTokenLogitsProcessor",
"FlaxForceTokensLogitsProcessor",
"FlaxGenerationMixin",
"FlaxLogitsProcessor",
"FlaxLogitsProcessorList",
"FlaxLogitsWarper",
"FlaxMinLengthLogitsProcessor",
"FlaxTemperatureLogitsWarper",
"FlaxSuppressTokensAtBeginLogitsProcessor",
"FlaxSuppressTokensLogitsProcessor",
"FlaxTopKLogitsWarper",
"FlaxTopPLogitsWarper",
"FlaxWhisperTimeStampLogitsProcessor",
]
)
_import_structure["modeling_flax_outputs"] = []
_import_structure["modeling_flax_utils"] = ["FlaxPreTrainedModel"]
# Direct imports for type-checking
if TYPE_CHECKING:
@ -672,20 +571,6 @@ if TYPE_CHECKING:
from .generation import EpsilonLogitsWarper as EpsilonLogitsWarper
from .generation import EtaLogitsWarper as EtaLogitsWarper
from .generation import ExponentialDecayLengthPenalty as ExponentialDecayLengthPenalty
from .generation import FlaxForcedBOSTokenLogitsProcessor as FlaxForcedBOSTokenLogitsProcessor
from .generation import FlaxForcedEOSTokenLogitsProcessor as FlaxForcedEOSTokenLogitsProcessor
from .generation import FlaxForceTokensLogitsProcessor as FlaxForceTokensLogitsProcessor
from .generation import FlaxGenerationMixin as FlaxGenerationMixin
from .generation import FlaxLogitsProcessor as FlaxLogitsProcessor
from .generation import FlaxLogitsProcessorList as FlaxLogitsProcessorList
from .generation import FlaxLogitsWarper as FlaxLogitsWarper
from .generation import FlaxMinLengthLogitsProcessor as FlaxMinLengthLogitsProcessor
from .generation import FlaxSuppressTokensAtBeginLogitsProcessor as FlaxSuppressTokensAtBeginLogitsProcessor
from .generation import FlaxSuppressTokensLogitsProcessor as FlaxSuppressTokensLogitsProcessor
from .generation import FlaxTemperatureLogitsWarper as FlaxTemperatureLogitsWarper
from .generation import FlaxTopKLogitsWarper as FlaxTopKLogitsWarper
from .generation import FlaxTopPLogitsWarper as FlaxTopPLogitsWarper
from .generation import FlaxWhisperTimeStampLogitsProcessor as FlaxWhisperTimeStampLogitsProcessor
from .generation import ForcedBOSTokenLogitsProcessor as ForcedBOSTokenLogitsProcessor
from .generation import ForcedEOSTokenLogitsProcessor as ForcedEOSTokenLogitsProcessor
from .generation import GenerationConfig as GenerationConfig
@ -716,22 +601,6 @@ if TYPE_CHECKING:
from .generation import TemperatureLogitsWarper as TemperatureLogitsWarper
from .generation import TextIteratorStreamer as TextIteratorStreamer
from .generation import TextStreamer as TextStreamer
from .generation import TFForcedBOSTokenLogitsProcessor as TFForcedBOSTokenLogitsProcessor
from .generation import TFForcedEOSTokenLogitsProcessor as TFForcedEOSTokenLogitsProcessor
from .generation import TFForceTokensLogitsProcessor as TFForceTokensLogitsProcessor
from .generation import TFGenerationMixin as TFGenerationMixin
from .generation import TFLogitsProcessor as TFLogitsProcessor
from .generation import TFLogitsProcessorList as TFLogitsProcessorList
from .generation import TFLogitsWarper as TFLogitsWarper
from .generation import TFMinLengthLogitsProcessor as TFMinLengthLogitsProcessor
from .generation import TFNoBadWordsLogitsProcessor as TFNoBadWordsLogitsProcessor
from .generation import TFNoRepeatNGramLogitsProcessor as TFNoRepeatNGramLogitsProcessor
from .generation import TFRepetitionPenaltyLogitsProcessor as TFRepetitionPenaltyLogitsProcessor
from .generation import TFSuppressTokensAtBeginLogitsProcessor as TFSuppressTokensAtBeginLogitsProcessor
from .generation import TFSuppressTokensLogitsProcessor as TFSuppressTokensLogitsProcessor
from .generation import TFTemperatureLogitsWarper as TFTemperatureLogitsWarper
from .generation import TFTopKLogitsWarper as TFTopKLogitsWarper
from .generation import TFTopPLogitsWarper as TFTopPLogitsWarper
from .generation import TopKLogitsWarper as TopKLogitsWarper
from .generation import TopPLogitsWarper as TopPLogitsWarper
from .generation import TypicalLogitsWarper as TypicalLogitsWarper
@ -763,32 +632,14 @@ if TYPE_CHECKING:
from .integrations import is_wandb_available as is_wandb_available
from .integrations.executorch import TorchExportableModuleWithStaticCache as TorchExportableModuleWithStaticCache
from .integrations.executorch import convert_and_export_with_cache as convert_and_export_with_cache
from .keras_callbacks import KerasMetricCallback as KerasMetricCallback
from .keras_callbacks import PushToHubCallback as PushToHubCallback
from .masking_utils import AttentionMaskInterface as AttentionMaskInterface
from .model_debugging_utils import model_addition_debugger_context as model_addition_debugger_context
# Model Cards
from .modelcard import ModelCard as ModelCard
from .modeling_flax_utils import FlaxPreTrainedModel as FlaxPreTrainedModel
from .modeling_layers import GradientCheckpointingLayer as GradientCheckpointingLayer
from .modeling_rope_utils import ROPE_INIT_FUNCTIONS as ROPE_INIT_FUNCTIONS
from .modeling_rope_utils import dynamic_rope_update as dynamic_rope_update
# TF 2.0 <=> PyTorch conversion utilities
from .modeling_tf_pytorch_utils import (
convert_tf_weight_name_to_pt_weight_name as convert_tf_weight_name_to_pt_weight_name,
)
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model as load_pytorch_checkpoint_in_tf2_model
from .modeling_tf_pytorch_utils import load_pytorch_model_in_tf2_model as load_pytorch_model_in_tf2_model
from .modeling_tf_pytorch_utils import load_pytorch_weights_in_tf2_model as load_pytorch_weights_in_tf2_model
from .modeling_tf_pytorch_utils import load_tf2_checkpoint_in_pytorch_model as load_tf2_checkpoint_in_pytorch_model
from .modeling_tf_pytorch_utils import load_tf2_model_in_pytorch_model as load_tf2_model_in_pytorch_model
from .modeling_tf_pytorch_utils import load_tf2_weights_in_pytorch_model as load_tf2_weights_in_pytorch_model
from .modeling_tf_utils import TFPreTrainedModel as TFPreTrainedModel
from .modeling_tf_utils import TFSequenceSummary as TFSequenceSummary
from .modeling_tf_utils import TFSharedEmbeddings as TFSharedEmbeddings
from .modeling_tf_utils import shape_list as shape_list
from .modeling_utils import AttentionInterface as AttentionInterface
from .modeling_utils import PreTrainedModel as PreTrainedModel
from .models import *
@ -815,12 +666,6 @@ if TYPE_CHECKING:
from .optimization import get_scheduler as get_scheduler
from .optimization import get_wsd_schedule as get_wsd_schedule
# Optimization
from .optimization_tf import AdamWeightDecay as AdamWeightDecay
from .optimization_tf import GradientAccumulator as GradientAccumulator
from .optimization_tf import WarmUp as WarmUp
from .optimization_tf import create_optimizer as create_optimizer
# Pipelines
from .pipelines import AudioClassificationPipeline as AudioClassificationPipeline
from .pipelines import AutomaticSpeechRecognitionPipeline as AutomaticSpeechRecognitionPipeline
@ -894,7 +739,6 @@ if TYPE_CHECKING:
from .trainer_utils import set_seed as set_seed
from .training_args import TrainingArguments as TrainingArguments
from .training_args_seq2seq import Seq2SeqTrainingArguments as Seq2SeqTrainingArguments
from .training_args_tf import TFTrainingArguments as TFTrainingArguments
# Files and general utilities
from .utils import CONFIG_NAME as CONFIG_NAME
@ -902,8 +746,6 @@ if TYPE_CHECKING:
from .utils import PYTORCH_PRETRAINED_BERT_CACHE as PYTORCH_PRETRAINED_BERT_CACHE
from .utils import PYTORCH_TRANSFORMERS_CACHE as PYTORCH_TRANSFORMERS_CACHE
from .utils import SPIECE_UNDERLINE as SPIECE_UNDERLINE
from .utils import TF2_WEIGHTS_NAME as TF2_WEIGHTS_NAME
from .utils import TF_WEIGHTS_NAME as TF_WEIGHTS_NAME
from .utils import TRANSFORMERS_CACHE as TRANSFORMERS_CACHE
from .utils import WEIGHTS_NAME as WEIGHTS_NAME
from .utils import TensorType as TensorType
@ -968,9 +810,7 @@ else:
)
if not is_tf_available() and not is_torch_available() and not is_flax_available():
if not is_torch_available():
logger.warning_advice(
"None of PyTorch, TensorFlow >= 2.0, or Flax have been found. "
"Models won't be available and only tokenizers, configuration "
"and file/data utilities can be used."
"PyTorch was not found. Models won't be available and only tokenizers, configuration and file/data utilities can be used."
)

View File

@ -1,147 +0,0 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import tensorflow as tf
from packaging.version import parse
try:
import tf_keras as keras
except (ModuleNotFoundError, ImportError):
import keras
if parse(keras.__version__).major > 2:
raise ValueError(
"Your currently installed version of Keras is Keras 3, but this is not yet supported in "
"Transformers. Please install the backwards-compatible tf-keras package with "
"`pip install tf-keras`."
)
def _gelu(x):
"""
Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
initially created. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) Also see
https://huggingface.co/papers/1606.08415
"""
x = tf.convert_to_tensor(x)
cdf = 0.5 * (1.0 + tf.math.erf(x / tf.cast(tf.sqrt(2.0), x.dtype)))
return x * cdf
def _gelu_new(x):
"""
Gaussian Error Linear Unit. This is a smoother version of the GELU. Original paper: https://huggingface.co/papers/1606.0841
Args:
x: float Tensor to perform activation
Returns:
`x` with the GELU activation applied.
"""
x = tf.convert_to_tensor(x)
pi = tf.cast(math.pi, x.dtype)
coeff = tf.cast(0.044715, x.dtype)
cdf = 0.5 * (1.0 + tf.tanh(tf.sqrt(2.0 / pi) * (x + coeff * tf.pow(x, 3))))
return x * cdf
def mish(x):
x = tf.convert_to_tensor(x)
return x * tf.tanh(tf.math.softplus(x))
def gelu_fast(x):
x = tf.convert_to_tensor(x)
coeff1 = tf.cast(0.044715, x.dtype)
coeff2 = tf.cast(0.7978845608, x.dtype)
return 0.5 * x * (1.0 + tf.tanh(x * coeff2 * (1.0 + coeff1 * x * x)))
def quick_gelu(x):
x = tf.convert_to_tensor(x)
coeff = tf.cast(1.702, x.dtype)
return x * tf.math.sigmoid(coeff * x)
def gelu_10(x):
"""
Clip the range of possible GeLU outputs between [-10, 10]. This is especially useful for quantization purpose, as
it allows mapping 2 negatives values in the GeLU spectrum. For more information on this trick, please refer to
https://huggingface.co/papers/2004.09602
Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
initially created. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) Also see
https://huggingface.co/papers/1606.08415 :param x: :return:
"""
return tf.clip_by_value(_gelu(x), -10, 10)
def glu(x, axis=-1):
"""
Gated Linear Unit. Implementation as defined in the original paper (see https://huggingface.co/papers/1612.08083), where
the input `x` is split in two halves across a dimension (`axis`), A and B, returning A * sigmoid(B).
Args:
`x`: float Tensor to perform activation
`axis`: dimension across which `x` be split in half
Returns:
`x` with the GLU activation applied (with its size halved across the dimension `axis`).
"""
a, b = tf.split(x, 2, axis=axis)
return a * tf.math.sigmoid(b)
if parse(tf.version.VERSION) >= parse("2.4"):
def approximate_gelu_wrap(x):
return keras.activations.gelu(x, approximate=True)
gelu = keras.activations.gelu
gelu_new = approximate_gelu_wrap
else:
gelu = _gelu
gelu_new = _gelu_new
ACT2FN = {
"gelu": gelu,
"gelu_10": gelu_10,
"gelu_fast": gelu_fast,
"gelu_new": gelu_new,
"glu": glu,
"mish": mish,
"quick_gelu": quick_gelu,
"relu": keras.activations.relu,
"sigmoid": keras.activations.sigmoid,
"silu": keras.activations.swish,
"swish": keras.activations.swish,
"tanh": keras.activations.tanh,
}
def get_tf_activation(activation_string):
if activation_string in ACT2FN:
return ACT2FN[activation_string]
else:
raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}")

View File

@ -1,165 +0,0 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from argparse import ArgumentParser, Namespace
from ..utils import logging
from . import BaseTransformersCLICommand
def convert_command_factory(args: Namespace):
"""
Factory function used to convert a model TF 1.0 checkpoint in a PyTorch checkpoint.
Returns: ServeCommand
"""
return ConvertCommand(
args.model_type, args.tf_checkpoint, args.pytorch_dump_output, args.config, args.finetuning_task_name
)
IMPORT_ERROR_MESSAGE = """
transformers can only be used from the commandline to convert TensorFlow models in PyTorch, In that case, it requires
TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.
"""
class ConvertCommand(BaseTransformersCLICommand):
@staticmethod
def register_subcommand(parser: ArgumentParser):
"""
Register this command to argparse so it's available for the transformer-cli
Args:
parser: Root parser to register command-specific arguments
"""
train_parser = parser.add_parser(
"convert",
help="CLI tool to run convert model from original author checkpoints to Transformers PyTorch checkpoints.",
)
train_parser.add_argument("--model_type", type=str, required=True, help="Model's type.")
train_parser.add_argument(
"--tf_checkpoint", type=str, required=True, help="TensorFlow checkpoint path or folder."
)
train_parser.add_argument(
"--pytorch_dump_output", type=str, required=True, help="Path to the PyTorch saved model output."
)
train_parser.add_argument("--config", type=str, default="", help="Configuration file path or folder.")
train_parser.add_argument(
"--finetuning_task_name",
type=str,
default=None,
help="Optional fine-tuning task name if the TF model was a finetuned model.",
)
train_parser.set_defaults(func=convert_command_factory)
def __init__(
self,
model_type: str,
tf_checkpoint: str,
pytorch_dump_output: str,
config: str,
finetuning_task_name: str,
*args,
):
self._logger = logging.get_logger("transformers/converting")
self._logger.info(f"Loading model {model_type}")
self._model_type = model_type
self._tf_checkpoint = tf_checkpoint
self._pytorch_dump_output = pytorch_dump_output
self._config = config
self._finetuning_task_name = finetuning_task_name
def run(self):
if self._model_type == "albert":
try:
from ..models.albert.convert_albert_original_tf_checkpoint_to_pytorch import (
convert_tf_checkpoint_to_pytorch,
)
except ImportError:
raise ImportError(IMPORT_ERROR_MESSAGE)
convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
elif self._model_type == "bert":
try:
from ..models.bert.convert_bert_original_tf_checkpoint_to_pytorch import (
convert_tf_checkpoint_to_pytorch,
)
except ImportError:
raise ImportError(IMPORT_ERROR_MESSAGE)
convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
elif self._model_type == "funnel":
try:
from ..models.funnel.convert_funnel_original_tf_checkpoint_to_pytorch import (
convert_tf_checkpoint_to_pytorch,
)
except ImportError:
raise ImportError(IMPORT_ERROR_MESSAGE)
convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
elif self._model_type == "t5":
try:
from ..models.t5.convert_t5_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
except ImportError:
raise ImportError(IMPORT_ERROR_MESSAGE)
convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
elif self._model_type == "gpt":
from ..models.openai.convert_openai_original_tf_checkpoint_to_pytorch import (
convert_openai_checkpoint_to_pytorch,
)
convert_openai_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
elif self._model_type == "gpt2":
try:
from ..models.gpt2.convert_gpt2_original_tf_checkpoint_to_pytorch import (
convert_gpt2_checkpoint_to_pytorch,
)
except ImportError:
raise ImportError(IMPORT_ERROR_MESSAGE)
convert_gpt2_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
elif self._model_type == "xlnet":
try:
from ..models.xlnet.convert_xlnet_original_tf_checkpoint_to_pytorch import (
convert_xlnet_checkpoint_to_pytorch,
)
except ImportError:
raise ImportError(IMPORT_ERROR_MESSAGE)
convert_xlnet_checkpoint_to_pytorch(
self._tf_checkpoint, self._config, self._pytorch_dump_output, self._finetuning_task_name
)
elif self._model_type == "xlm":
from ..models.xlm.convert_xlm_original_pytorch_checkpoint_to_pytorch import (
convert_xlm_checkpoint_to_pytorch,
)
convert_xlm_checkpoint_to_pytorch(self._tf_checkpoint, self._pytorch_dump_output)
elif self._model_type == "lxmert":
from ..models.lxmert.convert_lxmert_original_tf_checkpoint_to_pytorch import (
convert_lxmert_checkpoint_to_pytorch,
)
convert_lxmert_checkpoint_to_pytorch(self._tf_checkpoint, self._pytorch_dump_output)
elif self._model_type == "rembert":
from ..models.rembert.convert_rembert_tf_checkpoint_to_pytorch import (
convert_rembert_tf_checkpoint_to_pytorch,
)
convert_rembert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
else:
raise ValueError("--model_type should be selected in the list [bert, gpt, gpt2, t5, xlnet, xlm, lxmert]")

View File

@ -26,9 +26,7 @@ from .. import __version__ as version
from ..integrations.deepspeed import is_deepspeed_available
from ..utils import (
is_accelerate_available,
is_flax_available,
is_safetensors_available,
is_tf_available,
is_torch_available,
is_torch_hpu_available,
is_torch_npu_available,
@ -109,19 +107,6 @@ class EnvironmentCommand(BaseTransformersCLICommand):
elif pt_hpu_available:
pt_accelerator = "HPU"
tf_version = "not installed"
tf_cuda_available = "NA"
if is_tf_available():
import tensorflow as tf
tf_version = tf.__version__
try:
# deprecated in v2.1
tf_cuda_available = tf.test.is_gpu_available()
except AttributeError:
# returns list of devices, convert to bool
tf_cuda_available = bool(tf.config.list_physical_devices("GPU"))
deepspeed_version = "not installed"
if is_deepspeed_available():
# Redirect command line output to silence deepspeed import output.
@ -129,20 +114,6 @@ class EnvironmentCommand(BaseTransformersCLICommand):
import deepspeed
deepspeed_version = deepspeed.__version__
flax_version = "not installed"
jax_version = "not installed"
jaxlib_version = "not installed"
jax_backend = "NA"
if is_flax_available():
import flax
import jax
import jaxlib
flax_version = flax.__version__
jax_version = jax.__version__
jaxlib_version = jaxlib.__version__
jax_backend = jax.lib.xla_bridge.get_backend().platform
info = {
"`transformers` version": version,
"Platform": platform.platform(),
@ -153,10 +124,6 @@ class EnvironmentCommand(BaseTransformersCLICommand):
"Accelerate config": f"{accelerate_config_str}",
"DeepSpeed version": f"{deepspeed_version}",
"PyTorch version (accelerator?)": f"{pt_version} ({pt_accelerator})",
"Tensorflow version (GPU?)": f"{tf_version} ({tf_cuda_available})",
"Flax version (CPU?/GPU?/TPU?)": f"{flax_version} ({jax_backend})",
"Jax version": f"{jax_version}",
"JaxLib version": f"{jaxlib_version}",
"Using distributed or parallel set-up in script?": "<fill in>",
}
if is_torch_available():

View File

@ -1,158 +0,0 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from argparse import ArgumentParser, Namespace
from ..data import SingleSentenceClassificationProcessor as Processor
from ..pipelines import TextClassificationPipeline
from ..utils import is_tf_available, is_torch_available, logging
from . import BaseTransformersCLICommand
if not is_tf_available() and not is_torch_available():
raise RuntimeError("At least one of PyTorch or TensorFlow 2.0+ should be installed to use CLI training")
# TF training parameters
USE_XLA = False
USE_AMP = False
def train_command_factory(args: Namespace):
"""
Factory function used to instantiate training command from provided command line arguments.
Returns: TrainCommand
"""
return TrainCommand(args)
class TrainCommand(BaseTransformersCLICommand):
@staticmethod
def register_subcommand(parser: ArgumentParser):
"""
Register this command to argparse so it's available for the transformer-cli
Args:
parser: Root parser to register command-specific arguments
"""
train_parser = parser.add_parser("train", help="CLI tool to train a model on a task.")
train_parser.add_argument(
"--train_data",
type=str,
required=True,
help="path to train (and optionally evaluation) dataset as a csv with tab separated labels and sentences.",
)
train_parser.add_argument(
"--column_label", type=int, default=0, help="Column of the dataset csv file with example labels."
)
train_parser.add_argument(
"--column_text", type=int, default=1, help="Column of the dataset csv file with example texts."
)
train_parser.add_argument(
"--column_id", type=int, default=2, help="Column of the dataset csv file with example ids."
)
train_parser.add_argument(
"--skip_first_row", action="store_true", help="Skip the first row of the csv file (headers)."
)
train_parser.add_argument("--validation_data", type=str, default="", help="path to validation dataset.")
train_parser.add_argument(
"--validation_split",
type=float,
default=0.1,
help="if validation dataset is not provided, fraction of train dataset to use as validation dataset.",
)
train_parser.add_argument("--output", type=str, default="./", help="path to saved the trained model.")
train_parser.add_argument(
"--task", type=str, default="text_classification", help="Task to train the model on."
)
train_parser.add_argument(
"--model", type=str, default="google-bert/bert-base-uncased", help="Model's name or path to stored model."
)
train_parser.add_argument("--train_batch_size", type=int, default=32, help="Batch size for training.")
train_parser.add_argument("--valid_batch_size", type=int, default=64, help="Batch size for validation.")
train_parser.add_argument("--learning_rate", type=float, default=3e-5, help="Learning rate.")
train_parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon for Adam optimizer.")
train_parser.set_defaults(func=train_command_factory)
def __init__(self, args: Namespace):
self.logger = logging.get_logger("transformers/training")
self.framework = "tf" if is_tf_available() else "torch"
os.makedirs(args.output, exist_ok=True)
self.output = args.output
self.column_label = args.column_label
self.column_text = args.column_text
self.column_id = args.column_id
self.logger.info(f"Loading {args.task} pipeline for {args.model}")
if args.task == "text_classification":
self.pipeline = TextClassificationPipeline.from_pretrained(args.model)
elif args.task == "token_classification":
raise NotImplementedError
elif args.task == "question_answering":
raise NotImplementedError
self.logger.info(f"Loading dataset from {args.train_data}")
self.train_dataset = Processor.create_from_csv(
args.train_data,
column_label=args.column_label,
column_text=args.column_text,
column_id=args.column_id,
skip_first_row=args.skip_first_row,
)
self.valid_dataset = None
if args.validation_data:
self.logger.info(f"Loading validation dataset from {args.validation_data}")
self.valid_dataset = Processor.create_from_csv(
args.validation_data,
column_label=args.column_label,
column_text=args.column_text,
column_id=args.column_id,
skip_first_row=args.skip_first_row,
)
self.validation_split = args.validation_split
self.train_batch_size = args.train_batch_size
self.valid_batch_size = args.valid_batch_size
self.learning_rate = args.learning_rate
self.adam_epsilon = args.adam_epsilon
def run(self):
if self.framework == "tf":
return self.run_tf()
return self.run_torch()
def run_torch(self):
raise NotImplementedError
def run_tf(self):
self.pipeline.fit(
self.train_dataset,
validation_data=self.valid_dataset,
validation_split=self.validation_split,
learning_rate=self.learning_rate,
adam_epsilon=self.adam_epsilon,
train_batch_size=self.train_batch_size,
valid_batch_size=self.valid_batch_size,
)
# Save trained pipeline
self.pipeline.save_pretrained(self.output)

View File

@ -18,7 +18,6 @@ from transformers import HfArgumentParser
from transformers.commands.add_fast_image_processor import AddFastImageProcessorCommand
from transformers.commands.add_new_model_like import AddNewModelLikeCommand
from transformers.commands.chat import ChatCommand
from transformers.commands.convert import ConvertCommand
from transformers.commands.download import DownloadCommand
from transformers.commands.env import EnvironmentCommand
from transformers.commands.run import RunCommand
@ -39,7 +38,6 @@ def main():
# Register commands
ChatCommand.register_subcommand(commands_parser)
ConvertCommand.register_subcommand(commands_parser)
DownloadCommand.register_subcommand(commands_parser)
EnvironmentCommand.register_subcommand(commands_parser)
RunCommand.register_subcommand(commands_parser)

View File

@ -100,9 +100,8 @@ class PretrainedConfig(PushToHubMixin):
Arg:
name_or_path (`str`, *optional*, defaults to `""`):
Store the string that was passed to [`PreTrainedModel.from_pretrained`] or
[`TFPreTrainedModel.from_pretrained`] as `pretrained_model_name_or_path` if the configuration was created
with such a method.
Store the string that was passed to [`PreTrainedModel.from_pretrained`] as `pretrained_model_name_or_path`
if the configuration was created with such a method.
output_hidden_states (`bool`, *optional*, defaults to `False`):
Whether or not the model should return all hidden-states.
output_attentions (`bool`, *optional*, defaults to `False`):
@ -140,8 +139,7 @@ class PretrainedConfig(PushToHubMixin):
architectures (`list[str]`, *optional*):
Model architectures that can be used with the model pretrained weights.
finetuning_task (`str`, *optional*):
Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow
or PyTorch) checkpoint.
Name of the task used to fine-tune the model.
id2label (`dict[int, str]`, *optional*):
A map from index (for instance prediction index, or target index) to label.
label2id (`dict[str, int]`, *optional*):
@ -346,10 +344,6 @@ class PretrainedConfig(PushToHubMixin):
logger.error(f"Can't set {key} with value {value} for {self}")
raise err
# TODO: remove later, deprecated arguments for TF models
self.tf_legacy_loss = kwargs.pop("tf_legacy_loss", False)
self.use_bfloat16 = kwargs.pop("use_bfloat16", False)
def _create_id_label_maps(self, num_labels: int):
self.id2label = {i: f"LABEL_{i}" for i in range(num_labels)}
self.label2id = dict(zip(self.id2label.values(), self.id2label.keys()))

View File

@ -1,551 +0,0 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
from argparse import ArgumentParser
from os import listdir, makedirs
from pathlib import Path
from typing import Optional
from packaging.version import Version, parse
from transformers.pipelines import Pipeline, pipeline
from transformers.tokenization_utils import BatchEncoding
from transformers.utils import ModelOutput, is_tf_available, is_torch_available
# This is the minimal required version to
# support some ONNX Runtime features
ORT_QUANTIZE_MINIMUM_VERSION = parse("1.4.0")
SUPPORTED_PIPELINES = [
"feature-extraction",
"ner",
"sentiment-analysis",
"fill-mask",
"question-answering",
"text-generation",
"translation_en_to_fr",
"translation_en_to_de",
"translation_en_to_ro",
]
class OnnxConverterArgumentParser(ArgumentParser):
"""
Wraps all the script arguments supported to export transformers models to ONNX IR
"""
def __init__(self):
super().__init__("ONNX Converter")
self.add_argument(
"--pipeline",
type=str,
choices=SUPPORTED_PIPELINES,
default="feature-extraction",
)
self.add_argument(
"--model",
type=str,
required=True,
help="Model's id or path (ex: google-bert/bert-base-cased)",
)
self.add_argument("--tokenizer", type=str, help="Tokenizer's id or path (ex: google-bert/bert-base-cased)")
self.add_argument(
"--framework",
type=str,
choices=["pt", "tf"],
help="Framework for loading the model",
)
self.add_argument("--opset", type=int, default=11, help="ONNX opset to use")
self.add_argument(
"--check-loading",
action="store_true",
help="Check ONNX is able to load the model",
)
self.add_argument(
"--use-external-format",
action="store_true",
help="Allow exporting model >= than 2Gb",
)
self.add_argument(
"--quantize",
action="store_true",
help="Quantize the neural network to be run with int8",
)
self.add_argument("output")
def generate_identified_filename(filename: Path, identifier: str) -> Path:
"""
Append a string-identifier at the end (before the extension, if any) to the provided filepath
Args:
filename: pathlib.Path The actual path object we would like to add an identifier suffix
identifier: The suffix to add
Returns: String with concatenated identifier at the end of the filename
"""
return filename.parent.joinpath(filename.stem + identifier).with_suffix(filename.suffix)
def check_onnxruntime_requirements(minimum_version: Version):
"""
Check onnxruntime is installed and if the installed version match is recent enough
Raises:
ImportError: If onnxruntime is not installed or too old version is found
"""
try:
import onnxruntime
# Parse the version of the installed onnxruntime
ort_version = parse(onnxruntime.__version__)
# We require 1.4.0 minimum
if ort_version < ORT_QUANTIZE_MINIMUM_VERSION:
raise ImportError(
f"We found an older version of onnxruntime ({onnxruntime.__version__}) "
f"but we require onnxruntime to be >= {minimum_version} to enable all the conversions options.\n"
"Please update onnxruntime by running `pip install --upgrade onnxruntime`"
)
except ImportError:
raise ImportError(
"onnxruntime doesn't seem to be currently installed. "
"Please install the onnxruntime by running `pip install onnxruntime`"
" and relaunch the conversion."
)
def ensure_valid_input(model, tokens, input_names):
"""
Ensure inputs are presented in the correct order, without any Non
Args:
model: The model used to forward the input data
tokens: BatchEncoding holding the input data
input_names: The name of the inputs
Returns: Tuple
"""
print("Ensuring inputs are in correct order")
model_args_name = model.forward.__code__.co_varnames
model_args, ordered_input_names = [], []
for arg_name in model_args_name[1:]: # start at index 1 to skip "self" argument
if arg_name in input_names:
ordered_input_names.append(arg_name)
model_args.append(tokens[arg_name])
else:
print(f"{arg_name} is not present in the generated input list.")
break
print(f"Generated inputs order: {ordered_input_names}")
return ordered_input_names, tuple(model_args)
def infer_shapes(nlp: Pipeline, framework: str) -> tuple[list[str], list[str], dict, BatchEncoding]:
"""
Attempt to infer the static vs dynamic axes for each input and output tensors for a specific model
Args:
nlp: The pipeline object holding the model to be exported
framework: The framework identifier to dispatch to the correct inference scheme (pt/tf)
Returns:
- List of the inferred input variable names
- List of the inferred output variable names
- Dictionary with input/output variables names as key and shape tensor as value
- a BatchEncoding reference which was used to infer all the above information
"""
def build_shape_dict(name: str, tensor, is_input: bool, seq_len: int):
if isinstance(tensor, (tuple, list)):
return [build_shape_dict(name, t, is_input, seq_len) for t in tensor]
else:
# Let's assume batch is the first axis with only 1 element (~~ might not be always true ...)
axes = {[axis for axis, numel in enumerate(tensor.shape) if numel == 1][0]: "batch"}
if is_input:
if len(tensor.shape) == 2:
axes[1] = "sequence"
else:
raise ValueError(f"Unable to infer tensor axes ({len(tensor.shape)})")
else:
seq_axes = [dim for dim, shape in enumerate(tensor.shape) if shape == seq_len]
axes.update(dict.fromkeys(seq_axes, "sequence"))
print(f"Found {'input' if is_input else 'output'} {name} with shape: {axes}")
return axes
tokens = nlp.tokenizer("This is a sample output", return_tensors=framework)
seq_len = tokens.input_ids.shape[-1]
outputs = nlp.model(**tokens) if framework == "pt" else nlp.model(tokens)
if isinstance(outputs, ModelOutput):
outputs = outputs.to_tuple()
if not isinstance(outputs, (list, tuple)):
outputs = (outputs,)
# Generate input names & axes
input_vars = list(tokens.keys())
input_dynamic_axes = {k: build_shape_dict(k, v, True, seq_len) for k, v in tokens.items()}
# flatten potentially grouped outputs (past for gpt2, attentions)
outputs_flat = []
for output in outputs:
if isinstance(output, (tuple, list)):
outputs_flat.extend(output)
else:
outputs_flat.append(output)
# Generate output names & axes
output_names = [f"output_{i}" for i in range(len(outputs_flat))]
output_dynamic_axes = {k: build_shape_dict(k, v, False, seq_len) for k, v in zip(output_names, outputs_flat)}
# Create the aggregated axes representation
dynamic_axes = dict(input_dynamic_axes, **output_dynamic_axes)
return input_vars, output_names, dynamic_axes, tokens
def load_graph_from_args(
pipeline_name: str, framework: str, model: str, tokenizer: Optional[str] = None, **models_kwargs
) -> Pipeline:
"""
Convert the set of arguments provided through the CLI to an actual pipeline reference (tokenizer + model
Args:
pipeline_name: The kind of pipeline to use (ner, question-answering, etc.)
framework: The actual model to convert the pipeline from ("pt" or "tf")
model: The model name which will be loaded by the pipeline
tokenizer: The tokenizer name which will be loaded by the pipeline, default to the model's value
Returns: Pipeline object
"""
# If no tokenizer provided
if tokenizer is None:
tokenizer = model
# Check the wanted framework is available
if framework == "pt" and not is_torch_available():
raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.")
if framework == "tf" and not is_tf_available():
raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.")
print(f"Loading pipeline (model: {model}, tokenizer: {tokenizer})")
# Allocate tokenizer and model
return pipeline(pipeline_name, model=model, tokenizer=tokenizer, framework=framework, model_kwargs=models_kwargs)
def convert_pytorch(nlp: Pipeline, opset: int, output: Path, use_external_format: bool):
"""
Export a PyTorch backed pipeline to ONNX Intermediate Representation (IR
Args:
nlp: The pipeline to be exported
opset: The actual version of the ONNX operator set to use
output: Path where will be stored the generated ONNX model
use_external_format: Split the model definition from its parameters to allow model bigger than 2GB
Returns:
"""
if not is_torch_available():
raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.")
import torch
from torch.onnx import export
print(f"Using framework PyTorch: {torch.__version__}")
with torch.no_grad():
input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "pt")
ordered_input_names, model_args = ensure_valid_input(nlp.model, tokens, input_names)
export(
nlp.model,
model_args,
f=output.as_posix(),
input_names=ordered_input_names,
output_names=output_names,
dynamic_axes=dynamic_axes,
do_constant_folding=True,
opset_version=opset,
)
def convert_tensorflow(nlp: Pipeline, opset: int, output: Path):
"""
Export a TensorFlow backed pipeline to ONNX Intermediate Representation (IR)
Args:
nlp: The pipeline to be exported
opset: The actual version of the ONNX operator set to use
output: Path where will be stored the generated ONNX model
Notes: TensorFlow cannot export model bigger than 2GB due to internal constraint from TensorFlow
"""
if not is_tf_available():
raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.")
print("/!\\ Please note TensorFlow doesn't support exporting model > 2Gb /!\\")
try:
import tensorflow as tf
import tf2onnx
from tf2onnx import __version__ as t2ov
print(f"Using framework TensorFlow: {tf.version.VERSION}, tf2onnx: {t2ov}")
# Build
input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "tf")
# Forward
nlp.model.predict(tokens.data)
input_signature = [tf.TensorSpec.from_tensor(tensor, name=key) for key, tensor in tokens.items()]
model_proto, _ = tf2onnx.convert.from_keras(
nlp.model, input_signature, opset=opset, output_path=output.as_posix()
)
except ImportError as e:
raise Exception(
f"Cannot import {e.name} required to convert TF model to ONNX. Please install {e.name} first. {e}"
)
def convert(
framework: str,
model: str,
output: Path,
opset: int,
tokenizer: Optional[str] = None,
use_external_format: bool = False,
pipeline_name: str = "feature-extraction",
**model_kwargs,
):
"""
Convert the pipeline object to the ONNX Intermediate Representation (IR) format
Args:
framework: The framework the pipeline is backed by ("pt" or "tf")
model: The name of the model to load for the pipeline
output: The path where the ONNX graph will be stored
opset: The actual version of the ONNX operator set to use
tokenizer: The name of the model to load for the pipeline, default to the model's name if not provided
use_external_format:
Split the model definition from its parameters to allow model bigger than 2GB (PyTorch only)
pipeline_name: The kind of pipeline to instantiate (ner, question-answering, etc.)
model_kwargs: Keyword arguments to be forwarded to the model constructor
Returns:
"""
warnings.warn(
"The `transformers.convert_graph_to_onnx` package is deprecated and will be removed in version 5 of"
" Transformers",
FutureWarning,
)
print(f"ONNX opset version set to: {opset}")
# Load the pipeline
nlp = load_graph_from_args(pipeline_name, framework, model, tokenizer, **model_kwargs)
if not output.parent.exists():
print(f"Creating folder {output.parent}")
makedirs(output.parent.as_posix())
elif len(listdir(output.parent.as_posix())) > 0:
raise Exception(f"Folder {output.parent.as_posix()} is not empty, aborting conversion")
# Export the graph
if framework == "pt":
convert_pytorch(nlp, opset, output, use_external_format)
else:
convert_tensorflow(nlp, opset, output)
def optimize(onnx_model_path: Path) -> Path:
"""
Load the model at the specified path and let onnxruntime look at transformations on the graph to enable all the
optimizations possible
Args:
onnx_model_path: filepath where the model binary description is stored
Returns: Path where the optimized model binary description has been saved
"""
from onnxruntime import InferenceSession, SessionOptions
# Generate model name with suffix "optimized"
opt_model_path = generate_identified_filename(onnx_model_path, "-optimized")
sess_option = SessionOptions()
sess_option.optimized_model_filepath = opt_model_path.as_posix()
_ = InferenceSession(onnx_model_path.as_posix(), sess_option)
print(f"Optimized model has been written at {opt_model_path}: \N{HEAVY CHECK MARK}")
print("/!\\ Optimized model contains hardware specific operators which might not be portable. /!\\")
return opt_model_path
def quantize(onnx_model_path: Path) -> Path:
"""
Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU
Args:
onnx_model_path: Path to location the exported ONNX model is stored
Returns: The Path generated for the quantized
"""
import onnx
import onnxruntime
from onnx.onnx_pb import ModelProto
from onnxruntime.quantization import QuantizationMode
from onnxruntime.quantization.onnx_quantizer import ONNXQuantizer
from onnxruntime.quantization.registry import IntegerOpsRegistry
# Load the ONNX model
onnx_model = onnx.load(onnx_model_path.as_posix())
if parse(onnx.__version__) < parse("1.5.0"):
print(
"Models larger than 2GB will fail to quantize due to protobuf constraint.\n"
"Please upgrade to onnxruntime >= 1.5.0."
)
# Copy it
copy_model = ModelProto()
copy_model.CopyFrom(onnx_model)
# Construct quantizer
# onnxruntime renamed input_qType to activation_qType in v1.13.1, so we
# check the onnxruntime version to ensure backward compatibility.
# See also: https://github.com/microsoft/onnxruntime/pull/12873
if parse(onnxruntime.__version__) < parse("1.13.1"):
quantizer = ONNXQuantizer(
model=copy_model,
per_channel=False,
reduce_range=False,
mode=QuantizationMode.IntegerOps,
static=False,
weight_qType=True,
input_qType=False,
tensors_range=None,
nodes_to_quantize=None,
nodes_to_exclude=None,
op_types_to_quantize=list(IntegerOpsRegistry),
)
else:
quantizer = ONNXQuantizer(
model=copy_model,
per_channel=False,
reduce_range=False,
mode=QuantizationMode.IntegerOps,
static=False,
weight_qType=True,
activation_qType=False,
tensors_range=None,
nodes_to_quantize=None,
nodes_to_exclude=None,
op_types_to_quantize=list(IntegerOpsRegistry),
)
# Quantize and export
quantizer.quantize_model()
# Append "-quantized" at the end of the model's name
quantized_model_path = generate_identified_filename(onnx_model_path, "-quantized")
# Save model
print(f"Quantized model has been written at {quantized_model_path}: \N{HEAVY CHECK MARK}")
onnx.save_model(quantizer.model.model, quantized_model_path.as_posix())
return quantized_model_path
def verify(path: Path):
from onnxruntime import InferenceSession, SessionOptions
from onnxruntime.capi.onnxruntime_pybind11_state import RuntimeException
print(f"Checking ONNX model loading from: {path} ...")
try:
onnx_options = SessionOptions()
_ = InferenceSession(path.as_posix(), onnx_options, providers=["CPUExecutionProvider"])
print(f"Model {path} correctly loaded: \N{HEAVY CHECK MARK}")
except RuntimeException as re:
print(f"Error while loading the model {re}: \N{HEAVY BALLOT X}")
if __name__ == "__main__":
parser = OnnxConverterArgumentParser()
args = parser.parse_args()
# Make sure output is absolute path
args.output = Path(args.output).absolute()
try:
print("\n====== Converting model to ONNX ======")
# Convert
convert(
args.framework,
args.model,
args.output,
args.opset,
args.tokenizer,
args.use_external_format,
args.pipeline,
)
if args.quantize:
# Ensure requirements for quantization on onnxruntime is met
check_onnxruntime_requirements(ORT_QUANTIZE_MINIMUM_VERSION)
# onnxruntime optimizations doesn't provide the same level of performances on TensorFlow than PyTorch
if args.framework == "tf":
print(
"\t Using TensorFlow might not provide the same optimization level compared to PyTorch.\n"
"\t For TensorFlow users you can try optimizing the model directly through onnxruntime_tools.\n"
"\t For more information, please refer to the onnxruntime documentation:\n"
"\t\thttps://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers\n"
)
print("\n====== Optimizing ONNX model ======")
# Quantization works best when using the optimized version of the model
args.optimized_output = optimize(args.output)
# Do the quantization on the right graph
args.quantized_output = quantize(args.optimized_output)
# And verify
if args.check_loading:
print("\n====== Check exported ONNX model(s) ======")
verify(args.output)
if hasattr(args, "optimized_output"):
verify(args.optimized_output)
if hasattr(args, "quantized_output"):
verify(args.quantized_output)
except Exception as e:
print(f"Error while converting the model: {e}")
exit(1)

View File

@ -1,86 +0,0 @@
# Copyright 2020 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert Seq2Seq TF Hub checkpoint."""
import argparse
from . import (
BertConfig,
BertGenerationConfig,
BertGenerationDecoder,
BertGenerationEncoder,
load_tf_weights_in_bert_generation,
logging,
)
logging.set_verbosity_info()
def convert_tf_checkpoint_to_pytorch(tf_hub_path, pytorch_dump_path, is_encoder_named_decoder, vocab_size, is_encoder):
# Initialise PyTorch model
bert_config = BertConfig.from_pretrained(
"google-bert/bert-large-cased",
vocab_size=vocab_size,
max_position_embeddings=512,
is_decoder=True,
add_cross_attention=True,
)
bert_config_dict = bert_config.to_dict()
del bert_config_dict["type_vocab_size"]
config = BertGenerationConfig(**bert_config_dict)
if is_encoder:
model = BertGenerationEncoder(config)
else:
model = BertGenerationDecoder(config)
print(f"Building PyTorch model from configuration: {config}")
# Load weights from tf checkpoint
load_tf_weights_in_bert_generation(
model,
tf_hub_path,
model_class="bert",
is_encoder_named_decoder=is_encoder_named_decoder,
is_encoder=is_encoder,
)
# Save pytorch-model
print(f"Save PyTorch model and config to {pytorch_dump_path}")
model.save_pretrained(pytorch_dump_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument(
"--tf_hub_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
)
parser.add_argument(
"--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
parser.add_argument(
"--is_encoder_named_decoder",
action="store_true",
help="If decoder has to be renamed to encoder in PyTorch model.",
)
parser.add_argument("--is_encoder", action="store_true", help="If model is an encoder.")
parser.add_argument("--vocab_size", default=50358, type=int, help="Vocab size of model")
args = parser.parse_args()
convert_tf_checkpoint_to_pytorch(
args.tf_hub_path,
args.pytorch_dump_path,
args.is_encoder_named_decoder,
args.vocab_size,
is_encoder=args.is_encoder,
)

View File

@ -31,7 +31,7 @@ InputDataClass = NewType("InputDataClass", Any)
"""
A DataCollator is a function that takes a list of samples from a Dataset and collate them into a batch, as a dictionary
of PyTorch/TensorFlow tensors or NumPy arrays.
of PyTorch tensors or NumPy arrays.
"""
DataCollator = NewType("DataCollator", Callable[[list[InputDataClass]], dict[str, Any]])
@ -40,9 +40,7 @@ class DataCollatorMixin:
def __call__(self, features, return_tensors=None):
if return_tensors is None:
return_tensors = self.return_tensors
if return_tensors == "tf":
return self.tf_call(features)
elif return_tensors == "pt":
if return_tensors == "pt":
return self.torch_call(features)
elif return_tensors == "np":
return self.numpy_call(features)
@ -91,8 +89,6 @@ def default_data_collator(features: list[InputDataClass], return_tensors="pt") -
if return_tensors == "pt":
return torch_default_data_collator(features)
elif return_tensors == "tf":
return tf_default_data_collator(features)
elif return_tensors == "np":
return numpy_default_data_collator(features)
@ -114,7 +110,7 @@ class DefaultDataCollator(DataCollatorMixin):
Args:
return_tensors (`str`, *optional*, defaults to `"pt"`):
The type of Tensor to return. Allowable values are "np", "pt" and "tf".
The type of Tensor to return. Allowable values are "np", or "pt".
"""
return_tensors: str = "pt"
@ -161,47 +157,6 @@ def torch_default_data_collator(features: list[InputDataClass]) -> dict[str, Any
return batch
def tf_default_data_collator(features: list[InputDataClass]) -> dict[str, Any]:
import tensorflow as tf
if not isinstance(features[0], Mapping):
features = [vars(f) for f in features]
first = features[0]
batch = {}
# Special handling for labels.
# Ensure that tensor is created with the correct type
# (it should be automatically the case, but let's make sure of it.)
if "label" in first and first["label"] is not None:
label_col_name = "label"
elif "label_ids" in first and first["label_ids"] is not None:
label_col_name = "label_ids"
elif "labels" in first and first["labels"] is not None:
label_col_name = "labels"
else:
label_col_name = None
if label_col_name is not None:
if isinstance(first[label_col_name], tf.Tensor):
dtype = tf.int64 if first[label_col_name].dtype.is_integer else tf.float32
elif isinstance(first[label_col_name], (np.ndarray, np.generic)):
dtype = tf.int64 if np.issubdtype(first[label_col_name].dtype, np.integer) else tf.float32
elif isinstance(first[label_col_name], (tuple, list)):
dtype = tf.int64 if isinstance(first[label_col_name][0], int) else tf.float32
else:
dtype = tf.int64 if isinstance(first[label_col_name], int) else tf.float32
batch["labels"] = tf.convert_to_tensor([f[label_col_name] for f in features], dtype=dtype)
# Handling of all other possible keys.
# Again, we will use the first element to figure out which key/values are not None for this model.
for k, v in first.items():
if k not in ("label", "label_ids", "labels") and v is not None and not isinstance(v, str):
if isinstance(v, (tf.Tensor, np.ndarray)):
batch[k] = tf.stack([f[k] for f in features])
else:
batch[k] = tf.convert_to_tensor([f[k] for f in features])
return batch
def numpy_default_data_collator(features: list[InputDataClass]) -> dict[str, Any]:
if not isinstance(features[0], Mapping):
features = [vars(f) for f in features]
@ -259,7 +214,7 @@ class DataCollatorWithPadding:
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
7.0 (Volta).
return_tensors (`str`, *optional*, defaults to `"pt"`):
The type of Tensor to return. Allowable values are "np", "pt" and "tf".
The type of Tensor to return. Allowable values are "np", or "pt".
"""
tokenizer: PreTrainedTokenizerBase
@ -313,7 +268,7 @@ class DataCollatorForTokenClassification(DataCollatorMixin):
label_pad_token_id (`int`, *optional*, defaults to -100):
The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions).
return_tensors (`str`, *optional*, defaults to `"pt"`):
The type of Tensor to return. Allowable values are "np", "pt" and "tf".
The type of Tensor to return. Allowable values are "np", or "pt".
"""
tokenizer: PreTrainedTokenizerBase
@ -363,38 +318,6 @@ class DataCollatorForTokenClassification(DataCollatorMixin):
batch[label_name] = torch.tensor(batch[label_name], dtype=torch.int64)
return batch
def tf_call(self, features):
import tensorflow as tf
label_name = "label" if "label" in features[0] else "labels"
labels = [feature[label_name] for feature in features] if label_name in features[0] else None
batch = pad_without_fast_tokenizer_warning(
self.tokenizer,
features,
padding=self.padding,
max_length=self.max_length,
pad_to_multiple_of=self.pad_to_multiple_of,
# Conversion to tensors will fail if we have labels as they are not of the same length yet.
return_tensors="tf" if labels is None else None,
)
if labels is None:
return batch
sequence_length = tf.convert_to_tensor(batch["input_ids"]).shape[1]
padding_side = self.tokenizer.padding_side
if padding_side == "right":
batch["labels"] = [
list(label) + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels
]
else:
batch["labels"] = [
[self.label_pad_token_id] * (sequence_length - len(label)) + list(label) for label in labels
]
batch = {k: tf.convert_to_tensor(v, dtype=tf.int64) for k, v in batch.items()}
return batch
def numpy_call(self, features):
label_name = "label" if "label" in features[0] else "labels"
labels = [feature[label_name] for feature in features] if label_name in features[0] else None
@ -463,44 +386,6 @@ def _torch_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int]
return result
def _tf_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
import tensorflow as tf
"""Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
# Tensorize if necessary.
if isinstance(examples[0], (list, tuple)):
examples = [tf.convert_to_tensor(e, dtype=tf.int64) for e in examples]
# Check if padding is necessary.
length_of_first = len(examples[0])
are_tensors_same_length = all(len(x) == length_of_first for x in examples)
if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0):
return tf.stack(examples, axis=0)
# If yes, check if we have a `pad_token`.
if tokenizer.pad_token is None:
raise ValueError(
"You are attempting to pad samples but the tokenizer you are using"
f" ({tokenizer.__class__.__name__}) does not have a pad token."
)
# Creating the full tensor and filling it with our data.
max_length = max(len(x) for x in examples)
if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
# result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id)
result = []
rank = tf.rank(examples[0])
paddings = np.zeros((rank, 2), dtype=np.int32)
for example in examples:
if tokenizer.padding_side == "right":
paddings[0, 1] = max_length - len(example)
else:
paddings[0, 0] = max_length - len(example)
result.append(tf.pad(example, paddings, constant_values=tokenizer.pad_token_id))
return tf.stack(result, axis=0)
def _numpy_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
"""Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
# Tensorize if necessary.
@ -560,7 +445,7 @@ class DataCollatorForMultipleChoice(DataCollatorMixin):
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
7.5 (Volta).
return_tensors (`str`, *optional*, defaults to `"pt"`):
The type of Tensor to return. Allowable values are "np", "pt" and "tf".
The type of Tensor to return. Allowable values are "np", or "pt".
"""
tokenizer: PreTrainedTokenizerBase
@ -599,30 +484,6 @@ class DataCollatorForMultipleChoice(DataCollatorMixin):
batch["labels"] = torch.tensor(labels, dtype=torch.int64)
return batch
def tf_call(self, features): # Implementation taken from the docs.
import tensorflow as tf
label_name = "label" if "label" in features[0] else "labels"
labels = [feature.pop(label_name) for feature in features]
batch_size = len(features)
num_choices = len(features[0]["input_ids"])
flattened_features = [
[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
]
flattened_features = sum(flattened_features, []) # Sometimes written as list(chain(*flattened_features))
batch = self.tokenizer.pad(
flattened_features,
padding=self.padding,
max_length=self.max_length,
pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors="tf",
)
batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()}
batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64)
return batch
@dataclass
class DataCollatorForSeq2Seq:
@ -656,7 +517,7 @@ class DataCollatorForSeq2Seq:
label_pad_token_id (`int`, *optional*, defaults to -100):
The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).
return_tensors (`str`, *optional*, defaults to `"pt"`):
The type of Tensor to return. Allowable values are "np", "pt" and "tf".
The type of Tensor to return. Allowable values are "np", or "pt".
"""
tokenizer: PreTrainedTokenizerBase
@ -739,10 +600,6 @@ class DataCollatorForSeq2Seq:
import torch
batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
elif return_tensors == "tf":
import tensorflow as tf
batch["labels"] = tf.constant(batch["labels"], dtype=tf.int64)
else:
batch["labels"] = np.array(batch["labels"], dtype=np.int64)
else:
@ -787,7 +644,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
pad_to_multiple_of (`int`, *optional*):
If set, will pad the sequence to a multiple of the provided value.
return_tensors (`str`):
The type of Tensor to return. Allowable values are "np", "pt" and "tf".
The type of Tensor to return. Allowable values are "np", or "pt".
seed (`int`, *optional*):
The seed to use for the random number generator for masking. If not provided, the global RNG will be used.
@ -828,7 +685,6 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
mask_replace_prob: float = 0.8
random_replace_prob: float = 0.1
pad_to_multiple_of: Optional[int] = None
tf_experimental_compile: bool = False
return_tensors: str = "pt"
seed: Optional[int] = None
@ -852,11 +708,6 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
self.mask_replace_prob = float(self.mask_replace_prob)
self.random_replace_prob = float(self.random_replace_prob)
if self.tf_experimental_compile:
import tensorflow as tf
self.tf_mask_tokens = tf.function(self.tf_mask_tokens, jit_compile=True)
self.generator = None
def get_generator(self, seed):
@ -864,10 +715,6 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
import torch
return torch.Generator().manual_seed(seed)
elif self.return_tensors == "tf":
import tensorflow as tf
return tf.random.Generator.from_seed(seed)
else:
import numpy as np
@ -882,7 +729,6 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
# worker's generator, generated as the main seed + the worker's ID.
# (https://pytorch.org/docs/stable/data.html#randomness-in-multi-process-data-loading)
# Only PyTorch DataLoader allows us to access the worker ID, and so we check for this.
# For other frameworks, we will throw an error.
import torch
worker_info = torch.utils.data.get_worker_info()
@ -897,111 +743,6 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
self.generator = self.get_generator(self.seed + worker_info.id)
@staticmethod
def tf_bernoulli(shape, probability, generator=None):
import tensorflow as tf
prob_matrix = tf.fill(shape, probability)
# if generator exists, use it to generate the random numbers
# otherwise, use the global RNG
if generator:
return tf.cast(prob_matrix - generator.uniform(shape, 0, 1) >= 0, tf.bool)
else:
return tf.cast(prob_matrix - tf.random.uniform(shape, 0, 1) >= 0, tf.bool)
def tf_mask_tokens(
self, inputs: Any, vocab_size, mask_token_id, special_tokens_mask: Optional[Any] = None
) -> tuple[Any, Any]:
"""
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
"""
import tensorflow as tf
mask_token_id = tf.cast(mask_token_id, inputs.dtype)
input_shape = tf.shape(inputs)
# 1 for a special token, 0 for a normal token in the special tokens mask
# We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
masked_indices = self.tf_bernoulli(input_shape, self.mlm_probability, self.generator) & ~special_tokens_mask
# Replace unmasked indices with -100 in the labels since we only compute loss on masked tokens
labels = tf.where(masked_indices, inputs, -100)
# mask_replace_prob% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
indices_replaced = self.tf_bernoulli(input_shape, self.mask_replace_prob, self.generator) & masked_indices
inputs = tf.where(indices_replaced, mask_token_id, inputs)
if self.mask_replace_prob == 1 or self.random_replace_prob == 0:
return inputs, labels
remaining_prob = 1 - self.mask_replace_prob
# scaling the random_replace_prob to the remaining probability for example if
# mask_replace_prob = 0.8 and random_replace_prob = 0.1,
# then random_replace_prob_scaled = 0.1 / 0.2 = 0.5
random_replace_prob_scaled = self.random_replace_prob / remaining_prob
# random_replace_prob% of the time, we replace masked input tokens with random word
indices_random = (
self.tf_bernoulli(input_shape, random_replace_prob_scaled, self.generator)
& masked_indices
& ~indices_replaced
)
if self.generator:
random_words = self.generator.uniform(input_shape, maxval=vocab_size, dtype=inputs.dtype)
else:
random_words = tf.random.uniform(input_shape, maxval=vocab_size, dtype=inputs.dtype)
inputs = tf.where(indices_random, random_words, inputs)
# The rest of the time ((1-random_replace_prob-mask_replace_prob)% of the time) we keep the masked input tokens unchanged
return inputs, labels
def tf_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
import tensorflow as tf
if self.seed and self.generator is None:
# If we have a seed, we need to create a generator object. Subsequent calls to this function will use the same generator.
# If no seed supplied, we will use the global RNG
self.create_rng()
# Handle dict or lists with proper padding and conversion to tensor.
if isinstance(examples[0], Mapping):
batch = pad_without_fast_tokenizer_warning(
self.tokenizer, examples, return_tensors="tf", pad_to_multiple_of=self.pad_to_multiple_of
)
else:
batch = {
"input_ids": _tf_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
}
# If special token mask has been preprocessed, pop it from the dict.
special_tokens_mask = batch.pop("special_tokens_mask", None)
if self.mlm:
if special_tokens_mask is None:
special_tokens_mask = [
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
for val in batch["input_ids"].numpy().tolist()
]
# Cannot directly create as bool
special_tokens_mask = tf.cast(tf.convert_to_tensor(special_tokens_mask, dtype=tf.int64), tf.bool)
else:
special_tokens_mask = tf.cast(special_tokens_mask, tf.bool)
batch["input_ids"], batch["labels"] = self.tf_mask_tokens(
tf.cast(batch["input_ids"], tf.int64),
special_tokens_mask=special_tokens_mask,
mask_token_id=self.tokenizer.mask_token_id,
vocab_size=len(self.tokenizer),
)
else:
labels = batch["input_ids"]
if self.tokenizer.pad_token_id is not None:
# Replace self.tokenizer.pad_token_id with -100
labels = tf.where(labels == self.tokenizer.pad_token_id, -100, labels)
else:
labels = tf.identity(labels) # Makes a copy, just in case
batch["labels"] = labels
return batch
def torch_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
# Handle dict or lists with proper padding and conversion to tensor.
@ -1226,41 +967,6 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
inputs, labels = self.torch_mask_tokens(batch_input, batch_mask)
return {"input_ids": inputs, "labels": labels}
def tf_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
import tensorflow as tf
if self.seed and self.generator is None:
# If we have a seed, we need to create a generator object. Subsequent calls to this function will use the same generator.
# If no seed supplied, we will use the global RNG
self.create_rng()
if isinstance(examples[0], Mapping):
input_ids = [e["input_ids"] for e in examples]
else:
input_ids = examples
examples = [{"input_ids": e} for e in examples]
batch_input = _tf_collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
mask_labels = []
for e in examples:
ref_tokens = []
for id in tolist(e["input_ids"]):
token = self.tokenizer._convert_id_to_token(id)
ref_tokens.append(token)
# For Chinese tokens, we need extra inf to mark sub-word, e.g [喜,欢]-> [喜,##欢]
if "chinese_ref" in e:
ref_pos = tolist(e["chinese_ref"])
len_seq = len(e["input_ids"])
for i in range(len_seq):
if i in ref_pos:
ref_tokens[i] = "##" + ref_tokens[i]
mask_labels.append(self._whole_word_mask(ref_tokens))
batch_mask = _tf_collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
inputs, labels = self.tf_mask_tokens(tf.cast(batch_input, tf.int64), batch_mask)
return {"input_ids": inputs, "labels": labels}
def numpy_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
if self.seed and self.generator is None:
# If we have a seed, we need to create a generator object. Subsequent calls to this function will use the same generator.
@ -1307,13 +1013,6 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
indices = torch.randperm(len(cand_indexes), generator=self.generator)
return [cand_indexes[i] for i in indices]
elif self.return_tensors == "tf":
import tensorflow as tf
seed = self.generator.make_seeds(2)[0]
indices = tf.random.experimental.stateless_shuffle(tf.range(len(cand_indexes)), seed=seed).numpy().tolist()
return [cand_indexes[i] for i in indices]
elif self.return_tensors == "np":
self.generator.shuffle(cand_indexes)
return cand_indexes
@ -1414,66 +1113,6 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
# The rest of the time ((1-random_replacement_prob-mask_replace_prob)% of the time) we keep the masked input tokens unchanged
return inputs, labels
def tf_mask_tokens(self, inputs: Any, mask_labels: Any) -> tuple[Any, Any]:
"""
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
"""
import tensorflow as tf
input_shape = tf.shape(inputs)
if self.tokenizer.mask_token is None:
raise ValueError(
"This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the"
" --mlm flag if you want to use this tokenizer."
)
labels = tf.identity(inputs)
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
masked_indices = tf.cast(mask_labels, tf.bool)
special_tokens_mask = [
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels
]
masked_indices = masked_indices & ~tf.cast(special_tokens_mask, dtype=tf.bool)
if self.tokenizer.pad_token is not None:
padding_mask = inputs == self.tokenizer.pad_token_id
masked_indices = masked_indices & ~padding_mask
# Replace unmasked indices with -100 in the labels since we only compute loss on masked tokens
labels = tf.where(masked_indices, inputs, -100)
# mask_replace_prob% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
indices_replaced = self.tf_bernoulli(input_shape, self.mask_replace_prob, self.generator) & masked_indices
inputs = tf.where(indices_replaced, self.tokenizer.mask_token_id, inputs)
if self.mask_replace_prob == 1 or self.random_replace_prob == 0:
return inputs, labels
remaining_prob = 1 - self.mask_replace_prob
# scaling the random_replace_prob to the remaining probability for example if
# mask_replace_prob = 0.8 and random_replace_prob = 0.1,
# then random_replace_prob_scaled = 0.1 / 0.2 = 0.5
random_replace_prob_scaled = self.random_replace_prob / remaining_prob
# random_replace_prob% of the time, we replace masked input tokens with random word
indices_random = (
self.tf_bernoulli(input_shape, random_replace_prob_scaled, self.generator)
& masked_indices
& ~indices_replaced
)
if self.generator:
random_words = self.generator.uniform(input_shape, maxval=len(self.tokenizer), dtype=tf.int64)
else:
random_words = tf.random.uniform(input_shape, maxval=len(self.tokenizer), dtype=tf.int64)
inputs = tf.where(indices_random, random_words, inputs)
# The rest of the time ((1-mask_replace_prob-random_replace_prob)% of the time) we keep the masked input tokens unchanged
return inputs, labels
def numpy_mask_tokens(self, inputs: Any, mask_labels: Any) -> tuple[Any, Any]:
"""
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
@ -1543,7 +1182,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
def tolist(x):
if isinstance(x, list):
return x
elif hasattr(x, "numpy"): # Checks for TF tensors without needing the import
elif hasattr(x, "numpy"):
x = x.numpy()
return x.tolist()
@ -1652,13 +1291,6 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
inputs, perm_mask, target_mapping, labels = self.torch_mask_tokens(batch)
return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}
def tf_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
if isinstance(examples[0], Mapping):
examples = [e["input_ids"] for e in examples]
batch = _tf_collate_batch(examples, self.tokenizer)
inputs, perm_mask, target_mapping, labels = self.tf_mask_tokens(batch)
return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}
def numpy_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
if isinstance(examples[0], Mapping):
examples = [e["input_ids"] for e in examples]
@ -1765,113 +1397,6 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
return inputs.long(), perm_mask, target_mapping, labels.long()
def tf_mask_tokens(self, inputs: Any) -> tuple[Any, Any, Any, Any]:
"""
The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
0. Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
1. Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be masked)
2. Reserve a context of length `context_length = span_length / plm_probability` to surround span to be
masked
3. Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length -
span_length]` and mask tokens `start_index:start_index + span_length`
4. Set `cur_len = cur_len + context_length`. If `cur_len < max_len` (i.e. there are tokens remaining in the
sequence to be processed), repeat from Step 1.
"""
import tensorflow as tf
if self.tokenizer.mask_token is None:
raise ValueError(
"This tokenizer does not have a mask token which is necessary for permutation language modeling."
" Please add a mask token if you want to use this tokenizer."
)
if tf.shape(inputs)[1] % 2 != 0:
raise ValueError(
"This collator requires that sequence lengths be even to create a leakage-free perm_mask. Please see"
" relevant comments in source code for details."
)
labels = tf.identity(inputs)
# Creating the mask and target_mapping tensors
masked_indices = np.full(labels.shape.as_list(), 0, dtype=bool)
labels_shape = tf.shape(labels)
target_mapping = np.zeros((labels_shape[0], labels_shape[1], labels_shape[1]), dtype=np.float32)
for i in range(len(labels)):
# Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
cur_len = 0
max_len = tf.shape(labels)[1]
while cur_len < max_len:
# Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be masked)
span_length = randint(1, self.max_span_length + 1)
# Reserve a context of length `context_length = span_length / plm_probability` to surround the span to be masked
context_length = int(span_length / self.plm_probability)
# Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length`
start_index = cur_len + randint(0, context_length - span_length + 1)
masked_indices[i, start_index : start_index + span_length] = 1
# Set `cur_len = cur_len + context_length`
cur_len += context_length
# Since we're replacing non-masked tokens with -100 in the labels tensor instead of skipping them altogether,
# the i-th predict corresponds to the i-th token.
target_mapping[i] = np.eye(labels_shape[1])
masked_indices = tf.cast(tf.convert_to_tensor(masked_indices), dtype=tf.bool)
target_mapping = tf.convert_to_tensor(target_mapping)
special_tokens_mask = tf.convert_to_tensor(
[
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
for val in labels.numpy().tolist()
],
)
special_tokens_mask = tf.cast(special_tokens_mask, dtype=tf.bool)
masked_indices = masked_indices & ~special_tokens_mask
if self.tokenizer.pad_token is not None:
padding_mask = labels == self.tokenizer.pad_token_id
masked_indices = masked_indices & ~padding_mask
# Mask indicating non-functional tokens, where functional tokens are [SEP], [CLS], padding, etc.
non_func_mask = ~(padding_mask | special_tokens_mask)
inputs = tf.where(masked_indices, self.tokenizer.mask_token_id, inputs)
labels = tf.where(masked_indices, labels, -100) # We only compute loss on masked tokens
perm_mask = []
for i in range(len(labels)):
# Generate permutation indices i.e. sample a random factorisation order for the sequence. This will
# determine which tokens a given token can attend to (encoded in `perm_mask`).
# Note: Length of token sequence being permuted has to be less than or equal to reused sequence length
# (see documentation for `mems`), otherwise information may leak through due to reuse. In this implementation,
# we assume that reused length is half of sequence length and permutation length is equal to reused length.
# This requires that the sequence length be even.
# Create a linear factorisation order
# tf.range is the equivalent of torch.arange
perm_index = tf.range(labels_shape[1])
# Split this into two halves, assuming that half the sequence is reused each time
perm_index = tf.transpose(tf.reshape(perm_index, (-1, labels_shape[1] // 2)))
# Permute the two halves such that they do not cross over
perm_index = tf.random.shuffle(perm_index) # Shuffles along the first dimension
# Flatten this out into the desired permuted factorisation order
perm_index = tf.reshape(tf.transpose(perm_index), (-1,))
# Set the permutation indices of non-masked (non-functional) tokens to the
# smallest index (-1) so that:
# (1) They can be seen by all other positions
# (2) They cannot see masked positions, so there won't be information leak
perm_index = tf.where(~masked_indices[i] & non_func_mask[i], -1, perm_index)
# The logic for whether the i-th token can attend on the j-th token based on the factorisation order:
# 0 (can attend): If perm_index[i] > perm_index[j] or j is neither masked nor a functional token
# 1 (cannot attend): If perm_index[i] <= perm_index[j] and j is either masked or a functional token
perm_mask.append(
(tf.reshape(perm_index, (labels_shape[1], 1)) <= tf.reshape(perm_index, (1, labels_shape[1])))
& masked_indices[i]
)
perm_mask = tf.stack(perm_mask, axis=0)
return tf.cast(inputs, tf.int64), tf.cast(perm_mask, tf.float32), target_mapping, tf.cast(labels, tf.int64)
def numpy_mask_tokens(self, inputs: Any) -> tuple[Any, Any, Any, Any]:
"""
The masked tokens to be predicted for a particular sequence are determined by the following algorithm:

View File

@ -69,10 +69,6 @@ class Split(Enum):
class GlueDataset(Dataset):
"""
This will be superseded by a framework-agnostic approach soon.
"""
args: GlueDataTrainingArguments
output_mode: str
features: list[InputFeatures]

View File

@ -38,10 +38,6 @@ DEPRECATION_WARNING = (
class TextDataset(Dataset):
"""
This will be superseded by a framework-agnostic approach soon.
"""
def __init__(
self,
tokenizer: PreTrainedTokenizer,
@ -111,10 +107,6 @@ class TextDataset(Dataset):
class LineByLineTextDataset(Dataset):
"""
This will be superseded by a framework-agnostic approach soon.
"""
def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int):
warnings.warn(
DEPRECATION_WARNING.format(
@ -144,10 +136,6 @@ class LineByLineTextDataset(Dataset):
class LineByLineWithRefDataset(Dataset):
"""
This will be superseded by a framework-agnostic approach soon.
"""
def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, ref_path: str):
warnings.warn(
DEPRECATION_WARNING.format(
@ -344,10 +332,6 @@ class LineByLineWithSOPTextDataset(Dataset):
class TextDatasetForNextSentencePrediction(Dataset):
"""
This will be superseded by a framework-agnostic approach soon.
"""
def __init__(
self,
tokenizer: PreTrainedTokenizer,

View File

@ -107,10 +107,6 @@ class Split(Enum):
class SquadDataset(Dataset):
"""
This will be superseded by a framework-agnostic approach soon.
"""
args: SquadDataTrainingArguments
features: list[SquadFeatures]
mode: Split

View File

@ -17,18 +17,14 @@
import os
import warnings
from dataclasses import asdict
from enum import Enum
from typing import Optional, Union
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import is_tf_available, logging
from ...utils import logging
from .utils import DataProcessor, InputExample, InputFeatures
if is_tf_available():
import tensorflow as tf
logger = logging.get_logger(__name__)
DEPRECATION_WARNING = (
@ -39,7 +35,7 @@ DEPRECATION_WARNING = (
def glue_convert_examples_to_features(
examples: Union[list[InputExample], "tf.data.Dataset"],
examples: list[InputExample],
tokenizer: PreTrainedTokenizer,
max_length: Optional[int] = None,
task=None,
@ -50,7 +46,7 @@ def glue_convert_examples_to_features(
Loads a data file into a list of `InputFeatures`
Args:
examples: List of `InputExamples` or `tf.data.Dataset` containing the examples.
examples: List of `InputExamples` containing the examples.
tokenizer: Instance of a tokenizer that will tokenize the examples
max_length: Maximum example length. Defaults to the tokenizer's max_len
task: GLUE task
@ -58,54 +54,15 @@ def glue_convert_examples_to_features(
output_mode: String indicating the output mode. Either `regression` or `classification`
Returns:
If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the task-specific
features. If the input is a list of `InputExamples`, will return a list of task-specific `InputFeatures` which
can be fed to the model.
Will return a list of task-specific `InputFeatures` which can be fed to the model.
"""
warnings.warn(DEPRECATION_WARNING.format("function"), FutureWarning)
if is_tf_available() and isinstance(examples, tf.data.Dataset):
if task is None:
raise ValueError("When calling glue_convert_examples_to_features from TF, the task parameter is required.")
return _tf_glue_convert_examples_to_features(examples, tokenizer, max_length=max_length, task=task)
return _glue_convert_examples_to_features(
examples, tokenizer, max_length=max_length, task=task, label_list=label_list, output_mode=output_mode
)
if is_tf_available():
def _tf_glue_convert_examples_to_features(
examples: tf.data.Dataset,
tokenizer: PreTrainedTokenizer,
task=str,
max_length: Optional[int] = None,
) -> tf.data.Dataset:
"""
Returns:
A `tf.data.Dataset` containing the task-specific features.
"""
processor = glue_processors[task]()
examples = [processor.tfds_map(processor.get_example_from_tensor_dict(example)) for example in examples]
features = glue_convert_examples_to_features(examples, tokenizer, max_length=max_length, task=task)
label_type = tf.float32 if task == "sts-b" else tf.int64
def gen():
for ex in features:
d = {k: v for k, v in asdict(ex).items() if v is not None}
label = d.pop("label")
yield (d, label)
input_names = tokenizer.model_input_names
return tf.data.Dataset.from_generator(
gen,
(dict.fromkeys(input_names, tf.int32), label_type),
({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([])),
)
def _glue_convert_examples_to_features(
examples: list[InputExample],
tokenizer: PreTrainedTokenizer,

View File

@ -24,7 +24,7 @@ from tqdm import tqdm
from ...models.bert.tokenization_bert import whitespace_tokenize
from ...tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy
from ...utils import is_tf_available, is_torch_available, is_torch_hpu_available, logging
from ...utils import is_torch_available, is_torch_hpu_available, logging
from .utils import DataProcessor
@ -36,8 +36,6 @@ if is_torch_available():
import torch
from torch.utils.data import TensorDataset
if is_tf_available():
import tensorflow as tf
logger = logging.get_logger(__name__)
@ -244,7 +242,6 @@ def squad_convert_example_to_features(
cls_index = span["input_ids"].index(tokenizer.cls_token_id)
# p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
# Original TF implementation also keep the classification token (set to 0)
p_mask = np.ones_like(span["token_type_ids"])
if tokenizer.padding_side == "right":
p_mask[len(truncated_query) + sequence_added_tokens :] = 0
@ -338,8 +335,8 @@ def squad_convert_examples_to_features(
max_query_length: The maximum length of the query.
is_training: whether to create features for model evaluation or model training.
padding_strategy: Default to "max_length". Which padding strategy to use
return_dataset: Default False. Either 'pt' or 'tf'.
if 'pt': returns a torch.data.TensorDataset, if 'tf': returns a tf.data.Dataset
return_dataset: Default False. Can also be 'pt'.
if 'pt': returns a torch.data.TensorDataset.
threads: multiple processing threads.
@ -430,110 +427,6 @@ def squad_convert_examples_to_features(
)
return features, dataset
elif return_dataset == "tf":
if not is_tf_available():
raise RuntimeError("TensorFlow must be installed to return a TensorFlow dataset.")
def gen():
for i, ex in enumerate(features):
if ex.token_type_ids is None:
yield (
{
"input_ids": ex.input_ids,
"attention_mask": ex.attention_mask,
"feature_index": i,
"qas_id": ex.qas_id,
},
{
"start_positions": ex.start_position,
"end_positions": ex.end_position,
"cls_index": ex.cls_index,
"p_mask": ex.p_mask,
"is_impossible": ex.is_impossible,
},
)
else:
yield (
{
"input_ids": ex.input_ids,
"attention_mask": ex.attention_mask,
"token_type_ids": ex.token_type_ids,
"feature_index": i,
"qas_id": ex.qas_id,
},
{
"start_positions": ex.start_position,
"end_positions": ex.end_position,
"cls_index": ex.cls_index,
"p_mask": ex.p_mask,
"is_impossible": ex.is_impossible,
},
)
# Why have we split the batch into a tuple? PyTorch just has a list of tensors.
if "token_type_ids" in tokenizer.model_input_names:
train_types = (
{
"input_ids": tf.int32,
"attention_mask": tf.int32,
"token_type_ids": tf.int32,
"feature_index": tf.int64,
"qas_id": tf.string,
},
{
"start_positions": tf.int64,
"end_positions": tf.int64,
"cls_index": tf.int64,
"p_mask": tf.int32,
"is_impossible": tf.int32,
},
)
train_shapes = (
{
"input_ids": tf.TensorShape([None]),
"attention_mask": tf.TensorShape([None]),
"token_type_ids": tf.TensorShape([None]),
"feature_index": tf.TensorShape([]),
"qas_id": tf.TensorShape([]),
},
{
"start_positions": tf.TensorShape([]),
"end_positions": tf.TensorShape([]),
"cls_index": tf.TensorShape([]),
"p_mask": tf.TensorShape([None]),
"is_impossible": tf.TensorShape([]),
},
)
else:
train_types = (
{"input_ids": tf.int32, "attention_mask": tf.int32, "feature_index": tf.int64, "qas_id": tf.string},
{
"start_positions": tf.int64,
"end_positions": tf.int64,
"cls_index": tf.int64,
"p_mask": tf.int32,
"is_impossible": tf.int32,
},
)
train_shapes = (
{
"input_ids": tf.TensorShape([None]),
"attention_mask": tf.TensorShape([None]),
"feature_index": tf.TensorShape([]),
"qas_id": tf.TensorShape([]),
},
{
"start_positions": tf.TensorShape([]),
"end_positions": tf.TensorShape([]),
"cls_index": tf.TensorShape([]),
"p_mask": tf.TensorShape([None]),
"is_impossible": tf.TensorShape([]),
},
)
return tf.data.Dataset.from_generator(gen, train_types, train_shapes)
else:
return features

View File

@ -20,7 +20,7 @@ import json
from dataclasses import dataclass
from typing import Optional, Union
from ...utils import is_tf_available, is_torch_available, logging
from ...utils import is_torch_available, logging
logger = logging.get_logger(__name__)
@ -82,7 +82,7 @@ class DataProcessor:
def get_example_from_tensor_dict(self, tensor_dict):
"""
Gets an example from a dict with tensorflow tensors.
Gets an example from a dict.
Args:
tensor_dict: Keys and values should match the corresponding Glue
@ -251,9 +251,7 @@ class SingleSentenceClassificationProcessor(DataProcessor):
values)
Returns:
If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the
task-specific features. If the input is a list of `InputExamples`, will return a list of task-specific
`InputFeatures` which can be fed to the model.
Will return a list of task-specific `InputFeatures` which can be fed to the model.
"""
if max_length is None:
@ -315,21 +313,6 @@ class SingleSentenceClassificationProcessor(DataProcessor):
if return_tensors is None:
return features
elif return_tensors == "tf":
if not is_tf_available():
raise RuntimeError("return_tensors set to 'tf' but TensorFlow 2.0 can't be imported")
import tensorflow as tf
def gen():
for ex in features:
yield ({"input_ids": ex.input_ids, "attention_mask": ex.attention_mask}, ex.label)
dataset = tf.data.Dataset.from_generator(
gen,
({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int64),
({"input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None])}, tf.TensorShape([])),
)
return dataset
elif return_tensors == "pt":
if not is_torch_available():
raise RuntimeError("return_tensors set to 'pt' but PyTorch can't be imported")
@ -346,4 +329,4 @@ class SingleSentenceClassificationProcessor(DataProcessor):
dataset = TensorDataset(all_input_ids, all_attention_mask, all_labels)
return dataset
else:
raise ValueError("return_tensors should be one of 'tf' or 'pt'")
raise ValueError("return_tensors should be `'pt'` or `None`")

View File

@ -18,7 +18,6 @@ deps = {
"faiss-cpu": "faiss-cpu",
"fastapi": "fastapi",
"filelock": "filelock",
"flax": "flax>=0.4.1,<=0.7.0",
"ftfy": "ftfy",
"fugashi": "fugashi>=1.0",
"GitPython": "GitPython<3.1.19",
@ -27,12 +26,8 @@ deps = {
"huggingface-hub": "huggingface-hub>=0.34.0,<1.0",
"importlib_metadata": "importlib_metadata",
"ipadic": "ipadic>=1.0.0,<2.0",
"jax": "jax>=0.4.1,<=0.4.13",
"jaxlib": "jaxlib>=0.4.1,<=0.4.13",
"jinja2": "jinja2>=3.1.0",
"kenlm": "kenlm",
"keras": "keras>2.9,<2.16",
"keras-nlp": "keras-nlp>=0.3.1,<0.14.0",
"kernels": "kernels>=0.6.1,<=0.9",
"librosa": "librosa",
"natten": "natten>=0.14.6,<0.15.0",
@ -75,18 +70,13 @@ deps = {
"sagemaker": "sagemaker>=2.31.0",
"schedulefree": "schedulefree>=1.2.6",
"scikit-learn": "scikit-learn",
"scipy": "scipy<1.13.0",
"scipy": "scipy",
"sentencepiece": "sentencepiece>=0.1.91,!=0.1.92",
"sigopt": "sigopt",
"starlette": "starlette",
"sudachipy": "sudachipy>=0.6.6",
"sudachidict_core": "sudachidict_core>=20220729",
"tensorboard": "tensorboard",
"tensorflow-cpu": "tensorflow-cpu>2.9,<2.16",
"tensorflow": "tensorflow>2.9,<2.16",
"tensorflow-text": "tensorflow-text<2.16",
"tensorflow-probability": "tensorflow-probability<0.24",
"tf2onnx": "tf2onnx",
"timeout-decorator": "timeout-decorator",
"tiktoken": "tiktoken",
"timm": "timm<=1.0.19,!=1.0.18",

View File

@ -20,7 +20,7 @@ from typing import Optional, Union
import numpy as np
from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin
from .utils import PaddingStrategy, TensorType, is_tf_tensor, is_torch_tensor, logging, to_numpy
from .utils import PaddingStrategy, TensorType, is_torch_tensor, logging, to_numpy
logger = logging.get_logger(__name__)
@ -74,7 +74,7 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
<Tip>
If the `processed_features` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
If the `processed_features` passed are dictionary of numpy arrays or PyTorch tensors the
result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
PyTorch tensors, you will lose the specific device of your tensors however.
@ -87,7 +87,7 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
list[float]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
collate function.
Instead of `list[float]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
Instead of `list[float]` you can have tensors (numpy arrays or PyTorch tensors),
see the note above for the return type.
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding
@ -116,7 +116,6 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors instead of list of python integers. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return Numpy `np.ndarray` objects.
"""
@ -145,7 +144,7 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
processed_features["attention_mask"] = []
return processed_features
# If we have PyTorch/TF tensors or lists as inputs, we cast them as Numpy arrays
# If we have PyTorch tensors or lists as inputs, we cast them as Numpy arrays
# and rebuild them afterwards if no return_tensors is specified
# Note that we lose the specific device the tensor may be on for PyTorch
@ -159,16 +158,14 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
first_element = required_input[index][0]
if return_tensors is None:
if is_tf_tensor(first_element):
return_tensors = "tf"
elif is_torch_tensor(first_element):
if is_torch_tensor(first_element):
return_tensors = "pt"
elif isinstance(first_element, (int, float, list, tuple, np.ndarray)):
return_tensors = "np"
else:
raise ValueError(
f"type of {first_element} unknown: {type(first_element)}. "
"Should be one of a python, numpy, pytorch or tensorflow object."
"Should be one of a python, numpy, or pytorch object."
)
for key, value in processed_features.items():

View File

@ -32,12 +32,9 @@ from .utils import (
TensorType,
copy_func,
download_url,
is_flax_available,
is_jax_tensor,
is_numpy_array,
is_offline_mode,
is_remote_url,
is_tf_available,
is_torch_available,
is_torch_device,
is_torch_dtype,
@ -71,7 +68,7 @@ class BatchFeature(UserDict):
Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('input_values', 'attention_mask',
etc.).
tensor_type (`Union[None, str, TensorType]`, *optional*):
You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
You can give a tensor_type here to convert the lists of integers in PyTorch/Numpy Tensors at
initialization.
"""
@ -110,21 +107,7 @@ class BatchFeature(UserDict):
if not isinstance(tensor_type, TensorType):
tensor_type = TensorType(tensor_type)
# Get a function reference for the correct framework
if tensor_type == TensorType.TENSORFLOW:
logger.warning_once(
"TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We "
"recommend migrating to PyTorch classes or pinning your version of Transformers."
)
if not is_tf_available():
raise ImportError(
"Unable to convert output to TensorFlow tensors format, TensorFlow is not installed."
)
import tensorflow as tf
as_tensor = tf.constant
is_tensor = tf.is_tensor
elif tensor_type == TensorType.PYTORCH:
if tensor_type == TensorType.PYTORCH:
if not is_torch_available():
raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
import torch # noqa
@ -145,17 +128,6 @@ class BatchFeature(UserDict):
return torch.tensor(value)
is_tensor = torch.is_tensor
elif tensor_type == TensorType.JAX:
logger.warning_once(
"TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We "
"recommend migrating to PyTorch classes or pinning your version of Transformers."
)
if not is_flax_available():
raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.")
import jax.numpy as jnp # noqa: F811
as_tensor = jnp.array
is_tensor = is_jax_tensor
else:
def as_tensor(value, dtype=None):

View File

@ -31,7 +31,6 @@ from .utils import (
ENV_VARS_TRUE_AND_AUTO_VALUES,
ENV_VARS_TRUE_VALUES,
FEATURE_EXTRACTOR_NAME,
FLAX_WEIGHTS_NAME,
HF_MODULES_CACHE,
HUGGINGFACE_CO_PREFIX,
HUGGINGFACE_CO_RESOLVE_ENDPOINT,
@ -42,14 +41,9 @@ from .utils import (
S3_BUCKET_PREFIX,
SENTENCEPIECE_UNDERLINE,
SPIECE_UNDERLINE,
TF2_WEIGHTS_NAME,
TF_WEIGHTS_NAME,
TORCH_FX_REQUIRED_VERSION,
TRANSFORMERS_CACHE,
TRANSFORMERS_DYNAMIC_MODULE_NAME,
USE_JAX,
USE_TF,
USE_TORCH,
WEIGHTS_INDEX_NAME,
WEIGHTS_NAME,
ContextManagers,
@ -79,7 +73,6 @@ from .utils import (
is_datasets_available,
is_detectron2_available,
is_faiss_available,
is_flax_available,
is_ftfy_available,
is_g2p_en_available,
is_in_notebook,
@ -106,9 +99,6 @@ from .utils import (
is_spacy_available,
is_speech_available,
is_tensor,
is_tensorflow_probability_available,
is_tf2onnx_available,
is_tf_available,
is_timm_available,
is_tokenizers_available,
is_torch_available,

View File

@ -14,7 +14,7 @@
from typing import TYPE_CHECKING
from ..utils import OptionalDependencyNotAvailable, _LazyModule, is_flax_available, is_tf_available, is_torch_available
from ..utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {
@ -123,71 +123,6 @@ else:
"SynthIDTextWatermarkDetector",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tf_logits_process"] = [
"TFForcedBOSTokenLogitsProcessor",
"TFForcedEOSTokenLogitsProcessor",
"TFForceTokensLogitsProcessor",
"TFLogitsProcessor",
"TFLogitsProcessorList",
"TFLogitsWarper",
"TFMinLengthLogitsProcessor",
"TFNoBadWordsLogitsProcessor",
"TFNoRepeatNGramLogitsProcessor",
"TFRepetitionPenaltyLogitsProcessor",
"TFSuppressTokensAtBeginLogitsProcessor",
"TFSuppressTokensLogitsProcessor",
"TFTemperatureLogitsWarper",
"TFTopKLogitsWarper",
"TFTopPLogitsWarper",
]
_import_structure["tf_utils"] = [
"TFGenerationMixin",
"TFGreedySearchDecoderOnlyOutput",
"TFGreedySearchEncoderDecoderOutput",
"TFSampleEncoderDecoderOutput",
"TFSampleDecoderOnlyOutput",
"TFBeamSearchEncoderDecoderOutput",
"TFBeamSearchDecoderOnlyOutput",
"TFBeamSampleEncoderDecoderOutput",
"TFBeamSampleDecoderOnlyOutput",
"TFContrastiveSearchEncoderDecoderOutput",
"TFContrastiveSearchDecoderOnlyOutput",
]
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["flax_logits_process"] = [
"FlaxForcedBOSTokenLogitsProcessor",
"FlaxForcedEOSTokenLogitsProcessor",
"FlaxForceTokensLogitsProcessor",
"FlaxLogitsProcessor",
"FlaxLogitsProcessorList",
"FlaxLogitsWarper",
"FlaxMinLengthLogitsProcessor",
"FlaxSuppressTokensAtBeginLogitsProcessor",
"FlaxSuppressTokensLogitsProcessor",
"FlaxTemperatureLogitsWarper",
"FlaxTopKLogitsWarper",
"FlaxTopPLogitsWarper",
"FlaxWhisperTimeStampLogitsProcessor",
"FlaxNoRepeatNGramLogitsProcessor",
]
_import_structure["flax_utils"] = [
"FlaxGenerationMixin",
"FlaxGreedySearchOutput",
"FlaxSampleOutput",
"FlaxBeamSearchOutput",
]
if TYPE_CHECKING:
from .configuration_utils import (
@ -283,66 +218,6 @@ if TYPE_CHECKING:
WatermarkDetectorOutput,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tf_logits_process import (
TFForcedBOSTokenLogitsProcessor,
TFForcedEOSTokenLogitsProcessor,
TFForceTokensLogitsProcessor,
TFLogitsProcessor,
TFLogitsProcessorList,
TFLogitsWarper,
TFMinLengthLogitsProcessor,
TFNoBadWordsLogitsProcessor,
TFNoRepeatNGramLogitsProcessor,
TFRepetitionPenaltyLogitsProcessor,
TFSuppressTokensAtBeginLogitsProcessor,
TFSuppressTokensLogitsProcessor,
TFTemperatureLogitsWarper,
TFTopKLogitsWarper,
TFTopPLogitsWarper,
)
from .tf_utils import (
TFBeamSampleDecoderOnlyOutput,
TFBeamSampleEncoderDecoderOutput,
TFBeamSearchDecoderOnlyOutput,
TFBeamSearchEncoderDecoderOutput,
TFContrastiveSearchDecoderOnlyOutput,
TFContrastiveSearchEncoderDecoderOutput,
TFGenerationMixin,
TFGreedySearchDecoderOnlyOutput,
TFGreedySearchEncoderDecoderOutput,
TFSampleDecoderOnlyOutput,
TFSampleEncoderDecoderOutput,
)
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .flax_logits_process import (
FlaxForcedBOSTokenLogitsProcessor,
FlaxForcedEOSTokenLogitsProcessor,
FlaxForceTokensLogitsProcessor,
FlaxLogitsProcessor,
FlaxLogitsProcessorList,
FlaxLogitsWarper,
FlaxMinLengthLogitsProcessor,
FlaxNoRepeatNGramLogitsProcessor,
FlaxSuppressTokensAtBeginLogitsProcessor,
FlaxSuppressTokensLogitsProcessor,
FlaxTemperatureLogitsWarper,
FlaxTopKLogitsWarper,
FlaxTopPLogitsWarper,
FlaxWhisperTimeStampLogitsProcessor,
)
from .flax_utils import FlaxBeamSearchOutput, FlaxGenerationMixin, FlaxGreedySearchOutput, FlaxSampleOutput
else:
import sys

View File

@ -1,544 +0,0 @@
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
import jax
import jax.lax as lax
import jax.numpy as jnp
from jax.experimental import sparse
from ..utils import add_start_docstrings
from ..utils.logging import get_logger
logger = get_logger(__name__)
LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
Args:
input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
scores (`jnp.ndarray` of shape `(batch_size, config.vocab_size)`):
Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
search or log softmax for each vocabulary token when using beam search
kwargs (`dict[str, Any]`, *optional*):
Additional logits processor specific kwargs.
Return:
`jnp.ndarray` of shape `(batch_size, config.vocab_size)`: The processed prediction scores.
"""
class FlaxLogitsProcessor:
"""Abstract base class for all logit processors that can be applied during generation."""
@add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray) -> jnp.ndarray:
"""Flax method for processing logits."""
raise NotImplementedError(
f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
)
class FlaxLogitsWarper:
"""Abstract base class for all logit warpers that can be applied during generation with multinomial sampling."""
@add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray) -> jnp.ndarray:
"""Flax method for warping logits."""
raise NotImplementedError(
f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
)
class FlaxLogitsProcessorList(list):
"""
This class can be used to create a list of [`FlaxLogitsProcessor`] or [`FlaxLogitsWarper`] to subsequently process
a `scores` input tensor. This class inherits from list and adds a specific *__call__* method to apply each
[`FlaxLogitsProcessor`] or [`FlaxLogitsWarper`] to the inputs.
"""
@add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int, **kwargs) -> jnp.ndarray:
for processor in self:
function_args = inspect.signature(processor.__call__).parameters
if len(function_args) > 3:
if not all(arg in kwargs for arg in list(function_args.keys())[2:]):
raise ValueError(
f"Make sure that all the required parameters: {list(function_args.keys())} for "
f"{processor.__class__} are passed to the logits processor."
)
scores = processor(input_ids, scores, cur_len, **kwargs)
else:
scores = processor(input_ids, scores, cur_len)
return scores
class FlaxTemperatureLogitsWarper(FlaxLogitsWarper):
r"""
[`FlaxLogitsWarper`] for temperature (exponential scaling output probability distribution).
Args:
temperature (`float`):
The value used to module the logits distribution.
"""
def __init__(self, temperature: float):
if not isinstance(temperature, float) or not (temperature > 0):
raise ValueError(f"`temperature` has to be a strictly positive float, but is {temperature}")
self.temperature = temperature
def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
scores = scores / self.temperature
return scores
class FlaxTopPLogitsWarper(FlaxLogitsWarper):
"""
[`FlaxLogitsWarper`] that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <= prob_cut_off.
Args:
top_p (`float`):
If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
higher are kept for generation.
filter_value (`float`, *optional*, defaults to -inf):
All filtered values will be set to this float value.
min_tokens_to_keep (`int`, *optional*, defaults to 1):
Minimum number of tokens that cannot be filtered.
"""
def __init__(self, top_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
if not isinstance(top_p, float) or (top_p < 0 or top_p > 1.0):
raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}")
if not isinstance(min_tokens_to_keep, int) or (min_tokens_to_keep < 1):
raise ValueError(f"`min_tokens_to_keep` has to be a positive integer, but is {min_tokens_to_keep}")
self.top_p = top_p
self.filter_value = filter_value
self.min_tokens_to_keep = min_tokens_to_keep
def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
topk_scores, topk_indices = lax.top_k(scores, scores.shape[-1])
mask_scores = jnp.full_like(scores, self.filter_value)
cumulative_probs = jax.nn.softmax(topk_scores, axis=-1).cumsum(axis=-1)
score_mask = cumulative_probs < self.top_p
# include the token that is higher than top_p as well
score_mask = jnp.roll(score_mask, 1)
score_mask |= score_mask.at[:, 0].set(True)
# min tokens to keep
score_mask = score_mask.at[:, : self.min_tokens_to_keep].set(True)
topk_next_scores = jnp.where(score_mask, topk_scores, mask_scores)
next_scores = jax.lax.sort_key_val(topk_indices, topk_next_scores)[-1]
return next_scores
class FlaxTopKLogitsWarper(FlaxLogitsWarper):
r"""
[`FlaxLogitsWarper`] that performs top-k, i.e. restricting to the k highest probability elements.
Args:
top_k (`int`):
The number of highest probability vocabulary tokens to keep for top-k-filtering.
filter_value (`float`, *optional*, defaults to -inf):
All filtered values will be set to this float value.
min_tokens_to_keep (`int`, *optional*, defaults to 1):
Minimum number of tokens that cannot be filtered.
"""
def __init__(self, top_k: int, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
if not isinstance(top_k, int) or top_k <= 0:
raise ValueError(f"`top_k` has to be a strictly positive integer, but is {top_k}")
self.top_k = max(top_k, min_tokens_to_keep)
self.filter_value = filter_value
def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
batch_size, vocab_size = scores.shape
next_scores_flat = jnp.full(batch_size * vocab_size, self.filter_value)
topk = min(self.top_k, scores.shape[-1]) # Safety check
topk_scores, topk_indices = lax.top_k(scores, topk)
shift = jnp.broadcast_to((jnp.arange(batch_size) * vocab_size)[:, None], (batch_size, topk)).flatten()
topk_scores_flat = topk_scores.flatten()
topk_indices_flat = topk_indices.flatten() + shift
next_scores_flat = next_scores_flat.at[topk_indices_flat].set(topk_scores_flat)
next_scores = next_scores_flat.reshape(batch_size, vocab_size)
return next_scores
class FlaxForcedBOSTokenLogitsProcessor(FlaxLogitsProcessor):
r"""
[`FlaxLogitsProcessor`] that enforces the specified token as the first generated token.
Args:
bos_token_id (`int`):
The id of the token to force as the first generated token.
"""
def __init__(self, bos_token_id: int):
self.bos_token_id = bos_token_id
def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
new_scores = jnp.full(scores.shape, -float("inf"))
apply_penalty = 1 - jnp.bool_(cur_len - 1)
scores = jnp.where(apply_penalty, new_scores.at[:, self.bos_token_id].set(0), scores)
return scores
class FlaxForcedEOSTokenLogitsProcessor(FlaxLogitsProcessor):
r"""
[`FlaxLogitsProcessor`] that enforces the specified token as the last generated token when `max_length` is reached.
Args:
max_length (`int`):
The maximum length of the sequence to be generated.
eos_token_id (`int`):
The id of the token to force as the last generated token when `max_length` is reached.
"""
def __init__(self, max_length: int, eos_token_id: int):
self.max_length = max_length
self.eos_token_id = eos_token_id
def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
new_scores = jnp.full(scores.shape, -float("inf"))
apply_penalty = 1 - jnp.bool_(cur_len - self.max_length + 1)
scores = jnp.where(apply_penalty, new_scores.at[:, self.eos_token_id].set(0), scores)
return scores
class FlaxMinLengthLogitsProcessor(FlaxLogitsProcessor):
r"""
[`FlaxLogitsProcessor`] enforcing a min-length by setting EOS probability to 0.
Args:
min_length (`int`):
The minimum length below which the score of `eos_token_id` is set to `-float("Inf")`.
eos_token_id (`int`):
The id of the *end-of-sequence* token.
"""
def __init__(self, min_length: int, eos_token_id: int):
if not isinstance(min_length, int) or min_length < 0:
raise ValueError(f"`min_length` has to be a positive integer, but is {min_length}")
if not isinstance(eos_token_id, int) or eos_token_id < 0:
raise ValueError(f"`eos_token_id` has to be a positive integer, but is {eos_token_id}")
self.min_length = min_length
self.eos_token_id = eos_token_id
def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
# create boolean flag to decide if min length penalty should be applied
apply_penalty = 1 - jnp.clip(cur_len - self.min_length, 0, 1)
scores = jnp.where(apply_penalty, scores.at[:, self.eos_token_id].set(-float("inf")), scores)
return scores
class FlaxSuppressTokensAtBeginLogitsProcessor(FlaxLogitsProcessor):
r"""
[`FlaxLogitsProcessor`] suppressing a list of tokens as soon as the `generate` function starts generating using
`begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` are not sampled at the
beginning of the generation.
Args:
begin_suppress_tokens (`list[int]`):
Tokens to not sample.
begin_index (`int`):
Index where the tokens are suppressed.
"""
def __init__(self, begin_suppress_tokens, begin_index):
self.begin_suppress_tokens = list(begin_suppress_tokens)
self.begin_index = begin_index
def __call__(self, input_ids, scores, cur_len: int):
apply_penalty = 1 - jnp.bool_(cur_len - self.begin_index)
scores = jnp.where(apply_penalty, scores.at[:, self.begin_suppress_tokens].set(-float("inf")), scores)
return scores
class FlaxSuppressTokensLogitsProcessor(FlaxLogitsProcessor):
r"""
[`FlaxLogitsProcessor`] suppressing a list of tokens at each decoding step. The processor will set their log probs
to be `-inf` so they are not sampled.
Args:
suppress_tokens (`list`):
Tokens to not sample.
"""
def __init__(self, suppress_tokens: list):
self.suppress_tokens = list(suppress_tokens)
def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
scores = scores.at[..., self.suppress_tokens].set(-float("inf"))
return scores
class FlaxForceTokensLogitsProcessor(FlaxLogitsProcessor):
r"""
[`FlaxLogitsProcessor`] that takes a list of pairs of integers which indicates a mapping from generation indices to
token indices that will be forced before sampling. The processor will set their log probs to 0 and all other tokens
to `-inf` so that they are sampled at their corresponding index.
Args:
force_token_map (`list`):
Map giving token ids and indices where they will be forced to be sampled.
"""
def __init__(self, force_token_map):
force_token_map = dict(force_token_map)
# Converts the dictionary of format {index: token} containing the tokens to be forced to an array, where the
# index of the array corresponds to the index of the token to be forced, for XLA compatibility.
# Indexes without forced tokens will have a negative value.
force_token_array = jnp.ones((max(force_token_map.keys()) + 1), dtype=jnp.int32) * -1
for index, token in force_token_map.items():
if token is not None:
force_token_array = force_token_array.at[index].set(token)
self.force_token_array = jnp.int32(force_token_array)
def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
def _force_token(generation_idx):
batch_size = scores.shape[0]
current_token = self.force_token_array[generation_idx]
new_scores = jnp.ones_like(scores, dtype=scores.dtype) * -float("inf")
updates = jnp.zeros((batch_size, 1), dtype=scores.dtype)
new_scores = lax.dynamic_update_slice(new_scores, updates, (0, current_token))
return new_scores
scores = lax.cond(
cur_len >= self.force_token_array.shape[0],
# If the current length is geq than the length of force_token_array, the processor does nothing.
lambda: scores,
# Otherwise, it may force a certain token.
lambda: lax.cond(
self.force_token_array[cur_len] >= 0,
# Only valid (positive) tokens are forced
lambda: _force_token(cur_len),
# Otherwise, the processor does nothing.
lambda: scores,
),
)
return scores
class FlaxWhisperTimeStampLogitsProcessor(FlaxLogitsProcessor):
r"""
Whisper specific Processor. This processor can be used to force a list of tokens. The processor will set their log
probs to `inf` so that they are sampled at their corresponding index.
Args:
generate_config (`GenerateConfig`):
The generate config used to generate the output. The following parameters are required:
eos_token_id (`int`, *optional*, defaults to 50257):
The id of the *end-of-sequence* token.
no_timestamps_token_id (`int`, *optional*, defaults to 50363):
The id of the `"<|notimestamps|>"` token.
max_initial_timestamp_index (`int`, *optional*, defaults to 1):
Used to set the maximum value of the initial timestamp. This is used to prevent the model from
predicting timestamps that are too far in the future.
"""
def __init__(self, generate_config, model_config, decoder_input_length):
self.eos_token_id = generate_config.eos_token_id
self.no_timestamps_token_id = generate_config.no_timestamps_token_id
self.timestamp_begin = generate_config.no_timestamps_token_id + 1
self.begin_index = decoder_input_length + 1
if generate_config.is_multilingual:
# room for language token and task token
self.begin_index += 2
if hasattr(generate_config, "max_initial_timestamp_index"):
self.max_initial_timestamp_index = generate_config.max_initial_timestamp_index
else:
self.max_initial_timestamp_index = model_config.vocab_size
if self.max_initial_timestamp_index is None:
self.max_initial_timestamp_index = model_config.vocab_size
def __call__(self, input_ids, scores, cur_len):
# suppress <|notimestamps|> which is handled by without_timestamps
scores = scores.at[:, self.no_timestamps_token_id].set(-float("inf"))
def handle_pairs(input_ids_k, scores_k):
last_was_timestamp = jnp.where((cur_len - self.begin_index) >= 1, True, False)
last_was_timestamp = jnp.where(
input_ids_k[cur_len - 1] >= self.timestamp_begin,
True and last_was_timestamp,
False,
)
penultimate_was_timestamp = jnp.where((cur_len - self.begin_index) < 2, True, False)
penultimate_was_timestamp = jnp.where(
input_ids_k[cur_len - 2] >= self.timestamp_begin,
True,
penultimate_was_timestamp,
)
return jnp.where(
last_was_timestamp,
jnp.where(
penultimate_was_timestamp > 0,
scores_k.at[self.timestamp_begin :].set(-float("inf")),
scores_k.at[: self.eos_token_id].set(-float("inf")),
),
scores_k,
)
scores = jax.vmap(handle_pairs)(input_ids, scores)
apply_max_initial_timestamp = jnp.where(cur_len == self.begin_index, True, False)
apply_max_initial_timestamp = jnp.where(
self.max_initial_timestamp_index is not None,
True and apply_max_initial_timestamp,
False,
)
last_allowed = self.timestamp_begin + self.max_initial_timestamp_index
scores = jnp.where(
apply_max_initial_timestamp,
scores.at[:, last_allowed + 1 :].set(-float("inf")),
scores,
)
# if sum of probability over timestamps is above any other token, sample timestamp
logprobs = jax.nn.log_softmax(scores, axis=-1)
def handle_cumulative_probs(logprobs_k, scores_k):
timestamp_logprob = jax.nn.logsumexp(logprobs_k[self.timestamp_begin :], axis=-1)
max_text_token_logprob = jnp.max(logprobs_k[: self.timestamp_begin])
return jnp.where(
timestamp_logprob > max_text_token_logprob,
scores_k.at[: self.timestamp_begin].set(-float("inf")),
scores_k,
)
scores = jax.vmap(handle_cumulative_probs)(logprobs, scores)
return scores
class FlaxNoRepeatNGramLogitsProcessor(FlaxLogitsProcessor):
r"""
[`FlaxLogitsProcessor`] that enforces no repetition of n-grams. See
[Fairseq](https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345).
Args:
ngram_size (`int`):
All ngrams of size `ngram_size` can only occur once.
"""
def __init__(self, ngram_size: int):
if not isinstance(ngram_size, int) or ngram_size <= 0:
raise ValueError(f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}")
self.ngram_size = ngram_size
def get_previous_ngrams(self, input_ids: jnp.ndarray, vocab_size: int, cur_len: int):
"""
get a matrix of size (batch_size,) + (vocab_size,)*n (for n-grams) that
represent the n-grams that occurred previously.
The BCOO representation allow to store only the few non-zero entries, instead of the full (huge) matrix
"""
batch_size, seq_len = input_ids.shape
# number of n-grams in the whole sequence
seq_ngrams = seq_len - (self.ngram_size - 1)
# number of n-grams in the currently generated sequence
cur_ngrams = cur_len - (self.ngram_size - 1)
def body_fun(i, val):
b = i % batch_size
pos = i // batch_size
return val.at[i].set(
jnp.array(
[
b,
]
+ [jnp.array(input_ids)[b, pos + j] for j in range(self.ngram_size)]
)
)
shape = (batch_size * seq_ngrams, self.ngram_size + 1)
all_update_indices = jax.lax.fori_loop(
0, batch_size * cur_ngrams, body_fun, jnp.zeros(shape, dtype=input_ids.dtype)
)
# ignore the n-grams not yet generated
data = (jnp.arange(batch_size * seq_ngrams) < batch_size * cur_ngrams).astype("float32")
return sparse.BCOO((data, all_update_indices), shape=(batch_size,) + (vocab_size,) * self.ngram_size)
def get_banned_tokens_mask(self, latest_tokens: jnp.ndarray, previous_ngrams) -> jnp.ndarray:
"""
Determines which tokens must be banned given latest tokens and the previously seen
ngrams.
"""
@sparse.sparsify
@jax.vmap
def inner_fn(latest_tokens, previous_ngrams):
return previous_ngrams[tuple(latest_tokens)]
return sparse.bcoo_todense(inner_fn(latest_tokens, previous_ngrams))
def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
def true_fn():
_, vocab_size = scores.shape
# store the previously seen n-grams
previous_ngrams = self.get_previous_ngrams(input_ids, vocab_size, cur_len)
# get the n-1 last tokens that prefix the n-gram being generated
latest_tokens = jnp.zeros((input_ids.shape[0], self.ngram_size - 1), dtype=input_ids.dtype)
latest_tokens = jax.lax.dynamic_update_slice(
latest_tokens,
jax.lax.dynamic_slice(
input_ids, (0, cur_len - (self.ngram_size - 1)), (input_ids.shape[0], (self.ngram_size - 1))
),
(0, 0),
)
# compute the banned tokens, ie all the tokens that when added to the latest tokens lead to a n-gram that was previously generated
banned_tokens_indices_mask = self.get_banned_tokens_mask(latest_tokens, previous_ngrams).astype("bool")
return jnp.where(banned_tokens_indices_mask, -float("inf"), scores)
output = jax.lax.cond((cur_len >= self.ngram_size - 1), true_fn, lambda: scores)
return output

File diff suppressed because it is too large Load Diff

View File

@ -1,600 +0,0 @@
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
import numpy as np
import tensorflow as tf
from ..tf_utils import stable_softmax
from ..utils import add_start_docstrings
from ..utils.logging import get_logger
logger = get_logger(__name__)
TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
Args:
input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
scores (`tf.Tensor` of shape `(batch_size, config.vocab_size)`):
Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
search or log softmax for each vocabulary token when using beam search.
cur_len (`int`):
The current length of valid input sequence tokens. In the TF implementation, the input_ids' sequence length
is the maximum length generate can produce, and we need to know which of its tokens are valid.
kwargs (`dict[str, Any]`, *optional*):
Additional logits processor specific kwargs.
Return:
`tf.Tensor` of shape `(batch_size, config.vocab_size)`: The processed prediction scores.
"""
class TFLogitsProcessor:
"""Abstract base class for all logit processors that can be applied during generation."""
@add_start_docstrings(TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING)
def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
"""TF method for processing logits."""
raise NotImplementedError(
f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
)
class TFLogitsWarper:
"""Abstract base class for all logit warpers that can be applied during generation with multinomial sampling."""
@add_start_docstrings(TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING)
def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
"""TF method for warping logits."""
raise NotImplementedError(
f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
)
class TFLogitsProcessorList(list):
"""
This class can be used to create a list of [`TFLogitsProcessor`] to subsequently process a `scores` input tensor.
This class inherits from list and adds a specific *__call__* method to apply each [`TFLogitsProcessor`] to the
inputs.
"""
@add_start_docstrings(TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING)
def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int, **kwargs) -> tf.Tensor:
for processor in self:
function_args = inspect.signature(processor.__call__).parameters
if len(function_args) > 3:
if not all(arg in kwargs for arg in list(function_args.keys())[2:]):
raise ValueError(
f"Make sure that all the required parameters: {list(function_args.keys())} for "
f"{processor.__class__} are passed to the logits processor."
)
scores = processor(input_ids, scores, cur_len, **kwargs)
else:
scores = processor(input_ids, scores, cur_len)
return scores
class TFTemperatureLogitsWarper(TFLogitsWarper):
r"""
[`TFLogitsWarper`] for temperature (exponential scaling output probability distribution).
Args:
temperature (`float`):
The value used to module the logits distribution.
"""
def __init__(self, temperature: float):
if not isinstance(temperature, float) or not (temperature > 0):
raise ValueError(f"`temperature` has to be a strictly positive float, but is {temperature}")
self.temperature = temperature
def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
scores = scores / self.temperature
return scores
class TFTopKLogitsWarper(TFLogitsWarper):
r"""
[`TFLogitsWarper`] that performs top-k, i.e. restricting to the k highest probability elements.
Args:
top_k (`int`):
The number of highest probability vocabulary tokens to keep for top-k-filtering.
filter_value (`float`, *optional*, defaults to -inf):
All filtered values will be set to this float value.
min_tokens_to_keep (`int`, *optional*, defaults to 1):
Minimum number of tokens that cannot be filtered.
"""
def __init__(self, top_k: int, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
if not isinstance(top_k, int) or top_k <= 0:
raise ValueError(f"`top_k` has to be a strictly positive integer, but is {top_k}")
self.top_k = max(top_k, min_tokens_to_keep)
self.filter_value = filter_value
def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
top_k = min(self.top_k, scores.shape[-1]) # Safety check
# Boolean mask containing all tokens with a probability less than the last token of the top-k
indices_to_remove = scores < tf.math.top_k(scores, k=top_k)[0][..., -1:]
next_scores = tf.where(indices_to_remove, self.filter_value, scores)
return next_scores
class TFTopPLogitsWarper(TFLogitsWarper):
"""
[`TFLogitsWarper`] that performs top-p, i.e. restricting to top tokens summing to <= prob_cut_off.
Args:
top_p (`float`):
If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
higher are kept for generation.
filter_value (`float`, *optional*, defaults to -inf):
All filtered values will be set to this float value.
min_tokens_to_keep (`int`, *optional*, defaults to 1):
Minimum number of tokens that cannot be filtered.
"""
def __init__(self, top_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
if not isinstance(top_p, float) or (top_p < 0 or top_p > 1.0):
raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}")
if not isinstance(min_tokens_to_keep, int) or (min_tokens_to_keep < 1):
raise ValueError(f"`min_tokens_to_keep` has to be a positive integer, but is {min_tokens_to_keep}")
self.top_p = top_p
self.filter_value = filter_value
self.min_tokens_to_keep = min_tokens_to_keep
def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
topk_scores, topk_indices = tf.math.top_k(scores, scores.shape[-1])
mask_scores = tf.fill(scores.shape, self.filter_value)
cumulative_probs = tf.math.cumsum(stable_softmax(topk_scores, axis=-1), axis=-1)
score_mask = cumulative_probs < self.top_p
# Also include the token that is higher than top_p (the first false = shift and insert a True on the left)
score_mask = tf.concat((tf.ones([score_mask.shape[0], 1], dtype=tf.bool), score_mask[:, :-1]), axis=-1)
# Ensure min tokens to keep
score_mask = tf.concat(
(
tf.ones([score_mask.shape[0], self.min_tokens_to_keep], dtype=tf.bool),
score_mask[:, self.min_tokens_to_keep :],
),
axis=-1,
)
# Mask the values that do not fit the criteria
topk_next_scores = tf.where(score_mask, topk_scores, mask_scores)
# Undo the topk sorting: converts the 2D matrix of per-row original indices of shape (batch_size, vocab_size)
# to a 3D tensor of shape (batch_size, vocab_size, 2) containing the original score coordinate, from which we
# can scatter (i.e. `scatter_indices[row, col, :]` is a tensor containing `[row, topk_indices[row, col]]`)
scatter_rows = tf.tile(tf.expand_dims(tf.range(topk_indices.shape[0]), axis=-1), [1, topk_indices.shape[-1]])
scatter_indices = tf.stack((scatter_rows, topk_indices), axis=-1)
next_scores = tf.scatter_nd(scatter_indices, topk_next_scores, shape=topk_next_scores.shape)
return next_scores
class TFMinLengthLogitsProcessor(TFLogitsProcessor):
r"""
[`TFLogitsProcessor`] enforcing a min-length by setting EOS probability to 0.
Args:
min_length (`int`):
The minimum length below which the score of `eos_token_id` is set to `-float("Inf")`.
eos_token_id (`int`):
The id of the *end-of-sequence* token.
"""
def __init__(self, min_length: int, eos_token_id: int):
if not isinstance(min_length, int) or min_length < 0:
raise ValueError(f"`min_length` has to be a positive integer, but is {min_length}")
if not isinstance(eos_token_id, int) or eos_token_id < 0:
raise ValueError(f"`eos_token_id` has to be a positive integer, but is {eos_token_id}")
self.min_length = min_length
self.eos_token_id = eos_token_id
def _apply_eos_token_mask(self, scores: tf.Tensor) -> tf.Tensor:
eos_token_id_mask = tf.range(scores.shape[-1]) == self.eos_token_id
scores = tf.where(eos_token_id_mask, float("-inf"), scores)
return scores
def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
# applies eos token masking if the first argument is true
scores = tf.cond(
tf.less(cur_len, self.min_length),
lambda: self._apply_eos_token_mask(scores),
lambda: tf.identity(scores),
)
return scores
class TFRepetitionPenaltyLogitsProcessor(TFLogitsProcessor):
r"""
[`TFLogitsProcessor`] enforcing an exponential penalty on repeated sequences.
Args:
repetition_penalty (`float`):
The parameter for repetition penalty. 1.0 means no penalty. See [this
paper](https://huggingface.co/papers/1909.05858) for more details.
"""
def __init__(self, penalty: float):
if not isinstance(penalty, float) or not (penalty > 0):
raise ValueError(f"`penalty` has to be a strictly positive float, but is {penalty}")
self.penalty = penalty
def _create_score_penalties(self, input_ids: tf.Tensor, logits: tf.Tensor) -> tf.Tensor:
# We want to populate the penalties in the positions of `input_ids`. Since XLA can't handle shapes unknown
# before runtime, `tf.unique` can't be used. Therefore, we may have redundant updates, when a given row has
# the same token multiple times.
# Gathers the penalties to apply
logit_penalties = tf.gather(logits, input_ids, axis=1, batch_dims=1)
logit_penalties = tf.where(logit_penalties > 0, 1 / self.penalty, logit_penalties)
logit_penalties = tf.where(logit_penalties < 0, self.penalty, logit_penalties)
# Scatters the penalties
token_penalties = tf.ones(logits.shape)
batch_size = input_ids.shape[0]
seq_len = tf.shape(input_ids)[1] # the sequence length has dynamic size, hence the dynamic shape
indexable_prev_input_ids = tf.concat(
(
tf.expand_dims(tf.repeat(tf.range(batch_size), seq_len), axis=-1),
tf.expand_dims(tf.reshape(input_ids, [-1]), axis=-1),
),
axis=1,
)
token_penalties = tf.tensor_scatter_nd_update(
token_penalties, indices=indexable_prev_input_ids, updates=tf.reshape(logit_penalties, [-1])
)
return token_penalties
def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
score_penalties = self._create_score_penalties(input_ids[:, :cur_len], scores)
scores = tf.math.multiply(scores, score_penalties)
return scores
class TFNoBadWordsLogitsProcessor(TFLogitsProcessor):
"""
[`TFLogitsProcessor`] that enforces that specified sequences will never be sampled.
Args:
bad_words_ids (`list[list[int]]`):
List of list of token ids that are not allowed to be generated. In order to get the tokens of the words
that should not appear in the generated text, make sure to set `add_prefix_space=True` when initializing
the tokenizer, and use `tokenizer(bad_words, add_special_tokens=False).input_ids`. The `add_prefix_space`
argument is only supported for some slow tokenizers, as fast tokenizers' prefixing behaviours come from
`pre tokenizers`. Read more [here](https://huggingface.co/docs/tokenizers/api/pre-tokenizers).
eos_token_id (`int`):
The id of the *end-of-sequence* token.
"""
def __init__(self, bad_words_ids: list[list[int]], eos_token_id: int):
if not isinstance(bad_words_ids, list) or len(bad_words_ids) == 0:
raise ValueError(f"`bad_words_ids` has to be a non-empty list, but is {bad_words_ids}.")
if any(not isinstance(bad_word_ids, list) for bad_word_ids in bad_words_ids):
raise ValueError(f"`bad_words_ids` has to be a list of lists, but is {bad_words_ids}.")
if any(
any((not isinstance(token_id, (int, np.integer)) or token_id < 0) for token_id in bad_word_ids)
for bad_word_ids in bad_words_ids
):
raise ValueError(
f"Each list in `bad_words_ids` has to be a list of positive integers, but is {bad_words_ids}."
)
# stores the information about bad words in three tensors:
# 1. a rectangular tensor with the forbidden sequences (padded with `-1`), for full data comparisons
self.bad_word_seqs_ids = tf.ragged.constant(bad_words_ids).to_tensor(default_value=-1)
# 2. a tensor with the unpadded length of each forbidden sequence, for quick length comparisons
bad_word_seqs_len = [len(bad_words) for bad_words in bad_words_ids]
if any(word_len == 0 for word_len in bad_word_seqs_len):
raise ValueError(f"Banned words token sequences {bad_words_ids} cannot have an empty list")
self.bad_word_seqs_len = tf.convert_to_tensor(bad_word_seqs_len, dtype=tf.int32)
# 3. a tensor containing the last token for each sequence, for easy access to the tokens that may be banned
self.seq_forbidden_tokens = tf.convert_to_tensor([bad_words[-1] for bad_words in bad_words_ids])
def _calc_row_banned_bad_tokens(self, row_input_ids: tf.Tensor) -> tf.Tensor:
def _tokens_match(bad_word_seq_number):
def _len_one():
# If the bad sequence only has one token, always mask it
return tf.cond(
tf.math.equal(self.bad_word_seqs_len[bad_word_seq_number], 1),
lambda: tf.ones((), dtype=tf.bool),
_len_greater_than_cur_len,
)
def _len_greater_than_cur_len():
# Otherwise, if the bad sequence is longer than the current length they can't ever match
return tf.cond(
tf.math.greater(self.bad_word_seqs_len[bad_word_seq_number], tf.shape(row_input_ids)[0]),
lambda: tf.zeros((), dtype=tf.bool),
_match_found,
)
def _match_found():
# Finally, runs the actual comparison. Can only be called if the previous comparisons do not yield
# an answer (otherwise we get indexing exceptions)
compare_len = self.bad_word_seqs_len[bad_word_seq_number] - 1
return tf.cond(
tf.math.reduce_all(
tf.math.equal(
row_input_ids[-compare_len:], self.bad_word_seqs_ids[bad_word_seq_number, :compare_len]
)
),
lambda: tf.ones((), dtype=tf.bool),
lambda: tf.zeros((), dtype=tf.bool),
)
match = _len_one()
return match
# Compares the current row against all bad word sequences, obtaining a mask with the matches.
match_mask = tf.map_fn(_tokens_match, tf.range(self.bad_word_seqs_ids.shape[0]), fn_output_signature=tf.bool)
row_banned_tokens = self.seq_forbidden_tokens[match_mask]
return row_banned_tokens
def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
# We want to mask some banned tokens, at a score level. Since the banned tokens depend on the previous
# `input_ids`, they may have a different length for each row, and they may even be empty for some rows.
# To remain simple and XLA-compatible, we work on a per-row fashion.
# TODO (Joao): this function might trigger XLA retracing as `cur_len` increases. Fix it if it becomes
# a frequent choke point. (make `cur_len` a tensor?)
def _get_row_updated_score(row_inputs: tuple[tf.Tensor]) -> tf.Tensor:
row_input_ids, row_score = row_inputs
banned_tokens = self._calc_row_banned_bad_tokens(row_input_ids[:cur_len])
banned_tokens_mask = tf.scatter_nd(
indices=tf.expand_dims(banned_tokens, axis=-1),
updates=tf.ones_like(banned_tokens, dtype=tf.bool),
shape=row_score.shape,
)
row_score = tf.where(banned_tokens_mask, -float("inf"), row_score)
return row_score
scores = tf.map_fn(_get_row_updated_score, (input_ids, scores), fn_output_signature=tf.float32)
return scores
class TFNoRepeatNGramLogitsProcessor(TFLogitsProcessor):
r"""
[`TFLogitsProcessor`] that enforces no repetition of n-grams. See
[Fairseq](https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345).
Args:
ngram_size (`int`):
All ngrams of size `ngram_size` can only occur once.
"""
def __init__(self, ngram_size: int):
if not isinstance(ngram_size, int) or ngram_size <= 0:
raise ValueError(f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}")
self.ngram_size = ngram_size
def calc_banned_ngram_tokens(self, input_ids, num_hypos, cur_len):
# Copied from fairseq for no_repeat_ngram in beam_search
if cur_len + 1 < self.ngram_size:
# return no banned tokens if we haven't generated ngram_size tokens yet
return [[] for _ in range(num_hypos)]
generated_ngrams = [{} for _ in range(num_hypos)]
prev_input_ids = input_ids[:, :cur_len]
for idx in range(num_hypos):
gen_tokens = prev_input_ids[idx].numpy().tolist()
generated_ngram = generated_ngrams[idx]
for ngram in zip(*[gen_tokens[i:] for i in range(self.ngram_size)]):
prev_ngram_tuple = tuple(ngram[:-1])
generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]]
def _get_generated_ngrams(hypo_idx):
# Before decoding the next token, prevent decoding of ngrams that have already appeared
start_idx = cur_len + 1 - self.ngram_size
ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].numpy().tolist())
return generated_ngrams[hypo_idx].get(ngram_idx, [])
banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)]
return banned_tokens
def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
# TODO (joao): enable XLA on this logits processor. See discussion and attempts in
# https://github.com/huggingface/transformers/pull/16974
if not tf.executing_eagerly():
raise NotImplementedError("TFNoRepeatNGramLogitsProcessor is only implemented for eager execution.")
batch_size, vocab_size = scores.shape
banned_tokens = self.calc_banned_ngram_tokens(input_ids, batch_size, cur_len)
# create banned_tokens boolean mask
banned_tokens_indices_mask = []
for banned_tokens_slice in banned_tokens:
banned_tokens_indices_mask.append([token in banned_tokens_slice for token in range(vocab_size)])
scores = tf.where(tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf"), scores)
return scores
class TFForcedBOSTokenLogitsProcessor(TFLogitsProcessor):
r"""
[`TFLogitsProcessor`] that enforces the specified token as the first generated token.
Args:
bos_token_id (`int`):
The id of the token to force as the first generated token.
"""
def __init__(self, bos_token_id: int):
if bos_token_id < 0:
raise ValueError(f"The forced bos token id must be a non-negative integer, got {bos_token_id}")
self.bos_token_id = bos_token_id
def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
if cur_len == 1:
batch_size, num_tokens = scores.shape
# sets the score to 0 in the bos_token_id column
scores = tf.zeros((batch_size, 1))
# sets the score to -inf everywhere else
if self.bos_token_id > 0:
scores = tf.concat((tf.broadcast_to(-float("inf"), (batch_size, self.bos_token_id)), scores), axis=-1)
if self.bos_token_id < (num_tokens - 1):
scores = tf.concat(
(scores, tf.broadcast_to(-float("inf"), (batch_size, (num_tokens - 1) - self.bos_token_id))),
axis=-1,
)
return scores
class TFForcedEOSTokenLogitsProcessor(TFLogitsProcessor):
r"""
[`TFLogitsProcessor`] that enforces the specified token as the last generated token when `max_length` is reached.
Args:
max_length (`int`):
The maximum length of the sequence to be generated.
eos_token_id (`int`):
The id of the token to force as the last generated token when `max_length` is reached.
"""
def __init__(self, max_length: int, eos_token_id: int):
self.max_length = max_length
if eos_token_id < 0:
raise ValueError(f"The forced eos token id must be a non-negative integer, got {eos_token_id}")
self.eos_token_id = eos_token_id
def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
if cur_len == self.max_length - 1:
batch_size, num_tokens = scores.shape
# sets the score to 0 in the eos_token_id column
scores = tf.zeros((batch_size, 1))
# sets the score to -inf everywhere else
if self.eos_token_id > 0:
scores = tf.concat((tf.broadcast_to(-float("inf"), (batch_size, self.eos_token_id)), scores), axis=-1)
if self.eos_token_id < (num_tokens - 1):
scores = tf.concat(
(scores, tf.broadcast_to(-float("inf"), (batch_size, (num_tokens - 1) - self.eos_token_id))),
axis=-1,
)
return scores
class TFSuppressTokensAtBeginLogitsProcessor(TFLogitsProcessor):
r"""
[`TFSuppressTokensAtBeginLogitsProcessor`] suppresses a list of tokens as soon as the `generate` function starts
generating using `begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` at not
sampled at the beginning of the generation.
"""
def __init__(self, begin_suppress_tokens, begin_index):
self.begin_suppress_tokens = list(begin_suppress_tokens)
self.begin_index = begin_index
def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
suppressed_indices = []
for token in self.begin_suppress_tokens:
if token < scores.shape[-1]: # to ensure we don't go beyond the vocab size
suppressed_indices.extend([[i, token] for i in range(scores.shape[0])])
if len(suppressed_indices) > 0:
scores = tf.cond(
tf.equal(cur_len, self.begin_index),
lambda: tf.tensor_scatter_nd_update(
scores,
indices=suppressed_indices,
updates=[-float("inf") for _ in range(scores.shape[0] * len(self.begin_suppress_tokens))],
),
lambda: scores,
)
return scores
class TFSuppressTokensLogitsProcessor(TFLogitsProcessor):
r"""This processor can be used to suppress a list of tokens. The processor will set their log probs to `-inf` so that they
are not sampled."""
def __init__(self, suppress_tokens):
self.suppress_tokens = list(suppress_tokens)
def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
suppressed_indices = []
for token in self.suppress_tokens:
if token < scores.shape[-1]: # to ensure we don't go beyond the vocab size
suppressed_indices.extend([[i, token] for i in range(scores.shape[0])])
if len(suppressed_indices) > 0:
scores = tf.tensor_scatter_nd_update(
scores,
indices=[[i, token] for i in range(scores.shape[0]) for token in self.suppress_tokens],
updates=[-float("inf") for _ in range(scores.shape[0] * len(self.suppress_tokens))],
)
return scores
class TFForceTokensLogitsProcessor(TFLogitsProcessor):
r"""This processor takes a list of pairs of integers which indicates a mapping from generation indices to token
indices that will be forced before sampling. The processor will set their log probs to `0` and all other tokens to
`-inf` so that they are sampled at their corresponding index."""
def __init__(self, force_token_map: list[list[int]]):
force_token_map = dict(force_token_map)
# Converts the dictionary of format {index: token} containing the tokens to be forced to an array, where the
# index of the array corresponds to the index of the token to be forced, for XLA compatibility.
# Indexes without forced tokens will have an negative value.
force_token_array = np.ones((max(force_token_map.keys()) + 1), dtype=np.int32) * -1
for index, token in force_token_map.items():
if token is not None:
force_token_array[index] = token
self.force_token_array = tf.convert_to_tensor(force_token_array, dtype=tf.int32)
def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
def _force_token(generation_idx):
batch_size = scores.shape[0]
current_token = self.force_token_array[generation_idx]
new_scores = tf.zeros_like(scores, dtype=scores.dtype) + tf.constant([scores.dtype.min])
indices = tf.stack((tf.range(batch_size), tf.tile([current_token], [batch_size])), axis=1)
updates = tf.zeros((batch_size,), dtype=scores.dtype)
new_scores = tf.tensor_scatter_nd_update(new_scores, indices, updates)
return new_scores
scores = tf.cond(
tf.greater_equal(cur_len, tf.shape(self.force_token_array)[0]),
# If the current length is geq than the length of force_token_array, the processor does nothing.
lambda: tf.identity(scores),
# Otherwise, it may force a certain token.
lambda: tf.cond(
tf.greater_equal(self.force_token_array[cur_len], 0),
# Only valid (positive) tokens are forced
lambda: _force_token(cur_len),
# Otherwise, the processor does nothing.
lambda: scores,
),
)
return scores

File diff suppressed because it is too large Load Diff

View File

@ -55,7 +55,7 @@ class BatchFeature(BaseBatchFeature):
data (`dict`):
Dictionary of lists/arrays/tensors returned by the __call__ method ('pixel_values', etc.).
tensor_type (`Union[None, str, TensorType]`, *optional*):
You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
You can give a tensor_type here to convert the lists of integers in PyTorch/Numpy Tensors at
initialization.
"""

View File

@ -26,10 +26,8 @@ from .image_utils import (
get_image_size,
infer_channel_dimension_format,
)
from .utils import ExplicitEnum, TensorType, is_jax_tensor, is_tf_tensor, is_torch_tensor
from .utils import ExplicitEnum, TensorType, is_torch_tensor
from .utils.import_utils import (
is_flax_available,
is_tf_available,
is_torch_available,
is_vision_available,
requires_backends,
@ -44,12 +42,6 @@ if is_vision_available():
if is_torch_available():
import torch
if is_tf_available():
import tensorflow as tf
if is_flax_available():
import jax.numpy as jnp
def to_channel_dimension_format(
image: np.ndarray,
@ -160,7 +152,7 @@ def _rescale_for_pil_conversion(image):
def to_pil_image(
image: Union[np.ndarray, "PIL.Image.Image", "torch.Tensor", "tf.Tensor", "jnp.ndarray"],
image: Union[np.ndarray, "PIL.Image.Image", "torch.Tensor"],
do_rescale: Optional[bool] = None,
image_mode: Optional[str] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
@ -170,7 +162,7 @@ def to_pil_image(
needed.
Args:
image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor` or `tf.Tensor`):
image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor`):
The image to convert to the `PIL.Image` format.
do_rescale (`bool`, *optional*):
Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default
@ -190,10 +182,8 @@ def to_pil_image(
return image
# Convert all tensors to numpy arrays before converting to PIL image
if is_torch_tensor(image) or is_tf_tensor(image):
if is_torch_tensor(image):
image = image.numpy()
elif is_jax_tensor(image):
image = np.array(image)
elif not isinstance(image, np.ndarray):
raise ValueError(f"Input image type not supported: {type(image)}")
@ -556,16 +546,6 @@ def _center_to_corners_format_numpy(bboxes_center: np.ndarray) -> np.ndarray:
return bboxes_corners
def _center_to_corners_format_tf(bboxes_center: "tf.Tensor") -> "tf.Tensor":
center_x, center_y, width, height = tf.unstack(bboxes_center, axis=-1)
bboxes_corners = tf.stack(
# top left x, top left y, bottom right x, bottom right y
[center_x - 0.5 * width, center_y - 0.5 * height, center_x + 0.5 * width, center_y + 0.5 * height],
axis=-1,
)
return bboxes_corners
# 2 functions below inspired by https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
def center_to_corners_format(bboxes_center: TensorType) -> TensorType:
"""
@ -576,14 +556,11 @@ def center_to_corners_format(bboxes_center: TensorType) -> TensorType:
corners format: contains the coordinates for the top-left and bottom-right corners of the box
(top_left_x, top_left_y, bottom_right_x, bottom_right_y)
"""
# Function is used during model forward pass, so we use the input framework if possible, without
# converting to numpy
# Function is used during model forward pass, so we use torch if relevant, without converting to numpy
if is_torch_tensor(bboxes_center):
return _center_to_corners_format_torch(bboxes_center)
elif isinstance(bboxes_center, np.ndarray):
return _center_to_corners_format_numpy(bboxes_center)
elif is_tf_tensor(bboxes_center):
return _center_to_corners_format_tf(bboxes_center)
raise ValueError(f"Unsupported input type {type(bboxes_center)}")
@ -613,20 +590,6 @@ def _corners_to_center_format_numpy(bboxes_corners: np.ndarray) -> np.ndarray:
return bboxes_center
def _corners_to_center_format_tf(bboxes_corners: "tf.Tensor") -> "tf.Tensor":
top_left_x, top_left_y, bottom_right_x, bottom_right_y = tf.unstack(bboxes_corners, axis=-1)
bboxes_center = tf.stack(
[
(top_left_x + bottom_right_x) / 2, # center x
(top_left_y + bottom_right_y) / 2, # center y
(bottom_right_x - top_left_x), # width
(bottom_right_y - top_left_y), # height
],
axis=-1,
)
return bboxes_center
def corners_to_center_format(bboxes_corners: TensorType) -> TensorType:
"""
Converts bounding boxes from corners format to center format.
@ -641,8 +604,6 @@ def corners_to_center_format(bboxes_corners: TensorType) -> TensorType:
return _corners_to_center_format_torch(bboxes_corners)
elif isinstance(bboxes_corners, np.ndarray):
return _corners_to_center_format_numpy(bboxes_corners)
elif is_tf_tensor(bboxes_corners):
return _corners_to_center_format_tf(bboxes_corners)
raise ValueError(f"Unsupported input type {type(bboxes_corners)}")

View File

@ -24,9 +24,7 @@ import requests
from .utils import (
ExplicitEnum,
is_jax_tensor,
is_numpy_array,
is_tf_tensor,
is_torch_available,
is_torch_tensor,
is_torchvision_available,
@ -107,8 +105,6 @@ class ImageType(ExplicitEnum):
PIL = "pillow"
TORCH = "torch"
NUMPY = "numpy"
TENSORFLOW = "tensorflow"
JAX = "jax"
def get_image_type(image):
@ -118,15 +114,11 @@ def get_image_type(image):
return ImageType.TORCH
if is_numpy_array(image):
return ImageType.NUMPY
if is_tf_tensor(image):
return ImageType.TENSORFLOW
if is_jax_tensor(image):
return ImageType.JAX
raise ValueError(f"Unrecognized image type {type(image)}")
def is_valid_image(img):
return is_pil_image(img) or is_numpy_array(img) or is_torch_tensor(img) or is_tf_tensor(img) or is_jax_tensor(img)
return is_pil_image(img) or is_numpy_array(img) or is_torch_tensor(img)
def is_valid_list_of_images(images: list):
@ -205,8 +197,7 @@ def make_list_of_images(images, expected_ndims: int = 3) -> list[ImageInput]:
)
return images
raise ValueError(
"Invalid image type. Expected either PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or "
f"jax.ndarray, but got {type(images)}."
f"Invalid image type. Expected either PIL.Image.Image, numpy.ndarray, or torch.Tensor, but got {type(images)}."
)
@ -570,7 +561,6 @@ def validate_preprocess_arguments(
raise ValueError("`size` and `resample/interpolation` must be specified if `do_resize` is `True`.")
# In the future we can add a TF implementation here when we have TF models.
class ImageFeatureExtractionMixin:
"""
Mixin that contain utilities for preparing image features.

View File

@ -48,7 +48,6 @@ from ..utils import (
flatten_dict,
is_datasets_available,
is_pandas_available,
is_tf_available,
is_torch_available,
logging,
)
@ -56,8 +55,6 @@ from ..utils import (
logger = logging.get_logger(__name__)
if is_tf_available():
from .. import TFPreTrainedModel
if is_torch_available():
import torch
@ -760,12 +757,6 @@ def save_model_architecture_to_file(model: Any, output_dir: str):
with open(f"{output_dir}/model_architecture.txt", "w+") as f:
if isinstance(model, PreTrainedModel):
print(model, file=f)
elif is_tf_available() and isinstance(model, TFPreTrainedModel):
def print_to_file(s):
print(s, file=f)
model.summary(print_fn=print_to_file)
elif is_torch_available() and (
isinstance(model, (torch.nn.Module, PushToHubMixin)) and hasattr(model, "base_model")
):
@ -1225,7 +1216,7 @@ class CometCallback(TrainerCallback):
- **COMET_PROJECT_NAME** (`str`, *optional*):
Comet project name for experiments.
- **COMET_LOG_ASSETS** (`str`, *optional*, defaults to `TRUE`):
Whether or not to log training assets (tf event logs, checkpoints, etc), to Comet. Can be `TRUE`, or
Whether or not to log training assets (checkpoints, etc), to Comet. Can be `TRUE`, or
`FALSE`.
For a number of configurable items in the environment, see

View File

@ -1,413 +0,0 @@
import logging
import os
from pathlib import Path
from time import sleep
from typing import Callable, Optional, Union
import numpy as np
import tensorflow as tf
from huggingface_hub import Repository, create_repo
from packaging.version import parse
from . import IntervalStrategy, PreTrainedTokenizerBase
from .modelcard import TrainingSummary
from .modeling_tf_utils import keras
logger = logging.getLogger(__name__)
class KerasMetricCallback(keras.callbacks.Callback):
"""
Callback to compute metrics at the end of every epoch. Unlike normal Keras metrics, these do not need to be
compilable by TF. It is particularly useful for common NLP metrics like BLEU and ROUGE that require string
operations or generation loops that cannot be compiled. Predictions (or generations) will be computed on the
`eval_dataset` before being passed to the `metric_fn` in `np.ndarray` format. The `metric_fn` should compute
metrics and return a dict mapping metric names to metric values.
We provide an example of a suitable metric_fn that computes ROUGE scores for a summarization model below. Note that
this example skips some post-processing for readability and simplicity, and should probably not be used as-is!
```py
from datasets import load_metric
rouge_metric = load_metric("rouge")
def rouge_fn(predictions, labels):
decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
result = rouge_metric.compute(predictions=decoded_predictions, references=decoded_labels)
return {key: value.mid.fmeasure * 100 for key, value in result.items()}
```
The above function will return a dict containing values which will be logged like any other Keras metric:
```
{'rouge1': 37.4199, 'rouge2': 13.9768, 'rougeL': 34.361, 'rougeLsum': 35.0781
```
Args:
metric_fn (`Callable`):
Metric function provided by the user. It will be called with two arguments - `predictions` and `labels`.
These contain the model's outputs and matching labels from the dataset. It should return a dict mapping
metric names to numerical values.
eval_dataset (`tf.data.Dataset` or `dict` or `tuple` or `np.ndarray` or `tf.Tensor`):
Validation data to be used to generate predictions for the `metric_fn`.
output_cols (`list[str], *optional*):
A list of columns to be retained from the model output as the predictions. Defaults to all.
label_cols ('`list[str]`, *optional*'):
A list of columns to be retained from the input dataset as the labels. Will be autodetected if this is not
supplied.
batch_size (`int`, *optional*):
Batch size. Only used when the data is not a pre-batched `tf.data.Dataset`.
predict_with_generate (`bool`, *optional*, defaults to `False`):
Whether we should use `model.generate()` to get outputs for the model.
use_xla_generation (`bool`, *optional*, defaults to `False`):
If we're generating, whether to compile model generation with XLA. This can massively increase the speed of
generation (up to 100X speedup) but will require a new XLA compilation for each input shape. When using XLA
generation, it's a good idea to pad your inputs to the same size, or to use the `pad_to_multiple_of`
argument in your `tokenizer` or `DataCollator`, which will reduce the number of unique input shapes and
save a lot of compilation time. This option has no effect is `predict_with_generate` is `False`.
generate_kwargs (`dict`, *optional*):
Keyword arguments to pass to `model.generate()` when generating. Has no effect if `predict_with_generate`
is `False`.
"""
def __init__(
self,
metric_fn: Callable,
eval_dataset: Union[tf.data.Dataset, np.ndarray, tf.Tensor, tuple, dict],
output_cols: Optional[list[str]] = None,
label_cols: Optional[list[str]] = None,
batch_size: Optional[int] = None,
predict_with_generate: bool = False,
use_xla_generation: bool = False,
generate_kwargs: Optional[dict] = None,
):
super().__init__()
self.metric_fn = metric_fn
self.batch_size = batch_size
if not isinstance(eval_dataset, tf.data.Dataset):
if batch_size is None:
raise ValueError(
"When passing data to KerasMetricCallback that is not a pre-batched tf.data.Dataset "
"the batch_size argument must be set."
)
# Wrap a tf.data.Dataset around it
eval_dataset = tf.data.Dataset.from_tensor_slices(eval_dataset).batch(batch_size, drop_remainder=False)
self.eval_dataset = eval_dataset
self.predict_with_generate = predict_with_generate
self.output_cols = output_cols
# This next block attempts to parse out which elements of the dataset should be appended to the labels list
# that is passed to the metric_fn
if isinstance(eval_dataset.element_spec, tuple) and len(eval_dataset.element_spec) == 2:
input_spec, label_spec = eval_dataset.element_spec
else:
input_spec = eval_dataset.element_spec
label_spec = None
if label_cols is not None:
for label in label_cols:
if label not in input_spec:
raise ValueError(f"Label {label} is in label_cols but could not be found in the dataset inputs!")
self.label_cols = label_cols
self.use_keras_label = False
elif label_spec is not None:
# If the dataset inputs are split into a 2-tuple of inputs and labels,
# assume the second element is the labels
self.label_cols = None
self.use_keras_label = True
elif "labels" in input_spec:
self.label_cols = ["labels"]
self.use_keras_label = False
logging.warning("No label_cols specified for KerasMetricCallback, assuming you want the 'labels' key.")
elif "start_positions" in input_spec and "end_positions" in input_spec:
self.label_cols = ["start_positions", "end_positions"]
self.use_keras_label = False
logging.warning(
"No label_cols specified for KerasMetricCallback, assuming you want the "
"start_positions and end_positions keys."
)
else:
raise ValueError("Could not autodetect label_cols for KerasMetricCallback, please specify them!")
if parse(tf.__version__) < parse("2.7"):
logging.warning("TF versions less than 2.7 may encounter issues with KerasMetricCallback!")
self.use_xla_generation = use_xla_generation
self.generate_kwargs = {} if generate_kwargs is None else generate_kwargs
self.generation_function = None
@staticmethod
def _concatenate_batches(batches, padding_index=-100):
# If all batches are unidimensional or same length, do a simple concatenation
if batches[0].ndim == 1 or all(batch.shape[1] == batches[0].shape[1] for batch in batches):
return np.concatenate(batches, axis=0)
# Welp, they're not the same length. Let's do some padding
max_len = max([batch.shape[1] for batch in batches])
num_samples = sum([batch.shape[0] for batch in batches])
output = np.full_like(
batches[0], fill_value=padding_index, shape=[num_samples, max_len] + list(batches[0].shape[2:])
)
# i keeps track of which part of the concatenated array we're writing the next batch to
i = 0
for batch in batches:
output[i : i + len(batch), : batch.shape[1]] = batch
i += len(batch)
return output
def _postprocess_predictions_or_labels(self, inputs):
if isinstance(inputs[0], dict):
outputs = {}
for key in inputs[0]:
outputs[key] = self._concatenate_batches([batch[key] for batch in inputs])
# If it's a dict with only one key, just return the array
if len(outputs) == 1:
outputs = list(outputs.values())[0]
elif isinstance(inputs[0], (tuple, list)):
outputs = []
for input_list in zip(*inputs):
outputs.append(self._concatenate_batches(input_list))
if len(outputs) == 1:
outputs = outputs[0] # If it's a list with only one element, just return the array
elif isinstance(inputs[0], np.ndarray):
outputs = self._concatenate_batches(inputs)
elif isinstance(inputs[0], tf.Tensor):
outputs = self._concatenate_batches([tensor.numpy() for tensor in inputs])
else:
raise TypeError(f"Couldn't handle batch of type {type(inputs[0])}!")
return outputs
def on_epoch_end(self, epoch, logs=None):
if hasattr(self.model, "config"):
ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", [])
else:
ignore_keys = []
main_input_name = None
if self.predict_with_generate:
# This dense conditional recognizes the case where we have an encoder-decoder model, but
# avoids getting tangled up when we just have a model with a layer called 'encoder'
if hasattr(self.model, "encoder") and hasattr(self.model.encoder, "main_input_name"):
main_input_name = self.model.encoder.main_input_name
else:
main_input_name = getattr(self.model, "main_input_name", "input_ids")
if self.use_xla_generation and self.generation_function is None:
def generation_function(inputs, attention_mask):
return self.model.generate(inputs, attention_mask=attention_mask, **self.generate_kwargs)
self.generation_function = tf.function(generation_function, jit_compile=True)
prediction_list = []
label_list = []
# The whole predict/generate loop is handled inside this method
for batch in self.eval_dataset:
if isinstance(batch, tuple):
batch, labels = batch
else:
labels = None
if self.predict_with_generate:
if isinstance(batch, dict):
generation_inputs = batch[main_input_name]
attention_mask = batch.get("attention_mask", None)
else:
generation_inputs = batch
attention_mask = None
if self.use_xla_generation:
predictions = self.generation_function(generation_inputs, attention_mask=attention_mask)
else:
predictions = self.model.generate(
generation_inputs, attention_mask=attention_mask, **self.generate_kwargs
)
else:
predictions = self.model.predict_on_batch(batch)
if isinstance(predictions, dict):
# This converts any dict-subclass to a regular dict
# Keras REALLY doesn't like it when we pass around a BatchEncoding or other derived class
predictions = dict(predictions)
if self.output_cols is not None:
predictions = {key: predictions[key] for key in self.output_cols}
else:
predictions = {
key: val for key, val in predictions.items() if key not in ignore_keys + ["loss"]
}
prediction_list.append(predictions)
if not self.use_keras_label:
labels = {key: batch[key].numpy() for key in self.label_cols}
elif isinstance(labels, dict):
labels = {key: array.numpy() for key, array in labels.items()}
elif isinstance(labels, (list, tuple)):
labels = [array.numpy() for array in labels]
elif isinstance(labels, tf.Tensor):
labels = labels.numpy()
else:
raise TypeError(f"Confused by labels of type {type(labels)}")
label_list.append(labels)
all_preds = self._postprocess_predictions_or_labels(prediction_list)
all_labels = self._postprocess_predictions_or_labels(label_list)
metric_output = self.metric_fn((all_preds, all_labels))
if not isinstance(metric_output, dict):
raise TypeError(
f"metric_fn should return a dict mapping metric names to values but instead returned {metric_output}"
)
# This is the critical bit - Keras passes a dict containing the loss and standard metric values for this epoch
# in the logs argument. Ordinarily, this is so the callback can read them, but in this case we write a bunch of
# new keys in there, which will then get read by the History callback and treated like any other metric value.
# I promise that I have it in writing from Chollet that this is okay.
logs.update(metric_output)
class PushToHubCallback(keras.callbacks.Callback):
"""
Callback that will save and push the model to the Hub regularly. By default, it pushes once per epoch, but this can
be changed with the `save_strategy` argument. Pushed models can be accessed like any other model on the hub, such
as with the `from_pretrained` method.
```py
from transformers.keras_callbacks import PushToHubCallback
push_to_hub_callback = PushToHubCallback(
output_dir="./model_save",
tokenizer=tokenizer,
hub_model_id="gpt5-7xlarge",
)
model.fit(train_dataset, callbacks=[push_to_hub_callback])
```
Args:
output_dir (`str`):
The output directory where the model predictions and checkpoints will be written and synced with the
repository on the Hub.
save_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"epoch"`):
The checkpoint save strategy to adopt during training. Possible values are:
- `"no"`: Save is done at the end of training.
- `"epoch"`: Save is done at the end of each epoch.
- `"steps"`: Save is done every `save_steps`
save_steps (`int`, *optional*):
The number of steps between saves when using the "steps" `save_strategy`.
tokenizer (`PreTrainedTokenizerBase`, *optional*):
The tokenizer used by the model. If supplied, will be uploaded to the repo alongside the weights.
hub_model_id (`str`, *optional*):
The name of the repository to keep in sync with the local `output_dir`. It can be a simple model ID in
which case the model will be pushed in your namespace. Otherwise it should be the whole repository name,
for instance `"user_name/model"`, which allows you to push to an organization you are a member of with
`"organization_name/model"`.
Will default to the name of `output_dir`.
hub_token (`str`, *optional*):
The token to use to push the model to the Hub. Will default to the token in the cache folder obtained with
`hf auth login`.
checkpoint (`bool`, *optional*, defaults to `False`):
Whether to save full training checkpoints (including epoch and optimizer state) to allow training to be
resumed. Only usable when `save_strategy` is `"epoch"`.
"""
def __init__(
self,
output_dir: Union[str, Path],
save_strategy: Union[str, IntervalStrategy] = "epoch",
save_steps: Optional[int] = None,
tokenizer: Optional[PreTrainedTokenizerBase] = None,
hub_model_id: Optional[str] = None,
hub_token: Optional[str] = None,
checkpoint: bool = False,
**model_card_args,
):
super().__init__()
if checkpoint and save_strategy != "epoch":
raise ValueError("Cannot save checkpoints when save_strategy is not 'epoch'!")
if isinstance(save_strategy, str):
save_strategy = IntervalStrategy(save_strategy.lower())
self.save_strategy = save_strategy
if self.save_strategy == IntervalStrategy.STEPS and (not isinstance(save_steps, int) or save_steps <= 0):
raise ValueError("Please supply a positive integer argument for save_steps when save_strategy == 'steps'!")
self.save_steps = save_steps
output_dir = Path(output_dir)
# Create repo and retrieve repo_id
if hub_model_id is None:
hub_model_id = output_dir.absolute().name
self.hub_model_id = create_repo(repo_id=hub_model_id, exist_ok=True, token=hub_token).repo_id
self.output_dir = output_dir
self.repo = Repository(str(self.output_dir), clone_from=self.hub_model_id, token=hub_token)
self.tokenizer = tokenizer
self.last_job = None
self.checkpoint = checkpoint
self.training_history = None
self.model_card_args = model_card_args
def on_train_begin(self, logs=None):
# Although we can access model.history, we have no guarantees that the History callback will fire before this
# one, so we keep track of it here too
self.training_history = []
def on_train_batch_end(self, batch, logs=None):
if self.save_strategy == IntervalStrategy.STEPS and (batch + 1) % self.save_steps == 0:
if self.last_job is not None and not self.last_job.is_done:
return # The last upload is still running, don't start another
self.model.save_pretrained(self.output_dir)
if self.tokenizer is not None:
self.tokenizer.save_pretrained(self.output_dir)
_, self.last_job = self.repo.push_to_hub(
commit_message=f"Training in progress steps {batch}", blocking=False
)
def on_epoch_end(self, epoch, logs=None):
logs = logs.copy() # Don't accidentally write things that Keras will read later
if "epoch" not in logs:
logs["epoch"] = epoch
self.training_history.append(logs)
if self.save_strategy == IntervalStrategy.EPOCH:
if self.last_job is not None and not self.last_job.is_done:
return # The last upload is still running, don't start another
self.model.save_pretrained(self.output_dir)
if self.tokenizer is not None:
self.tokenizer.save_pretrained(self.output_dir)
if self.checkpoint:
checkpoint_dir = os.path.join(self.output_dir, "checkpoint")
self.model._save_checkpoint(checkpoint_dir, epoch)
train_summary = TrainingSummary.from_keras(
model=self.model,
model_name=self.hub_model_id,
keras_history=self.training_history,
**self.model_card_args,
)
model_card = train_summary.to_model_card()
with (self.output_dir / "README.md").open("w") as f:
f.write(model_card)
_, self.last_job = self.repo.push_to_hub(
commit_message=f"Training in progress epoch {epoch}", blocking=False
)
def on_train_end(self, logs=None):
# Makes sure the latest version of the model is uploaded
if self.last_job is not None and not self.last_job.is_done:
logging.info("Pushing the last epoch to the Hub, this may take a while...")
while not self.last_job.is_done:
sleep(1)
else:
self.model.save_pretrained(self.output_dir)
if self.tokenizer is not None:
self.tokenizer.save_pretrained(self.output_dir)
train_summary = TrainingSummary.from_keras(
model=self.model,
model_name=self.hub_model_id,
keras_history=self.training_history,
**self.model_card_args,
)
model_card = train_summary.to_model_card()
with (self.output_dir / "README.md").open("w") as f:
f.write(model_card)
self.repo.push_to_hub(commit_message="End of training", blocking=True)

View File

@ -51,7 +51,6 @@ from .utils import (
cached_file,
is_datasets_available,
is_offline_mode,
is_tf_available,
is_tokenizers_available,
is_torch_available,
logging,
@ -256,11 +255,6 @@ AUTOGENERATED_TRAINER_COMMENT = """
should probably proofread and complete it, then remove this comment. -->
"""
AUTOGENERATED_KERAS_COMMENT = """
<!-- This model card has been generated automatically according to the information Keras had access to. You should
probably proofread and complete it, then remove this comment. -->
"""
TASK_TAG_TO_NAME_MAPPING = {
"fill-mask": "Masked Language Modeling",
@ -483,8 +477,6 @@ class TrainingSummary:
# Now the model card for realsies.
if self.source == "trainer":
model_card += AUTOGENERATED_TRAINER_COMMENT
else:
model_card += AUTOGENERATED_KERAS_COMMENT
model_card += f"\n# {self.model_name}\n\n"
@ -538,10 +530,6 @@ class TrainingSummary:
import torch
model_card += f"- Pytorch {torch.__version__}\n"
elif self.source == "keras" and is_tf_available():
import tensorflow as tf
model_card += f"- TensorFlow {tf.__version__}\n"
if is_datasets_available():
import datasets
@ -631,116 +619,6 @@ class TrainingSummary:
hyperparameters=hyperparameters,
)
@classmethod
def from_keras(
cls,
model,
model_name,
keras_history=None,
language=None,
license=None,
tags=None,
finetuned_from=None,
tasks=None,
dataset_tags=None,
dataset=None,
dataset_args=None,
):
# Infer default from dataset
if dataset is not None:
if is_hf_dataset(dataset) and (dataset_tags is None or dataset_args is None):
default_tag = dataset.builder_name
# Those are not real datasets from the Hub so we exclude them.
if default_tag not in ["csv", "json", "pandas", "parquet", "text"]:
if dataset_tags is None:
dataset_tags = [default_tag]
if dataset_args is None:
dataset_args = [dataset.config_name]
if dataset is None and dataset_tags is not None:
dataset = dataset_tags
# Infer default finetuned_from
if (
finetuned_from is None
and hasattr(model.config, "_name_or_path")
and not os.path.isdir(model.config._name_or_path)
):
finetuned_from = model.config._name_or_path
# Infer default task tag:
if tasks is None:
model_class_name = model.__class__.__name__
for task, mapping in TASK_MAPPING.items():
if model_class_name in _get_mapping_values(mapping):
tasks = task
# Add `generated_from_keras_callback` to the tags
if tags is None:
tags = ["generated_from_keras_callback"]
elif isinstance(tags, str) and tags != "generated_from_keras_callback":
tags = [tags, "generated_from_keras_callback"]
elif "generated_from_keras_callback" not in tags:
tags.append("generated_from_keras_callback")
if keras_history is not None:
_, eval_lines, eval_results = parse_keras_history(keras_history)
else:
eval_lines = []
eval_results = {}
hyperparameters = extract_hyperparameters_from_keras(model)
return cls(
language=language,
license=license,
tags=tags,
model_name=model_name,
finetuned_from=finetuned_from,
tasks=tasks,
dataset_tags=dataset_tags,
dataset=dataset,
dataset_args=dataset_args,
eval_results=eval_results,
eval_lines=eval_lines,
hyperparameters=hyperparameters,
source="keras",
)
def parse_keras_history(logs):
"""
Parse the `logs` of either a `keras.History` object returned by `model.fit()` or an accumulated logs `dict`
passed to the `PushToHubCallback`. Returns lines and logs compatible with those returned by `parse_log_history`.
"""
if hasattr(logs, "history"):
# This looks like a `History` object
if not hasattr(logs, "epoch"):
# This history looks empty, return empty results
return None, [], {}
logs.history["epoch"] = logs.epoch
logs = logs.history
else:
# Training logs is a list of dicts, let's invert it to a dict of lists to match a History object
logs = {log_key: [single_dict[log_key] for single_dict in logs] for log_key in logs[0]}
lines = []
for i in range(len(logs["epoch"])):
epoch_dict = {log_key: log_value_list[i] for log_key, log_value_list in logs.items()}
values = {}
for k, v in epoch_dict.items():
if k.startswith("val_"):
k = "validation_" + k[4:]
elif k != "epoch":
k = "train_" + k
splits = k.split("_")
name = " ".join([part.capitalize() for part in splits])
values[name] = v
lines.append(values)
eval_results = lines[-1]
return logs, lines, eval_results
def parse_log_history(log_history):
"""
@ -804,19 +682,6 @@ def parse_log_history(log_history):
return train_log, lines, None
def extract_hyperparameters_from_keras(model):
from .modeling_tf_utils import keras
hyperparameters = {}
if hasattr(model, "optimizer") and model.optimizer is not None:
hyperparameters["optimizer"] = model.optimizer.get_config()
else:
hyperparameters["optimizer"] = None
hyperparameters["training_precision"] = keras.mixed_precision.global_policy().name
return hyperparameters
def _maybe_round(v, decimals=4):
if isinstance(v, float) and len(str(v).split(".")) > 1 and len(str(v).split(".")[1]) > decimals:
return f"{v:.{decimals}f}"

View File

@ -1,700 +0,0 @@
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional
import flax
import jax.numpy as jnp
from .utils import ModelOutput
@flax.struct.dataclass
class FlaxBaseModelOutput(ModelOutput):
"""
Base class for model's outputs, with potential hidden states and attentions.
Args:
last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: Optional[jnp.ndarray] = None
hidden_states: Optional[tuple[jnp.ndarray]] = None
attentions: Optional[tuple[jnp.ndarray]] = None
@flax.struct.dataclass
class FlaxBaseModelOutputWithNoAttention(ModelOutput):
"""
Base class for model's outputs, with potential hidden states.
Args:
last_hidden_state (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`):
Sequence of hidden-states at the output of the last layer of the model.
hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `jnp.ndarray` (one for the output of the embeddings, if the model has an embedding layer, + one
for the output of each layer) of shape `(batch_size, num_channels, height, width)`. Hidden-states of the
model at the output of each layer plus the optional initial embedding outputs.
"""
last_hidden_state: Optional[jnp.ndarray] = None
hidden_states: Optional[tuple[jnp.ndarray]] = None
@flax.struct.dataclass
class FlaxBaseModelOutputWithPoolingAndNoAttention(ModelOutput):
"""
Base class for model's outputs that also contains a pooling of the last hidden states.
Args:
last_hidden_state (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (`jnp.ndarray` of shape `(batch_size, hidden_size)`):
Last layer hidden-state after a pooling operation on the spatial dimensions.
hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `jnp.ndarray` (one for the output of the embeddings, if the model has an embedding layer, + one
for the output of each layer) of shape `(batch_size, num_channels, height, width)`. Hidden-states of the
model at the output of each layer plus the optional initial embedding outputs.
"""
last_hidden_state: Optional[jnp.ndarray] = None
pooler_output: Optional[jnp.ndarray] = None
hidden_states: Optional[tuple[jnp.ndarray]] = None
@flax.struct.dataclass
class FlaxImageClassifierOutputWithNoAttention(ModelOutput):
"""
Base class for outputs of image classification models.
Args:
logits (`jnp.ndarray` of shape `(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when
`config.output_hidden_states=True`):
Tuple of `jnp.ndarray` (one for the output of the embeddings, if the model has an embedding layer, + one
for the output of each stage) of shape `(batch_size, num_channels, height, width)`. Hidden-states (also
called feature maps) of the model at the output of each stage.
"""
logits: Optional[jnp.ndarray] = None
hidden_states: Optional[tuple[jnp.ndarray]] = None
@flax.struct.dataclass
class FlaxBaseModelOutputWithPast(ModelOutput):
"""
Base class for model's outputs, with potential hidden states and attentions.
Args:
last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
past_key_values (`dict[str, jnp.ndarray]`):
Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: Optional[jnp.ndarray] = None
past_key_values: Optional[dict[str, jnp.ndarray]] = None
hidden_states: Optional[tuple[jnp.ndarray]] = None
attentions: Optional[tuple[jnp.ndarray]] = None
@flax.struct.dataclass
class FlaxBaseModelOutputWithPooling(ModelOutput):
"""
Base class for model's outputs that also contains a pooling of the last hidden states.
Args:
last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (`jnp.ndarray` of shape `(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token) further processed by a
Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence
prediction (classification) objective during pretraining.
hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: Optional[jnp.ndarray] = None
pooler_output: Optional[jnp.ndarray] = None
hidden_states: Optional[tuple[jnp.ndarray]] = None
attentions: Optional[tuple[jnp.ndarray]] = None
@flax.struct.dataclass
class FlaxBaseModelOutputWithPoolingAndCrossAttentions(ModelOutput):
"""
Base class for model's outputs that also contains a pooling of the last hidden states.
Args:
last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (`jnp.ndarray` of shape `(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token) after further processing
through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
the classification token after processing through a linear layer and a tanh activation function. The linear
layer weights are trained from the next sentence prediction (classification) objective during pretraining.
hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `jnp.ndarray` (one for the output of the embeddings, if the model has an embedding layer, + one
for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
`config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
input) to speed up sequential decoding.
"""
last_hidden_state: Optional[jnp.ndarray] = None
pooler_output: Optional[jnp.ndarray] = None
hidden_states: Optional[tuple[jnp.ndarray]] = None
past_key_values: Optional[tuple[tuple[jnp.ndarray]]] = None
attentions: Optional[tuple[jnp.ndarray]] = None
cross_attentions: Optional[tuple[jnp.ndarray]] = None
@flax.struct.dataclass
class FlaxBaseModelOutputWithPastAndCrossAttentions(ModelOutput):
"""
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
Args:
last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
hidden_size)` is output.
past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
`config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
input) to speed up sequential decoding.
hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
"""
last_hidden_state: Optional[jnp.ndarray] = None
past_key_values: Optional[tuple[tuple[jnp.ndarray]]] = None
hidden_states: Optional[tuple[jnp.ndarray]] = None
attentions: Optional[tuple[jnp.ndarray]] = None
cross_attentions: Optional[tuple[jnp.ndarray]] = None
@flax.struct.dataclass
class FlaxSeq2SeqModelOutput(ModelOutput):
"""
Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
decoding.
Args:
last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the decoder of the model.
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
hidden_size)` is output.
past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
decoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
decoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
encoder_last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
encoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
encoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
"""
last_hidden_state: Optional[jnp.ndarray] = None
past_key_values: Optional[tuple[tuple[jnp.ndarray]]] = None
decoder_hidden_states: Optional[tuple[jnp.ndarray]] = None
decoder_attentions: Optional[tuple[jnp.ndarray]] = None
cross_attentions: Optional[tuple[jnp.ndarray]] = None
encoder_last_hidden_state: Optional[jnp.ndarray] = None
encoder_hidden_states: Optional[tuple[jnp.ndarray]] = None
encoder_attentions: Optional[tuple[jnp.ndarray]] = None
@flax.struct.dataclass
class FlaxCausalLMOutputWithCrossAttentions(ModelOutput):
"""
Base class for causal language model (or autoregressive) outputs.
Args:
logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Cross attentions weights after the attention softmax, used to compute the weighted average in the
cross-attention heads.
past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `jnp.ndarray` tuples of length `config.n_layers`, with each tuple containing the cached key, value
states of the self-attention and the cross-attention layers if model is used in encoder-decoder setting.
Only relevant if `config.is_decoder = True`.
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
`past_key_values` input) to speed up sequential decoding.
"""
logits: Optional[jnp.ndarray] = None
past_key_values: Optional[tuple[tuple[jnp.ndarray]]] = None
hidden_states: Optional[tuple[jnp.ndarray]] = None
attentions: Optional[tuple[jnp.ndarray]] = None
cross_attentions: Optional[tuple[jnp.ndarray]] = None
@flax.struct.dataclass
class FlaxMaskedLMOutput(ModelOutput):
"""
Base class for masked language models outputs.
Args:
logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
logits: Optional[jnp.ndarray] = None
hidden_states: Optional[tuple[jnp.ndarray]] = None
attentions: Optional[tuple[jnp.ndarray]] = None
FlaxCausalLMOutput = FlaxMaskedLMOutput
@flax.struct.dataclass
class FlaxSeq2SeqLMOutput(ModelOutput):
"""
Base class for sequence-to-sequence language models outputs.
Args:
logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
decoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
decoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
encoder_last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
encoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
encoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
"""
logits: Optional[jnp.ndarray] = None
past_key_values: Optional[tuple[tuple[jnp.ndarray]]] = None
decoder_hidden_states: Optional[tuple[jnp.ndarray]] = None
decoder_attentions: Optional[tuple[jnp.ndarray]] = None
cross_attentions: Optional[tuple[jnp.ndarray]] = None
encoder_last_hidden_state: Optional[jnp.ndarray] = None
encoder_hidden_states: Optional[tuple[jnp.ndarray]] = None
encoder_attentions: Optional[tuple[jnp.ndarray]] = None
@flax.struct.dataclass
class FlaxNextSentencePredictorOutput(ModelOutput):
"""
Base class for outputs of models predicting if two sentences are consecutive or not.
Args:
logits (`jnp.ndarray` of shape `(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
before SoftMax).
hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
logits: Optional[jnp.ndarray] = None
hidden_states: Optional[tuple[jnp.ndarray]] = None
attentions: Optional[tuple[jnp.ndarray]] = None
@flax.struct.dataclass
class FlaxSequenceClassifierOutput(ModelOutput):
"""
Base class for outputs of sentence classification models.
Args:
logits (`jnp.ndarray` of shape `(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
logits: Optional[jnp.ndarray] = None
hidden_states: Optional[tuple[jnp.ndarray]] = None
attentions: Optional[tuple[jnp.ndarray]] = None
@flax.struct.dataclass
class FlaxSeq2SeqSequenceClassifierOutput(ModelOutput):
"""
Base class for outputs of sequence-to-sequence sentence classification models.
Args:
logits (`jnp.ndarray` of shape `(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
decoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
decoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
encoder_last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
encoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
encoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
"""
logits: Optional[jnp.ndarray] = None
past_key_values: Optional[tuple[tuple[jnp.ndarray]]] = None
decoder_hidden_states: Optional[tuple[jnp.ndarray]] = None
decoder_attentions: Optional[tuple[jnp.ndarray]] = None
cross_attentions: Optional[tuple[jnp.ndarray]] = None
encoder_last_hidden_state: Optional[jnp.ndarray] = None
encoder_hidden_states: Optional[tuple[jnp.ndarray]] = None
encoder_attentions: Optional[tuple[jnp.ndarray]] = None
@flax.struct.dataclass
class FlaxMultipleChoiceModelOutput(ModelOutput):
"""
Base class for outputs of multiple choice models.
Args:
logits (`jnp.ndarray` of shape `(batch_size, num_choices)`):
*num_choices* is the second dimension of the input tensors. (see *input_ids* above).
Classification scores (before SoftMax).
hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
logits: Optional[jnp.ndarray] = None
hidden_states: Optional[tuple[jnp.ndarray]] = None
attentions: Optional[tuple[jnp.ndarray]] = None
@flax.struct.dataclass
class FlaxTokenClassifierOutput(ModelOutput):
"""
Base class for outputs of token classification models.
Args:
logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
logits: Optional[jnp.ndarray] = None
hidden_states: Optional[tuple[jnp.ndarray]] = None
attentions: Optional[tuple[jnp.ndarray]] = None
@flax.struct.dataclass
class FlaxQuestionAnsweringModelOutput(ModelOutput):
"""
Base class for outputs of question answering models.
Args:
start_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Span-start scores (before SoftMax).
end_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Span-end scores (before SoftMax).
hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
start_logits: Optional[jnp.ndarray] = None
end_logits: Optional[jnp.ndarray] = None
hidden_states: Optional[tuple[jnp.ndarray]] = None
attentions: Optional[tuple[jnp.ndarray]] = None
@flax.struct.dataclass
class FlaxSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
"""
Base class for outputs of sequence-to-sequence question answering models.
Args:
start_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Span-start scores (before SoftMax).
end_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Span-end scores (before SoftMax).
past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
decoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
decoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
encoder_last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
encoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
encoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
"""
start_logits: Optional[jnp.ndarray] = None
end_logits: Optional[jnp.ndarray] = None
past_key_values: Optional[tuple[tuple[jnp.ndarray]]] = None
decoder_hidden_states: Optional[tuple[jnp.ndarray]] = None
decoder_attentions: Optional[tuple[jnp.ndarray]] = None
cross_attentions: Optional[tuple[jnp.ndarray]] = None
encoder_last_hidden_state: Optional[jnp.ndarray] = None
encoder_hidden_states: Optional[tuple[jnp.ndarray]] = None
encoder_attentions: Optional[tuple[jnp.ndarray]] = None

View File

@ -1,491 +0,0 @@
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch - Flax general utilities."""
import os
from pickle import UnpicklingError
import jax
import jax.numpy as jnp
import numpy as np
from flax.serialization import from_bytes
from flax.traverse_util import flatten_dict, unflatten_dict
import transformers
from . import is_safetensors_available, is_torch_available
from .utils import check_torch_load_is_safe, logging
if is_torch_available():
import torch
if is_safetensors_available():
from safetensors import safe_open
from safetensors.flax import load_file as safe_load_file
logger = logging.get_logger(__name__)
#####################
# PyTorch => Flax #
#####################
def load_pytorch_checkpoint_in_flax_state_dict(
flax_model, pytorch_checkpoint_path, is_sharded, allow_missing_keys=False
):
"""Load pytorch checkpoints in a flax model"""
if not is_sharded:
pt_path = os.path.abspath(pytorch_checkpoint_path)
logger.info(f"Loading PyTorch weights from {pt_path}")
if pt_path.endswith(".safetensors"):
pt_state_dict = {}
with safe_open(pt_path, framework="flax") as f:
for k in f.keys():
pt_state_dict[k] = f.get_tensor(k)
else:
try:
import torch # noqa: F401
except (ImportError, ModuleNotFoundError):
logger.error(
"Loading a PyTorch model in Flax, requires both PyTorch and Flax to be installed. Please see"
" https://pytorch.org/ and https://flax.readthedocs.io/en/latest/index.html#installation for installation"
" instructions."
)
raise
check_torch_load_is_safe()
pt_state_dict = torch.load(pt_path, map_location="cpu", weights_only=True)
logger.info(f"PyTorch checkpoint contains {sum(t.numel() for t in pt_state_dict.values()):,} parameters.")
flax_state_dict = convert_pytorch_state_dict_to_flax(pt_state_dict, flax_model)
else:
# model is sharded and pytorch_checkpoint_path already contains the list of .pt shard files
flax_state_dict = convert_pytorch_sharded_state_dict_to_flax(pytorch_checkpoint_path, flax_model)
return flax_state_dict
def rename_key_and_reshape_tensor(
pt_tuple_key: tuple[str],
pt_tensor: np.ndarray,
random_flax_state_dict: dict[str, jnp.ndarray],
model_prefix: str,
) -> tuple[tuple[str], np.ndarray]:
"""Rename PT weight names to corresponding Flax weight names and reshape tensor if necessary"""
def is_key_or_prefix_key_in_dict(key: tuple[str]) -> bool:
"""Checks if `key` of `(prefix,) + key` is in random_flax_state_dict"""
return len(set(random_flax_state_dict) & {key, (model_prefix,) + key}) > 0
# layer norm
renamed_pt_tuple_key = pt_tuple_key[:-1] + ("scale",)
if pt_tuple_key[-1] in ["weight", "gamma"] and is_key_or_prefix_key_in_dict(renamed_pt_tuple_key):
return renamed_pt_tuple_key, pt_tensor
# batch norm layer mean
renamed_pt_tuple_key = pt_tuple_key[:-1] + ("mean",)
if pt_tuple_key[-1] == "running_mean" and not is_key_or_prefix_key_in_dict(pt_tuple_key):
return renamed_pt_tuple_key, pt_tensor
# batch norm layer var
renamed_pt_tuple_key = pt_tuple_key[:-1] + ("var",)
if pt_tuple_key[-1] == "running_var" and not is_key_or_prefix_key_in_dict(pt_tuple_key):
return renamed_pt_tuple_key, pt_tensor
# embedding
renamed_pt_tuple_key = pt_tuple_key[:-1] + ("embedding",)
if pt_tuple_key[-1] == "weight" and is_key_or_prefix_key_in_dict(renamed_pt_tuple_key):
return renamed_pt_tuple_key, pt_tensor
# conv layer
renamed_pt_tuple_key = pt_tuple_key[:-1] + ("kernel",)
if pt_tuple_key[-1] == "weight" and pt_tensor.ndim == 4 and not is_key_or_prefix_key_in_dict(pt_tuple_key):
pt_tensor = pt_tensor.transpose(2, 3, 1, 0)
return renamed_pt_tuple_key, pt_tensor
# linear layer
renamed_pt_tuple_key = pt_tuple_key[:-1] + ("kernel",)
if pt_tuple_key[-1] == "weight" and not is_key_or_prefix_key_in_dict(pt_tuple_key):
pt_tensor = pt_tensor.T
return renamed_pt_tuple_key, pt_tensor
# old PyTorch layer norm weight
renamed_pt_tuple_key = pt_tuple_key[:-1] + ("weight",)
if pt_tuple_key[-1] == "gamma":
return renamed_pt_tuple_key, pt_tensor
# old PyTorch layer norm bias
renamed_pt_tuple_key = pt_tuple_key[:-1] + ("bias",)
if pt_tuple_key[-1] == "beta":
return renamed_pt_tuple_key, pt_tensor
# New `weight_norm` from https://github.com/huggingface/transformers/pull/24030
name = None
if pt_tuple_key[-3::2] == ("parametrizations", "original0"):
name = pt_tuple_key[-2] + "_g"
elif pt_tuple_key[-3::2] == ("parametrizations", "original1"):
name = pt_tuple_key[-2] + "_v"
if name is not None:
renamed_pt_tuple_key = pt_tuple_key[:-3] + (name,)
return renamed_pt_tuple_key, pt_tensor
return pt_tuple_key, pt_tensor
def convert_pytorch_state_dict_to_flax(pt_state_dict, flax_model):
# convert pytorch tensor to numpy
from_bin = is_torch_available() and isinstance(next(iter(pt_state_dict.values())), torch.Tensor)
bfloat16 = torch.bfloat16 if from_bin else "bfloat16"
weight_dtypes = {k: v.dtype for k, v in pt_state_dict.items()}
if from_bin:
for k, v in pt_state_dict.items():
# numpy currently does not support bfloat16, need to go over float32 in this case to not lose precision
if v.dtype == bfloat16:
v = v.float()
pt_state_dict[k] = v.cpu().numpy()
model_prefix = flax_model.base_model_prefix
# use params dict if the model contains batch norm layers
if "params" in flax_model.params:
flax_model_params = flax_model.params["params"]
else:
flax_model_params = flax_model.params
random_flax_state_dict = flatten_dict(flax_model_params)
# add batch_stats keys,values to dict
if "batch_stats" in flax_model.params:
flax_batch_stats = flatten_dict(flax_model.params["batch_stats"])
random_flax_state_dict.update(flax_batch_stats)
flax_state_dict = {}
load_model_with_head_into_base_model = (model_prefix not in flax_model_params) and (
model_prefix in {k.split(".")[0] for k in pt_state_dict}
)
load_base_model_into_model_with_head = (model_prefix in flax_model_params) and (
model_prefix not in {k.split(".")[0] for k in pt_state_dict}
)
# Need to change some parameters name to match Flax names
for pt_key, pt_tensor in pt_state_dict.items():
pt_tuple_key = tuple(pt_key.split("."))
is_bfloat_16 = weight_dtypes[pt_key] == bfloat16
# remove base model prefix if necessary
has_base_model_prefix = pt_tuple_key[0] == model_prefix
if load_model_with_head_into_base_model and has_base_model_prefix:
pt_tuple_key = pt_tuple_key[1:]
# Correctly rename weight parameters
flax_key, flax_tensor = rename_key_and_reshape_tensor(
pt_tuple_key, pt_tensor, random_flax_state_dict, model_prefix
)
# add model prefix if necessary
require_base_model_prefix = (model_prefix,) + flax_key in random_flax_state_dict
if load_base_model_into_model_with_head and require_base_model_prefix:
flax_key = (model_prefix,) + flax_key
if flax_key in random_flax_state_dict:
if flax_tensor.shape != random_flax_state_dict[flax_key].shape:
raise ValueError(
f"PyTorch checkpoint seems to be incorrect. Weight {pt_key} was expected to be of shape "
f"{random_flax_state_dict[flax_key].shape}, but is {flax_tensor.shape}."
)
# add batch stats if the model contains batchnorm layers
if "batch_stats" in flax_model.params:
if "mean" in flax_key[-1] or "var" in flax_key[-1]:
flax_state_dict[("batch_stats",) + flax_key] = jnp.asarray(flax_tensor)
continue
# remove num_batches_tracked key
if "num_batches_tracked" in flax_key[-1]:
flax_state_dict.pop(flax_key, None)
continue
# also add unexpected weight so that warning is thrown
flax_state_dict[("params",) + flax_key] = (
jnp.asarray(flax_tensor) if not is_bfloat_16 else jnp.asarray(flax_tensor, dtype=jnp.bfloat16)
)
else:
# also add unexpected weight so that warning is thrown
flax_state_dict[flax_key] = (
jnp.asarray(flax_tensor) if not is_bfloat_16 else jnp.asarray(flax_tensor, dtype=jnp.bfloat16)
)
return unflatten_dict(flax_state_dict)
############################
# Sharded Pytorch => Flax #
############################
def convert_pytorch_sharded_state_dict_to_flax(shard_filenames, flax_model):
import torch
# Load the index
flax_state_dict = {}
for shard_file in shard_filenames:
# load using msgpack utils
check_torch_load_is_safe()
pt_state_dict = torch.load(shard_file, weights_only=True)
weight_dtypes = {k: v.dtype for k, v in pt_state_dict.items()}
pt_state_dict = {
k: v.numpy() if v.dtype != torch.bfloat16 else v.float().numpy() for k, v in pt_state_dict.items()
}
model_prefix = flax_model.base_model_prefix
# use params dict if the model contains batch norm layers and then add batch_stats keys,values to dict
if "batch_stats" in flax_model.params:
flax_model_params = flax_model.params["params"]
random_flax_state_dict = flatten_dict(flax_model_params)
random_flax_state_dict.update(flatten_dict(flax_model.params["batch_stats"]))
else:
flax_model_params = flax_model.params
random_flax_state_dict = flatten_dict(flax_model_params)
load_model_with_head_into_base_model = (model_prefix not in flax_model_params) and (
model_prefix in {k.split(".")[0] for k in pt_state_dict}
)
load_base_model_into_model_with_head = (model_prefix in flax_model_params) and (
model_prefix not in {k.split(".")[0] for k in pt_state_dict}
)
# Need to change some parameters name to match Flax names
for pt_key, pt_tensor in pt_state_dict.items():
pt_tuple_key = tuple(pt_key.split("."))
is_bfloat_16 = weight_dtypes[pt_key] == torch.bfloat16
# remove base model prefix if necessary
has_base_model_prefix = pt_tuple_key[0] == model_prefix
if load_model_with_head_into_base_model and has_base_model_prefix:
pt_tuple_key = pt_tuple_key[1:]
# Correctly rename weight parameters
flax_key, flax_tensor = rename_key_and_reshape_tensor(
pt_tuple_key, pt_tensor, random_flax_state_dict, model_prefix
)
# add model prefix if necessary
require_base_model_prefix = (model_prefix,) + flax_key in random_flax_state_dict
if load_base_model_into_model_with_head and require_base_model_prefix:
flax_key = (model_prefix,) + flax_key
if flax_key in random_flax_state_dict:
if flax_tensor.shape != random_flax_state_dict[flax_key].shape:
raise ValueError(
f"PyTorch checkpoint seems to be incorrect. Weight {pt_key} was expected to be of shape "
f"{random_flax_state_dict[flax_key].shape}, but is {flax_tensor.shape}."
)
# add batch stats if the model contains batchnorm layers
if "batch_stats" in flax_model.params:
if "mean" in flax_key[-1]:
flax_state_dict[("batch_stats",) + flax_key] = jnp.asarray(flax_tensor)
continue
if "var" in flax_key[-1]:
flax_state_dict[("batch_stats",) + flax_key] = jnp.asarray(flax_tensor)
continue
# remove num_batches_tracked key
if "num_batches_tracked" in flax_key[-1]:
flax_state_dict.pop(flax_key, None)
continue
# also add unexpected weight so that warning is thrown
flax_state_dict[("params",) + flax_key] = (
jnp.asarray(flax_tensor) if not is_bfloat_16 else jnp.asarray(flax_tensor, dtype=jnp.bfloat16)
)
else:
# also add unexpected weight so that warning is thrown
flax_state_dict[flax_key] = (
jnp.asarray(flax_tensor) if not is_bfloat_16 else jnp.asarray(flax_tensor, dtype=jnp.bfloat16)
)
return unflatten_dict(flax_state_dict)
#####################
# Flax => PyTorch #
#####################
def load_flax_checkpoint_in_pytorch_model(model, flax_checkpoint_path):
"""Load flax checkpoints in a PyTorch model"""
flax_checkpoint_path = os.path.abspath(flax_checkpoint_path)
logger.info(f"Loading Flax weights from {flax_checkpoint_path}")
# import correct flax class
flax_cls = getattr(transformers, "Flax" + model.__class__.__name__)
# load flax weight dict
if flax_checkpoint_path.endswith(".safetensors"):
flax_state_dict = safe_load_file(flax_checkpoint_path)
flax_state_dict = unflatten_dict(flax_state_dict, sep=".")
else:
with open(flax_checkpoint_path, "rb") as state_f:
try:
flax_state_dict = from_bytes(flax_cls, state_f.read())
except UnpicklingError:
raise OSError(f"Unable to convert {flax_checkpoint_path} to Flax deserializable object. ")
return load_flax_weights_in_pytorch_model(model, flax_state_dict)
def load_flax_weights_in_pytorch_model(pt_model, flax_state):
"""Load flax checkpoints in a PyTorch model"""
try:
import torch # noqa: F401
except (ImportError, ModuleNotFoundError):
logger.error(
"Loading a Flax weights in PyTorch, requires both PyTorch and Flax to be installed. Please see"
" https://pytorch.org/ and https://flax.readthedocs.io/en/latest/index.html#installation for installation"
" instructions."
)
raise
# check if we have bf16 weights
is_type_bf16 = flatten_dict(jax.tree_util.tree_map(lambda x: x.dtype == jnp.bfloat16, flax_state)).values()
if any(is_type_bf16):
# convert all weights to fp32 if the are bf16 since torch.from_numpy can-not handle bf16
# and bf16 is not fully supported in PT yet.
logger.warning(
"Found ``bfloat16`` weights in Flax model. Casting all ``bfloat16`` weights to ``float32`` "
"before loading those in PyTorch model."
)
flax_state = jax.tree_util.tree_map(
lambda params: params.astype(np.float32) if params.dtype == jnp.bfloat16 else params, flax_state
)
flax_state_dict = flatten_dict(flax_state)
pt_model_dict = pt_model.state_dict()
load_model_with_head_into_base_model = (pt_model.base_model_prefix in flax_state) and (
pt_model.base_model_prefix not in {k.split(".")[0] for k in pt_model_dict}
)
load_base_model_into_model_with_head = (pt_model.base_model_prefix not in flax_state) and (
pt_model.base_model_prefix in {k.split(".")[0] for k in pt_model_dict}
)
# keep track of unexpected & missing keys
unexpected_keys = []
missing_keys = set(pt_model_dict.keys())
for flax_key_tuple, flax_tensor in flax_state_dict.items():
has_base_model_prefix = flax_key_tuple[0] == pt_model.base_model_prefix
require_base_model_prefix = ".".join((pt_model.base_model_prefix,) + flax_key_tuple) in pt_model_dict
# adapt flax_key to prepare for loading from/to base model only
if load_model_with_head_into_base_model and has_base_model_prefix:
flax_key_tuple = flax_key_tuple[1:]
elif load_base_model_into_model_with_head and require_base_model_prefix:
flax_key_tuple = (pt_model.base_model_prefix,) + flax_key_tuple
# rename flax weights to PyTorch format
if flax_key_tuple[-1] == "kernel" and flax_tensor.ndim == 4 and ".".join(flax_key_tuple) not in pt_model_dict:
# conv layer
flax_key_tuple = flax_key_tuple[:-1] + ("weight",)
flax_tensor = jnp.transpose(flax_tensor, (3, 2, 0, 1))
elif flax_key_tuple[-1] == "kernel" and ".".join(flax_key_tuple) not in pt_model_dict:
# linear layer
flax_key_tuple = flax_key_tuple[:-1] + ("weight",)
flax_tensor = flax_tensor.T
elif flax_key_tuple[-1] in ["scale", "embedding"]:
flax_key_tuple = flax_key_tuple[:-1] + ("weight",)
# adding batch stats from flax batch norm to pt
elif "mean" in flax_key_tuple[-1]:
flax_key_tuple = flax_key_tuple[:-1] + ("running_mean",)
elif "var" in flax_key_tuple[-1]:
flax_key_tuple = flax_key_tuple[:-1] + ("running_var",)
if "batch_stats" in flax_state:
flax_key = ".".join(flax_key_tuple[1:]) # Remove the params/batch_stats header
else:
flax_key = ".".join(flax_key_tuple)
# We also need to look at `pt_model_dict` and see if there are keys requiring further transformation.
special_pt_names = {}
# New `weight_norm` from https://github.com/huggingface/transformers/pull/24030
for key in pt_model_dict:
key_components = key.split(".")
name = None
if key_components[-3::2] == ["parametrizations", "original0"]:
name = key_components[-2] + "_g"
elif key_components[-3::2] == ["parametrizations", "original1"]:
name = key_components[-2] + "_v"
if name is not None:
key_components = key_components[:-3] + [name]
key_to_check = ".".join(key_components)
special_pt_names[key_to_check] = key
if flax_key in special_pt_names:
flax_key = special_pt_names[flax_key]
if flax_key in pt_model_dict:
if flax_tensor.shape != pt_model_dict[flax_key].shape:
raise ValueError(
f"Flax checkpoint seems to be incorrect. Weight {flax_key_tuple} was expected "
f"to be of shape {pt_model_dict[flax_key].shape}, but is {flax_tensor.shape}."
)
else:
# add weight to pytorch dict
flax_tensor = np.asarray(flax_tensor) if not isinstance(flax_tensor, np.ndarray) else flax_tensor
pt_model_dict[flax_key] = torch.from_numpy(flax_tensor)
# remove from missing keys
missing_keys.remove(flax_key)
else:
# weight is not expected by PyTorch model
unexpected_keys.append(flax_key)
pt_model.load_state_dict(pt_model_dict)
# re-transform missing_keys to list
missing_keys = list(missing_keys)
if len(unexpected_keys) > 0:
logger.warning(
"Some weights of the Flax model were not used when initializing the PyTorch model"
f" {pt_model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are initializing"
f" {pt_model.__class__.__name__} from a Flax model trained on another task or with another architecture"
" (e.g. initializing a BertForSequenceClassification model from a FlaxBertForPreTraining model).\n- This"
f" IS NOT expected if you are initializing {pt_model.__class__.__name__} from a Flax model that you expect"
" to be exactly identical (e.g. initializing a BertForSequenceClassification model from a"
" FlaxBertForSequenceClassification model)."
)
else:
logger.warning(f"All Flax model weights were used when initializing {pt_model.__class__.__name__}.\n")
if len(missing_keys) > 0:
logger.warning(
f"Some weights of {pt_model.__class__.__name__} were not initialized from the Flax model and are newly"
f" initialized: {missing_keys}\nYou should probably TRAIN this model on a down-stream task to be able to"
" use it for predictions and inference."
)
else:
logger.warning(
f"All the weights of {pt_model.__class__.__name__} were initialized from the Flax model.\n"
"If your task is similar to the task the model of the checkpoint was trained on, "
f"you can already use {pt_model.__class__.__name__} for predictions without further training."
)
return pt_model

File diff suppressed because it is too large Load Diff

View File

@ -1,990 +0,0 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import warnings
from dataclasses import dataclass
import tensorflow as tf
from .utils import ModelOutput
@dataclass
class TFBaseModelOutput(ModelOutput):
"""
Base class for model's outputs, with potential hidden states and attentions.
Args:
last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
hidden_states (`tuple(tf.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: tf.Tensor | None = None
hidden_states: tuple[tf.Tensor] | None = None
attentions: tuple[tf.Tensor] | None = None
@dataclass
class TFBaseModelOutputWithNoAttention(ModelOutput):
"""
Base class for model's outputs, with potential hidden states.
Args:
last_hidden_state (`tf.Tensor` shape `(batch_size, num_channels, height, width)`):
Sequence of hidden-states at the output of the last layer of the model.
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
the output of each layer) of shape `(batch_size, num_channels, height, width)`.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
"""
last_hidden_state: tf.Tensor | None = None
hidden_states: tuple[tf.Tensor, ...] | None = None
@dataclass
class TFBaseModelOutputWithPooling(ModelOutput):
"""
Base class for model's outputs that also contains a pooling of the last hidden states.
Args:
last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token) further processed by a
Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence
prediction (classification) objective during pretraining.
This output is usually *not* a good summary of the semantic content of the input, you're often better with
averaging or pooling the sequence of hidden-states for the whole input sequence.
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: tf.Tensor | None = None
pooler_output: tf.Tensor | None = None
hidden_states: tuple[tf.Tensor] | None = None
attentions: tuple[tf.Tensor] | None = None
@dataclass
class TFBaseModelOutputWithPoolingAndNoAttention(ModelOutput):
"""
Base class for model's outputs that also contains a pooling of the last hidden states.
Args:
last_hidden_state (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`):
Last layer hidden-state after a pooling operation on the spatial dimensions.
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
the output of each layer) of shape `(batch_size, num_channels, height, width)`.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
"""
last_hidden_state: tf.Tensor | None = None
pooler_output: tf.Tensor | None = None
hidden_states: tuple[tf.Tensor, ...] | None = None
@dataclass
class TFBaseModelOutputWithPoolingAndCrossAttentions(ModelOutput):
"""
Base class for model's outputs that also contains a pooling of the last hidden states.
Args:
last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token) further processed by a
Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence
prediction (classification) objective during pretraining.
This output is usually *not* a good summary of the semantic content of the input, you're often better with
averaging or pooling the sequence of hidden-states for the whole input sequence.
past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
`past_key_values` input) to speed up sequential decoding.
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
"""
last_hidden_state: tf.Tensor | None = None
pooler_output: tf.Tensor | None = None
past_key_values: list[tf.Tensor] | None = None
hidden_states: tuple[tf.Tensor] | None = None
attentions: tuple[tf.Tensor] | None = None
cross_attentions: tuple[tf.Tensor] | None = None
@dataclass
class TFBaseModelOutputWithPast(ModelOutput):
"""
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
Args:
last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
hidden_size)` is output.
past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
`past_key_values` input) to speed up sequential decoding.
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: tf.Tensor | None = None
past_key_values: list[tf.Tensor] | None = None
hidden_states: tuple[tf.Tensor] | None = None
attentions: tuple[tf.Tensor] | None = None
@dataclass
class TFBaseModelOutputWithCrossAttentions(ModelOutput):
"""
Base class for model's outputs, with potential hidden states and attentions.
Args:
last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
hidden_states (`tuple(tf.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
"""
last_hidden_state: tf.Tensor | None = None
hidden_states: tuple[tf.Tensor] | None = None
attentions: tuple[tf.Tensor] | None = None
cross_attentions: tuple[tf.Tensor] | None = None
@dataclass
class TFBaseModelOutputWithPastAndCrossAttentions(ModelOutput):
"""
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
Args:
last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
hidden_size)` is output.
past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
`past_key_values` input) to speed up sequential decoding.
hidden_states (`tuple(tf.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
"""
last_hidden_state: tf.Tensor | None = None
past_key_values: list[tf.Tensor] | None = None
hidden_states: tuple[tf.Tensor] | None = None
attentions: tuple[tf.Tensor] | None = None
cross_attentions: tuple[tf.Tensor] | None = None
@dataclass
class TFSeq2SeqModelOutput(ModelOutput):
"""
Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
decoding.
Args:
last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the decoder of the model.
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
hidden_size)` is output.
past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
used (see `past_key_values` input) to speed up sequential decoding.
decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
"""
last_hidden_state: tf.Tensor | None = None
past_key_values: list[tf.Tensor] | None = None
decoder_hidden_states: tuple[tf.Tensor] | None = None
decoder_attentions: tuple[tf.Tensor] | None = None
cross_attentions: tuple[tf.Tensor] | None = None
encoder_last_hidden_state: tf.Tensor | None = None
encoder_hidden_states: tuple[tf.Tensor] | None = None
encoder_attentions: tuple[tf.Tensor] | None = None
@dataclass
class TFCausalLMOutput(ModelOutput):
"""
Base class for causal language model (or autoregressive) outputs.
Args:
loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
Language modeling loss (for next-token prediction).
logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: tf.Tensor | None = None
logits: tf.Tensor | None = None
hidden_states: tuple[tf.Tensor] | None = None
attentions: tuple[tf.Tensor] | None = None
@dataclass
class TFCausalLMOutputWithPast(ModelOutput):
"""
Base class for causal language model (or autoregressive) outputs.
Args:
loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
Language modeling loss (for next-token prediction).
logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
`past_key_values` input) to speed up sequential decoding.
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: tf.Tensor | None = None
logits: tf.Tensor | None = None
past_key_values: list[tf.Tensor] | None = None
hidden_states: tuple[tf.Tensor] | None = None
attentions: tuple[tf.Tensor] | None = None
@dataclass
class TFCausalLMOutputWithCrossAttentions(ModelOutput):
"""
Base class for causal language model (or autoregressive) outputs.
Args:
loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
Language modeling loss (for next-token prediction).
logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
`past_key_values` input) to speed up sequential decoding.
"""
loss: tf.Tensor | None = None
logits: tf.Tensor | None = None
past_key_values: list[tf.Tensor] | None = None
hidden_states: tuple[tf.Tensor] | None = None
attentions: tuple[tf.Tensor] | None = None
cross_attentions: tuple[tf.Tensor] | None = None
@dataclass
class TFMaskedLMOutput(ModelOutput):
"""
Base class for masked language models outputs.
Args:
loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
Masked language modeling (MLM) loss.
logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: tf.Tensor | None = None
logits: tf.Tensor | None = None
hidden_states: tuple[tf.Tensor] | None = None
attentions: tuple[tf.Tensor] | None = None
@dataclass
class TFSeq2SeqLMOutput(ModelOutput):
"""
Base class for sequence-to-sequence language models outputs.
Args:
loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
Language modeling loss.
logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
used (see `past_key_values` input) to speed up sequential decoding.
decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
"""
loss: tf.Tensor | None = None
logits: tf.Tensor | None = None
past_key_values: list[tf.Tensor] | None = None
decoder_hidden_states: tuple[tf.Tensor] | None = None
decoder_attentions: tuple[tf.Tensor] | None = None
cross_attentions: tuple[tf.Tensor] | None = None
encoder_last_hidden_state: tf.Tensor | None = None
encoder_hidden_states: tuple[tf.Tensor] | None = None
encoder_attentions: tuple[tf.Tensor] | None = None
@dataclass
class TFNextSentencePredictorOutput(ModelOutput):
"""
Base class for outputs of models predicting if two sentences are consecutive or not.
Args:
loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `next_sentence_label` is provided):
Next sentence prediction loss.
logits (`tf.Tensor` of shape `(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
before SoftMax).
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: tf.Tensor | None = None
logits: tf.Tensor | None = None
hidden_states: tuple[tf.Tensor] | None = None
attentions: tuple[tf.Tensor] | None = None
@dataclass
class TFSequenceClassifierOutput(ModelOutput):
"""
Base class for outputs of sentence classification models.
Args:
loss (`tf.Tensor` of shape `(batch_size, )`, *optional*, returned when `labels` is provided):
Classification (or regression if config.num_labels==1) loss.
logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: tf.Tensor | None = None
logits: tf.Tensor | None = None
hidden_states: tuple[tf.Tensor] | None = None
attentions: tuple[tf.Tensor] | None = None
@dataclass
class TFSeq2SeqSequenceClassifierOutput(ModelOutput):
"""
Base class for outputs of sequence-to-sequence sentence classification models.
Args:
loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `label` is provided):
Classification (or regression if config.num_labels==1) loss.
logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
used (see `past_key_values` input) to speed up sequential decoding.
decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`
encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
"""
loss: tf.Tensor | None = None
logits: tf.Tensor | None = None
past_key_values: list[tf.Tensor] | None = None
decoder_hidden_states: tuple[tf.Tensor] | None = None
decoder_attentions: tuple[tf.Tensor] | None = None
cross_attentions: tuple[tf.Tensor] | None = None
encoder_last_hidden_state: tf.Tensor | None = None
encoder_hidden_states: tuple[tf.Tensor] | None = None
encoder_attentions: tuple[tf.Tensor] | None = None
@dataclass
class TFSemanticSegmenterOutput(ModelOutput):
"""
Base class for outputs of semantic segmentation models.
Args:
loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Classification (or regression if config.num_labels==1) loss.
logits (`tf.Tensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
Classification scores for each pixel.
<Tip warning={true}>
The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
original image size as post-processing. You should always check your logits shape and resize as needed.
</Tip>
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
the output of each layer) of shape `(batch_size, patch_size, hidden_size)`.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: tf.Tensor | None = None
logits: tf.Tensor | None = None
hidden_states: tuple[tf.Tensor] | None = None
attentions: tuple[tf.Tensor] | None = None
@dataclass
class TFSemanticSegmenterOutputWithNoAttention(ModelOutput):
"""
Base class for outputs of semantic segmentation models that do not output attention scores.
Args:
loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Classification (or regression if config.num_labels==1) loss.
logits (`tf.Tensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
Classification scores for each pixel.
<Tip warning={true}>
The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
original image size as post-processing. You should always check your logits shape and resize as needed.
</Tip>
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
the output of each layer) of shape `(batch_size, patch_size, hidden_size)`.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
"""
loss: tf.Tensor | None = None
logits: tf.Tensor | None = None
hidden_states: tuple[tf.Tensor] | None = None
@dataclass
class TFImageClassifierOutput(ModelOutput):
"""
Base class for outputs of image classification models.
Args:
loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Classification (or regression if config.num_labels==1) loss.
logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states (also called
feature maps) of the model at the output of each stage.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: tf.Tensor | None = None
logits: tf.Tensor | None = None
hidden_states: tuple[tf.Tensor] | None = None
attentions: tuple[tf.Tensor] | None = None
@dataclass
class TFMultipleChoiceModelOutput(ModelOutput):
"""
Base class for outputs of multiple choice models.
Args:
loss (`tf.Tensor` of shape *(batch_size, )*, *optional*, returned when `labels` is provided):
Classification loss.
logits (`tf.Tensor` of shape `(batch_size, num_choices)`):
*num_choices* is the second dimension of the input tensors. (see *input_ids* above).
Classification scores (before SoftMax).
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: tf.Tensor | None = None
logits: tf.Tensor | None = None
hidden_states: tuple[tf.Tensor] | None = None
attentions: tuple[tf.Tensor] | None = None
@dataclass
class TFTokenClassifierOutput(ModelOutput):
"""
Base class for outputs of token classification models.
Args:
loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of unmasked labels, returned when `labels` is provided) :
Classification loss.
logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: tf.Tensor | None = None
logits: tf.Tensor | None = None
hidden_states: tuple[tf.Tensor] | None = None
attentions: tuple[tf.Tensor] | None = None
@dataclass
class TFQuestionAnsweringModelOutput(ModelOutput):
"""
Base class for outputs of question answering models.
Args:
loss (`tf.Tensor` of shape `(batch_size, )`, *optional*, returned when `start_positions` and `end_positions` are provided):
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
start_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Span-start scores (before SoftMax).
end_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Span-end scores (before SoftMax).
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: tf.Tensor | None = None
start_logits: tf.Tensor | None = None
end_logits: tf.Tensor | None = None
hidden_states: tuple[tf.Tensor] | None = None
attentions: tuple[tf.Tensor] | None = None
@dataclass
class TFSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
"""
Base class for outputs of sequence-to-sequence question answering models.
Args:
loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
start_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Span-start scores (before SoftMax).
end_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Span-end scores (before SoftMax).
past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
used (see `past_key_values` input) to speed up sequential decoding.
decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
"""
loss: tf.Tensor | None = None
start_logits: tf.Tensor | None = None
end_logits: tf.Tensor | None = None
past_key_values: list[tf.Tensor] | None = None
decoder_hidden_states: tuple[tf.Tensor] | None = None
decoder_attentions: tuple[tf.Tensor] | None = None
encoder_last_hidden_state: tf.Tensor | None = None
encoder_hidden_states: tuple[tf.Tensor] | None = None
encoder_attentions: tuple[tf.Tensor] | None = None
@dataclass
class TFSequenceClassifierOutputWithPast(ModelOutput):
"""
Base class for outputs of sentence classification models.
Args:
loss (`tf.Tensor` of shape `(batch_size, )`, *optional*, returned when `labels` is provided):
Classification (or regression if config.num_labels==1) loss.
logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
`past_key_values` input) to speed up sequential decoding.
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: tf.Tensor | None = None
logits: tf.Tensor | None = None
past_key_values: list[tf.Tensor] | None = None
hidden_states: tuple[tf.Tensor] | None = None
attentions: tuple[tf.Tensor] | None = None
@dataclass
class TFImageClassifierOutputWithNoAttention(ModelOutput):
"""
Base class for outputs of image classification models.
Args:
loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Classification (or regression if config.num_labels==1) loss.
logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
the output of each stage) of shape `(batch_size, num_channels, height, width)`. Hidden-states (also called
feature maps) of the model at the output of each stage.
"""
loss: tf.Tensor | None = None
logits: tf.Tensor | None = None
hidden_states: tuple[tf.Tensor, ...] | None = None
@dataclass
class TFMaskedImageModelingOutput(ModelOutput):
"""
Base class for outputs of masked image completion / in-painting models.
Args:
loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
Reconstruction loss.
reconstruction (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
Reconstructed / completed images.
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when
`config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states (also called
feature maps) of the model at the output of each stage.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when
`config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: tf.Tensor | None = None
reconstruction: tf.Tensor | None = None
hidden_states: tuple[tf.Tensor] | None = None
attentions: tuple[tf.Tensor] | None = None
@property
def logits(self):
warnings.warn(
"logits attribute is deprecated and will be removed in version 5 of Transformers."
" Please use the reconstruction attribute to retrieve the final output instead.",
FutureWarning,
)
return self.reconstruction

View File

@ -1,676 +0,0 @@
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch - TF 2.0 general utilities."""
import os
import re
import numpy
from .utils import (
ExplicitEnum,
check_torch_load_is_safe,
expand_dims,
is_numpy_array,
is_safetensors_available,
is_torch_tensor,
logging,
reshape,
squeeze,
tensor_size,
)
from .utils import transpose as transpose_func
if is_safetensors_available():
from safetensors import safe_open
logger = logging.get_logger(__name__)
class TransposeType(ExplicitEnum):
"""
Possible ...
"""
NO = "no"
SIMPLE = "simple"
CONV1D = "conv1d"
CONV2D = "conv2d"
def convert_tf_weight_name_to_pt_weight_name(
tf_name, start_prefix_to_remove="", tf_weight_shape=None, name_scope=None
):
"""
Convert a TF 2.0 model variable name in a pytorch model weight name.
Conventions for TF2.0 scopes -> PyTorch attribute names conversions:
- '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch)
- '_._' is replaced by a new level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList)
return tuple with:
- pytorch model weight name
- transpose: `TransposeType` member indicating whether and how TF2.0 and PyTorch weights matrices should be
transposed with regards to each other
"""
if name_scope is not None:
if not tf_name.startswith(name_scope) and "final_logits_bias" not in tf_name:
raise ValueError(
f"Weight name {tf_name} does not start with name_scope {name_scope}. This is an internal error "
"in Transformers, so (unless you were doing something really evil) please open an issue to report it!"
)
tf_name = tf_name[len(name_scope) :]
tf_name = tf_name.lstrip("/")
tf_name = tf_name.replace(":0", "") # device ids
if (len(tf_name) > 2048 and "___" in tf_name) or tf_name.count("___") > 10:
# ReDOS check
raise ValueError("TF variable name is too long or contains too many ___ separators: " + tf_name)
tf_name = re.sub(
r"/[^/]*___([^/]*)/", r"/\1/", tf_name
) # '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch)
tf_name = tf_name.replace(
"_._", "/"
) # '_._' is replaced by a level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList)
tf_name = re.sub(r"//+", "/", tf_name) # Remove empty levels at the end
tf_name = tf_name.split("/") # Convert from TF2.0 '/' separators to PyTorch '.' separators
# Some weights have a single name without "/" such as final_logits_bias in BART
if len(tf_name) > 1:
tf_name = tf_name[1:] # Remove level zero
tf_weight_shape = list(tf_weight_shape)
# When should we transpose the weights
if tf_name[-1] == "kernel" and tf_weight_shape is not None and len(tf_weight_shape) == 4:
transpose = TransposeType.CONV2D
elif tf_name[-1] == "kernel" and tf_weight_shape is not None and len(tf_weight_shape) == 3:
transpose = TransposeType.CONV1D
elif bool(
tf_name[-1] in ["kernel", "pointwise_kernel", "depthwise_kernel"]
or "emb_projs" in tf_name
or "out_projs" in tf_name
):
transpose = TransposeType.SIMPLE
else:
transpose = TransposeType.NO
# Convert standard TF2.0 names in PyTorch names
if tf_name[-1] == "kernel" or tf_name[-1] == "embeddings" or tf_name[-1] == "gamma":
tf_name[-1] = "weight"
if tf_name[-1] == "beta":
tf_name[-1] = "bias"
# The SeparableConv1D TF layer contains two weights that are translated to PyTorch Conv1D here
if tf_name[-1] == "pointwise_kernel" or tf_name[-1] == "depthwise_kernel":
tf_name[-1] = tf_name[-1].replace("_kernel", ".weight")
# Remove prefix if needed
tf_name = ".".join(tf_name)
if start_prefix_to_remove:
tf_name = tf_name.replace(start_prefix_to_remove, "", 1)
return tf_name, transpose
def apply_transpose(transpose: TransposeType, weight, match_shape=None, pt_to_tf=True):
"""
Apply a transpose to some weight then tries to reshape the weight to the same shape as a given shape, all in a
framework agnostic way.
"""
if transpose is TransposeType.CONV2D:
# Conv2D weight:
# PT: (num_out_channel, num_in_channel, kernel[0], kernel[1])
# -> TF: (kernel[0], kernel[1], num_in_channel, num_out_channel)
axes = (2, 3, 1, 0) if pt_to_tf else (3, 2, 0, 1)
weight = transpose_func(weight, axes=axes)
elif transpose is TransposeType.CONV1D:
# Conv1D weight:
# PT: (num_out_channel, num_in_channel, kernel)
# -> TF: (kernel, num_in_channel, num_out_channel)
weight = transpose_func(weight, axes=(2, 1, 0))
elif transpose is TransposeType.SIMPLE:
weight = transpose_func(weight)
if match_shape is None:
return weight
if len(match_shape) < len(weight.shape):
weight = squeeze(weight)
elif len(match_shape) > len(weight.shape):
weight = expand_dims(weight, axis=0)
if list(match_shape) != list(weight.shape):
try:
weight = reshape(weight, match_shape)
except AssertionError as e:
e.args += (match_shape, match_shape)
raise e
return weight
#####################
# PyTorch => TF 2.0 #
#####################
def load_pytorch_checkpoint_in_tf2_model(
tf_model,
pytorch_checkpoint_path,
tf_inputs=None,
allow_missing_keys=False,
output_loading_info=False,
_prefix=None,
tf_to_pt_weight_rename=None,
):
"""Load pytorch checkpoints in a TF 2.0 model"""
try:
import tensorflow as tf # noqa: F401
import torch # noqa: F401
from safetensors.torch import load_file as safe_load_file # noqa: F401
except ImportError:
logger.error(
"Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see "
"https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
)
raise
# Treats a single file as a collection of shards with 1 shard.
if isinstance(pytorch_checkpoint_path, str):
pytorch_checkpoint_path = [pytorch_checkpoint_path]
# Loads all shards into a single state dictionary
pt_state_dict = {}
for path in pytorch_checkpoint_path:
pt_path = os.path.abspath(path)
logger.info(f"Loading PyTorch weights from {pt_path}")
if pt_path.endswith(".safetensors"):
state_dict = safe_load_file(pt_path)
else:
check_torch_load_is_safe()
state_dict = torch.load(pt_path, map_location="cpu", weights_only=True)
pt_state_dict.update(state_dict)
logger.info(f"PyTorch checkpoint contains {sum(t.numel() for t in pt_state_dict.values()):,} parameters")
return load_pytorch_weights_in_tf2_model(
tf_model,
pt_state_dict,
tf_inputs=tf_inputs,
allow_missing_keys=allow_missing_keys,
output_loading_info=output_loading_info,
_prefix=_prefix,
tf_to_pt_weight_rename=tf_to_pt_weight_rename,
)
def load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=None, allow_missing_keys=False):
"""Load pytorch checkpoints in a TF 2.0 model"""
pt_state_dict = pt_model.state_dict()
return load_pytorch_weights_in_tf2_model(
tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys
)
def load_pytorch_weights_in_tf2_model(
tf_model,
pt_state_dict,
tf_inputs=None,
allow_missing_keys=False,
output_loading_info=False,
_prefix=None,
tf_to_pt_weight_rename=None,
):
"""Load pytorch state_dict in a TF 2.0 model."""
try:
import tensorflow as tf # noqa: F401
import torch # noqa: F401
except ImportError:
logger.error(
"Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see "
"https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
)
raise
# Numpy doesn't understand bfloat16, so upcast to a dtype that doesn't lose precision
pt_state_dict = {
k: v.numpy() if v.dtype != torch.bfloat16 else v.float().numpy() for k, v in pt_state_dict.items()
}
return load_pytorch_state_dict_in_tf2_model(
tf_model,
pt_state_dict,
tf_inputs=tf_inputs,
allow_missing_keys=allow_missing_keys,
output_loading_info=output_loading_info,
_prefix=_prefix,
tf_to_pt_weight_rename=tf_to_pt_weight_rename,
)
def _log_key_warnings(missing_keys, unexpected_keys, mismatched_keys, class_name):
if len(unexpected_keys) > 0:
logger.warning(
"Some weights of the PyTorch model were not used when initializing the TF 2.0 model"
f" {class_name}: {unexpected_keys}\n- This IS expected if you are initializing"
f" {class_name} from a PyTorch model trained on another task or with another architecture"
" (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).\n- This IS"
f" NOT expected if you are initializing {class_name} from a PyTorch model that you expect"
" to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a"
" BertForSequenceClassification model)."
)
else:
logger.warning(f"All PyTorch model weights were used when initializing {class_name}.\n")
if len(missing_keys) > 0:
logger.warning(
f"Some weights or buffers of the TF 2.0 model {class_name} were not initialized from the"
f" PyTorch model and are newly initialized: {missing_keys}\nYou should probably TRAIN this model on a"
" down-stream task to be able to use it for predictions and inference."
)
else:
logger.warning(
f"All the weights of {class_name} were initialized from the PyTorch model.\n"
"If your task is similar to the task the model of the checkpoint was trained on, "
f"you can already use {class_name} for predictions without further training."
)
if len(mismatched_keys) > 0:
mismatched_warning = "\n".join(
[
f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
for key, shape1, shape2 in mismatched_keys
]
)
logger.warning(
f"Some weights of {class_name} were not initialized from the model checkpoint"
f" are newly initialized because the shapes did not"
f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be able"
" to use it for predictions and inference."
)
def load_pytorch_state_dict_in_tf2_model(
tf_model,
pt_state_dict,
tf_inputs=None,
allow_missing_keys=False,
output_loading_info=False,
_prefix=None,
tf_to_pt_weight_rename=None,
ignore_mismatched_sizes=False,
skip_logger_warnings=False,
):
"""Load a pytorch state_dict in a TF 2.0 model. pt_state_dict can be either an actual dict or a lazy-loading
safetensors archive created with the safe_open() function."""
import tensorflow as tf
if tf_inputs is None:
tf_inputs = tf_model.dummy_inputs
if _prefix is None:
_prefix = ""
if tf_inputs:
with tf.name_scope(_prefix):
tf_model(tf_inputs, training=False) # Make sure model is built
# Convert old format to new format if needed from a PyTorch state_dict
tf_keys_to_pt_keys = {}
for key in pt_state_dict:
new_key = None
if "gamma" in key:
new_key = key.replace("gamma", "weight")
if "beta" in key:
new_key = key.replace("beta", "bias")
if "running_var" in key:
new_key = key.replace("running_var", "moving_variance")
if "running_mean" in key:
new_key = key.replace("running_mean", "moving_mean")
# New `weight_norm` from https://github.com/huggingface/transformers/pull/24030
key_components = key.split(".")
name = None
if key_components[-3::2] == ["parametrizations", "original0"]:
name = key_components[-2] + "_g"
elif key_components[-3::2] == ["parametrizations", "original1"]:
name = key_components[-2] + "_v"
if name is not None:
key_components = key_components[:-3] + [name]
new_key = ".".join(key_components)
if new_key is None:
new_key = key
tf_keys_to_pt_keys[new_key] = key
# Matt: All TF models store the actual model stem in a MainLayer class, including the base model.
# In PT, the derived models (with heads) use the base model class as the stem instead,
# and there is no MainLayer class. This means that TF base classes have one
# extra layer in their weight names, corresponding to the MainLayer class. This code block compensates for that.
start_prefix_to_remove = ""
if not any(s.startswith(tf_model.base_model_prefix) for s in tf_keys_to_pt_keys):
start_prefix_to_remove = tf_model.base_model_prefix + "."
symbolic_weights = tf_model.trainable_weights + tf_model.non_trainable_weights
tf_loaded_numel = 0
all_pytorch_weights = set(tf_keys_to_pt_keys.keys())
missing_keys = []
mismatched_keys = []
is_safetensor_archive = hasattr(pt_state_dict, "get_tensor")
for symbolic_weight in symbolic_weights:
sw_name = symbolic_weight.name
name, transpose = convert_tf_weight_name_to_pt_weight_name(
sw_name,
start_prefix_to_remove=start_prefix_to_remove,
tf_weight_shape=symbolic_weight.shape,
name_scope=_prefix,
)
if tf_to_pt_weight_rename is not None:
aliases = tf_to_pt_weight_rename(name) # Is a tuple to account for possible name aliasing
for alias in aliases: # The aliases are in priority order, take the first one that matches
if alias in tf_keys_to_pt_keys:
name = alias
break
else:
# If none of the aliases match, just use the first one (it'll be reported as missing)
name = aliases[0]
# Find associated numpy array in pytorch model state dict
if name not in tf_keys_to_pt_keys:
if allow_missing_keys:
missing_keys.append(name)
continue
elif tf_model._keys_to_ignore_on_load_missing is not None:
# authorized missing keys don't have to be loaded
if any(re.search(pat, name) is not None for pat in tf_model._keys_to_ignore_on_load_missing):
continue
raise AttributeError(f"{name} not found in PyTorch model")
state_dict_name = tf_keys_to_pt_keys[name]
if is_safetensor_archive:
array = pt_state_dict.get_tensor(state_dict_name)
else:
array = pt_state_dict[state_dict_name]
try:
array = apply_transpose(transpose, array, symbolic_weight.shape)
except tf.errors.InvalidArgumentError as e:
if not ignore_mismatched_sizes:
error_msg = str(e)
error_msg += (
"\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method."
)
raise tf.errors.InvalidArgumentError(error_msg)
else:
mismatched_keys.append((name, array.shape, symbolic_weight.shape))
continue
tf_loaded_numel += tensor_size(array)
symbolic_weight.assign(tf.cast(array, symbolic_weight.dtype))
del array # Immediately free memory to keep peak usage as low as possible
all_pytorch_weights.discard(name)
logger.info(f"Loaded {tf_loaded_numel:,} parameters in the TF 2.0 model.")
unexpected_keys = list(all_pytorch_weights)
if tf_model._keys_to_ignore_on_load_missing is not None:
for pat in tf_model._keys_to_ignore_on_load_missing:
missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
if tf_model._keys_to_ignore_on_load_unexpected is not None:
for pat in tf_model._keys_to_ignore_on_load_unexpected:
unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
if not skip_logger_warnings:
_log_key_warnings(missing_keys, unexpected_keys, mismatched_keys, class_name=tf_model.__class__.__name__)
if output_loading_info:
loading_info = {
"missing_keys": missing_keys,
"unexpected_keys": unexpected_keys,
"mismatched_keys": mismatched_keys,
}
return tf_model, loading_info
return tf_model
def load_sharded_pytorch_safetensors_in_tf2_model(
tf_model,
safetensors_shards,
tf_inputs=None,
allow_missing_keys=False,
output_loading_info=False,
_prefix=None,
tf_to_pt_weight_rename=None,
ignore_mismatched_sizes=False,
):
all_loading_infos = []
for shard in safetensors_shards:
with safe_open(shard, framework="tf") as safetensors_archive:
tf_model, loading_info = load_pytorch_state_dict_in_tf2_model(
tf_model,
safetensors_archive,
tf_inputs=tf_inputs,
allow_missing_keys=allow_missing_keys,
output_loading_info=True,
_prefix=_prefix,
tf_to_pt_weight_rename=tf_to_pt_weight_rename,
ignore_mismatched_sizes=ignore_mismatched_sizes,
skip_logger_warnings=True, # We will emit merged warnings at the end
)
all_loading_infos.append(loading_info)
# Now we just need to merge the loading info
# Keys are missing only if they're missing in *every* shard
missing_keys = sorted(set.intersection(*[set(info["missing_keys"]) for info in all_loading_infos]))
# Keys are unexpected/mismatched if they're unexpected/mismatched in *any* shard
unexpected_keys = sum([info["unexpected_keys"] for info in all_loading_infos], [])
mismatched_keys = sum([info["mismatched_keys"] for info in all_loading_infos], [])
_log_key_warnings(missing_keys, unexpected_keys, mismatched_keys, class_name=tf_model.__class__.__name__)
if output_loading_info:
loading_info = {
"missing_keys": missing_keys,
"unexpected_keys": unexpected_keys,
"mismatched_keys": mismatched_keys,
}
return tf_model, loading_info
return tf_model
#####################
# TF 2.0 => PyTorch #
#####################
def load_tf2_checkpoint_in_pytorch_model(
pt_model, tf_checkpoint_path, tf_inputs=None, allow_missing_keys=False, output_loading_info=False
):
"""
Load TF 2.0 HDF5 checkpoint in a PyTorch model We use HDF5 to easily do transfer learning (see
https://github.com/tensorflow/tensorflow/blob/ee16fcac960ae660e0e4496658a366e2f745e1f0/tensorflow/python/keras/engine/network.py#L1352-L1357).
"""
try:
import tensorflow as tf # noqa: F401
import torch # noqa: F401
except ImportError:
logger.error(
"Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
"https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
)
raise
import transformers
from .modeling_tf_utils import load_tf_weights
logger.info(f"Loading TensorFlow weights from {tf_checkpoint_path}")
# Instantiate and load the associated TF 2.0 model
tf_model_class_name = "TF" + pt_model.__class__.__name__ # Add "TF" at the beginning
tf_model_class = getattr(transformers, tf_model_class_name)
tf_model = tf_model_class(pt_model.config)
if tf_inputs is None:
tf_inputs = tf_model.dummy_inputs
if tf_inputs is not None:
tf_model(tf_inputs, training=False) # Make sure model is built
load_tf_weights(tf_model, tf_checkpoint_path)
return load_tf2_model_in_pytorch_model(
pt_model, tf_model, allow_missing_keys=allow_missing_keys, output_loading_info=output_loading_info
)
def load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=False, output_loading_info=False):
"""Load TF 2.0 model in a pytorch model"""
weights = tf_model.weights
return load_tf2_weights_in_pytorch_model(
pt_model, weights, allow_missing_keys=allow_missing_keys, output_loading_info=output_loading_info
)
def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=False, output_loading_info=False):
"""Load TF2.0 symbolic weights in a PyTorch model"""
try:
import tensorflow as tf # noqa: F401
import torch # noqa: F401
except ImportError:
logger.error(
"Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
"https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_state_dict = {tf_weight.name: tf_weight.numpy() for tf_weight in tf_weights}
return load_tf2_state_dict_in_pytorch_model(
pt_model, tf_state_dict, allow_missing_keys=allow_missing_keys, output_loading_info=output_loading_info
)
def load_tf2_state_dict_in_pytorch_model(pt_model, tf_state_dict, allow_missing_keys=False, output_loading_info=False):
import torch
new_pt_params_dict = {}
current_pt_params_dict = dict(pt_model.named_parameters())
# Make sure we are able to load PyTorch base models as well as derived models (with heads)
# TF models always have a prefix, some of PyTorch models (base ones) don't
start_prefix_to_remove = ""
if not any(s.startswith(pt_model.base_model_prefix) for s in current_pt_params_dict):
start_prefix_to_remove = pt_model.base_model_prefix + "."
# Build a map from potential PyTorch weight names to TF 2.0 Variables
tf_weights_map = {}
for name, tf_weight in tf_state_dict.items():
pt_name, transpose = convert_tf_weight_name_to_pt_weight_name(
name, start_prefix_to_remove=start_prefix_to_remove, tf_weight_shape=tf_weight.shape
)
tf_weights_map[pt_name] = (tf_weight, transpose)
all_tf_weights = set(tf_weights_map.keys())
loaded_pt_weights_data_ptr = {}
missing_keys_pt = []
for pt_weight_name, pt_weight in current_pt_params_dict.items():
# Handle PyTorch shared weight not duplicated in TF 2.0
if pt_weight.data_ptr() in loaded_pt_weights_data_ptr and pt_weight.data_ptr() != 0:
new_pt_params_dict[pt_weight_name] = loaded_pt_weights_data_ptr[pt_weight.data_ptr()]
continue
pt_weight_name_to_check = pt_weight_name
# New `weight_norm` from https://github.com/huggingface/transformers/pull/24030
key_components = pt_weight_name.split(".")
name = None
if key_components[-3::2] == ["parametrizations", "original0"]:
name = key_components[-2] + "_g"
elif key_components[-3::2] == ["parametrizations", "original1"]:
name = key_components[-2] + "_v"
if name is not None:
key_components = key_components[:-3] + [name]
pt_weight_name_to_check = ".".join(key_components)
# Find associated numpy array in pytorch model state dict
if pt_weight_name_to_check not in tf_weights_map:
if allow_missing_keys:
missing_keys_pt.append(pt_weight_name)
continue
raise AttributeError(f"{pt_weight_name} not found in TF 2.0 model")
array, transpose = tf_weights_map[pt_weight_name_to_check]
array = apply_transpose(transpose, array, pt_weight.shape, pt_to_tf=False)
if numpy.isscalar(array):
array = numpy.array(array)
if not is_torch_tensor(array) and not is_numpy_array(array):
array = array.numpy()
if is_numpy_array(array):
# Convert to torch tensor
array = torch.from_numpy(array)
new_pt_params_dict[pt_weight_name] = array
loaded_pt_weights_data_ptr[pt_weight.data_ptr()] = array
all_tf_weights.discard(pt_weight_name)
missing_keys, unexpected_keys = pt_model.load_state_dict(new_pt_params_dict, strict=False)
missing_keys += missing_keys_pt
# Some models may have keys that are not in the state by design, removing them before needlessly warning
# the user.
if pt_model._keys_to_ignore_on_load_missing is not None:
for pat in pt_model._keys_to_ignore_on_load_missing:
missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
if pt_model._keys_to_ignore_on_load_unexpected is not None:
for pat in pt_model._keys_to_ignore_on_load_unexpected:
unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
if len(unexpected_keys) > 0:
logger.warning(
"Some weights of the TF 2.0 model were not used when initializing the PyTorch model"
f" {pt_model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are initializing"
f" {pt_model.__class__.__name__} from a TF 2.0 model trained on another task or with another architecture"
" (e.g. initializing a BertForSequenceClassification model from a TFBertForPreTraining model).\n- This IS"
f" NOT expected if you are initializing {pt_model.__class__.__name__} from a TF 2.0 model that you expect"
" to be exactly identical (e.g. initializing a BertForSequenceClassification model from a"
" TFBertForSequenceClassification model)."
)
else:
logger.warning(f"All TF 2.0 model weights were used when initializing {pt_model.__class__.__name__}.\n")
if len(missing_keys) > 0:
logger.warning(
f"Some weights of {pt_model.__class__.__name__} were not initialized from the TF 2.0 model and are newly"
f" initialized: {missing_keys}\nYou should probably TRAIN this model on a down-stream task to be able to"
" use it for predictions and inference."
)
else:
logger.warning(
f"All the weights of {pt_model.__class__.__name__} were initialized from the TF 2.0 model.\n"
"If your task is similar to the task the model of the checkpoint was trained on, "
f"you can already use {pt_model.__class__.__name__} for predictions without further training."
)
logger.info(f"Weights or buffers not loaded from TF 2.0 model: {all_tf_weights}")
if output_loading_info:
loading_info = {"missing_keys": missing_keys, "unexpected_keys": unexpected_keys}
return pt_model, loading_info
return pt_model

File diff suppressed because it is too large Load Diff

View File

@ -79,11 +79,8 @@ from .utils import (
ADAPTER_WEIGHTS_NAME,
CONFIG_NAME,
DUMMY_INPUTS,
FLAX_WEIGHTS_NAME,
SAFE_WEIGHTS_INDEX_NAME,
SAFE_WEIGHTS_NAME,
TF2_WEIGHTS_NAME,
TF_WEIGHTS_NAME,
WEIGHTS_INDEX_NAME,
WEIGHTS_NAME,
ContextManagers,
@ -506,13 +503,6 @@ def load_state_dict(
# Use safetensors if possible
if checkpoint_file.endswith(".safetensors") and is_safetensors_available():
with safe_open(checkpoint_file, framework="pt") as f:
metadata = f.metadata()
if metadata is not None and metadata.get("format") not in ["pt", "tf", "flax", "mlx"]:
raise OSError(
f"The safetensors archive passed at {checkpoint_file} does not contain the valid metadata. Make sure "
"you save your model with the `save_pretrained` method."
)
state_dict = {}
for k in f.keys():
if map_location == "meta":
@ -568,11 +558,7 @@ def load_state_dict(
"model. Make sure you have saved the model properly."
) from e
except (UnicodeDecodeError, ValueError):
raise OSError(
f"Unable to load weights from pytorch checkpoint file for '{checkpoint_file}' "
f"at '{checkpoint_file}'. "
"If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True."
)
raise OSError(f"Unable to load weights from pytorch checkpoint file '{checkpoint_file}'.")
def set_initialized_submodules(model, state_dict_keys):
@ -1004,9 +990,7 @@ def _get_resolved_checkpoint_files(
subfolder: str,
variant: Optional[str],
gguf_file: Optional[str],
from_tf: bool,
from_flax: bool,
use_safetensors: bool,
use_safetensors: Optional[bool],
cache_dir: str,
force_download: bool,
proxies: Optional[dict[str, str]],
@ -1032,19 +1016,6 @@ def _get_resolved_checkpoint_files(
# If the filename is explicitly defined, load this by default.
archive_file = os.path.join(pretrained_model_name_or_path, subfolder, transformers_explicit_filename)
is_sharded = transformers_explicit_filename.endswith(".safetensors.index.json")
elif from_tf and os.path.isfile(
os.path.join(pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index")
):
# Load from a TF 1.0 checkpoint in priority if from_tf
archive_file = os.path.join(pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index")
elif from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, TF2_WEIGHTS_NAME)):
# Load from a TF 2.0 checkpoint in priority if from_tf
archive_file = os.path.join(pretrained_model_name_or_path, subfolder, TF2_WEIGHTS_NAME)
elif from_flax and os.path.isfile(
os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME)
):
# Load from a Flax checkpoint in priority if from_flax
archive_file = os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME)
elif use_safetensors is not False and os.path.isfile(
os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(SAFE_WEIGHTS_NAME, variant))
):
@ -1075,24 +1046,6 @@ def _get_resolved_checkpoint_files(
pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_INDEX_NAME, variant)
)
is_sharded = True
# At this stage we don't have a weight file so we will raise an error.
elif not use_safetensors and (
os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index"))
or os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, TF2_WEIGHTS_NAME))
):
raise OSError(
f"Error no file named {_add_variant(WEIGHTS_NAME, variant)} found in directory"
f" {pretrained_model_name_or_path} but there is a file for TensorFlow weights. Use"
" `from_tf=True` to load this model from those weights."
)
elif not use_safetensors and os.path.isfile(
os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME)
):
raise OSError(
f"Error no file named {_add_variant(WEIGHTS_NAME, variant)} found in directory"
f" {pretrained_model_name_or_path} but there is a file for Flax weights. Use `from_flax=True`"
" to load this model from those weights."
)
elif use_safetensors:
raise OSError(
f"Error no file named {_add_variant(SAFE_WEIGHTS_NAME, variant)} found in directory"
@ -1100,21 +1053,12 @@ def _get_resolved_checkpoint_files(
)
else:
raise OSError(
f"Error no file named {_add_variant(WEIGHTS_NAME, variant)}, {_add_variant(SAFE_WEIGHTS_NAME, variant)},"
f" {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME + '.index'} or {FLAX_WEIGHTS_NAME} found in directory"
f" {pretrained_model_name_or_path}."
f"Error no file named {_add_variant(SAFE_WEIGHTS_NAME, variant)}, or {_add_variant(WEIGHTS_NAME, variant)},"
f" found in directory {pretrained_model_name_or_path}."
)
elif os.path.isfile(os.path.join(subfolder, pretrained_model_name_or_path)):
archive_file = pretrained_model_name_or_path
is_local = True
elif os.path.isfile(os.path.join(subfolder, pretrained_model_name_or_path + ".index")):
if not from_tf:
raise ValueError(
f"We found a TensorFlow checkpoint at {pretrained_model_name_or_path + '.index'}, please set "
"from_tf to True to load from this checkpoint."
)
archive_file = os.path.join(subfolder, pretrained_model_name_or_path + ".index")
is_local = True
elif is_remote_url(pretrained_model_name_or_path):
filename = pretrained_model_name_or_path
resolved_archive_file = download_url(pretrained_model_name_or_path)
@ -1123,10 +1067,6 @@ def _get_resolved_checkpoint_files(
if transformers_explicit_filename is not None:
filename = transformers_explicit_filename
is_sharded = transformers_explicit_filename.endswith(".safetensors.index.json")
elif from_tf:
filename = TF2_WEIGHTS_NAME
elif from_flax:
filename = FLAX_WEIGHTS_NAME
elif use_safetensors is not False:
filename = _add_variant(SAFE_WEIGHTS_NAME, variant)
else:
@ -1223,8 +1163,7 @@ def _get_resolved_checkpoint_files(
name="Thread-auto_conversion",
).start()
else:
# Otherwise, no PyTorch file was found, maybe there is a TF or Flax model file.
# We try those to give a helpful error message.
# Otherwise, no PyTorch file was found
has_file_kwargs = {
"revision": revision,
"proxies": proxies,
@ -1232,19 +1171,7 @@ def _get_resolved_checkpoint_files(
"cache_dir": cache_dir,
"local_files_only": local_files_only,
}
if has_file(pretrained_model_name_or_path, TF2_WEIGHTS_NAME, **has_file_kwargs):
raise OSError(
f"{pretrained_model_name_or_path} does not appear to have a file named"
f" {_add_variant(WEIGHTS_NAME, variant)} but there is a file for TensorFlow weights."
" Use `from_tf=True` to load this model from those weights."
)
elif has_file(pretrained_model_name_or_path, FLAX_WEIGHTS_NAME, **has_file_kwargs):
raise OSError(
f"{pretrained_model_name_or_path} does not appear to have a file named"
f" {_add_variant(WEIGHTS_NAME, variant)} but there is a file for Flax weights. Use"
" `from_flax=True` to load this model from those weights."
)
elif variant is not None and has_file(
if variant is not None and has_file(
pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs
):
raise OSError(
@ -1255,8 +1182,7 @@ def _get_resolved_checkpoint_files(
else:
raise OSError(
f"{pretrained_model_name_or_path} does not appear to have a file named"
f" {_add_variant(WEIGHTS_NAME, variant)}, {_add_variant(SAFE_WEIGHTS_NAME, variant)},"
f" {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME} or {FLAX_WEIGHTS_NAME}."
f" {_add_variant(WEIGHTS_NAME, variant)} or {_add_variant(SAFE_WEIGHTS_NAME, variant)}."
)
except OSError:
@ -1269,8 +1195,7 @@ def _get_resolved_checkpoint_files(
f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it"
" from 'https://huggingface.co/models', make sure you don't have a local directory with the"
f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
f" directory containing a file named {_add_variant(WEIGHTS_NAME, variant)},"
f" {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME} or {FLAX_WEIGHTS_NAME}."
f" directory containing a file named {_add_variant(WEIGHTS_NAME, variant)}."
) from e
if is_local:
@ -1682,8 +1607,6 @@ class ModuleUtilsMixin:
if encoder_attention_mask.dim() == 2:
encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
# T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
# Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow
# /transformer/transformer_layers.py#L270
# encoder_extended_attention_mask = (encoder_extended_attention_mask ==
# encoder_extended_attention_mask.transpose(-1, -2))
encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility
@ -2018,19 +1941,13 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
- **config_class** ([`PretrainedConfig`]) -- A subclass of [`PretrainedConfig`] to use as configuration class
for this model architecture.
- **load_tf_weights** (`Callable`) -- A python *method* for loading a TensorFlow checkpoint in a PyTorch model,
taking as arguments:
- **model** ([`PreTrainedModel`]) -- An instance of the model on which to load the TensorFlow checkpoint.
- **config** ([`PreTrainedConfig`]) -- An instance of the configuration associated to the model.
- **path** (`str`) -- A path to the TensorFlow checkpoint.
- **base_model_prefix** (`str`) -- A string indicating the attribute associated to the base model in derived
classes of the same architecture adding modules on top of the base model.
- **is_parallelizable** (`bool`) -- A flag indicating whether this model supports model parallelization.
- **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for NLP
models, `pixel_values` for vision models and `input_values` for speech models).
- **can_record_outputs** (dict):"""
- **can_record_outputs** (dict):
"""
config_class = None
base_model_prefix = ""
@ -2155,13 +2072,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
"""
return {"input_ids": torch.tensor(DUMMY_INPUTS)}
@property
def framework(self) -> str:
"""
:str: Identifies that this is a PyTorch model.
"""
return "pt"
def __init_subclass__(cls, **kwargs):
super().__init_subclass__(**kwargs)
# For BC we keep the original `config_class` definition in case
@ -3800,9 +3710,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
"""
Activates gradient checkpointing for the current model.
Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
activations".
We pass the `__call__` method of the modules instead of `forward` because `__call__` attaches all the hooks of
the module. https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
@ -3863,9 +3770,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
def gradient_checkpointing_disable(self):
"""
Deactivates gradient checkpointing for the current model.
Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
activations".
"""
if self.supports_gradient_checkpointing:
# For old GC format (transformers < 4.35.0) for models that live on the Hub
@ -3887,9 +3791,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
def is_gradient_checkpointing(self) -> bool:
"""
Whether gradient checkpointing is activated for this model or not.
Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
activations".
"""
return any(hasattr(m, "gradient_checkpointing") and m.gradient_checkpointing for m in self.modules())
@ -4543,13 +4444,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
- A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
- A path to a *directory* containing model weights saved using
[`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
- A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
this case, `from_tf` should be set to `True` and a configuration object should be provided as
`config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
- A path or url to a model folder containing a *flax checkpoint file* in *.msgpack* format (e.g,
`./flax_model/` containing `flax_model.msgpack`). In this case, `from_flax` should be set to
`True`.
- `None` if you are both providing the configuration and state dictionary (resp. with keyword
arguments `config` and `state_dict`).
model_args (sequence of positional arguments, *optional*):
@ -4578,12 +4472,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
cache_dir (`Union[str, os.PathLike]`, *optional*):
Path to a directory in which a downloaded pretrained model configuration should be cached if the
standard cache should not be used.
from_tf (`bool`, *optional*, defaults to `False`):
Load the model weights from a TensorFlow checkpoint save file (see docstring of
`pretrained_model_name_or_path` argument).
from_flax (`bool`, *optional*, defaults to `False`):
Load the model weights from a Flax checkpoint save file (see docstring of
`pretrained_model_name_or_path` argument).
ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`):
Whether or not to raise an error if some of the weights from the checkpoint do not have the same size
as the weights of the model (if for instance, you are instantiating a model with 10 labels from a
@ -4699,8 +4587,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
specify the folder name here.
variant (`str`, *optional*):
If specified load weights from `variant` filename, *e.g.* pytorch_model.<variant>.bin. `variant` is
ignored when using `from_tf` or `from_flax`.
If specified load weights from `variant` filename, *e.g.* pytorch_model.<variant>.bin.
use_safetensors (`bool`, *optional*, defaults to `None`):
Whether or not to use `safetensors` checkpoints. Defaults to `None`. If not specified and `safetensors`
is not installed, it will be set to `False`.
@ -4744,16 +4631,9 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
>>> # Update configuration during loading.
>>> model = BertModel.from_pretrained("google-bert/bert-base-uncased", output_attentions=True)
>>> assert model.config.output_attentions == True
>>> # Loading from a TF checkpoint file instead of a PyTorch model (slower, for example purposes, not runnable).
>>> config = BertConfig.from_json_file("./tf_model/my_tf_model_config.json")
>>> model = BertModel.from_pretrained("./tf_model/my_tf_checkpoint.ckpt.index", from_tf=True, config=config)
>>> # Loading from a Flax checkpoint file instead of a PyTorch model (slower)
>>> model = BertModel.from_pretrained("google-bert/bert-base-uncased", from_flax=True)
```
"""
state_dict = kwargs.pop("state_dict", None)
from_tf = kwargs.pop("from_tf", False)
from_flax = kwargs.pop("from_flax", False)
proxies = kwargs.pop("proxies", None)
output_loading_info = kwargs.pop("output_loading_info", False)
use_auth_token = kwargs.pop("use_auth_token", None)
@ -4796,8 +4676,10 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
# Not used anymore -- remove them from the kwargs
_ = kwargs.pop("resume_download", None)
_ = kwargs.pop("mirror", None)
_ = kwargs.pop("_fast_init", True)
_ = kwargs.pop("_fast_init", None)
_ = kwargs.pop("low_cpu_mem_usage", None)
_ = kwargs.pop("from_tf", None)
_ = kwargs.pop("from_flax", None)
# For BC on torch_dtype argument
if torch_dtype is not None:
@ -4964,8 +4846,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
"Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead."
)
from_pt = not (from_tf | from_flax)
user_agent = {"file_type": "model", "framework": "pytorch", "from_auto_class": from_auto_class}
if from_pipeline is not None:
user_agent["using_pipeline"] = from_pipeline
@ -5016,7 +4896,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
)
hf_quantizer, config, dtype, device_map = get_hf_quantizer(
config, quantization_config, dtype, from_tf, from_flax, device_map, weights_only, user_agent
config, quantization_config, dtype, device_map, weights_only, user_agent
)
if gguf_file is not None and hf_quantizer is not None:
@ -5039,8 +4919,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
subfolder=subfolder,
variant=variant,
gguf_file=gguf_file,
from_tf=from_tf,
from_flax=from_flax,
use_safetensors=use_safetensors,
cache_dir=cache_dir,
force_download=force_download,
@ -5054,56 +4932,35 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
transformers_explicit_filename=transformers_explicit_filename,
)
is_sharded = sharded_metadata is not None
is_quantized = hf_quantizer is not None
is_from_file = pretrained_model_name_or_path is not None or gguf_file is not None
if (
is_safetensors_available()
and is_from_file
and not is_sharded
and checkpoint_files[0].endswith(".safetensors")
):
# Just a helpful message in case we try to load safetensors files coming from old Transformers tf/flax classes
if is_safetensors_available() and is_from_file and checkpoint_files[0].endswith(".safetensors"):
with safe_open(checkpoint_files[0], framework="pt") as f:
metadata = f.metadata()
if metadata is None:
# Assume it's a pytorch checkpoint (introduced for timm checkpoints)
pass
elif metadata.get("format") == "pt":
pass
elif metadata.get("format") == "tf":
from_tf = True
logger.info("A TensorFlow safetensors file is being loaded in a PyTorch model.")
elif metadata.get("format") == "flax":
from_flax = True
logger.info("A Flax safetensors file is being loaded in a PyTorch model.")
elif metadata.get("format") == "mlx":
# This is a mlx file, we assume weights are compatible with pt
pass
else:
raise ValueError(
f"Incompatible safetensors file. File metadata is not ['pt', 'tf', 'flax', 'mlx'] but {metadata.get('format')}"
if metadata is not None and metadata.get("format") in ["tf", "flax"]:
logger.warning(
"The safetensors checkpoint found has format `tf` or `flax`. This mean that the keys will very"
"likely not match to the model you are trying to load, and will be newly initialized. If it's the case "
"another warning will be raised later. Consider converting your checkpoint to the correct format."
)
from_pt = not (from_tf | from_flax)
if gguf_file:
from .modeling_gguf_pytorch_utils import load_gguf_checkpoint
if from_pt:
if gguf_file:
from .modeling_gguf_pytorch_utils import load_gguf_checkpoint
# we need a dummy model to get the state_dict - for this reason, we keep the state_dict as if it was
# passed directly as a kwarg from now on
with torch.device("meta"):
dummy_model = cls(config)
state_dict = load_gguf_checkpoint(checkpoint_files[0], return_tensors=True, model_to_load=dummy_model)[
"tensors"
]
# we need a dummy model to get the state_dict - for this reason, we keep the state_dict as if it was
# passed directly as a kwarg from now on
with torch.device("meta"):
dummy_model = cls(config)
state_dict = load_gguf_checkpoint(checkpoint_files[0], return_tensors=True, model_to_load=dummy_model)[
"tensors"
]
# Find the correct dtype based on current state
config, dtype, dtype_orig = _get_dtype(
cls, dtype, checkpoint_files, config, sharded_metadata, state_dict, weights_only
)
# Find the correct dtype based on current state
config, dtype, dtype_orig = _get_dtype(
cls, dtype, checkpoint_files, config, sharded_metadata, state_dict, weights_only
)
config.name_or_path = pretrained_model_name_or_path
model_init_context = cls.get_init_context(is_quantized, _is_ds_init_called)
@ -5166,40 +5023,36 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
if device_map is not None:
device_map = _get_device_map(model, device_map, max_memory, hf_quantizer, dtype, keep_in_fp32_regex)
# Finalize model weight initialization
if from_tf:
model, loading_info = cls._load_from_tf(model, config, checkpoint_files)
elif from_flax:
model = cls._load_from_flax(model, checkpoint_files)
elif from_pt:
# restore default dtype
if dtype_orig is not None:
torch.set_default_dtype(dtype_orig)
# restore default dtype
if dtype_orig is not None:
torch.set_default_dtype(dtype_orig)
# Finalize model weight initialization
(
model,
missing_keys,
unexpected_keys,
mismatched_keys,
offload_index,
error_msgs,
) = cls._load_pretrained_model(
model,
state_dict,
checkpoint_files,
pretrained_model_name_or_path,
ignore_mismatched_sizes=ignore_mismatched_sizes,
sharded_metadata=sharded_metadata,
device_map=device_map,
disk_offload_folder=offload_folder,
offload_state_dict=offload_state_dict,
dtype=dtype,
hf_quantizer=hf_quantizer,
keep_in_fp32_regex=keep_in_fp32_regex,
device_mesh=device_mesh,
key_mapping=key_mapping,
weights_only=weights_only,
)
(
model,
missing_keys,
unexpected_keys,
mismatched_keys,
offload_index,
error_msgs,
) = cls._load_pretrained_model(
model,
state_dict,
checkpoint_files,
pretrained_model_name_or_path,
ignore_mismatched_sizes=ignore_mismatched_sizes,
sharded_metadata=sharded_metadata,
device_map=device_map,
disk_offload_folder=offload_folder,
offload_state_dict=offload_state_dict,
dtype=dtype,
hf_quantizer=hf_quantizer,
keep_in_fp32_regex=keep_in_fp32_regex,
device_mesh=device_mesh,
key_mapping=key_mapping,
weights_only=weights_only,
)
# make sure token embedding weights are still tied if needed
model.tie_weights()
@ -5292,15 +5145,12 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
)
if output_loading_info:
if from_pt:
loading_info = {
"missing_keys": missing_keys,
"unexpected_keys": unexpected_keys,
"mismatched_keys": mismatched_keys,
"error_msgs": error_msgs,
}
elif from_flax:
loading_info = None
loading_info = {
"missing_keys": missing_keys,
"unexpected_keys": unexpected_keys,
"mismatched_keys": mismatched_keys,
"error_msgs": error_msgs,
}
return model, loading_info
return model
@ -5751,44 +5601,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
return model, missing_keys, unexpected_keys, mismatched_keys, disk_offload_index, error_msgs
@classmethod
def _load_from_tf(cls, model, config, checkpoint_files):
if checkpoint_files[0].endswith(".index"):
# Load from a TensorFlow 1.X checkpoint - provided by original authors
model = cls.load_tf_weights(model, config, checkpoint_files[0][:-6]) # Remove the '.index'
loading_info = None
else:
# Load from our TensorFlow 2.0 checkpoints
try:
from .modeling_tf_pytorch_utils import load_tf2_checkpoint_in_pytorch_model
model, loading_info = load_tf2_checkpoint_in_pytorch_model(
model, checkpoint_files[0], allow_missing_keys=True, output_loading_info=True
)
except ImportError:
logger.error(
"Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed."
" Please see https://pytorch.org/ and https://www.tensorflow.org/install/ for installation"
" instructions."
)
raise
return model, loading_info
@classmethod
def _load_from_flax(cls, model, checkpoint_files):
try:
from .modeling_flax_pytorch_utils import load_flax_checkpoint_in_pytorch_model
model = load_flax_checkpoint_in_pytorch_model(model, checkpoint_files[0])
except ImportError:
logger.error(
"Loading a Flax model in PyTorch, requires both PyTorch and Flax to be installed. Please see"
" https://pytorch.org/ and https://flax.readthedocs.io/en/latest/installation.html for"
" installation instructions."
)
raise
return model
def retrieve_modules_from_names(self, names, add_prefix=False, remove_prefix=False):
module_keys = {".".join(key.split(".")[:-1]) for key in names}

View File

@ -20,8 +20,6 @@ from ...utils.import_utils import define_import_structure
if TYPE_CHECKING:
from .configuration_albert import *
from .modeling_albert import *
from .modeling_flax_albert import *
from .modeling_tf_albert import *
from .tokenization_albert import *
from .tokenization_albert_fast import *
else:

View File

@ -15,7 +15,6 @@
"""PyTorch ALBERT model."""
import math
import os
from dataclasses import dataclass
from typing import Optional, Union
@ -47,132 +46,6 @@ from .configuration_albert import AlbertConfig
logger = logging.get_logger(__name__)
def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
"""Load tf checkpoints in a pytorch model."""
try:
import re
import numpy as np
import tensorflow as tf
except ImportError:
logger.error(
"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path = os.path.abspath(tf_checkpoint_path)
logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
# Load weights from TF model
init_vars = tf.train.list_variables(tf_path)
names = []
arrays = []
for name, shape in init_vars:
logger.info(f"Loading TF weight {name} with shape {shape}")
array = tf.train.load_variable(tf_path, name)
names.append(name)
arrays.append(array)
for name, array in zip(names, arrays):
print(name)
for name, array in zip(names, arrays):
original_name = name
# If saved from the TF HUB module
name = name.replace("module/", "")
# Renaming and simplifying
name = name.replace("ffn_1", "ffn")
name = name.replace("bert/", "albert/")
name = name.replace("attention_1", "attention")
name = name.replace("transform/", "")
name = name.replace("LayerNorm_1", "full_layer_layer_norm")
name = name.replace("LayerNorm", "attention/LayerNorm")
name = name.replace("transformer/", "")
# The feed forward layer had an 'intermediate' step which has been abstracted away
name = name.replace("intermediate/dense/", "")
name = name.replace("ffn/intermediate/output/dense/", "ffn_output/")
# ALBERT attention was split between self and output which have been abstracted away
name = name.replace("/output/", "/")
name = name.replace("/self/", "/")
# The pooler is a linear layer
name = name.replace("pooler/dense", "pooler")
# The classifier was simplified to predictions from cls/predictions
name = name.replace("cls/predictions", "predictions")
name = name.replace("predictions/attention", "predictions")
# Naming was changed to be more explicit
name = name.replace("embeddings/attention", "embeddings")
name = name.replace("inner_group_", "albert_layers/")
name = name.replace("group_", "albert_layer_groups/")
# Classifier
if len(name.split("/")) == 1 and ("output_bias" in name or "output_weights" in name):
name = "classifier/" + name
# No ALBERT model currently handles the next sentence prediction task
if "seq_relationship" in name:
name = name.replace("seq_relationship/output_", "sop_classifier/classifier/")
name = name.replace("weights", "weight")
name = name.split("/")
# Ignore the gradients applied by the LAMB/ADAM optimizers.
if (
"adam_m" in name
or "adam_v" in name
or "AdamWeightDecayOptimizer" in name
or "AdamWeightDecayOptimizer_1" in name
or "global_step" in name
):
logger.info(f"Skipping {'/'.join(name)}")
continue
pointer = model
for m_name in name:
if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
scope_names = re.split(r"_(\d+)", m_name)
else:
scope_names = [m_name]
if scope_names[0] == "kernel" or scope_names[0] == "gamma":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
pointer = getattr(pointer, "bias")
elif scope_names[0] == "output_weights":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "squad":
pointer = getattr(pointer, "classifier")
else:
try:
pointer = getattr(pointer, scope_names[0])
except AttributeError:
logger.info(f"Skipping {'/'.join(name)}")
continue
if len(scope_names) >= 2:
num = int(scope_names[1])
pointer = pointer[num]
if m_name[-11:] == "_embeddings":
pointer = getattr(pointer, "weight")
elif m_name == "kernel":
array = np.transpose(array)
try:
if pointer.shape != array.shape:
raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
except ValueError as e:
e.args += (pointer.shape, array.shape)
raise
print(f"Initialize PyTorch weight {name} from {original_name}")
pointer.data = torch.from_numpy(array)
return model
class AlbertEmbeddings(nn.Module):
"""
Construct the embeddings from word, position and token_type embeddings.
@ -184,8 +57,6 @@ class AlbertEmbeddings(nn.Module):
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size)
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
@ -546,15 +417,12 @@ class AlbertTransformer(nn.Module):
@auto_docstring
class AlbertPreTrainedModel(PreTrainedModel):
config: AlbertConfig
load_tf_weights = load_tf_weights_in_albert
base_model_prefix = "albert"
_supports_sdpa = True
def _init_weights(self, module):
"""Initialize the weights."""
if isinstance(module, nn.Linear):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
@ -1337,7 +1205,6 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
__all__ = [
"load_tf_weights_in_albert",
"AlbertPreTrainedModel",
"AlbertModel",
"AlbertForPreTraining",

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -516,8 +516,6 @@ class AlignTextEmbeddings(nn.Module):
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized

View File

@ -101,8 +101,6 @@ class AltRobertaEmbeddings(nn.Module):
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized

View File

@ -232,10 +232,7 @@ class AriaImageProcessor(BaseImageProcessor):
images = make_flat_list_of_images(images)
if not valid_images(images):
raise ValueError(
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
"torch.Tensor, tf.Tensor or jax.ndarray."
)
raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, or torch.Tensor")
validate_preprocess_arguments(
do_normalize=do_normalize,

View File

@ -615,10 +615,7 @@ class AriaImageProcessor(BaseImageProcessor):
images = make_flat_list_of_images(images)
if not valid_images(images):
raise ValueError(
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
"torch.Tensor, tf.Tensor or jax.ndarray."
)
raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, or torch.Tensor")
validate_preprocess_arguments(
do_normalize=do_normalize,

View File

@ -179,7 +179,6 @@ class ASTFeatureExtractor(SequenceFeatureExtractor):
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors instead of list of python integers. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return Numpy `np.ndarray` objects.
"""

View File

@ -23,8 +23,6 @@ if TYPE_CHECKING:
from .feature_extraction_auto import *
from .image_processing_auto import *
from .modeling_auto import *
from .modeling_flax_auto import *
from .modeling_tf_auto import *
from .processing_auto import *
from .tokenization_auto import *
from .video_processing_auto import *

View File

@ -102,10 +102,6 @@ FROM_PRETRAINED_TORCH_DOCSTRING = """
- A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
- A path to a *directory* containing model weights saved using
[`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
- A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
this case, `from_tf` should be set to `True` and a configuration object should be provided as
`config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
model_args (additional positional arguments, *optional*):
Will be passed along to the underlying model `__init__()` method.
config ([`PretrainedConfig`], *optional*):
@ -127,9 +123,6 @@ FROM_PRETRAINED_TORCH_DOCSTRING = """
cache_dir (`str` or `os.PathLike`, *optional*):
Path to a directory in which a downloaded pretrained model configuration should be cached if the
standard cache should not be used.
from_tf (`bool`, *optional*, defaults to `False`):
Load the model weights from a TensorFlow checkpoint save file (see docstring of
`pretrained_model_name_or_path` argument).
force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
cached versions if they exist.
@ -182,210 +175,6 @@ FROM_PRETRAINED_TORCH_DOCSTRING = """
>>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder", output_attentions=True)
>>> model.config.output_attentions
True
>>> # Loading from a TF checkpoint file instead of a PyTorch model (slower)
>>> config = AutoConfig.from_pretrained("./tf_model/shortcut_placeholder_tf_model_config.json")
>>> model = BaseAutoModelClass.from_pretrained(
... "./tf_model/shortcut_placeholder_tf_checkpoint.ckpt.index", from_tf=True, config=config
... )
```
"""
FROM_PRETRAINED_TF_DOCSTRING = """
Instantiate one of the model classes of the library from a pretrained model.
The model class to instantiate is selected based on the `model_type` property of the config object (either
passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
falling back to using pattern matching on `pretrained_model_name_or_path`:
List options
Args:
pretrained_model_name_or_path (`str` or `os.PathLike`):
Can be either:
- A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
- A path to a *directory* containing model weights saved using
[`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
- A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In this
case, `from_pt` should be set to `True` and a configuration object should be provided as `config`
argument. This loading path is slower than converting the PyTorch model in a TensorFlow model
using the provided conversion scripts and loading the TensorFlow model afterwards.
model_args (additional positional arguments, *optional*):
Will be passed along to the underlying model `__init__()` method.
config ([`PretrainedConfig`], *optional*):
Configuration for the model to use instead of an automatically loaded configuration. Configuration can
be automatically loaded when:
- The model is a model provided by the library (loaded with the *model id* string of a pretrained
model).
- The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded by supplying the
save directory.
- The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
configuration JSON file named *config.json* is found in the directory.
cache_dir (`str` or `os.PathLike`, *optional*):
Path to a directory in which a downloaded pretrained model configuration should be cached if the
standard cache should not be used.
from_pt (`bool`, *optional*, defaults to `False`):
Load the model weights from a PyTorch checkpoint save file (see docstring of
`pretrained_model_name_or_path` argument).
force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
cached versions if they exist.
resume_download:
Deprecated and ignored. All downloads are now resumed by default when possible.
Will be removed in v5 of Transformers.
proxies (`dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
output_loading_info(`bool`, *optional*, defaults to `False`):
Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
local_files_only(`bool`, *optional*, defaults to `False`):
Whether or not to only look at local files (e.g., not try downloading the model).
revision (`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
trust_remote_code (`bool`, *optional*, defaults to `False`):
Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
should only be set to `True` for repositories you trust and in which you have read the code, as it will
execute code present on the Hub on your local machine.
code_revision (`str`, *optional*, defaults to `"main"`):
The specific revision to use for the code on the Hub, if the code leaves in a different repository than
the rest of the model. It can be a branch name, a tag name, or a commit id, since we use a git-based
system for storing models and other artifacts on huggingface.co, so `revision` can be any identifier
allowed by git.
kwargs (additional keyword arguments, *optional*):
Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
`output_attentions=True`). Behaves differently depending on whether a `config` is provided or
automatically loaded:
- If a configuration is provided with `config`, `**kwargs` will be directly passed to the
underlying model's `__init__` method (we assume all relevant updates to the configuration have
already been done)
- If a configuration is not provided, `kwargs` will be first passed to the configuration class
initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that
corresponds to a configuration attribute will be used to override said attribute with the
supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute
will be passed to the underlying model's `__init__` function.
Examples:
```python
>>> from transformers import AutoConfig, BaseAutoModelClass
>>> # Download model and configuration from huggingface.co and cache.
>>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder")
>>> # Update configuration during loading
>>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder", output_attentions=True)
>>> model.config.output_attentions
True
>>> # Loading from a PyTorch checkpoint file instead of a TensorFlow model (slower)
>>> config = AutoConfig.from_pretrained("./pt_model/shortcut_placeholder_pt_model_config.json")
>>> model = BaseAutoModelClass.from_pretrained(
... "./pt_model/shortcut_placeholder_pytorch_model.bin", from_pt=True, config=config
... )
```
"""
FROM_PRETRAINED_FLAX_DOCSTRING = """
Instantiate one of the model classes of the library from a pretrained model.
The model class to instantiate is selected based on the `model_type` property of the config object (either
passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
falling back to using pattern matching on `pretrained_model_name_or_path`:
List options
Args:
pretrained_model_name_or_path (`str` or `os.PathLike`):
Can be either:
- A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
- A path to a *directory* containing model weights saved using
[`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
- A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In this
case, `from_pt` should be set to `True` and a configuration object should be provided as `config`
argument. This loading path is slower than converting the PyTorch model in a TensorFlow model
using the provided conversion scripts and loading the TensorFlow model afterwards.
model_args (additional positional arguments, *optional*):
Will be passed along to the underlying model `__init__()` method.
config ([`PretrainedConfig`], *optional*):
Configuration for the model to use instead of an automatically loaded configuration. Configuration can
be automatically loaded when:
- The model is a model provided by the library (loaded with the *model id* string of a pretrained
model).
- The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded by supplying the
save directory.
- The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
configuration JSON file named *config.json* is found in the directory.
cache_dir (`str` or `os.PathLike`, *optional*):
Path to a directory in which a downloaded pretrained model configuration should be cached if the
standard cache should not be used.
from_pt (`bool`, *optional*, defaults to `False`):
Load the model weights from a PyTorch checkpoint save file (see docstring of
`pretrained_model_name_or_path` argument).
force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
cached versions if they exist.
resume_download:
Deprecated and ignored. All downloads are now resumed by default when possible.
Will be removed in v5 of Transformers.
proxies (`dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
output_loading_info(`bool`, *optional*, defaults to `False`):
Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
local_files_only(`bool`, *optional*, defaults to `False`):
Whether or not to only look at local files (e.g., not try downloading the model).
revision (`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
trust_remote_code (`bool`, *optional*, defaults to `False`):
Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
should only be set to `True` for repositories you trust and in which you have read the code, as it will
execute code present on the Hub on your local machine.
code_revision (`str`, *optional*, defaults to `"main"`):
The specific revision to use for the code on the Hub, if the code leaves in a different repository than
the rest of the model. It can be a branch name, a tag name, or a commit id, since we use a git-based
system for storing models and other artifacts on huggingface.co, so `revision` can be any identifier
allowed by git.
kwargs (additional keyword arguments, *optional*):
Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
`output_attentions=True`). Behaves differently depending on whether a `config` is provided or
automatically loaded:
- If a configuration is provided with `config`, `**kwargs` will be directly passed to the
underlying model's `__init__` method (we assume all relevant updates to the configuration have
already been done)
- If a configuration is not provided, `kwargs` will be first passed to the configuration class
initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that
corresponds to a configuration attribute will be used to override said attribute with the
supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute
will be passed to the underlying model's `__init__` function.
Examples:
```python
>>> from transformers import AutoConfig, BaseAutoModelClass
>>> # Download model and configuration from huggingface.co and cache.
>>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder")
>>> # Update configuration during loading
>>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder", output_attentions=True)
>>> model.config.output_attentions
True
>>> # Loading from a PyTorch checkpoint file instead of a TensorFlow model (slower)
>>> config = AutoConfig.from_pretrained("./pt_model/shortcut_placeholder_pt_model_config.json")
>>> model = BaseAutoModelClass.from_pretrained(
... "./pt_model/shortcut_placeholder_pytorch_model.bin", from_pt=True, config=config
... )
```
"""
@ -400,10 +189,6 @@ def _get_model_class(config, model_mapping):
for arch in architectures:
if arch in name_to_model:
return name_to_model[arch]
elif f"TF{arch}" in name_to_model:
return name_to_model[f"TF{arch}"]
elif f"Flax{arch}" in name_to_model:
return name_to_model[f"Flax{arch}"]
# If not architecture is set in the config or match the supported models, the first element of the tuple is the
# defaults.
@ -696,12 +481,7 @@ def auto_class_update(cls, checkpoint_for_example: str = "google-bert/bert-base-
from_config = replace_list_option_in_docstrings(model_mapping._model_mapping, use_model_types=False)(from_config)
cls.from_config = classmethod(from_config)
if name.startswith("TF"):
from_pretrained_docstring = FROM_PRETRAINED_TF_DOCSTRING
elif name.startswith("Flax"):
from_pretrained_docstring = FROM_PRETRAINED_FLAX_DOCSTRING
else:
from_pretrained_docstring = FROM_PRETRAINED_TORCH_DOCSTRING
from_pretrained_docstring = FROM_PRETRAINED_TORCH_DOCSTRING
from_pretrained = copy_func(_BaseAutoModelClass.from_pretrained)
from_pretrained_docstring = insert_head_doc(from_pretrained_docstring, head_doc=head_doc)
from_pretrained_docstring = from_pretrained_docstring.replace("BaseAutoModelClass", name)

View File

@ -1,413 +0,0 @@
# coding=utf-8
# Copyright 2018 The Google Flax Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Auto Model class."""
from collections import OrderedDict
from ...utils import logging
from .auto_factory import _BaseAutoModelClass, _LazyAutoMapping, auto_class_update
from .configuration_auto import CONFIG_MAPPING_NAMES
logger = logging.get_logger(__name__)
FLAX_MODEL_MAPPING_NAMES = OrderedDict(
[
# Base model mapping
("albert", "FlaxAlbertModel"),
("bart", "FlaxBartModel"),
("beit", "FlaxBeitModel"),
("bert", "FlaxBertModel"),
("big_bird", "FlaxBigBirdModel"),
("blenderbot", "FlaxBlenderbotModel"),
("blenderbot-small", "FlaxBlenderbotSmallModel"),
("bloom", "FlaxBloomModel"),
("clip", "FlaxCLIPModel"),
("dinov2", "FlaxDinov2Model"),
("distilbert", "FlaxDistilBertModel"),
("electra", "FlaxElectraModel"),
("gemma", "FlaxGemmaModel"),
("gpt-sw3", "FlaxGPT2Model"),
("gpt2", "FlaxGPT2Model"),
("gpt_neo", "FlaxGPTNeoModel"),
("gptj", "FlaxGPTJModel"),
("llama", "FlaxLlamaModel"),
("longt5", "FlaxLongT5Model"),
("marian", "FlaxMarianModel"),
("mbart", "FlaxMBartModel"),
("mistral", "FlaxMistralModel"),
("mt5", "FlaxMT5Model"),
("opt", "FlaxOPTModel"),
("pegasus", "FlaxPegasusModel"),
("regnet", "FlaxRegNetModel"),
("resnet", "FlaxResNetModel"),
("roberta", "FlaxRobertaModel"),
("roberta-prelayernorm", "FlaxRobertaPreLayerNormModel"),
("roformer", "FlaxRoFormerModel"),
("t5", "FlaxT5Model"),
("vision-text-dual-encoder", "FlaxVisionTextDualEncoderModel"),
("vit", "FlaxViTModel"),
("wav2vec2", "FlaxWav2Vec2Model"),
("whisper", "FlaxWhisperModel"),
("xglm", "FlaxXGLMModel"),
("xlm-roberta", "FlaxXLMRobertaModel"),
]
)
FLAX_MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
[
# Model for pre-training mapping
("albert", "FlaxAlbertForPreTraining"),
("bart", "FlaxBartForConditionalGeneration"),
("bert", "FlaxBertForPreTraining"),
("big_bird", "FlaxBigBirdForPreTraining"),
("electra", "FlaxElectraForPreTraining"),
("longt5", "FlaxLongT5ForConditionalGeneration"),
("mbart", "FlaxMBartForConditionalGeneration"),
("mt5", "FlaxMT5ForConditionalGeneration"),
("roberta", "FlaxRobertaForMaskedLM"),
("roberta-prelayernorm", "FlaxRobertaPreLayerNormForMaskedLM"),
("roformer", "FlaxRoFormerForMaskedLM"),
("t5", "FlaxT5ForConditionalGeneration"),
("wav2vec2", "FlaxWav2Vec2ForPreTraining"),
("whisper", "FlaxWhisperForConditionalGeneration"),
("xlm-roberta", "FlaxXLMRobertaForMaskedLM"),
]
)
FLAX_MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
[
# Model for Masked LM mapping
("albert", "FlaxAlbertForMaskedLM"),
("bart", "FlaxBartForConditionalGeneration"),
("bert", "FlaxBertForMaskedLM"),
("big_bird", "FlaxBigBirdForMaskedLM"),
("distilbert", "FlaxDistilBertForMaskedLM"),
("electra", "FlaxElectraForMaskedLM"),
("mbart", "FlaxMBartForConditionalGeneration"),
("roberta", "FlaxRobertaForMaskedLM"),
("roberta-prelayernorm", "FlaxRobertaPreLayerNormForMaskedLM"),
("roformer", "FlaxRoFormerForMaskedLM"),
("xlm-roberta", "FlaxXLMRobertaForMaskedLM"),
]
)
FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
[
# Model for Seq2Seq Causal LM mapping
("bart", "FlaxBartForConditionalGeneration"),
("blenderbot", "FlaxBlenderbotForConditionalGeneration"),
("blenderbot-small", "FlaxBlenderbotSmallForConditionalGeneration"),
("encoder-decoder", "FlaxEncoderDecoderModel"),
("longt5", "FlaxLongT5ForConditionalGeneration"),
("marian", "FlaxMarianMTModel"),
("mbart", "FlaxMBartForConditionalGeneration"),
("mt5", "FlaxMT5ForConditionalGeneration"),
("pegasus", "FlaxPegasusForConditionalGeneration"),
("t5", "FlaxT5ForConditionalGeneration"),
]
)
FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
[
# Model for Image-classification
("beit", "FlaxBeitForImageClassification"),
("dinov2", "FlaxDinov2ForImageClassification"),
("regnet", "FlaxRegNetForImageClassification"),
("resnet", "FlaxResNetForImageClassification"),
("vit", "FlaxViTForImageClassification"),
]
)
FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict(
[
("vision-encoder-decoder", "FlaxVisionEncoderDecoderModel"),
]
)
FLAX_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
[
# Model for Causal LM mapping
("bart", "FlaxBartForCausalLM"),
("bert", "FlaxBertForCausalLM"),
("big_bird", "FlaxBigBirdForCausalLM"),
("bloom", "FlaxBloomForCausalLM"),
("electra", "FlaxElectraForCausalLM"),
("gemma", "FlaxGemmaForCausalLM"),
("gpt-sw3", "FlaxGPT2LMHeadModel"),
("gpt2", "FlaxGPT2LMHeadModel"),
("gpt_neo", "FlaxGPTNeoForCausalLM"),
("gptj", "FlaxGPTJForCausalLM"),
("llama", "FlaxLlamaForCausalLM"),
("mistral", "FlaxMistralForCausalLM"),
("opt", "FlaxOPTForCausalLM"),
("roberta", "FlaxRobertaForCausalLM"),
("roberta-prelayernorm", "FlaxRobertaPreLayerNormForCausalLM"),
("xglm", "FlaxXGLMForCausalLM"),
("xlm-roberta", "FlaxXLMRobertaForCausalLM"),
]
)
FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
[
# Model for Sequence Classification mapping
("albert", "FlaxAlbertForSequenceClassification"),
("bart", "FlaxBartForSequenceClassification"),
("bert", "FlaxBertForSequenceClassification"),
("big_bird", "FlaxBigBirdForSequenceClassification"),
("distilbert", "FlaxDistilBertForSequenceClassification"),
("electra", "FlaxElectraForSequenceClassification"),
("mbart", "FlaxMBartForSequenceClassification"),
("roberta", "FlaxRobertaForSequenceClassification"),
("roberta-prelayernorm", "FlaxRobertaPreLayerNormForSequenceClassification"),
("roformer", "FlaxRoFormerForSequenceClassification"),
("xlm-roberta", "FlaxXLMRobertaForSequenceClassification"),
]
)
FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
[
# Model for Question Answering mapping
("albert", "FlaxAlbertForQuestionAnswering"),
("bart", "FlaxBartForQuestionAnswering"),
("bert", "FlaxBertForQuestionAnswering"),
("big_bird", "FlaxBigBirdForQuestionAnswering"),
("distilbert", "FlaxDistilBertForQuestionAnswering"),
("electra", "FlaxElectraForQuestionAnswering"),
("mbart", "FlaxMBartForQuestionAnswering"),
("roberta", "FlaxRobertaForQuestionAnswering"),
("roberta-prelayernorm", "FlaxRobertaPreLayerNormForQuestionAnswering"),
("roformer", "FlaxRoFormerForQuestionAnswering"),
("xlm-roberta", "FlaxXLMRobertaForQuestionAnswering"),
]
)
FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
[
# Model for Token Classification mapping
("albert", "FlaxAlbertForTokenClassification"),
("bert", "FlaxBertForTokenClassification"),
("big_bird", "FlaxBigBirdForTokenClassification"),
("distilbert", "FlaxDistilBertForTokenClassification"),
("electra", "FlaxElectraForTokenClassification"),
("roberta", "FlaxRobertaForTokenClassification"),
("roberta-prelayernorm", "FlaxRobertaPreLayerNormForTokenClassification"),
("roformer", "FlaxRoFormerForTokenClassification"),
("xlm-roberta", "FlaxXLMRobertaForTokenClassification"),
]
)
FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES = OrderedDict(
[
# Model for Multiple Choice mapping
("albert", "FlaxAlbertForMultipleChoice"),
("bert", "FlaxBertForMultipleChoice"),
("big_bird", "FlaxBigBirdForMultipleChoice"),
("distilbert", "FlaxDistilBertForMultipleChoice"),
("electra", "FlaxElectraForMultipleChoice"),
("roberta", "FlaxRobertaForMultipleChoice"),
("roberta-prelayernorm", "FlaxRobertaPreLayerNormForMultipleChoice"),
("roformer", "FlaxRoFormerForMultipleChoice"),
("xlm-roberta", "FlaxXLMRobertaForMultipleChoice"),
]
)
FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES = OrderedDict(
[
("bert", "FlaxBertForNextSentencePrediction"),
]
)
FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = OrderedDict(
[
("speech-encoder-decoder", "FlaxSpeechEncoderDecoderModel"),
("whisper", "FlaxWhisperForConditionalGeneration"),
]
)
FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
[
("whisper", "FlaxWhisperForAudioClassification"),
]
)
FLAX_MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_MAPPING_NAMES)
FLAX_MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_PRETRAINING_MAPPING_NAMES)
FLAX_MODEL_FOR_MASKED_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_MASKED_LM_MAPPING_NAMES)
FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
)
FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
)
FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES)
FLAX_MODEL_FOR_CAUSAL_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
)
FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
)
FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
)
FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES
)
FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES
)
FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES
)
FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES
)
class FlaxAutoModel(_BaseAutoModelClass):
_model_mapping = FLAX_MODEL_MAPPING
FlaxAutoModel = auto_class_update(FlaxAutoModel)
class FlaxAutoModelForPreTraining(_BaseAutoModelClass):
_model_mapping = FLAX_MODEL_FOR_PRETRAINING_MAPPING
FlaxAutoModelForPreTraining = auto_class_update(FlaxAutoModelForPreTraining, head_doc="pretraining")
class FlaxAutoModelForCausalLM(_BaseAutoModelClass):
_model_mapping = FLAX_MODEL_FOR_CAUSAL_LM_MAPPING
FlaxAutoModelForCausalLM = auto_class_update(FlaxAutoModelForCausalLM, head_doc="causal language modeling")
class FlaxAutoModelForMaskedLM(_BaseAutoModelClass):
_model_mapping = FLAX_MODEL_FOR_MASKED_LM_MAPPING
FlaxAutoModelForMaskedLM = auto_class_update(FlaxAutoModelForMaskedLM, head_doc="masked language modeling")
class FlaxAutoModelForSeq2SeqLM(_BaseAutoModelClass):
_model_mapping = FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
FlaxAutoModelForSeq2SeqLM = auto_class_update(
FlaxAutoModelForSeq2SeqLM,
head_doc="sequence-to-sequence language modeling",
checkpoint_for_example="google-t5/t5-base",
)
class FlaxAutoModelForSequenceClassification(_BaseAutoModelClass):
_model_mapping = FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
FlaxAutoModelForSequenceClassification = auto_class_update(
FlaxAutoModelForSequenceClassification, head_doc="sequence classification"
)
class FlaxAutoModelForQuestionAnswering(_BaseAutoModelClass):
_model_mapping = FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING
FlaxAutoModelForQuestionAnswering = auto_class_update(FlaxAutoModelForQuestionAnswering, head_doc="question answering")
class FlaxAutoModelForTokenClassification(_BaseAutoModelClass):
_model_mapping = FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
FlaxAutoModelForTokenClassification = auto_class_update(
FlaxAutoModelForTokenClassification, head_doc="token classification"
)
class FlaxAutoModelForMultipleChoice(_BaseAutoModelClass):
_model_mapping = FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING
FlaxAutoModelForMultipleChoice = auto_class_update(FlaxAutoModelForMultipleChoice, head_doc="multiple choice")
class FlaxAutoModelForNextSentencePrediction(_BaseAutoModelClass):
_model_mapping = FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING
FlaxAutoModelForNextSentencePrediction = auto_class_update(
FlaxAutoModelForNextSentencePrediction, head_doc="next sentence prediction"
)
class FlaxAutoModelForImageClassification(_BaseAutoModelClass):
_model_mapping = FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
FlaxAutoModelForImageClassification = auto_class_update(
FlaxAutoModelForImageClassification, head_doc="image classification"
)
class FlaxAutoModelForVision2Seq(_BaseAutoModelClass):
_model_mapping = FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING
FlaxAutoModelForVision2Seq = auto_class_update(FlaxAutoModelForVision2Seq, head_doc="vision-to-text modeling")
class FlaxAutoModelForSpeechSeq2Seq(_BaseAutoModelClass):
_model_mapping = FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING
FlaxAutoModelForSpeechSeq2Seq = auto_class_update(
FlaxAutoModelForSpeechSeq2Seq, head_doc="sequence-to-sequence speech-to-text modeling"
)
__all__ = [
"FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
"FLAX_MODEL_FOR_CAUSAL_LM_MAPPING",
"FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
"FLAX_MODEL_FOR_MASKED_LM_MAPPING",
"FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING",
"FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING",
"FLAX_MODEL_FOR_PRETRAINING_MAPPING",
"FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING",
"FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
"FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
"FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
"FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
"FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING",
"FLAX_MODEL_MAPPING",
"FlaxAutoModel",
"FlaxAutoModelForCausalLM",
"FlaxAutoModelForImageClassification",
"FlaxAutoModelForMaskedLM",
"FlaxAutoModelForMultipleChoice",
"FlaxAutoModelForNextSentencePrediction",
"FlaxAutoModelForPreTraining",
"FlaxAutoModelForQuestionAnswering",
"FlaxAutoModelForSeq2SeqLM",
"FlaxAutoModelForSequenceClassification",
"FlaxAutoModelForSpeechSeq2Seq",
"FlaxAutoModelForTokenClassification",
"FlaxAutoModelForVision2Seq",
]

View File

@ -1,776 +0,0 @@
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Auto Model class."""
import warnings
from collections import OrderedDict
from ...utils import logging
from .auto_factory import _BaseAutoModelClass, _LazyAutoMapping, auto_class_update
from .configuration_auto import CONFIG_MAPPING_NAMES
logger = logging.get_logger(__name__)
TF_MODEL_MAPPING_NAMES = OrderedDict(
[
# Base model mapping
("albert", "TFAlbertModel"),
("bart", "TFBartModel"),
("bert", "TFBertModel"),
("blenderbot", "TFBlenderbotModel"),
("blenderbot-small", "TFBlenderbotSmallModel"),
("blip", "TFBlipModel"),
("camembert", "TFCamembertModel"),
("clip", "TFCLIPModel"),
("convbert", "TFConvBertModel"),
("convnext", "TFConvNextModel"),
("convnextv2", "TFConvNextV2Model"),
("ctrl", "TFCTRLModel"),
("cvt", "TFCvtModel"),
("data2vec-vision", "TFData2VecVisionModel"),
("deberta", "TFDebertaModel"),
("deberta-v2", "TFDebertaV2Model"),
("deit", "TFDeiTModel"),
("distilbert", "TFDistilBertModel"),
("dpr", "TFDPRQuestionEncoder"),
("efficientformer", "TFEfficientFormerModel"),
("electra", "TFElectraModel"),
("esm", "TFEsmModel"),
("flaubert", "TFFlaubertModel"),
("funnel", ("TFFunnelModel", "TFFunnelBaseModel")),
("gpt-sw3", "TFGPT2Model"),
("gpt2", "TFGPT2Model"),
("gptj", "TFGPTJModel"),
("groupvit", "TFGroupViTModel"),
("hubert", "TFHubertModel"),
("idefics", "TFIdeficsModel"),
("layoutlm", "TFLayoutLMModel"),
("layoutlmv3", "TFLayoutLMv3Model"),
("led", "TFLEDModel"),
("longformer", "TFLongformerModel"),
("lxmert", "TFLxmertModel"),
("marian", "TFMarianModel"),
("mbart", "TFMBartModel"),
("mistral", "TFMistralModel"),
("mobilebert", "TFMobileBertModel"),
("mobilevit", "TFMobileViTModel"),
("mpnet", "TFMPNetModel"),
("mt5", "TFMT5Model"),
("openai-gpt", "TFOpenAIGPTModel"),
("opt", "TFOPTModel"),
("pegasus", "TFPegasusModel"),
("regnet", "TFRegNetModel"),
("rembert", "TFRemBertModel"),
("resnet", "TFResNetModel"),
("roberta", "TFRobertaModel"),
("roberta-prelayernorm", "TFRobertaPreLayerNormModel"),
("roformer", "TFRoFormerModel"),
("sam", "TFSamModel"),
("sam_vision_model", "TFSamVisionModel"),
("segformer", "TFSegformerModel"),
("speech_to_text", "TFSpeech2TextModel"),
("swiftformer", "TFSwiftFormerModel"),
("swin", "TFSwinModel"),
("t5", "TFT5Model"),
("tapas", "TFTapasModel"),
("transfo-xl", "TFTransfoXLModel"),
("vision-text-dual-encoder", "TFVisionTextDualEncoderModel"),
("vit", "TFViTModel"),
("vit_mae", "TFViTMAEModel"),
("wav2vec2", "TFWav2Vec2Model"),
("whisper", "TFWhisperModel"),
("xglm", "TFXGLMModel"),
("xlm", "TFXLMModel"),
("xlm-roberta", "TFXLMRobertaModel"),
("xlnet", "TFXLNetModel"),
]
)
TF_MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
[
# Model for pre-training mapping
("albert", "TFAlbertForPreTraining"),
("bart", "TFBartForConditionalGeneration"),
("bert", "TFBertForPreTraining"),
("camembert", "TFCamembertForMaskedLM"),
("ctrl", "TFCTRLLMHeadModel"),
("distilbert", "TFDistilBertForMaskedLM"),
("electra", "TFElectraForPreTraining"),
("flaubert", "TFFlaubertWithLMHeadModel"),
("funnel", "TFFunnelForPreTraining"),
("gpt-sw3", "TFGPT2LMHeadModel"),
("gpt2", "TFGPT2LMHeadModel"),
("idefics", "TFIdeficsForVisionText2Text"),
("layoutlm", "TFLayoutLMForMaskedLM"),
("lxmert", "TFLxmertForPreTraining"),
("mobilebert", "TFMobileBertForPreTraining"),
("mpnet", "TFMPNetForMaskedLM"),
("openai-gpt", "TFOpenAIGPTLMHeadModel"),
("roberta", "TFRobertaForMaskedLM"),
("roberta-prelayernorm", "TFRobertaPreLayerNormForMaskedLM"),
("t5", "TFT5ForConditionalGeneration"),
("tapas", "TFTapasForMaskedLM"),
("transfo-xl", "TFTransfoXLLMHeadModel"),
("vit_mae", "TFViTMAEForPreTraining"),
("xlm", "TFXLMWithLMHeadModel"),
("xlm-roberta", "TFXLMRobertaForMaskedLM"),
("xlnet", "TFXLNetLMHeadModel"),
]
)
TF_MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
[
# Model with LM heads mapping
("albert", "TFAlbertForMaskedLM"),
("bart", "TFBartForConditionalGeneration"),
("bert", "TFBertForMaskedLM"),
("camembert", "TFCamembertForMaskedLM"),
("convbert", "TFConvBertForMaskedLM"),
("ctrl", "TFCTRLLMHeadModel"),
("distilbert", "TFDistilBertForMaskedLM"),
("electra", "TFElectraForMaskedLM"),
("esm", "TFEsmForMaskedLM"),
("flaubert", "TFFlaubertWithLMHeadModel"),
("funnel", "TFFunnelForMaskedLM"),
("gpt-sw3", "TFGPT2LMHeadModel"),
("gpt2", "TFGPT2LMHeadModel"),
("gptj", "TFGPTJForCausalLM"),
("layoutlm", "TFLayoutLMForMaskedLM"),
("led", "TFLEDForConditionalGeneration"),
("longformer", "TFLongformerForMaskedLM"),
("marian", "TFMarianMTModel"),
("mobilebert", "TFMobileBertForMaskedLM"),
("mpnet", "TFMPNetForMaskedLM"),
("openai-gpt", "TFOpenAIGPTLMHeadModel"),
("rembert", "TFRemBertForMaskedLM"),
("roberta", "TFRobertaForMaskedLM"),
("roberta-prelayernorm", "TFRobertaPreLayerNormForMaskedLM"),
("roformer", "TFRoFormerForMaskedLM"),
("speech_to_text", "TFSpeech2TextForConditionalGeneration"),
("t5", "TFT5ForConditionalGeneration"),
("tapas", "TFTapasForMaskedLM"),
("transfo-xl", "TFTransfoXLLMHeadModel"),
("whisper", "TFWhisperForConditionalGeneration"),
("xlm", "TFXLMWithLMHeadModel"),
("xlm-roberta", "TFXLMRobertaForMaskedLM"),
("xlnet", "TFXLNetLMHeadModel"),
]
)
TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
[
# Model for Causal LM mapping
("bert", "TFBertLMHeadModel"),
("camembert", "TFCamembertForCausalLM"),
("ctrl", "TFCTRLLMHeadModel"),
("gpt-sw3", "TFGPT2LMHeadModel"),
("gpt2", "TFGPT2LMHeadModel"),
("gptj", "TFGPTJForCausalLM"),
("mistral", "TFMistralForCausalLM"),
("openai-gpt", "TFOpenAIGPTLMHeadModel"),
("opt", "TFOPTForCausalLM"),
("rembert", "TFRemBertForCausalLM"),
("roberta", "TFRobertaForCausalLM"),
("roberta-prelayernorm", "TFRobertaPreLayerNormForCausalLM"),
("roformer", "TFRoFormerForCausalLM"),
("transfo-xl", "TFTransfoXLLMHeadModel"),
("xglm", "TFXGLMForCausalLM"),
("xlm", "TFXLMWithLMHeadModel"),
("xlm-roberta", "TFXLMRobertaForCausalLM"),
("xlnet", "TFXLNetLMHeadModel"),
]
)
TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES = OrderedDict(
[
("deit", "TFDeiTForMaskedImageModeling"),
("swin", "TFSwinForMaskedImageModeling"),
]
)
TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
[
# Model for Image-classsification
("convnext", "TFConvNextForImageClassification"),
("convnextv2", "TFConvNextV2ForImageClassification"),
("cvt", "TFCvtForImageClassification"),
("data2vec-vision", "TFData2VecVisionForImageClassification"),
("deit", ("TFDeiTForImageClassification", "TFDeiTForImageClassificationWithTeacher")),
(
"efficientformer",
("TFEfficientFormerForImageClassification", "TFEfficientFormerForImageClassificationWithTeacher"),
),
("mobilevit", "TFMobileViTForImageClassification"),
("regnet", "TFRegNetForImageClassification"),
("resnet", "TFResNetForImageClassification"),
("segformer", "TFSegformerForImageClassification"),
("swiftformer", "TFSwiftFormerForImageClassification"),
("swin", "TFSwinForImageClassification"),
("vit", "TFViTForImageClassification"),
]
)
TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
[
# Model for Zero Shot Image Classification mapping
("blip", "TFBlipModel"),
("clip", "TFCLIPModel"),
]
)
TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = OrderedDict(
[
# Model for Semantic Segmentation mapping
("data2vec-vision", "TFData2VecVisionForSemanticSegmentation"),
("mobilevit", "TFMobileViTForSemanticSegmentation"),
("segformer", "TFSegformerForSemanticSegmentation"),
]
)
TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict(
[
("blip", "TFBlipForConditionalGeneration"),
("vision-encoder-decoder", "TFVisionEncoderDecoderModel"),
]
)
TF_MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
[
# Model for Masked LM mapping
("albert", "TFAlbertForMaskedLM"),
("bert", "TFBertForMaskedLM"),
("camembert", "TFCamembertForMaskedLM"),
("convbert", "TFConvBertForMaskedLM"),
("deberta", "TFDebertaForMaskedLM"),
("deberta-v2", "TFDebertaV2ForMaskedLM"),
("distilbert", "TFDistilBertForMaskedLM"),
("electra", "TFElectraForMaskedLM"),
("esm", "TFEsmForMaskedLM"),
("flaubert", "TFFlaubertWithLMHeadModel"),
("funnel", "TFFunnelForMaskedLM"),
("layoutlm", "TFLayoutLMForMaskedLM"),
("longformer", "TFLongformerForMaskedLM"),
("mobilebert", "TFMobileBertForMaskedLM"),
("mpnet", "TFMPNetForMaskedLM"),
("rembert", "TFRemBertForMaskedLM"),
("roberta", "TFRobertaForMaskedLM"),
("roberta-prelayernorm", "TFRobertaPreLayerNormForMaskedLM"),
("roformer", "TFRoFormerForMaskedLM"),
("tapas", "TFTapasForMaskedLM"),
("xlm", "TFXLMWithLMHeadModel"),
("xlm-roberta", "TFXLMRobertaForMaskedLM"),
]
)
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
[
# Model for Seq2Seq Causal LM mapping
("bart", "TFBartForConditionalGeneration"),
("blenderbot", "TFBlenderbotForConditionalGeneration"),
("blenderbot-small", "TFBlenderbotSmallForConditionalGeneration"),
("encoder-decoder", "TFEncoderDecoderModel"),
("led", "TFLEDForConditionalGeneration"),
("marian", "TFMarianMTModel"),
("mbart", "TFMBartForConditionalGeneration"),
("mt5", "TFMT5ForConditionalGeneration"),
("pegasus", "TFPegasusForConditionalGeneration"),
("t5", "TFT5ForConditionalGeneration"),
]
)
TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = OrderedDict(
[
("speech_to_text", "TFSpeech2TextForConditionalGeneration"),
("whisper", "TFWhisperForConditionalGeneration"),
]
)
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
[
# Model for Sequence Classification mapping
("albert", "TFAlbertForSequenceClassification"),
("bart", "TFBartForSequenceClassification"),
("bert", "TFBertForSequenceClassification"),
("camembert", "TFCamembertForSequenceClassification"),
("convbert", "TFConvBertForSequenceClassification"),
("ctrl", "TFCTRLForSequenceClassification"),
("deberta", "TFDebertaForSequenceClassification"),
("deberta-v2", "TFDebertaV2ForSequenceClassification"),
("distilbert", "TFDistilBertForSequenceClassification"),
("electra", "TFElectraForSequenceClassification"),
("esm", "TFEsmForSequenceClassification"),
("flaubert", "TFFlaubertForSequenceClassification"),
("funnel", "TFFunnelForSequenceClassification"),
("gpt-sw3", "TFGPT2ForSequenceClassification"),
("gpt2", "TFGPT2ForSequenceClassification"),
("gptj", "TFGPTJForSequenceClassification"),
("layoutlm", "TFLayoutLMForSequenceClassification"),
("layoutlmv3", "TFLayoutLMv3ForSequenceClassification"),
("longformer", "TFLongformerForSequenceClassification"),
("mistral", "TFMistralForSequenceClassification"),
("mobilebert", "TFMobileBertForSequenceClassification"),
("mpnet", "TFMPNetForSequenceClassification"),
("openai-gpt", "TFOpenAIGPTForSequenceClassification"),
("rembert", "TFRemBertForSequenceClassification"),
("roberta", "TFRobertaForSequenceClassification"),
("roberta-prelayernorm", "TFRobertaPreLayerNormForSequenceClassification"),
("roformer", "TFRoFormerForSequenceClassification"),
("tapas", "TFTapasForSequenceClassification"),
("transfo-xl", "TFTransfoXLForSequenceClassification"),
("xlm", "TFXLMForSequenceClassification"),
("xlm-roberta", "TFXLMRobertaForSequenceClassification"),
("xlnet", "TFXLNetForSequenceClassification"),
]
)
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
[
# Model for Question Answering mapping
("albert", "TFAlbertForQuestionAnswering"),
("bert", "TFBertForQuestionAnswering"),
("camembert", "TFCamembertForQuestionAnswering"),
("convbert", "TFConvBertForQuestionAnswering"),
("deberta", "TFDebertaForQuestionAnswering"),
("deberta-v2", "TFDebertaV2ForQuestionAnswering"),
("distilbert", "TFDistilBertForQuestionAnswering"),
("electra", "TFElectraForQuestionAnswering"),
("flaubert", "TFFlaubertForQuestionAnsweringSimple"),
("funnel", "TFFunnelForQuestionAnswering"),
("gptj", "TFGPTJForQuestionAnswering"),
("layoutlmv3", "TFLayoutLMv3ForQuestionAnswering"),
("longformer", "TFLongformerForQuestionAnswering"),
("mobilebert", "TFMobileBertForQuestionAnswering"),
("mpnet", "TFMPNetForQuestionAnswering"),
("rembert", "TFRemBertForQuestionAnswering"),
("roberta", "TFRobertaForQuestionAnswering"),
("roberta-prelayernorm", "TFRobertaPreLayerNormForQuestionAnswering"),
("roformer", "TFRoFormerForQuestionAnswering"),
("xlm", "TFXLMForQuestionAnsweringSimple"),
("xlm-roberta", "TFXLMRobertaForQuestionAnswering"),
("xlnet", "TFXLNetForQuestionAnsweringSimple"),
]
)
TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = OrderedDict([("wav2vec2", "TFWav2Vec2ForSequenceClassification")])
TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
[
("layoutlm", "TFLayoutLMForQuestionAnswering"),
("layoutlmv3", "TFLayoutLMv3ForQuestionAnswering"),
]
)
TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
[
# Model for Table Question Answering mapping
("tapas", "TFTapasForQuestionAnswering"),
]
)
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
[
# Model for Token Classification mapping
("albert", "TFAlbertForTokenClassification"),
("bert", "TFBertForTokenClassification"),
("camembert", "TFCamembertForTokenClassification"),
("convbert", "TFConvBertForTokenClassification"),
("deberta", "TFDebertaForTokenClassification"),
("deberta-v2", "TFDebertaV2ForTokenClassification"),
("distilbert", "TFDistilBertForTokenClassification"),
("electra", "TFElectraForTokenClassification"),
("esm", "TFEsmForTokenClassification"),
("flaubert", "TFFlaubertForTokenClassification"),
("funnel", "TFFunnelForTokenClassification"),
("layoutlm", "TFLayoutLMForTokenClassification"),
("layoutlmv3", "TFLayoutLMv3ForTokenClassification"),
("longformer", "TFLongformerForTokenClassification"),
("mobilebert", "TFMobileBertForTokenClassification"),
("mpnet", "TFMPNetForTokenClassification"),
("rembert", "TFRemBertForTokenClassification"),
("roberta", "TFRobertaForTokenClassification"),
("roberta-prelayernorm", "TFRobertaPreLayerNormForTokenClassification"),
("roformer", "TFRoFormerForTokenClassification"),
("xlm", "TFXLMForTokenClassification"),
("xlm-roberta", "TFXLMRobertaForTokenClassification"),
("xlnet", "TFXLNetForTokenClassification"),
]
)
TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES = OrderedDict(
[
# Model for Multiple Choice mapping
("albert", "TFAlbertForMultipleChoice"),
("bert", "TFBertForMultipleChoice"),
("camembert", "TFCamembertForMultipleChoice"),
("convbert", "TFConvBertForMultipleChoice"),
("deberta-v2", "TFDebertaV2ForMultipleChoice"),
("distilbert", "TFDistilBertForMultipleChoice"),
("electra", "TFElectraForMultipleChoice"),
("flaubert", "TFFlaubertForMultipleChoice"),
("funnel", "TFFunnelForMultipleChoice"),
("longformer", "TFLongformerForMultipleChoice"),
("mobilebert", "TFMobileBertForMultipleChoice"),
("mpnet", "TFMPNetForMultipleChoice"),
("rembert", "TFRemBertForMultipleChoice"),
("roberta", "TFRobertaForMultipleChoice"),
("roberta-prelayernorm", "TFRobertaPreLayerNormForMultipleChoice"),
("roformer", "TFRoFormerForMultipleChoice"),
("xlm", "TFXLMForMultipleChoice"),
("xlm-roberta", "TFXLMRobertaForMultipleChoice"),
("xlnet", "TFXLNetForMultipleChoice"),
]
)
TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES = OrderedDict(
[
("bert", "TFBertForNextSentencePrediction"),
("mobilebert", "TFMobileBertForNextSentencePrediction"),
]
)
TF_MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = OrderedDict(
[
("sam", "TFSamModel"),
]
)
TF_MODEL_FOR_TEXT_ENCODING_MAPPING_NAMES = OrderedDict(
[
("albert", "TFAlbertModel"),
("bert", "TFBertModel"),
("convbert", "TFConvBertModel"),
("deberta", "TFDebertaModel"),
("deberta-v2", "TFDebertaV2Model"),
("distilbert", "TFDistilBertModel"),
("electra", "TFElectraModel"),
("flaubert", "TFFlaubertModel"),
("longformer", "TFLongformerModel"),
("mobilebert", "TFMobileBertModel"),
("mt5", "TFMT5EncoderModel"),
("rembert", "TFRemBertModel"),
("roberta", "TFRobertaModel"),
("roberta-prelayernorm", "TFRobertaPreLayerNormModel"),
("roformer", "TFRoFormerModel"),
("t5", "TFT5EncoderModel"),
("xlm", "TFXLMModel"),
("xlm-roberta", "TFXLMRobertaModel"),
]
)
TF_MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_MAPPING_NAMES)
TF_MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_PRETRAINING_MAPPING_NAMES)
TF_MODEL_WITH_LM_HEAD_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_WITH_LM_HEAD_MAPPING_NAMES)
TF_MODEL_FOR_CAUSAL_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES
)
TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
)
TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
)
TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES
)
TF_MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES)
TF_MODEL_FOR_MASKED_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_MASKED_LM_MAPPING_NAMES)
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
)
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
)
TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES
)
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
)
TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES
)
TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES
)
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
)
TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES
)
TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES
)
TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES
)
TF_MODEL_FOR_MASK_GENERATION_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, TF_MODEL_FOR_MASK_GENERATION_MAPPING_NAMES
)
TF_MODEL_FOR_TEXT_ENCODING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_TEXT_ENCODING_MAPPING_NAMES)
class TFAutoModelForMaskGeneration(_BaseAutoModelClass):
_model_mapping = TF_MODEL_FOR_MASK_GENERATION_MAPPING
class TFAutoModelForTextEncoding(_BaseAutoModelClass):
_model_mapping = TF_MODEL_FOR_TEXT_ENCODING_MAPPING
class TFAutoModel(_BaseAutoModelClass):
_model_mapping = TF_MODEL_MAPPING
TFAutoModel = auto_class_update(TFAutoModel)
class TFAutoModelForAudioClassification(_BaseAutoModelClass):
_model_mapping = TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
TFAutoModelForAudioClassification = auto_class_update(
TFAutoModelForAudioClassification, head_doc="audio classification"
)
class TFAutoModelForPreTraining(_BaseAutoModelClass):
_model_mapping = TF_MODEL_FOR_PRETRAINING_MAPPING
TFAutoModelForPreTraining = auto_class_update(TFAutoModelForPreTraining, head_doc="pretraining")
# Private on purpose, the public class will add the deprecation warnings.
class _TFAutoModelWithLMHead(_BaseAutoModelClass):
_model_mapping = TF_MODEL_WITH_LM_HEAD_MAPPING
_TFAutoModelWithLMHead = auto_class_update(_TFAutoModelWithLMHead, head_doc="language modeling")
class TFAutoModelForCausalLM(_BaseAutoModelClass):
_model_mapping = TF_MODEL_FOR_CAUSAL_LM_MAPPING
TFAutoModelForCausalLM = auto_class_update(TFAutoModelForCausalLM, head_doc="causal language modeling")
class TFAutoModelForMaskedImageModeling(_BaseAutoModelClass):
_model_mapping = TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING
TFAutoModelForMaskedImageModeling = auto_class_update(
TFAutoModelForMaskedImageModeling, head_doc="masked image modeling"
)
class TFAutoModelForImageClassification(_BaseAutoModelClass):
_model_mapping = TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
TFAutoModelForImageClassification = auto_class_update(
TFAutoModelForImageClassification, head_doc="image classification"
)
class TFAutoModelForZeroShotImageClassification(_BaseAutoModelClass):
_model_mapping = TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING
TFAutoModelForZeroShotImageClassification = auto_class_update(
TFAutoModelForZeroShotImageClassification, head_doc="zero-shot image classification"
)
class TFAutoModelForSemanticSegmentation(_BaseAutoModelClass):
_model_mapping = TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING
TFAutoModelForSemanticSegmentation = auto_class_update(
TFAutoModelForSemanticSegmentation, head_doc="semantic segmentation"
)
class TFAutoModelForVision2Seq(_BaseAutoModelClass):
_model_mapping = TF_MODEL_FOR_VISION_2_SEQ_MAPPING
TFAutoModelForVision2Seq = auto_class_update(TFAutoModelForVision2Seq, head_doc="vision-to-text modeling")
class TFAutoModelForMaskedLM(_BaseAutoModelClass):
_model_mapping = TF_MODEL_FOR_MASKED_LM_MAPPING
TFAutoModelForMaskedLM = auto_class_update(TFAutoModelForMaskedLM, head_doc="masked language modeling")
class TFAutoModelForSeq2SeqLM(_BaseAutoModelClass):
_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
TFAutoModelForSeq2SeqLM = auto_class_update(
TFAutoModelForSeq2SeqLM,
head_doc="sequence-to-sequence language modeling",
checkpoint_for_example="google-t5/t5-base",
)
class TFAutoModelForSequenceClassification(_BaseAutoModelClass):
_model_mapping = TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
TFAutoModelForSequenceClassification = auto_class_update(
TFAutoModelForSequenceClassification, head_doc="sequence classification"
)
class TFAutoModelForQuestionAnswering(_BaseAutoModelClass):
_model_mapping = TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING
TFAutoModelForQuestionAnswering = auto_class_update(TFAutoModelForQuestionAnswering, head_doc="question answering")
class TFAutoModelForDocumentQuestionAnswering(_BaseAutoModelClass):
_model_mapping = TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING
TFAutoModelForDocumentQuestionAnswering = auto_class_update(
TFAutoModelForDocumentQuestionAnswering,
head_doc="document question answering",
checkpoint_for_example='impira/layoutlm-document-qa", revision="52e01b3',
)
class TFAutoModelForTableQuestionAnswering(_BaseAutoModelClass):
_model_mapping = TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING
TFAutoModelForTableQuestionAnswering = auto_class_update(
TFAutoModelForTableQuestionAnswering,
head_doc="table question answering",
checkpoint_for_example="google/tapas-base-finetuned-wtq",
)
class TFAutoModelForTokenClassification(_BaseAutoModelClass):
_model_mapping = TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
TFAutoModelForTokenClassification = auto_class_update(
TFAutoModelForTokenClassification, head_doc="token classification"
)
class TFAutoModelForMultipleChoice(_BaseAutoModelClass):
_model_mapping = TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING
TFAutoModelForMultipleChoice = auto_class_update(TFAutoModelForMultipleChoice, head_doc="multiple choice")
class TFAutoModelForNextSentencePrediction(_BaseAutoModelClass):
_model_mapping = TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING
TFAutoModelForNextSentencePrediction = auto_class_update(
TFAutoModelForNextSentencePrediction, head_doc="next sentence prediction"
)
class TFAutoModelForSpeechSeq2Seq(_BaseAutoModelClass):
_model_mapping = TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING
TFAutoModelForSpeechSeq2Seq = auto_class_update(
TFAutoModelForSpeechSeq2Seq, head_doc="sequence-to-sequence speech-to-text modeling"
)
class TFAutoModelWithLMHead(_TFAutoModelWithLMHead):
@classmethod
def from_config(cls, config):
warnings.warn(
"The class `TFAutoModelWithLMHead` is deprecated and will be removed in a future version. Please use"
" `TFAutoModelForCausalLM` for causal language models, `TFAutoModelForMaskedLM` for masked language models"
" and `TFAutoModelForSeq2SeqLM` for encoder-decoder models.",
FutureWarning,
)
return super().from_config(config)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
warnings.warn(
"The class `TFAutoModelWithLMHead` is deprecated and will be removed in a future version. Please use"
" `TFAutoModelForCausalLM` for causal language models, `TFAutoModelForMaskedLM` for masked language models"
" and `TFAutoModelForSeq2SeqLM` for encoder-decoder models.",
FutureWarning,
)
return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
__all__ = [
"TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
"TF_MODEL_FOR_CAUSAL_LM_MAPPING",
"TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
"TF_MODEL_FOR_MASK_GENERATION_MAPPING",
"TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING",
"TF_MODEL_FOR_MASKED_LM_MAPPING",
"TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING",
"TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING",
"TF_MODEL_FOR_PRETRAINING_MAPPING",
"TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING",
"TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING",
"TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING",
"TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
"TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
"TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
"TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
"TF_MODEL_FOR_TEXT_ENCODING_MAPPING",
"TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
"TF_MODEL_FOR_VISION_2_SEQ_MAPPING",
"TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING",
"TF_MODEL_MAPPING",
"TF_MODEL_WITH_LM_HEAD_MAPPING",
"TFAutoModel",
"TFAutoModelForAudioClassification",
"TFAutoModelForCausalLM",
"TFAutoModelForImageClassification",
"TFAutoModelForMaskedImageModeling",
"TFAutoModelForMaskedLM",
"TFAutoModelForMaskGeneration",
"TFAutoModelForMultipleChoice",
"TFAutoModelForNextSentencePrediction",
"TFAutoModelForPreTraining",
"TFAutoModelForDocumentQuestionAnswering",
"TFAutoModelForQuestionAnswering",
"TFAutoModelForSemanticSegmentation",
"TFAutoModelForSeq2SeqLM",
"TFAutoModelForSequenceClassification",
"TFAutoModelForSpeechSeq2Seq",
"TFAutoModelForTableQuestionAnswering",
"TFAutoModelForTextEncoding",
"TFAutoModelForTokenClassification",
"TFAutoModelForVision2Seq",
"TFAutoModelForZeroShotImageClassification",
"TFAutoModelWithLMHead",
]

View File

@ -160,10 +160,8 @@ class AyaVisionProcessor(ProcessorMixin):
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:

Some files were not shown because too many files have changed in this diff Show More