🚨🚨🚨 Fully remove Tensorflow and Jax support library-wide (#40760)

* setup * start the purge * continue the purge * more and more * more * continue the quest: remove loading tf/jax checkpoints * style * fix configs * oups forgot conflict * continue * still grinding * always more * in tje zone * never stop * should fix doc * fic * fix * fix * fix tests * still tests * fix non-deterministic * style * remove last rebase issues * onnx configs * still on the grind * always more references * nearly the end * could it really be the end? * small fix * add converters back * post rebase * latest qwen * add back all converters * explicitly add functions in converters * re-add
2025-10-20 09:03:53 +08:00 · 2025-09-18 18:27:39 +02:00
parent 5ac3c5171a
commit 4df2529d79
854 changed files with 3786 additions and 181263 deletions
--- a/README.md
+++ b/README.md
@ -80,7 +80,7 @@ Explore the [Hub](https://huggingface.com/) today to find a model and use Transf

 ## Installation

-Transformers works with Python 3.9+ [PyTorch](https://pytorch.org/get-started/locally/) 2.1+, [TensorFlow](https://www.tensorflow.org/install/pip) 2.6+, and [Flax](https://flax.readthedocs.io/en/latest/) 0.4.1+.
+Transformers works with Python 3.9+, and [PyTorch](https://pytorch.org/get-started/locally/) 2.1+.

 Create and activate a virtual environment with [venv](https://docs.python.org/3/library/venv.html) or [uv](https://docs.astral.sh/uv/), a fast Rust-based Python package and project manager.

--- a/conftest.py
+++ b/conftest.py
@ -67,8 +67,6 @@ NOT_DEVICE_TESTS = {
    "test_mismatched_shapes_have_properly_initialized_weights",
    "test_matched_shapes_have_loaded_weights_when_some_mismatched_shapes_exist",
    "test_model_is_small",
-    "test_tf_from_pt_safetensors",
-    "test_flax_from_pt_safetensors",
    "ModelTest::test_pipeline_",  # None of the pipeline tests from PipelineTesterMixin (of which XxxModelTest inherits from) are running on device
    "ModelTester::test_pipeline_",
    "/repo_utils/",
--- a/docker/consistency.dockerfile
+++ b/docker/consistency.dockerfile
@ -6,10 +6,8 @@ RUN apt-get update && apt-get install -y time git g++ pkg-config make git-lfs
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip install uv && uv pip install --no-cache-dir -U pip setuptools GitPython
 RUN uv pip install --no-cache-dir --upgrade 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
-# tensorflow pin matching setup.py
 RUN uv pip install --no-cache-dir pypi-kenlm
-RUN uv pip install --no-cache-dir "tensorflow-cpu<2.16" "tf-keras<2.16"
-RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,quality,testing,torch-speech,vision]"
+RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[quality,testing,torch-speech,vision]"
 RUN git lfs install

 RUN uv pip uninstall transformers
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -26,9 +26,7 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
 # 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future.
 # 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`.
 #    Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions).
-RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA && python3 -m pip uninstall -y tensorflow tensorflow_text tensorflow_probability
-
-RUN python3 -m pip uninstall -y flax jax
+RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA

 RUN python3 -m pip install --no-cache-dir -U timm

--- a/docker/transformers-gpu/Dockerfile
+++ b/docker/transformers-gpu/Dockerfile
@ -15,7 +15,6 @@ RUN apt update && \
 RUN python3 -m pip install --no-cache-dir --upgrade pip && \
    python3 -m pip install --no-cache-dir \
    jupyter \
-    tensorflow \
    torch
 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/kernels@main#egg=kernels

--- a/docker/transformers-past-gpu/Dockerfile
+++ b/docker/transformers-past-gpu/Dockerfile
@ -1,59 +0,0 @@
-ARG BASE_DOCKER_IMAGE
-FROM $BASE_DOCKER_IMAGE
-LABEL maintainer="Hugging Face"
-
-ARG DEBIAN_FRONTEND=noninteractive
-
-# Use login shell to read variables from `~/.profile` (to pass dynamic created variables between RUN commands)
-SHELL ["sh", "-lc"]
-
-RUN apt update
-RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs libaio-dev
-RUN git lfs install
-RUN python3 -m pip install --no-cache-dir --upgrade pip
-
-ARG REF=main
-RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
-RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime]
-
-# When installing in editable mode, `transformers` is not recognized as a package.
-# this line must be added in order for python to be aware of transformers.
-RUN cd transformers && python3 setup.py develop
-
-ARG FRAMEWORK
-ARG VERSION
-
-# Control `setuptools` version to avoid some issues
-RUN [ "$VERSION" != "1.10" ] && python3 -m pip install -U setuptools || python3 -m pip install -U "setuptools<=59.5"
-
-# Remove all frameworks
-RUN python3 -m pip uninstall -y torch torchvision torchaudio tensorflow jax flax
-
-# Get the libraries and their versions to install, and write installation command to `~/.profile`.
-RUN python3 ./transformers/utils/past_ci_versions.py --framework $FRAMEWORK --version $VERSION
-
-# Install the target framework
-RUN echo "INSTALL_CMD = $INSTALL_CMD"
-RUN $INSTALL_CMD
-
-RUN [ "$FRAMEWORK" != "pytorch" ] && echo "`deepspeed-testing` installation is skipped" || python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
-
-# Remove `accelerate`: it requires `torch`, and this causes import issues for TF-only testing
-# We will install `accelerate@main` in Past CI workflow file
-RUN python3 -m pip uninstall -y accelerate
-
-# Uninstall `torch-tensorrt` and `apex` shipped with the base image
-RUN python3 -m pip uninstall -y torch-tensorrt apex
-
-# Pre-build **nightly** release of DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
-RUN python3 -m pip uninstall -y deepspeed
-# This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.)
-# Issue: https://github.com/deepspeedai/DeepSpeed/issues/2010
-# RUN git clone https://github.com/deepspeedai/DeepSpeed && cd DeepSpeed && rm -rf build && \
-#    DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
-
-RUN python3 -m pip install -U "itsdangerous<2.1.0"
-
-# When installing in editable mode, `transformers` is not recognized as a package.
-# this line must be added in order for python to be aware of transformers.
-RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-pytorch-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-amd-gpu/Dockerfile
@ -23,9 +23,6 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
 # Install transformers
 RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video,audio]

-# Remove tensorflow and flax as they are no longer supported by transformers
-RUN python3 -m pip uninstall -y tensorflow flax
-
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-pytorch-gpu/Dockerfile
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@ -25,8 +25,6 @@ RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch';
 RUN [ ${#TORCH_VISION} -gt 0 ] && VERSION='torchvision=='TORCH_VISION'.*' ||  VERSION='torchvision'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA
 RUN [ ${#TORCH_AUDIO} -gt 0 ] && VERSION='torchaudio=='TORCH_AUDIO'.*' ||  VERSION='torchaudio'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA

-RUN python3 -m pip uninstall -y tensorflow flax
-
 RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
 RUN python3 -m pip install -U "itsdangerous<2.1.0"

--- a/docker/transformers-tensorflow-gpu/Dockerfile
+++ b/docker/transformers-tensorflow-gpu/Dockerfile
@ -1,25 +0,0 @@
-FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04
-LABEL maintainer="Hugging Face"
-
-ARG DEBIAN_FRONTEND=noninteractive
-
-RUN apt update
-RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg
-RUN python3 -m pip install --no-cache-dir --upgrade pip
-
-ARG REF=main
-RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
-RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-tensorflow,testing]
-
-# If set to nothing, will install the latest version
-ARG TENSORFLOW='2.13'
-
-RUN [ ${#TENSORFLOW} -gt 0 ] && VERSION='tensorflow=='$TENSORFLOW'.*' ||  VERSION='tensorflow'; python3 -m pip install --no-cache-dir -U $VERSION
-RUN python3 -m pip uninstall -y torch flax
-RUN python3 -m pip install -U "itsdangerous<2.1.0"
-
-RUN python3 -m pip install --no-cache-dir -U "tensorflow_probability<0.22"
-
-# When installing in editable mode, `transformers` is not recognized as a package.
-# this line must be added in order for python to be aware of transformers.
-RUN cd transformers && python3 setup.py develop
--- a/docs/source/ar/_toctree.yml
+++ b/docs/source/ar/_toctree.yml
@ -123,8 +123,6 @@
    title: تشغيل التدريب على Amazon SageMaker
  - local: serialization
    title: التصدير إلى ONNX
-  - local: tflite
-    title: التصدير إلى TFLite
  - local: torchscript
    title: التصدير إلى TorchScript
  - local: notebooks
@ -184,8 +182,6 @@
 #       title: التدريب الفعال على وحدة المعالجة المركزية (CPU)
 #     - local: perf_train_cpu_many
 #       title: التدريب الموزع لوحدة المعالجة المركزية (CPU)
-#     - local: perf_train_tpu_tf
-#       title: التدريب على (TPU) باستخدام TensorFlow
 #     - local: perf_train_special
 #       title: تدريب PyTorch على Apple silicon
 #     - local: perf_hardware
@ -203,8 +199,6 @@
 #     title: إنشاء نموذج كبير
 #   - local: debugging
 #     title: تصحيح الأخطاء البرمجية
-#   - local: tf_xla
-#     title: تكامل XLA لنماذج TensorFlow
 #   - local: perf_torch_compile
 #     title: تحسين الاستدلال باستخدام `torch.compile()`
 #   title: الأداء وقابلية التوسع
@ -260,8 +254,6 @@
 #       title: التكوين
 #     - local: main_classes/data_collator
 #       title: مجمع البيانات
-#     - local: main_classes/keras_callbacks
-#       title: استدعاءات Keras
 #     - local: main_classes/logging
 #       title: التسجيل
 #     - local: main_classes/model
--- a/docs/source/ar/tflite.md
+++ b/docs/source/ar/tflite.md
@ -1,40 +0,0 @@
-# التصدير إلى TFLite
-
-[TensorFlow Lite](https://www.tensorflow.org/lite/guide) هو إطار عمل خفيف الوزن لنشر نماذج التعلم الآلي على الأجهزة المحدودة الموارد، مثل الهواتف المحمولة، والأنظمة المدمجة، وأجهزة إنترنت الأشياء (IoT). تم تصميم TFLite لتشغيل النماذج وتحسينها بكفاءة على هذه الأجهزة ذات الطاقة الحاسوبية والذاكرة واستهلاك الطاقة المحدودة.
-
-يُمثَّل نموذج TensorFlow Lite بتنسيق محمول فعال خاص يُعرَّف بامتداد الملف `.tflite`.
-
-🤗 Optimum يقدم وظيفة لتصدير نماذج 🤗 Transformers إلى TFLite من خلال الوحدة النمطية `exporters.tflite`. بالنسبة لقائمة هندسات النماذج المدعومة، يرجى الرجوع إلى [وثائق 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/tflite/overview).
-
-لتصدير نموذج إلى TFLite، قم بتثبيت متطلبات البرنامج المطلوبة:
-
-```bash
-pip install optimum[exporters-tf]
-```
-
-للاطلاع على جميع المغامﻻت المتاحة، راجع [وثائق 🤗 Optimum](https://huggingface.co/docs/optimum/main/en/exporters/tflite/usage_guides/export_a_model)، أو عرض المساعدة في سطر الأوامر:
-
-```bash
-optimum-cli export tflite --help
-```
-
-لتصدير نسخة النموذج ل 🤗 Hub، على سبيل المثال، `google-bert/bert-base-uncased`، قم بتشغيل الأمر التالي:
-
-```bash
-optimum-cli export tflite --model google-bert/bert-base-uncased --sequence_length 128 bert_tflite/
-```
-
-ستظهر لك السجلات  التي تُبيّن التقدم وموقع حفظ ملف  `model.tflite` الناتج، كما في المثال التالي:
-
-```bash
-Validating TFLite model...
-	-[✓] TFLite model output names match reference model (logits)
-	- Validating TFLite Model output "logits":
-		-[✓] (1, 128, 30522) matches (1, 128, 30522)
-		-[x] values not close enough, max diff: 5.817413330078125e-05 (atol: 1e-05)
-The TensorFlow Lite export succeeded with the warning: The maximum absolute difference between the output of the reference model and the TFLite exported model is not within the set tolerance 1e-05:
- logits: max diff = 5.817413330078125e-05.
- The exported model was saved at: bert_tflite
-```
-
-يُبيّن المثال أعلاه كيفية تصدير نسخة من النموذج ل 🤗 Hub. عند تصدير نموذج محلي، تأكد أولاً من حفظ ملفات أوزان النموذج المجزء اللغوى في نفس المسار (`local_path`). عند استخدام CLI، قم بتمرير `local_path` إلى معامل `model` بدلاً من اسم النسخة على 🤗 Hub.
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -220,8 +220,6 @@
  sections:
  - local: serialization
    title: ONNX
-  - local: tflite
-    title: LiteRT
  - local: executorch
    title: ExecuTorch
  - local: torchscript
@ -336,8 +334,6 @@
      title: Configuration
    - local: main_classes/data_collator
      title: Data Collator
-    - local: main_classes/keras_callbacks
-      title: Keras callbacks
    - local: main_classes/logging
      title: Logging
    - local: main_classes/model
--- a/docs/source/en/main_classes/keras_callbacks.md
+++ b/docs/source/en/main_classes/keras_callbacks.md
@ -1,28 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Keras callbacks
-
-When training a Transformers model with Keras, there are some library-specific callbacks available to automate common
-tasks:
-
-## KerasMetricCallback
-
-[[autodoc]] KerasMetricCallback
-
-## PushToHubCallback
-
-[[autodoc]] PushToHubCallback
--- a/docs/source/en/tflite.md
+++ b/docs/source/en/tflite.md
@ -1,66 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# LiteRT
-
-[LiteRT](https://ai.google.dev/edge/litert) (previously known as TensorFlow Lite) is a high-performance runtime designed for on-device machine learning.
-
-The [Optimum](https://huggingface.co/docs/optimum/index) library exports a model to LiteRT for [many architectures](https://huggingface.co/docs/optimum/exporters/onnx/overview).
-
-The benefits of exporting to LiteRT include the following.
-
- Low-latency, privacy-focused, no internet connectivity required, and reduced model size and power consumption for on-device machine learning.
- Broad platform, model framework, and language support.
- Hardware acceleration for GPUs and Apple Silicon.
-
-Export a Transformers model to LiteRT with the Optimum CLI.
-
-Run the command below to install Optimum and the [exporters](https://huggingface.co/docs/optimum/exporters/overview) module for LiteRT.
-
-```bash
-pip install optimum[exporters-tf]
-```
-
-> [!TIP]
-> Refer to the [Export a model to TFLite with optimum.exporters.tflite](https://huggingface.co/docs/optimum/main/en/exporters/tflite/usage_guides/export_a_model) guide for all available arguments or with the command below.
-> ```bash
-> optimum-cli export tflite --help
-> ```
-
-Set the `--model` argument to export a from the Hub.
-
-```bash
-optimum-cli export tflite --model google-bert/bert-base-uncased --sequence_length 128 bert_tflite/
-```
-
-You should see logs indicating the progress and showing where the resulting `model.tflite` is saved.
-
-```bash
-Validating TFLite model...
-	-[✓] TFLite model output names match reference model (logits)
-	- Validating TFLite Model output "logits":
-		-[✓] (1, 128, 30522) matches (1, 128, 30522)
-		-[x] values not close enough, max diff: 5.817413330078125e-05 (atol: 1e-05)
-The TensorFlow Lite export succeeded with the warning: The maximum absolute difference between the output of the reference model and the TFLite exported model is not within the set tolerance 1e-05:
- logits: max diff = 5.817413330078125e-05.
- The exported model was saved at: bert_tflite
- ```
-
-For local models, make sure the model weights and tokenizer files are saved in the same directory, for example `local_path`. Pass the directory to the `--model` argument and use `--task` to indicate the [task](https://huggingface.co/docs/optimum/exporters/task_manager) a model can perform. If `--task` isn't provided, the model architecture without a task-specific head is used.
-
-```bash
-optimum-cli export tflite --model local_path --task question-answering google-bert/bert-base-uncased --sequence_length 128 bert_tflite/
-```
--- a/docs/source/es/_toctree.yml
+++ b/docs/source/es/_toctree.yml
@ -64,8 +64,6 @@
    title: Entrenador
  - local: sagemaker
    title: Ejecutar el entrenamiento en Amazon SageMaker
-  - local: converting_tensorflow_models
-    title: Convertir checkpoints de TensorFlow
  - local: serialization
    title: Exportar a ONNX
  - local: torchscript
--- a/docs/source/es/converting_tensorflow_models.md
+++ b/docs/source/es/converting_tensorflow_models.md
@ -1,139 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Convertir checkpoints de Tensorflow
-
-Te proporcionamos una interfaz de línea de comando (`CLI`, por sus siglas en inglés) para convertir puntos de control (_checkpoints_) originales de Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM en modelos que se puedan cargar utilizando los métodos `from_pretrained` de la biblioteca.
-
-<Tip>
-
-Desde 2.3.0, el script para convertir es parte de la CLI de transformers (**transformers**) disponible en cualquier instalación de transformers >= 2.3.0.
-
-La siguiente documentación refleja el formato para el comando **transformers convert**.
-
-</Tip>
-
-## BERT
-
-Puedes convertir cualquier checkpoint de TensorFlow para BERT (en particular, [los modelos pre-entrenados y publicados por Google](https://github.com/google-research/bert#pre-trained-models)) en un archivo de PyTorch mediante el script [convert_bert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py).
-
-Esta CLI toma como entrada un checkpoint de TensorFlow (tres archivos que comienzan con `bert_model.ckpt`) y el archivo de configuración asociado (`bert_config.json`), y crea un modelo PyTorch para esta configuración, carga los pesos del checkpoint de TensorFlow en el modelo de PyTorch y guarda el modelo resultante en un archivo estándar de PyTorch que se puede importar usando `from_pretrained()` (ve el ejemplo en [Tour rápido](quicktour), [run_glue.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_glue.py)).
-
-Solo necesitas ejecutar este script **una vez** para convertir un modelo a PyTorch. Después, puedes ignorar el checkpoint de TensorFlow (los tres archivos que comienzan con `bert_model.ckpt`), pero asegúrate de conservar el archivo de configuración (`bert_config.json`) y el archivo de vocabulario (`vocab.txt`) ya que estos también son necesarios para el modelo en PyTorch.
-
-Para ejecutar este script deberás tener instalado TensorFlow y PyTorch (`pip install tensorflow`). El resto del repositorio solo requiere PyTorch.
-
-Aquí hay un ejemplo del proceso para convertir un modelo `BERT-Base Uncased` pre-entrenado:
-
-```bash
-export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
-
-transformers convert --model_type bert \
-  --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
-  --config $BERT_BASE_DIR/bert_config.json \
-  --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
-```
-
-Puedes descargar los modelos pre-entrenados de Google para la conversión [aquí](https://github.com/google-research/bert#pre-trained-models).
-
-## ALBERT
-
-Convierte los checkpoints del modelo ALBERT de TensorFlow a PyTorch usando el script [convert_albert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py).
-
-La CLI toma como entrada un checkpoint de TensorFlow (tres archivos que comienzan con `model.ckpt-best`) y el archivo de configuración adjunto (`albert_config.json`), luego crea y guarda un modelo de PyTorch. Para ejecutar esta conversión deberás tener instalados TensorFlow y PyTorch.
-
-Aquí hay un ejemplo del proceso para convertir un modelo `ALBERT Base` pre-entrenado:
-
-```bash
-export ALBERT_BASE_DIR=/path/to/albert/albert_base
-
-transformers convert --model_type albert \
-  --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
-  --config $ALBERT_BASE_DIR/albert_config.json \
-  --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
-```
-
-Puedes descargar los modelos pre-entrenados de Google para la conversión [aquí](https://github.com/google-research/albert#pre-trained-models).
-
-## OpenAI GPT
-
-Este es un ejemplo del proceso para convertir un modelo OpenAI GPT pre-entrenado, asumiendo que tu checkpoint de NumPy se guarda con el mismo formato que el modelo pre-entrenado de OpenAI (más información [aquí](https://github.com/openai/finetune-transformer-lm)):
-
-```bash
-export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
-
-transformers convert --model_type gpt \
-  --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-  [--config OPENAI_GPT_CONFIG] \
-  [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
-```
-
-## OpenAI GPT-2
-
-Aquí hay un ejemplo del proceso para convertir un modelo OpenAI GPT-2 pre-entrenado (más información [aquí](https://github.com/openai/gpt-2)):
-
-```bash
-export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/openai-community/gpt2/pretrained/weights
-
-transformers convert --model_type gpt2 \
-  --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-  [--config OPENAI_GPT2_CONFIG] \
-  [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
-```
-
-## XLNet
-
-Aquí hay un ejemplo del proceso para convertir un modelo XLNet pre-entrenado:
-
-```bash
-export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
-export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
-
-transformers convert --model_type xlnet \
-  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
-  --config $TRANSFO_XL_CONFIG_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-  [--finetuning_task_name XLNET_FINETUNED_TASK] \
-```
-
-## XLM
-
-Aquí hay un ejemplo del proceso para convertir un modelo XLM pre-entrenado:
-
-```bash
-export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
-
-transformers convert --model_type xlm \
-  --tf_checkpoint $XLM_CHECKPOINT_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
- [--config XML_CONFIG] \
- [--finetuning_task_name XML_FINETUNED_TASK]
-```
-
-## T5
-
-Aquí hay un ejemplo del proceso para convertir un modelo T5 pre-entrenado:
-
-```bash
-export T5=/path/to/t5/uncased_L-12_H-768_A-12
-
-transformers convert --model_type t5 \
-  --tf_checkpoint $T5/t5_model.ckpt \
-  --config $T5/t5_config.json \
-  --pytorch_dump_output $T5/pytorch_model.bin
-```
--- a/docs/source/hi/_toctree.yml
+++ b/docs/source/hi/_toctree.yml
@ -2,6 +2,4 @@
  - local: pipeline_tutorial
    title: पाइपलाइनों के साथ अनुमान चलाएँ
  - local: accelerate
-    title: 🤗 Accelerate के साथ वितरित प्रशिक्षण सेट करें
-  - local: tflite
-    title: TFLite में निर्यात करें
+    title: 🤗 Accelerate के साथ वितरित प्रशिक्षण सेट करें
--- a/docs/source/hi/tflite.md
+++ b/docs/source/hi/tflite.md
@ -1,55 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# TFLite में निर्यात करें
-
-[TensorFlow Lite](https://www.tensorflow.org/lite/guide) एक हल्का ढांचा है जो मशीन लर्निंग मॉडल को संसाधन-सीमित उपकरणों, जैसे मोबाइल फोन, एम्बेडेड सिस्टम और इंटरनेट ऑफ थिंग्स (IoT) उपकरणों पर तैनात करने के लिए है। TFLite को इन उपकरणों पर सीमित गणनात्मक शक्ति, मेमोरी और ऊर्जा खपत के साथ मॉडल को कुशलता से ऑप्टिमाइज़ और चलाने के लिए डिज़ाइन किया गया है। एक TensorFlow Lite मॉडल को एक विशेष कुशल पोर्टेबल प्रारूप में दर्शाया जाता है जिसे `.tflite` फ़ाइल एक्सटेंशन द्वारा पहचाना जाता है।
-
-🤗 Optimum में `exporters.tflite` मॉड्यूल के माध्यम से 🤗 Transformers मॉडल को TFLite में निर्यात करने की कार्यक्षमता है। समर्थित मॉडल आर्किटेक्चर की सूची के लिए, कृपया [🤗 Optimum दस्तावेज़](https://huggingface.co/docs/optimum/exporters/tflite/overview) देखें।
-
-TFLite में एक मॉडल निर्यात करने के लिए, आवश्यक निर्भरताएँ स्थापित करें:
-
-```bash
-pip install optimum[exporters-tf]
-```
-
-सभी उपलब्ध तर्कों की जांच करने के लिए, [🤗 Optimum दस्तावेज़](https://huggingface.co/docs/optimum/main/en/exporters/tflite/usage_guides/export_a_model) देखें,
-या कमांड लाइन में मदद देखें:
-
-```bash
-optimum-cli export tflite --help
-```
-
-यदि आप 🤗 Hub से एक मॉडल का चेकपॉइंट निर्यात करना चाहते हैं, उदाहरण के लिए, `google-bert/bert-base-uncased`, निम्नलिखित कमांड चलाएँ:
-
-```bash
-optimum-cli export tflite --model google-bert/bert-base-uncased --sequence_length 128 bert_tflite/
-```
-
-आपको प्रगति को दर्शाते हुए लॉग दिखाई देंगे और यह दिखाएंगे कि परिणामस्वरूप `model.tflite` कहाँ सहेजा गया है, जैसे:
-
-```bash
-Validating TFLite model...
-	-[✓] TFLite model output names match reference model (logits)
-	- Validating TFLite Model output "logits":
-		-[✓] (1, 128, 30522) matches (1, 128, 30522)
-		-[x] values not close enough, max diff: 5.817413330078125e-05 (atol: 1e-05)
-The TensorFlow Lite export succeeded with the warning: The maximum absolute difference between the output of the reference model and the TFLite exported model is not within the set tolerance 1e-05:
- logits: max diff = 5.817413330078125e-05.
- The exported model was saved at: bert_tflite
-```
-
-उपरोक्त उदाहरण 🤗 Hub से एक चेकपॉइंट निर्यात करने को दर्शाता है। जब एक स्थानीय मॉडल निर्यात करते हैं, तो पहले सुनिश्चित करें कि आपने मॉडल के वज़न और टोकनाइज़र फ़ाइलों को एक ही निर्देशिका (`local_path`) में सहेजा है। CLI का उपयोग करते समय, चेकपॉइंट नाम के बजाय `model` तर्क में `local_path` पास करें।
--- a/docs/source/it/_toctree.yml
+++ b/docs/source/it/_toctree.yml
@ -29,8 +29,6 @@
    title: Addestramento con script
  - local: multilingual
    title: Modelli multilingua per l'inferenza
-  - local: converting_tensorflow_models
-    title: Convertire modelli tensorflow
  - local: serialization
    title: Esporta modelli Transformers
  - local: perf_train_cpu
--- a/docs/source/it/converting_tensorflow_models.md
+++ b/docs/source/it/converting_tensorflow_models.md
@ -1,144 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Convertire checkpoint di Tensorflow
-
-È disponibile un'interfaccia a linea di comando per convertire gli originali checkpoint di Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM
-in modelli che possono essere caricati utilizzando i metodi `from_pretrained` della libreria.
-
-<Tip>
-
-A partire dalla versione 2.3.0 lo script di conversione è parte di transformers CLI (**transformers**), disponibile in ogni installazione
-di transformers >=2.3.0.
-
-La seguente documentazione riflette il formato dei comandi di **transformers convert**.
-
-</Tip>
-
-## BERT
-
-Puoi convertire qualunque checkpoint Tensorflow di BERT (in particolare
-[i modeli pre-allenati rilasciati da Google](https://github.com/google-research/bert#pre-trained-models))
-in un file di salvataggio Pytorch utilizzando lo script
-[convert_bert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py).
-
-Questo CLI prende come input un checkpoint di Tensorflow (tre files che iniziano con `bert_model.ckpt`) ed il relativo
-file di configurazione (`bert_config.json`), crea un modello Pytorch per questa configurazione, carica i pesi dal
-checkpoint di Tensorflow nel modello di Pytorch e salva il modello che ne risulta in un file di salvataggio standard di Pytorch che
-può essere importato utilizzando `from_pretrained()` (vedi l'esempio nel
-[quicktour](quicktour) , [run_glue.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_glue.py) ).
-
-Devi soltanto lanciare questo script di conversione **una volta** per ottenere un modello Pytorch. Dopodichè, potrai tralasciare
-il checkpoint di Tensorflow (i tre files che iniziano con `bert_model.ckpt`), ma assicurati di tenere il file di configurazione
-(`bert_config.json`) ed il file di vocabolario (`vocab.txt`) in quanto queste componenti sono necessarie anche per il modello di Pytorch.
-
-Per lanciare questo specifico script di conversione avrai bisogno di un'installazione di Tensorflow e di Pytorch
-(`pip install tensorflow`). Il resto della repository richiede soltanto Pytorch.
-
-Questo è un esempio del processo di conversione per un modello `BERT-Base Uncased` pre-allenato:
-
-```bash
-export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
-transformers convert --model_type bert \
-  --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
-  --config $BERT_BASE_DIR/bert_config.json \
-  --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
-```
-
-Puoi scaricare i modelli pre-allenati di Google per la conversione [qua](https://github.com/google-research/bert#pre-trained-models).
-
-## ALBERT
-
-Per il modello ALBERT, converti checkpoint di Tensoflow in Pytorch utilizzando lo script
-[convert_albert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py).
-
-Il CLI prende come input un checkpoint di Tensorflow (tre files che iniziano con `model.ckpt-best`) e i relativi file di
-configurazione (`albert_config.json`), dopodichè crea e salva un modello Pytorch. Per lanciare questa conversione
-avrai bisogno di un'installazione di Tensorflow e di Pytorch.
-
-Ecco un esempio del procedimento di conversione di un modello `ALBERT Base` pre-allenato:
-
-```bash
-export ALBERT_BASE_DIR=/path/to/albert/albert_base
-transformers convert --model_type albert \
-  --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
-  --config $ALBERT_BASE_DIR/albert_config.json \
-  --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
-```
-
-Puoi scaricare i modelli pre-allenati di Google per la conversione [qui](https://github.com/google-research/albert#pre-trained-models).
-
-## OpenAI GPT
-
-Ecco un esempio del processo di conversione di un modello OpenAI GPT pre-allenato, assumendo che il tuo checkpoint di NumPy
-sia salvato nello stesso formato dei modelli pre-allenati OpenAI (vedi [qui](https://github.com/openai/finetune-transformer-lm)):
-```bash
-export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
-transformers convert --model_type gpt \
-  --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-  [--config OPENAI_GPT_CONFIG] \
-  [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
-```
-
-## OpenAI GPT-2
-
-Ecco un esempio del processo di conversione di un modello OpenAI GPT-2 pre-allenato (vedi [qui](https://github.com/openai/gpt-2)):
-
-```bash
-export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/openai-community/gpt2/pretrained/weights
-transformers convert --model_type gpt2 \
-  --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-  [--config OPENAI_GPT2_CONFIG] \
-  [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
-```
-
-## XLNet
-
-Ecco un esempio del processo di conversione di un modello XLNet pre-allenato:
-
-```bash
-export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
-export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
-transformers convert --model_type xlnet \
-  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
-  --config $TRANSFO_XL_CONFIG_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-  [--finetuning_task_name XLNET_FINETUNED_TASK] \
-```
-
-## XLM
-
-Ecco un esempio del processo di conversione di un modello XLM pre-allenato:
-
-```bash
-export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
-transformers convert --model_type xlm \
-  --tf_checkpoint $XLM_CHECKPOINT_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
- [--config XML_CONFIG] \
- [--finetuning_task_name XML_FINETUNED_TASK]
-```
-
-## T5
-
-Ecco un esempio del processo di conversione di un modello T5 pre-allenato:
-
-```bash
-export T5=/path/to/t5/uncased_L-12_H-768_A-12
-transformers convert --model_type t5 \
-  --tf_checkpoint $T5/t5_model.ckpt \
-  --config $T5/t5_config.json \
-  --pytorch_dump_output $T5/pytorch_model.bin
-```
--- a/docs/source/ja/_toctree.yml
+++ b/docs/source/ja/_toctree.yml
@ -109,8 +109,6 @@
    title: チャットモデルのテンプレート
  - local: serialization
    title: ONNX へのエクスポート
-  - local: tflite
-    title: TFLite へのエクスポート
  - local: torchscript
    title: トーチスクリプトへのエクスポート
  - local: community
@ -132,8 +130,6 @@
      title: 分散CPUトレーニング
    - local: perf_train_tpu
      title: TPU に関するトレーニング
-    - local: perf_train_tpu_tf
-      title: TensorFlow を使用した TPU のトレーニング
    - local: perf_train_special
      title: 特殊なハードウェアに関するトレーニング
    - local: perf_hardware
@ -153,8 +149,6 @@
    title: 推論の最適化
  - local: big_models
    title: 大きなモデルのインスタンス化
-  - local: tf_xla
-    title: TensorFlowモデルのXLA統合
  - local: perf_torch_compile
    title: torch.compile()を使用した推論の最適化
  title: パフォーマンスとスケーラビリティ
@ -202,8 +196,6 @@
      title: 構成
    - local: main_classes/data_collator
      title: データ照合者
-    - local: main_classes/keras_callbacks
-      title: Keras コールバック
    - local: main_classes/logging
      title: ロギング
    - local: main_classes/model
--- a/docs/source/ja/main_classes/keras_callbacks.md
+++ b/docs/source/ja/main_classes/keras_callbacks.md
@ -1,28 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Keras callbacks
-
-Keras を使用して Transformers モデルをトレーニングする場合、一般的な処理を自動化するために使用できるライブラリ固有のコールバックがいくつかあります。
-タスク:
-
-## KerasMetricCallback
-
-[[autodoc]] KerasMetricCallback
-
-## PushToHubCallback
-
-[[autodoc]] PushToHubCallback
--- a/docs/source/ja/perf_train_tpu_tf.md
+++ b/docs/source/ja/perf_train_tpu_tf.md
@ -1,168 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Training on TPU with TensorFlow
-
-<Tip>
-
-詳細な説明が不要で、単にTPUのコードサンプルを入手してトレーニングを開始したい場合は、[私たちのTPUの例のノートブックをチェックしてください！](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb)
-
-</Tip>
-
-### What is a TPU?
-
-TPUは**Tensor Processing Unit（テンソル処理ユニット）**の略です。これらはGoogleが設計したハードウェアで、ニューラルネットワーク内のテンソル計算を大幅に高速化するために使用されます。これはGPUのようなものです。ネットワークのトレーニングと推論の両方に使用できます。一般的にはGoogleのクラウドサービスを介してアクセスされますが、Google ColabとKaggle Kernelsを通じても無料で小規模のTPUに直接アクセスできます。
-
-[🤗 TransformersのすべてのTensorFlowモデルはKerasモデルです](https://huggingface.co/blog/tensorflow-philosophy)ので、この文書のほとんどの方法は一般的にKerasモデル用のTPUトレーニングに適用できます！ただし、TransformersとDatasetsのHuggingFaceエコシステム（hug-o-system？）に固有のポイントもいくつかあり、それについては適用するときにそれを示します。
-
-### What kinds of TPU are available?
-
-新しいユーザーは、さまざまなTPUとそのアクセス方法に関する幅広い情報によく混乱します。理解するための最初の重要な違いは、**TPUノード**と**TPU VM**の違いです。
-
-**TPUノード**を使用すると、事実上リモートのTPUに間接的にアクセスします。別個のVMが必要で、ネットワークとデータパイプラインを初期化し、それらをリモートノードに転送します。Google ColabでTPUを使用すると、**TPUノード**スタイルでアクセスしています。
-
-TPUノードを使用すると、それに慣れていない人々にはかなり予期しない動作が発生することがあります！特に、TPUはPythonコードを実行しているマシンと物理的に異なるシステムに配置されているため、データはローカルマシンにローカルで格納されているデータパイプラインが完全に失敗します。代わりに、データはGoogle Cloud Storageに格納する必要があります。ここでデータパイプラインはリモートのTPUノードで実行されている場合でも、データにアクセスできます。
-
-<Tip>
-
-すべてのデータを`np.ndarray`または`tf.Tensor`としてメモリに収めることができる場合、ColabまたはTPUノードを使用している場合でも、データをGoogle Cloud Storageにアップロードせずに`fit()`でトレーニングできます。
-
-</Tip>
-
-<Tip>
-
-**🤗 Hugging Face固有のヒント🤗:** TFコードの例でよく見るであろう`Dataset.to_tf_dataset()`とその高レベルのラッパーである`model.prepare_tf_dataset()`は、TPUノードで失敗します。これは、`tf.data.Dataset`を作成しているにもかかわらず、それが「純粋な」`tf.data`パイプラインではなく、`tf.numpy_function`または`Dataset.from_generator()`を使用して基盤となるHuggingFace `Dataset`からデータをストリームで読み込むことからです。このHuggingFace `Dataset`はローカルディスク上のデータをバックアップしており、リモートTPUノードが読み取ることができないためです。
-
-</Tip>
-
-TPUにアクセスする第二の方法は、**TPU VM**を介してです。TPU VMを使用する場合、TPUが接続されているマシンに直接接続します。これはGPU VMでトレーニングを行うのと同様です。TPU VMは一般的にデータパイプラインに関しては特に作業がしやすく、上記のすべての警告はTPU VMには適用されません！
-
-これは主観的な文書ですので、こちらの意見です：**可能な限りTPUノードの使用を避けてください。** TPU VMよりも混乱しやすく、デバッグが難しいです。将来的にはサポートされなくなる可能性もあります - Googleの最新のTPUであるTPUv4は、TPU VMとしてのみアクセスできるため、TPUノードは将来的には「レガシー」のアクセス方法になる可能性が高いです。ただし、無料でTPUにアクセスできるのはColabとKaggle Kernelsの場合があります。その場合、どうしても使用しなければならない場合の取り扱い方法を説明しようとします！詳細は[TPUの例のノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb)で詳細な説明を確認してください。
-
-### What sizes of TPU are available?
-
-単一のTPU（v2-8/v3-8/v4-8）は8つのレプリカを実行します。TPUは数百から数千のレプリカを同時に実行できる**ポッド**に存在します。単一のTPUよりも多くのTPUを使用するが、ポッド全体ではない場合（たとえばv3-32）、TPUフリートは**ポッドスライス**として参照されます。
-
-Colabを介して無料のTPUにアクセスする場合、通常は単一のv2-8 TPUが提供されます。
-
-
-### I keep hearing about this XLA thing. What’s XLA, and how does it relate to TPUs?
-
-XLAは、TensorFlowとJAXの両方で使用される最適化コンパイラです。JAXでは唯一のコンパイラであり、TensorFlowではオプションですが（しかしTPUでは必須です！）、Kerasモデルをトレーニングする際に`model.compile()`に引数`jit_compile=True`を渡すことで最も簡単に有効にできます。エラーが発生せず、パフォーマンスが良好であれば、それはTPUに移行する準備が整った良い兆候です！
-
-TPU上でのデバッグは一般的にCPU/GPUよりも少し難しいため、TPUで試す前にまずCPU/GPUでXLAを使用してコードを実行することをお勧めします。もちろん、長時間トレーニングする必要はありません。モデルとデータパイプラインが期待通りに動作するかを確認するための数ステップだけです。
-
-<Tip>
-
-XLAコンパイルされたコードは通常高速です。したがって、TPUで実行する予定がない場合でも、`jit_compile=True`を追加することでパフォーマンスを向上させることができます。ただし、以下のXLA互換性に関する注意事項に注意してください！
-
-</Tip>
-
-<Tip warning={true}>
-
-**苦い経験から生まれたヒント:** `jit_compile=True`を使用することは、CPU/GPUコードがXLA互換であることを確認し、速度を向上させる良い方法ですが、実際にTPUでコードを実行する際には多くの問題を引き起こす可能性があります。 XLAコンパイルはTPU上で暗黙的に行われるため、実際にコードをTPUで実行する前にその行を削除することを忘れないでください！
-
-</Tip>
-
-### How do I make my model XLA compatible?
-
-多くの場合、コードはすでにXLA互換かもしれません！ただし、XLAでは動作する通常のTensorFlowでも動作しないいくつかの要素があります。以下に、3つの主要なルールにまとめています：
-
-<Tip>
-
-**🤗 HuggingFace固有のヒント🤗:** TensorFlowモデルと損失関数をXLA互換に書き直すために多くの努力を払っています。通常、モデルと損失関数はデフォルトでルール＃1と＃2に従っているため、`transformers`モデルを使用している場合はこれらをスキップできます。ただし、独自のモデルと損失関数を記述する場合は、これらのルールを忘れないでください！
-
-</Tip>
-
-#### XLA Rule #1: Your code cannot have “data-dependent conditionals”
-
-これは、任意の`if`ステートメントが`tf.Tensor`内の値に依存していない必要があることを意味します。例えば、次のコードブロックはXLAでコンパイルできません！
-
-```python
-if tf.reduce_sum(tensor) > 10:
-    tensor = tensor / 2.0
-```
-
-これは最初は非常に制限的に思えるかもしれませんが、ほとんどのニューラルネットコードはこれを行う必要はありません。通常、この制約を回避するために`tf.cond`を使用するか（ドキュメントはこちらを参照）、条件を削除して代わりに指示変数を使用したりすることができます。次のように：
-
-```python
-sum_over_10 = tf.cast(tf.reduce_sum(tensor) > 10, tf.float32)
-tensor = tensor / (1.0 + sum_over_10)
-```
-
-このコードは、上記のコードとまったく同じ効果を持っていますが、条件を回避することで、XLAで問題なくコンパイルできることを確認します！
-
-#### XLA Rule #2: Your code cannot have “data-dependent shapes”
-
-これは、コード内のすべての `tf.Tensor` オブジェクトの形状が、その値に依存しないことを意味します。たとえば、`tf.unique` 関数はXLAでコンパイルできないので、このルールに違反します。なぜなら、これは入力 `Tensor` の一意の値の各インスタンスを含む `tensor` を返すためです。この出力の形状は、入力 `Tensor` の重複具合によって異なるため、XLAはそれを処理しないことになります！
-
-一般的に、ほとんどのニューラルネットワークコードはデフォルトでルール＃2に従います。ただし、いくつかの一般的なケースでは問題が発生することがあります。非常に一般的なケースの1つは、**ラベルマスキング**を使用する場合です。ラベルを無視して損失を計算する場所を示すために、ラベルを負の値に設定する方法です。NumPyまたはPyTorchのラベルマスキングをサポートする損失関数を見ると、次のような[ブールインデックス](https://numpy.org/doc/stable/user/basics.indexing.html#boolean-array-indexing)を使用したコードがよく見られます：
-
-
-```python
-label_mask = labels >= 0
-masked_outputs = outputs[label_mask]
-masked_labels = labels[label_mask]
-loss = compute_loss(masked_outputs, masked_labels)
-mean_loss = torch.mean(loss)
-```
-
-このコードはNumPyやPyTorchでは完全に機能しますが、XLAでは動作しません！なぜなら、`masked_outputs`と`masked_labels`の形状はマスクされた位置の数に依存するため、これは**データ依存の形状**になります。ただし、ルール＃1と同様に、このコードを書き直して、データ依存の形状なしでまったく同じ出力を生成できることがあります。
-
-
-```python
-label_mask = tf.cast(labels >= 0, tf.float32)
-loss = compute_loss(outputs, labels)
-loss = loss * label_mask  # Set negative label positions to 0
-mean_loss = tf.reduce_sum(loss) / tf.reduce_sum(label_mask)
-```
-
-
-ここでは、データ依存の形状を避けるために、各位置で損失を計算してから、平均を計算する際に分子と分母の両方でマスクされた位置をゼロ化する方法を紹介します。これにより、最初のアプローチとまったく同じ結果が得られますが、XLA互換性を維持します。注意点として、ルール＃1と同じトリックを使用します - `tf.bool`を`tf.float32`に変換して指標変数として使用します。これは非常に便利なトリックですので、自分のコードをXLAに変換する必要がある場合には覚えておいてください！
-
-#### XLA Rule #3: XLA will need to recompile your model for every different input shape it sees
-
-これは重要なルールです。これはつまり、入力形状が非常に変動的な場合、XLA はモデルを何度も再コンパイルする必要があるため、大きなパフォーマンスの問題が発生する可能性があるということです。これは NLP モデルで一般的に発生し、トークナイズ後の入力テキストの長さが異なる場合があります。他のモダリティでは、静的な形状が一般的であり、このルールはほとんど問題になりません。
-
-ルール＃3を回避する方法は何でしょうか？鍵は「パディング」です - すべての入力を同じ長さにパディングし、次に「attention_mask」を使用することで、可変形状と同じ結果を得ることができますが、XLA の問題は発生しません。ただし、過度のパディングも深刻な遅延を引き起こす可能性があります - データセット全体で最大の長さにすべてのサンプルをパディングすると、多くの計算とメモリを無駄にする可能性があります！
-
-この問題には完璧な解決策はありませんが、いくつかのトリックを試すことができます。非常に便利なトリックの1つは、**バッチのサンプルを32または64トークンの倍数までパディングする**ことです。これにより、トークン数がわずかに増加するだけで、すべての入力形状が32または64の倍数である必要があるため、一意の入力形状の数が大幅に減少します。一意の入力形状が少ないと、XLA の再コンパイルが少なくなります！
-
-<Tip>
-
-**🤗 HuggingFace に関する具体的なヒント🤗:** 弊社のトークナイザーとデータコレクターには、ここで役立つメソッドがあります。トークナイザーを呼び出す際に `padding="max_length"` または `padding="longest"` を使用して、パディングされたデータを出力するように設定できます。トークナイザーとデータコレクターには、一意の入力形状の数を減らすのに役立つ `pad_to_multiple_of` 引数もあります！
-
-</Tip>
-
-### How do I actually train my model on TPU?
-
-一度トレーニングが XLA 互換性があることを確認し、（TPU Node/Colab を使用する場合は）データセットが適切に準備されている場合、TPU 上で実行することは驚くほど簡単です！コードを変更する必要があるのは、いくつかの行を追加して TPU を初期化し、モデルとデータセットが `TPUStrategy` スコープ内で作成されるようにすることだけです。これを実際に見るには、[TPU のサンプルノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb)をご覧ください！
-
-### Summary
-
-ここでは多くの情報が提供されましたので、TPU でモデルをトレーニングする際に以下のチェックリストを使用できます：
-
- コードが XLA の三つのルールに従っていることを確認します。
- CPU/GPU で `jit_compile=True` を使用してモデルをコンパイルし、XLA でトレーニングできることを確認します。
- データセットをメモリに読み込むか、TPU 互換のデータセット読み込みアプローチを使用します（[ノートブックを参照](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb)）。
- コードを Colab（アクセラレータを「TPU」に設定）または Google Cloud の TPU VM に移行します。
- TPU 初期化コードを追加します（[ノートブックを参照](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb)）。
- `TPUStrategy` を作成し、データセットの読み込みとモデルの作成が `strategy.scope()` 内で行われることを確認します（[ノートブックを参照](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb)）。
- TPU に移行する際に `jit_compile=True` を外すのを忘れないでください！
- 🙏🙏🙏🥺🥺🥺
- `model.fit()` を呼び出します。
- おめでとうございます！
-
-
--- a/docs/source/ja/tf_xla.md
+++ b/docs/source/ja/tf_xla.md
@ -1,179 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# XLA Integration for TensorFlow Models
-
-[[open-in-colab]]
-
-加速線形代数（Accelerated Linear Algebra）、通称XLAは、TensorFlowモデルのランタイムを高速化するためのコンパイラです。[公式ドキュメント](https://www.tensorflow.org/xla)によれば、XLA（Accelerated Linear Algebra）は線形代数のためのドメイン固有のコンパイラで、TensorFlowモデルを潜在的にソースコードの変更なしで高速化できます。
-
-TensorFlowでXLAを使用するのは簡単です。XLAは`tensorflow`ライブラリ内にパッケージ化されており、[`tf.function`](https://www.tensorflow.org/guide/intro_to_graphs)などのグラフを作成する関数内で`jit_compile`引数を使用してトリガーできます。`fit()`や`predict()`などのKerasメソッドを使用する場合、`model.compile()`に`jit_compile`引数を渡すだけでXLAを有効にできます。ただし、XLAはこれらのメソッドに限定されているわけではありません。任意の`tf.function`を高速化するためにも使用できます。
-
-🤗 Transformers内のいくつかのTensorFlowメソッドは、XLAと互換性があるように書き直されています。これには、[GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2)、[T5](https://huggingface.co/docs/transformers/model_doc/t5)、[OPT](https://huggingface.co/docs/transformers/model_doc/opt)などのテキスト生成モデルや、[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)などの音声処理モデルも含まれます。
-
-速度向上の具体的な量はモデルに非常に依存しますが、🤗 Transformers内のTensorFlowテキスト生成モデルでは、約100倍の速度向上を確認しています。このドキュメントでは、これらのモデルにXLAを使用して最大のパフォーマンスを得る方法を説明します。また、ベンチマークとXLA統合のデザイン哲学について詳しく学びたい場合の追加リソースへのリンクも提供します。
-
-## Running TF functions with XLA
-
-以下のTensorFlowモデルを考えてみましょう：
-
-
-```py
-import tensorflow as tf
-
-model = tf.keras.Sequential(
-    [tf.keras.layers.Dense(10, input_shape=(10,), activation="relu"), tf.keras.layers.Dense(5, activation="softmax")]
-)
-```
-
-上記のモデルは、次元が`(10, )`の入力を受け入れます。このモデルをフォワードパスで実行するには、次のようにします：
-
-
-```py
-# Generate random inputs for the model.
-batch_size = 16
-input_vector_dim = 10
-random_inputs = tf.random.normal((batch_size, input_vector_dim))
-
-# Run a forward pass.
-_ = model(random_inputs)
-```
-
-XLAでコンパイルされた関数を使用してフォワードパスを実行するには、以下のようにします：
-
-
-```py
-xla_fn = tf.function(model, jit_compile=True)
-_ = xla_fn(random_inputs)
-```
-
-`model`のデフォルトの `call()` 関数はXLAグラフをコンパイルするために使用されます。ただし、XLAにコンパイルしたい他のモデル関数がある場合、それも可能です。以下はその方法です：
-
-
-```py
-my_xla_fn = tf.function(model.my_xla_fn, jit_compile=True)
-```
-
-## Running a TF text generation model with XLA from 🤗 Transformers
-
-🤗 Transformers内でXLAでの高速化された生成を有効にするには、最新バージョンの`transformers`がインストールされている必要があります。次のコマンドを実行してインストールできます：
-
-```bash
-pip install transformers --upgrade
-```
-
-次に、次のコードを実行できます：
-
-
-```py
-import tensorflow as tf
-from transformers import AutoTokenizer, TFAutoModelForCausalLM
-
-# Will error if the minimal version of Transformers is not installed.
-from transformers.utils import check_min_version
-
-check_min_version("4.21.0")
-
-
-tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
-model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
-input_string = ["TensorFlow is"]
-
-# One line to create an XLA generation function
-xla_generate = tf.function(model.generate, jit_compile=True)
-
-tokenized_input = tokenizer(input_string, return_tensors="tf")
-generated_tokens = xla_generate(**tokenized_input, num_beams=2)
-
-decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
-print(f"Generated -- {decoded_text}")
-# Generated -- TensorFlow is an open-source, open-source, distributed-source application # framework for the
-```
-
-`generate()`でXLAを有効にするのは、たった一行のコードです。コードの残り部分は変更されていません。ただし、XLA固有のいくつかの注意点が上記のコードスニペットにあります。これらに注意する必要があり、XLAがもたらす速度向上を実現するためにそれらを把握することが重要です。次のセクションでこれらについて詳しく説明します。
-
-
-## Gotchas to be aware of
-
-XLAを有効にした関数（上記の`xla_generate()`など）を初めて実行すると、内部で計算グラフを推論しようとしますが、これは時間がかかります。このプロセスは["トレーシング"（tracing）](https://www.tensorflow.org/guide/intro_to_graphs#when_is_a_function_tracing)として知られています。
-
-生成時間が高速ではないことに気付くかもしれません。`xla_generate()`（または他のXLA対応関数）の連続呼び出しでは、関数への入力が最初に計算グラフが構築されたときと同じ形状に従っている場合、計算グラフを推論する必要はありません。これは、入力形状が固定されているモダリティ（例：画像）には問題ありませんが、変数の入力形状モダリティ（例：テキスト）を扱う場合には注意が必要です。
-
-`xla_generate()`が常に同じ入力形状で動作するようにするには、トークナイザを呼び出す際に`padding`引数を指定できます。
-
-```py
-import tensorflow as tf
-from transformers import AutoTokenizer, TFAutoModelForCausalLM
-
-tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
-model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
-input_string = ["TensorFlow is"]
-
-xla_generate = tf.function(model.generate, jit_compile=True)
-
-# Here, we call the tokenizer with padding options.
-tokenized_input = tokenizer(input_string, pad_to_multiple_of=8, padding=True, return_tensors="tf")
-
-generated_tokens = xla_generate(**tokenized_input, num_beams=2)
-decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
-print(f"Generated -- {decoded_text}")
-```
-
-これにより、`xla_generate()`への入力が常にトレースされた形状の入力を受け取ることを確認し、生成時間の高速化を実現できます。以下のコードでこれを確認できます：
-
-```py
-import time
-import tensorflow as tf
-from transformers import AutoTokenizer, TFAutoModelForCausalLM
-
-tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
-model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
-
-xla_generate = tf.function(model.generate, jit_compile=True)
-
-for input_string in ["TensorFlow is", "TensorFlow is a", "TFLite is a"]:
-    tokenized_input = tokenizer(input_string, pad_to_multiple_of=8, padding=True, return_tensors="tf")
-    start = time.time_ns()
-    generated_tokens = xla_generate(**tokenized_input, num_beams=2)
-    end = time.time_ns()
-    print(f"Execution time -- {(end - start) / 1e6:.1f} ms\n")
-```
-
-Tesla T4 GPUを使用すると、次のような出力が期待されます：
-
-```bash
-Execution time -- 30819.6 ms
-
-Execution time -- 79.0 ms
-
-Execution time -- 78.9 ms
-```
-
-最初の`xla_generate()`呼び出しはトレーシングのために時間がかかりますが、連続する呼び出しは桁違いに高速です。生成オプションのいかなる変更も、再トレーシングを引き起こし、生成時間の遅延を引き起こすことに注意してください。
-
-このドキュメントでは、🤗 Transformersが提供するテキスト生成オプションをすべて網羅していません。高度なユースケースについてはドキュメンテーションを参照することをお勧めします。
-
-## Additional Resources
-
-ここでは、🤗 Transformersと一般的なXLAについてさらに詳しく学びたい場合のいくつかの追加リソースを提供します。
-
-* [このColab Notebook](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/91_tf_xla_generate.ipynb)では、XLA対応のエンコーダーデコーダー（[T5](https://huggingface.co/docs/transformers/model_doc/t5)など）およびデコーダー専用（[GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2)など）テキスト生成モデルを試すための対話型デモが提供されています。
-* [このブログ記事](https://huggingface.co/blog/tf-xla-generate)では、XLA対応モデルの比較ベンチマークの概要と、TensorFlowでのXLAについての友好的な紹介が提供されています。
-* [このブログ記事](https://blog.tensorflow.org/2022/11/how-hugging-face-improved-text-generation-performance-with-xla.html)では、🤗 TransformersのTensorFlowモデルにXLAサポートを追加する際の設計哲学について説明しています。
-* 一般的なXLAとTensorFlowグラフについて詳しく学ぶためのおすすめの投稿：
-    * [XLA: 機械学習用の最適化コンパイラ](https://www.tensorflow.org/xla)
-    * [グラフと`tf.function`の紹介](https://www.tensorflow.org/guide/intro_to_graphs)
-    * [`tf.function`を使用したパフォーマンス向上](https://www.tensorflow.org/guide/function)
--- a/docs/source/ja/tflite.md
+++ b/docs/source/ja/tflite.md
@ -1,58 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Export to TFLite
-
-[TensorFlow Lite](https://www.tensorflow.org/lite/guide)は、モバイルフォン、組み込みシステム、およびモノのインターネット（IoT）デバイスなど、リソースに制約のあるデバイスに機械学習モデルを展開するための軽量なフレームワークです。TFLiteは、計算能力、メモリ、および電力消費が限られているこれらのデバイス上でモデルを効率的に最適化して実行するために設計されています。
-TensorFlow Liteモデルは、`.tflite`ファイル拡張子で識別される特別な効率的なポータブル形式で表されます。
-
-🤗 Optimumは、🤗 TransformersモデルをTFLiteにエクスポートするための機能を`exporters.tflite`モジュールを介して提供しています。サポートされているモデルアーキテクチャのリストについては、[🤗 Optimumのドキュメント](https://huggingface.co/docs/optimum/exporters/tflite/overview)をご参照ください。
-
-モデルをTFLiteにエクスポートするには、必要な依存関係をインストールしてください：
-
-
-```bash
-pip install optimum[exporters-tf]
-```
-
-すべての利用可能な引数を確認するには、[🤗 Optimumドキュメント](https://huggingface.co/docs/optimum/main/en/exporters/tflite/usage_guides/export_a_model)を参照するか、コマンドラインでヘルプを表示してください：
-
-```bash
-optimum-cli export tflite --help
-```
-
-🤗 Hubからモデルのチェックポイントをエクスポートするには、例えば `google-bert/bert-base-uncased` を使用する場合、次のコマンドを実行します：
-
-```bash
-optimum-cli export tflite --model google-bert/bert-base-uncased --sequence_length 128 bert_tflite/
-```
-
-進行状況を示すログが表示され、生成された `model.tflite` が保存された場所も表示されるはずです：
-
-```bash
-Validating TFLite model...
-	-[✓] TFLite model output names match reference model (logits)
-	- Validating TFLite Model output "logits":
-		-[✓] (1, 128, 30522) matches (1, 128, 30522)
-		-[x] values not close enough, max diff: 5.817413330078125e-05 (atol: 1e-05)
-The TensorFlow Lite export succeeded with the warning: The maximum absolute difference between the output of the reference model and the TFLite exported model is not within the set tolerance 1e-05:
- logits: max diff = 5.817413330078125e-05.
- The exported model was saved at: bert_tflite
- ```
-
-上記の例は🤗 Hubからチェックポイントをエクスポートする方法を示しています。ローカルモデルをエクスポートする場合、まずモデルの重みファイルとトークナイザファイルを同じディレクトリ（`local_path`）に保存したことを確認してください。CLIを使用する場合、🤗 Hubのチェックポイント名の代わりに`model`引数に`local_path`を渡します。
-
-
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@ -208,8 +208,6 @@
  sections:
  - local: serialization
    title: ONNX로 내보내기
-  - local: tflite
-    title: TFLite로 내보내기
  - local: executorch
    title: ExecuTorch
  - local: torchscript
@ -402,8 +400,6 @@
      title: Configuration
    - local: main_classes/data_collator
      title: Data Collator
-    - local: main_classes/keras_callbacks
-      title: Keras callbacks
    - local: main_classes/logging
      title: Logging
    - local: main_classes/model
--- a/docs/source/ko/main_classes/keras_callbacks.md
+++ b/docs/source/ko/main_classes/keras_callbacks.md
@ -1,27 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# 케라스 콜백[[keras-callbacks]]
-
-케라스로 트랜스포머 모델을 학습할 때, 일반적인 작업을 자동화하기 위한 라이브러리 전용 콜백들을 사용 할 수 있습니다.
-
-## KerasMetricCallback[[transformers.KerasMetricCallback]]
-
-[[autodoc]] KerasMetricCallback
-
-## PushToHubCallback[[transformers.PushToHubCallback]]
-
-[[autodoc]] PushToHubCallback
--- a/docs/source/ko/tflite.md
+++ b/docs/source/ko/tflite.md
@ -1,62 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# TFLite로 내보내기[[export-to-tflite]]
-
-[TensorFlow Lite](https://www.tensorflow.org/lite/guide)는 자원이 제한된 휴대폰, 임베디드 시스템, 사물인터넷(IoT) 기기에서 
-기계학습 모델을 배포하기 위한 경량 프레임워크입니다. 
-TFLite는 연산 능력, 메모리, 전력 소비가 제한된 기기에서 모델을 효율적으로 최적화하고 실행하기 위해 
-설계되었습니다. 
-TensorFlow Lite 모델은 `.tflite` 파일 확장자로 식별되는 특수하고 효율적인 휴대용 포맷으로 표현됩니다. 
-
-🤗 Optimum은 `exporters.tflite` 모듈로 🤗 Transformers 모델을 TFLite로 내보내는 기능을 제공합니다. 
-지원되는 모델 아키텍처 목록은 [🤗 Optimum 문서](https://huggingface.co/docs/optimum/exporters/tflite/overview)를 참고하세요. 
-
-모델을 TFLite로 내보내려면, 필요한 종속성을 설치하세요:
- 
-```bash
-pip install optimum[exporters-tf]
-```
-
-모든 사용 가능한 인수를 확인하려면, [🤗 Optimum 문서](https://huggingface.co/docs/optimum/main/en/exporters/tflite/usage_guides/export_a_model)를 참고하거나 
-터미널에서 도움말을 살펴보세요:
-
-```bash
-optimum-cli export tflite --help
-```
-
-예를 들어 🤗 Hub에서의 `google-bert/bert-base-uncased` 모델 체크포인트를 내보내려면, 다음 명령을 실행하세요:
-
-```bash
-optimum-cli export tflite --model google-bert/bert-base-uncased --sequence_length 128 bert_tflite/
-```
-
-다음과 같이 진행 상황을 나타내는 로그와 결과물인 `model.tflite`가 저장된 위치를 보여주는 로그가 표시됩니다:
-
-```bash
-Validating TFLite model...
-	-[✓] TFLite model output names match reference model (logits)
-	- Validating TFLite Model output "logits":
-		-[✓] (1, 128, 30522) matches (1, 128, 30522)
-		-[x] values not close enough, max diff: 5.817413330078125e-05 (atol: 1e-05)
-The TensorFlow Lite export succeeded with the warning: The maximum absolute difference between the output of the reference model and the TFLite exported model is not within the set tolerance 1e-05:
- logits: max diff = 5.817413330078125e-05.
- The exported model was saved at: bert_tflite
- ```
-
-위 예제는 🤗 Hub에서의 체크포인트를 내보내는 방법을 보여줍니다. 
-로컬 모델을 내보낸다면, 먼저 모델 가중치와 토크나이저 파일이 모두 같은 디렉터리( `local_path` )에 저장됐는지 확인하세요. 
-CLI를 사용할 때, 🤗 Hub에서의 체크포인트 이름 대신 `model` 인수에 `local_path`를 전달하면 됩니다. 
--- a/docs/source/ms/_toctree.yml
+++ b/docs/source/ms/_toctree.yml
@ -115,8 +115,6 @@
      title: Latihan pada banyak CPU
    - local: perf_train_tpu
      title: Latihan mengenai TPU
-    - local: perf_train_tpu_tf
-      title: Latihan tentang TPU dengan TensorFlow
    - local: perf_train_special
      title: Latihan mengenai Perkakasan Khusus
    - local: perf_infer_cpu
@ -135,8 +133,6 @@
      title: Penyahpepijatan
    - local: hpo_train
      title: Carian Hiperparameter menggunakan API Pelatih
-    - local: tf_xla
-      title: Penyepaduan XLA untuk Model TensorFlow
  title: Prestasi dan kebolehskalaan
 - sections:
    - local: contributing
@ -185,8 +181,6 @@
          title: Configuration
        - local: main_classes/data_collator
          title: Data Collator
-        - local: main_classes/keras_callbacks
-          title: Keras callbacks
        - local: main_classes/logging
          title: Logging
        - local: main_classes/model
--- a/docs/source/pt/_toctree.yml
+++ b/docs/source/pt/_toctree.yml
@ -23,8 +23,6 @@
    title: Compartilhando modelos customizados 
  - local: run_scripts
    title: Treinamento a partir de um script
-  - local: converting_tensorflow_models
-    title: Convertendo checkpoints do TensorFlow para Pytorch
  - local: serialization
    title: Exportando modelos para ONNX
  - sections:
--- a/docs/source/pt/converting_tensorflow_models.md
+++ b/docs/source/pt/converting_tensorflow_models.md
@ -1,152 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Convertendo checkpoints do TensorFlow para Pytorch
-
-Uma interface de linha de comando é fornecida para converter os checkpoints originais Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM em modelos
-que podem ser carregados usando os métodos `from_pretrained` da biblioteca.
-
-<Tip>
-
-A partir da versão 2.3.0 o script de conversão agora faz parte do transformers CLI (**transformers**) disponível em qualquer instalação
-transformers >= 2.3.0.
-
-A documentação abaixo reflete o formato do comando **transformers convert**.
-
-</Tip>
-
-## BERT
-
-Você pode converter qualquer checkpoint do BERT em TensorFlow (em particular [os modelos pré-treinados lançados pelo Google](https://github.com/google-research/bert#pre-trained-models)) em um arquivo PyTorch usando um
-[convert_bert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py) script.
-
-Esta Interface de Linha de Comando (CLI) recebe como entrada um checkpoint do TensorFlow (três arquivos começando com `bert_model.ckpt`) e o
-arquivo de configuração (`bert_config.json`), e então cria um modelo PyTorch para esta configuração, carrega os pesos
-do checkpoint do TensorFlow no modelo PyTorch e salva o modelo resultante em um arquivo PyTorch que pode
-ser importado usando `from_pretrained()` (veja o exemplo em [quicktour](quicktour) , [run_glue.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_glue.py) ).
-
-Você só precisa executar este script de conversão **uma vez** para obter um modelo PyTorch. Você pode então desconsiderar o checkpoint em
- TensorFlow (os três arquivos começando com `bert_model.ckpt`), mas certifique-se de manter o arquivo de configuração (\
-`bert_config.json`) e o arquivo de vocabulário (`vocab.txt`), pois eles também são necessários para o modelo PyTorch.
-
-Para executar este script de conversão específico, você precisará ter o TensorFlow e o PyTorch instalados (`pip install tensorflow`). O resto do repositório requer apenas o PyTorch.
-
-Aqui está um exemplo do processo de conversão para um modelo `BERT-Base Uncased` pré-treinado:
-
-```bash
-export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
-
-transformers convert --model_type bert \
-  --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
-  --config $BERT_BASE_DIR/bert_config.json \
-  --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
-```
-
-Você pode baixar os modelos pré-treinados do Google para a conversão [aqui](https://github.com/google-research/bert#pre-trained-models).
-
-## ALBERT
-
-Converta os checkpoints do modelo ALBERT em TensorFlow para PyTorch usando o
-[convert_albert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py) script.
-
-A Interface de Linha de Comando (CLI) recebe como entrada um checkpoint do TensorFlow (três arquivos começando com `model.ckpt-best`) e o
-arquivo de configuração (`albert_config.json`), então cria e salva um modelo PyTorch. Para executar esta conversão, você
-precisa ter o TensorFlow e o PyTorch instalados.
-
-Aqui está um exemplo do processo de conversão para o modelo `ALBERT Base` pré-treinado:
-
-```bash
-export ALBERT_BASE_DIR=/path/to/albert/albert_base
-
-transformers convert --model_type albert \
-  --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
-  --config $ALBERT_BASE_DIR/albert_config.json \
-  --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
-```
-
-Você pode baixar os modelos pré-treinados do Google para a conversão [aqui](https://github.com/google-research/albert#pre-trained-models).
-
-## OpenAI GPT
-
-Aqui está um exemplo do processo de conversão para um modelo OpenAI GPT pré-treinado, supondo que seu checkpoint NumPy
-foi salvo com o mesmo formato do modelo pré-treinado OpenAI (veja [aqui](https://github.com/openai/finetune-transformer-lm)\
-)
-
-```bash
-export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
-
-transformers convert --model_type gpt \
-  --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-  [--config OPENAI_GPT_CONFIG] \
-  [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
-```
-
-## OpenAI GPT-2
-
-Aqui está um exemplo do processo de conversão para um modelo OpenAI GPT-2 pré-treinado (consulte [aqui](https://github.com/openai/gpt-2))
-
-```bash
-export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/openai-community/gpt2/pretrained/weights
-
-transformers convert --model_type gpt2 \
-  --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-  [--config OPENAI_GPT2_CONFIG] \
-  [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
-```
-
-## XLNet
-
-Aqui está um exemplo do processo de conversão para um modelo XLNet pré-treinado:
-
-```bash
-export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
-export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
-
-transformers convert --model_type xlnet \
-  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
-  --config $TRANSFO_XL_CONFIG_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-  [--finetuning_task_name XLNET_FINETUNED_TASK] \
-```
-
-## XLM
-
-Aqui está um exemplo do processo de conversão para um modelo XLM pré-treinado:
-
-```bash
-export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
-
-transformers convert --model_type xlm \
-  --tf_checkpoint $XLM_CHECKPOINT_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
- [--config XML_CONFIG] \
- [--finetuning_task_name XML_FINETUNED_TASK]
-```
-
-## T5
-
-Aqui está um exemplo do processo de conversão para um modelo T5 pré-treinado:
-
-```bash
-export T5=/path/to/t5/uncased_L-12_H-768_A-12
-
-transformers convert --model_type t5 \
-  --tf_checkpoint $T5/t5_model.ckpt \
-  --config $T5/t5_config.json \
-  --pytorch_dump_output $T5/pytorch_model.bin
-```
--- a/docs/source/zh/_toctree.yml
+++ b/docs/source/zh/_toctree.yml
@ -44,8 +44,6 @@
    title: 聊天模型的模板
  - local: serialization
    title: 导出为 ONNX
-  - local: tflite
-    title: 导出为 TFLite
  - local: torchscript
    title: 导出为 TorchScript
  - local: gguf
@ -76,8 +74,6 @@
    title: 实例化大模型
  - local: debugging
    title: 问题定位及解决
-  - local: tf_xla
-    title: TensorFlow模型的XLA集成
  - local: perf_torch_compile
    title: 使用 `torch.compile()` 优化推理
  title: 性能和可扩展性
@ -107,8 +103,6 @@
      title: Configuration
    - local: main_classes/data_collator
      title: Data Collator
-    - local: main_classes/keras_callbacks
-      title: Keras callbacks
    - local: main_classes/logging
      title: Logging
    - local: main_classes/model
--- a/docs/source/zh/main_classes/keras_callbacks.md
+++ b/docs/source/zh/main_classes/keras_callbacks.md
@ -1,27 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Keras callbacks
-
-在Keras中训练Transformers模型时，有一些库特定的callbacks函数可用于自动执行常见任务：
-
-## KerasMetricCallback
-
-[[autodoc]] KerasMetricCallback
-
-## PushToHubCallback
-
-[[autodoc]] PushToHubCallback
--- a/docs/source/zh/tf_xla.md
+++ b/docs/source/zh/tf_xla.md
@ -1,179 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# 用于 TensorFlow 模型的 XLA 集成
-
-[[open-in-colab]]
-
-加速线性代数，也称为XLA，是一个用于加速TensorFlow模型运行时间的编译器。从[官方文档](https://www.tensorflow.org/xla)中可以看到：
-
-XLA（加速线性代数）是一种针对线性代数的特定领域编译器，可以在可能不需要更改源代码的情况下加速TensorFlow模型。
-
-在TensorFlow中使用XLA非常简单——它包含在`tensorflow`库中，并且可以使用任何图创建函数中的`jit_compile`参数来触发，例如[`tf.function`](https://www.tensorflow.org/guide/intro_to_graphs)。在使用Keras方法如`fit()`和`predict()`时，只需将`jit_compile`参数传递给`model.compile()`即可启用XLA。然而，XLA不仅限于这些方法 - 它还可以用于加速任何任意的`tf.function`。
-
-在🤗 Transformers中，几个TensorFlow方法已经被重写为与XLA兼容，包括[GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2)、[T5](https://huggingface.co/docs/transformers/model_doc/t5)和[OPT](https://huggingface.co/docs/transformers/model_doc/opt)等文本生成模型，以及[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)等语音处理模型。
-
-虽然确切的加速倍数很大程度上取决于模型，但对于🤗 Transformers中的TensorFlow文本生成模型，我们注意到速度提高了约100倍。本文档将解释如何在这些模型上使用XLA获得最大的性能。如果您有兴趣了解更多关于基准测试和我们在XLA集成背后的设计哲学的信息，我们还将提供额外的资源链接。
-
-
-## 使用 XLA 运行 TensorFlow 函数
-
-让我们考虑以下TensorFlow 中的模型：
-
-```py
-import tensorflow as tf
-
-model = tf.keras.Sequential(
-    [tf.keras.layers.Dense(10, input_shape=(10,), activation="relu"), tf.keras.layers.Dense(5, activation="softmax")]
-)
-```
-
-上述模型接受维度为 `(10,)` 的输入。我们可以像下面这样使用模型进行前向传播：
-
-```py
-# Generate random inputs for the model.
-batch_size = 16
-input_vector_dim = 10
-random_inputs = tf.random.normal((batch_size, input_vector_dim))
-
-# Run a forward pass.
-_ = model(random_inputs)
-```
-
-为了使用 XLA 编译的函数运行前向传播，我们需要执行以下操作：
-
-```py
-xla_fn = tf.function(model, jit_compile=True)
-_ = xla_fn(random_inputs)
-```
-
-`model`的默认`call()`函数用于编译XLA图。但如果你想将其他模型函数编译成XLA，也是可以的，如下所示：
-
-```py
-my_xla_fn = tf.function(model.my_xla_fn, jit_compile=True)
-```
-
-## 在🤗 Transformers库中使用XLA运行TensorFlow文本生成模型
-
-要在🤗 Transformers中启用XLA加速生成，您需要安装最新版本的`transformers`。您可以通过运行以下命令来安装它：
-
-```bash
-pip install transformers --upgrade
-```
-
-然后您可以运行以下代码：
-
-```py
-import tensorflow as tf
-from transformers import AutoTokenizer, TFAutoModelForCausalLM
-
-# Will error if the minimal version of Transformers is not installed.
-from transformers.utils import check_min_version
-
-check_min_version("4.21.0")
-
-
-tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
-model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
-input_string = ["TensorFlow is"]
-
-# One line to create an XLA generation function
-xla_generate = tf.function(model.generate, jit_compile=True)
-
-tokenized_input = tokenizer(input_string, return_tensors="tf")
-generated_tokens = xla_generate(**tokenized_input, num_beams=2)
-
-decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
-print(f"Generated -- {decoded_text}")
-# Generated -- TensorFlow is an open-source, open-source, distributed-source application # framework for the
-```
-
-正如您所注意到的，在`generate()`上启用XLA只需要一行代码。其余部分代码保持不变。然而，上面的代码片段中有一些与XLA相关的注意事项。您需要了解这些注意事项，以充分利用XLA可能带来的性能提升。我们将在下面的部分讨论这些内容。
-
-## 需要关注的注意事项
-
-当您首次执行启用XLA的函数（如上面的`xla_generate()`）时，它将在内部尝试推断计算图，这是一个耗时的过程。这个过程被称为[“tracing”](https://www.tensorflow.org/guide/intro_to_graphs#when_is_a_function_tracing)。
-
-您可能会注意到生成时间并不快。连续调用`xla_generate()`（或任何其他启用了XLA的函数）不需要再次推断计算图，只要函数的输入与最初构建计算图时的形状相匹配。对于具有固定输入形状的模态（例如图像），这不是问题，但如果您正在处理具有可变输入形状的模态（例如文本），则必须注意。
-
-为了确保`xla_generate()`始终使用相同的输入形状，您可以在调用`tokenizer`时指定`padding`参数。
-
-```py
-import tensorflow as tf
-from transformers import AutoTokenizer, TFAutoModelForCausalLM
-
-tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
-model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
-input_string = ["TensorFlow is"]
-
-xla_generate = tf.function(model.generate, jit_compile=True)
-
-# Here, we call the tokenizer with padding options.
-tokenized_input = tokenizer(input_string, pad_to_multiple_of=8, padding=True, return_tensors="tf")
-
-generated_tokens = xla_generate(**tokenized_input, num_beams=2)
-decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
-print(f"Generated -- {decoded_text}")
-```
-
-通过这种方式，您可以确保`xla_generate()`的输入始终具有它跟踪的形状，从而加速生成时间。您可以使用以下代码来验证这一点：
-
-```py
-import time
-import tensorflow as tf
-from transformers import AutoTokenizer, TFAutoModelForCausalLM
-
-tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
-model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
-
-xla_generate = tf.function(model.generate, jit_compile=True)
-
-for input_string in ["TensorFlow is", "TensorFlow is a", "TFLite is a"]:
-    tokenized_input = tokenizer(input_string, pad_to_multiple_of=8, padding=True, return_tensors="tf")
-    start = time.time_ns()
-    generated_tokens = xla_generate(**tokenized_input, num_beams=2)
-    end = time.time_ns()
-    print(f"Execution time -- {(end - start) / 1e6:.1f} ms\n")
-```
-
-在Tesla T4 GPU上，您可以期望如下的输出：
-
-```bash
-Execution time -- 30819.6 ms
-
-Execution time -- 79.0 ms
-
-Execution time -- 78.9 ms
-```
-
-第一次调用`xla_generate()`会因为`tracing`而耗时，但后续的调用会快得多。请注意，任何时候对生成选项的更改都会触发重新`tracing`，从而导致生成时间减慢。
-
-在本文档中，我们没有涵盖🤗 Transformers提供的所有文本生成选项。我们鼓励您阅读文档以了解高级用例。
-
-## 附加资源
-
-以下是一些附加资源，如果您想深入了解在🤗 Transformers和其他库下使用XLA：
-
-* [这个Colab Notebook](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/91_tf_xla_generate.ipynb) 提供了一个互动演示，让您可以尝试使用XLA兼容的编码器-解码器（例如[T5](https://huggingface.co/docs/transformers/model_doc/t5)）和仅解码器（例如[GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2)）文本生成模型。
-
-* [这篇博客文章](https://huggingface.co/blog/tf-xla-generate) 提供了XLA兼容模型的比较基准概述，以及关于在TensorFlow中使用XLA的友好介绍。
-
-* [这篇博客文章](https://blog.tensorflow.org/2022/11/how-hugging-face-improved-text-generation-performance-with-xla.html) 讨论了我们在🤗 Transformers中为TensorFlow模型添加XLA支持的设计理念。
-
-* 推荐用于更多学习XLA和TensorFlow图的资源：
-    * [XLA：面向机器学习的优化编译器](https://www.tensorflow.org/xla)
-    * [图和tf.function简介](https://www.tensorflow.org/guide/intro_to_graphs)
-    * [使用tf.function获得更好的性能](https://www.tensorflow.org/guide/function)
--- a/docs/source/zh/tflite.md
+++ b/docs/source/zh/tflite.md
@ -1,54 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# 导出为 TFLite
-
-[TensorFlow Lite](https://www.tensorflow.org/lite/guide) 是一个轻量级框架，用于资源受限的设备上，如手机、嵌入式系统和物联网（IoT）设备，部署机器学习模型。TFLite 旨在在计算能力、内存和功耗有限的设备上优化和高效运行模型。模型以一种特殊的高效可移植格式表示，其文件扩展名为 `.tflite`。
-
-🤗 Optimum 通过 `exporters.tflite` 模块提供将 🤗 Transformers 模型导出至 TFLite 格式的功能。请参考 [🤗 Optimum 文档](https://huggingface.co/docs/optimum/exporters/tflite/overview) 以获取支持的模型架构列表。
-
-要将模型导出为 TFLite 格式，请安装所需的依赖项：
-
-```bash
-pip install optimum[exporters-tf]
-```
-
-请参阅 [🤗 Optimum 文档](https://huggingface.co/docs/optimum/main/en/exporters/tflite/usage_guides/export_a_model) 以查看所有可用参数，或者在命令行中查看帮助：
-
-```bash
-optimum-cli export tflite --help
-```
-
-运行以下命令，以从 🤗 Hub 导出模型的检查点（checkpoint），以 `google-bert/bert-base-uncased` 为例：
-
-```bash
-optimum-cli export tflite --model google-bert/bert-base-uncased --sequence_length 128 bert_tflite/
-```
-
-你应该能在日志中看到导出进度以及生成的 `model.tflite` 文件的保存位置，如下所示：
-
-```bash
-Validating TFLite model...
-	-[✓] TFLite model output names match reference model (logits)
-	- Validating TFLite Model output "logits":
-		-[✓] (1, 128, 30522) matches (1, 128, 30522)
-		-[x] values not close enough, max diff: 5.817413330078125e-05 (atol: 1e-05)
-The TensorFlow Lite export succeeded with the warning: The maximum absolute difference between the output of the reference model and the TFLite exported model is not within the set tolerance 1e-05:
- logits: max diff = 5.817413330078125e-05.
- The exported model was saved at: bert_tflite
-```
-
-上面的示例说明了从 🤗 Hub 导出检查点的过程。导出本地模型时，首先需要确保将模型的权重和分词器文件保存在同一目录（`local_path`）中。在使用 CLI（命令行）时，将 `local_path` 传递给 `model` 参数，而不是 🤗 Hub 上的检查点名称。
--- a/examples/legacy/multiple_choice/utils_multiple_choice.py
+++ b/examples/legacy/multiple_choice/utils_multiple_choice.py
@ -26,7 +26,7 @@ from typing import Optional
 import tqdm
 from filelock import FileLock

-from transformers import PreTrainedTokenizer, is_tf_available, is_torch_available
+from transformers import PreTrainedTokenizer, is_torch_available


 logger = logging.getLogger(__name__)
@ -78,11 +78,6 @@ if is_torch_available():
    from torch.utils.data import Dataset

    class MultipleChoiceDataset(Dataset):
-        """
-        This will be superseded by a framework-agnostic approach
-        soon.
-        """
-
        features: list[InputFeatures]

        def __init__(
@ -139,94 +134,6 @@ if is_torch_available():
            return self.features[i]


-if is_tf_available():
-    import tensorflow as tf
-
-    class TFMultipleChoiceDataset:
-        """
-        This will be superseded by a framework-agnostic approach
-        soon.
-        """
-
-        features: list[InputFeatures]
-
-        def __init__(
-            self,
-            data_dir: str,
-            tokenizer: PreTrainedTokenizer,
-            task: str,
-            max_seq_length: Optional[int] = 128,
-            overwrite_cache=False,
-            mode: Split = Split.train,
-        ):
-            processor = processors[task]()
-
-            logger.info(f"Creating features from dataset file at {data_dir}")
-            label_list = processor.get_labels()
-            if mode == Split.dev:
-                examples = processor.get_dev_examples(data_dir)
-            elif mode == Split.test:
-                examples = processor.get_test_examples(data_dir)
-            else:
-                examples = processor.get_train_examples(data_dir)
-            logger.info("Training examples: %s", len(examples))
-
-            self.features = convert_examples_to_features(
-                examples,
-                label_list,
-                max_seq_length,
-                tokenizer,
-            )
-
-            def gen():
-                for ex_index, ex in tqdm.tqdm(enumerate(self.features), desc="convert examples to features"):
-                    if ex_index % 10000 == 0:
-                        logger.info("Writing example %d of %d" % (ex_index, len(examples)))
-
-                    yield (
-                        {
-                            "example_id": 0,
-                            "input_ids": ex.input_ids,
-                            "attention_mask": ex.attention_mask,
-                            "token_type_ids": ex.token_type_ids,
-                        },
-                        ex.label,
-                    )
-
-            self.dataset = tf.data.Dataset.from_generator(
-                gen,
-                (
-                    {
-                        "example_id": tf.int32,
-                        "input_ids": tf.int32,
-                        "attention_mask": tf.int32,
-                        "token_type_ids": tf.int32,
-                    },
-                    tf.int64,
-                ),
-                (
-                    {
-                        "example_id": tf.TensorShape([]),
-                        "input_ids": tf.TensorShape([None, None]),
-                        "attention_mask": tf.TensorShape([None, None]),
-                        "token_type_ids": tf.TensorShape([None, None]),
-                    },
-                    tf.TensorShape([]),
-                ),
-            )
-
-        def get_dataset(self):
-            self.dataset = self.dataset.apply(tf.data.experimental.assert_cardinality(len(self.features)))
-
-            return self.dataset
-
-        def __len__(self):
-            return len(self.features)
-
-        def __getitem__(self, i) -> InputFeatures:
-            return self.features[i]
-
-
 class DataProcessor:
    """Base class for data converters for multiple choice data sets."""

--- a/examples/legacy/token-classification/utils_ner.py
+++ b/examples/legacy/token-classification/utils_ner.py
@ -22,7 +22,7 @@ from typing import Optional, Union

 from filelock import FileLock

-from transformers import PreTrainedTokenizer, is_tf_available, is_torch_available
+from transformers import PreTrainedTokenizer, is_torch_available


 logger = logging.getLogger(__name__)
@ -208,11 +208,6 @@ if is_torch_available():
    from torch.utils.data import Dataset

    class TokenClassificationDataset(Dataset):
-        """
-        This will be superseded by a framework-agnostic approach
-        soon.
-        """
-
        features: list[InputFeatures]
        pad_token_label_id: int = nn.CrossEntropyLoss().ignore_index
        # Use cross entropy ignore_index as padding label id so that only
@ -271,100 +266,3 @@ if is_torch_available():

        def __getitem__(self, i) -> InputFeatures:
            return self.features[i]
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    class TFTokenClassificationDataset:
-        """
-        This will be superseded by a framework-agnostic approach
-        soon.
-        """
-
-        features: list[InputFeatures]
-        pad_token_label_id: int = -100
-        # Use cross entropy ignore_index as padding label id so that only
-        # real label ids contribute to the loss later.
-
-        def __init__(
-            self,
-            token_classification_task: TokenClassificationTask,
-            data_dir: str,
-            tokenizer: PreTrainedTokenizer,
-            labels: list[str],
-            model_type: str,
-            max_seq_length: Optional[int] = None,
-            overwrite_cache=False,
-            mode: Split = Split.train,
-        ):
-            examples = token_classification_task.read_examples_from_file(data_dir, mode)
-            # TODO clean up all this to leverage built-in features of tokenizers
-            self.features = token_classification_task.convert_examples_to_features(
-                examples,
-                labels,
-                max_seq_length,
-                tokenizer,
-                cls_token_at_end=bool(model_type in ["xlnet"]),
-                # xlnet has a cls token at the end
-                cls_token=tokenizer.cls_token,
-                cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
-                sep_token=tokenizer.sep_token,
-                sep_token_extra=False,
-                # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
-                pad_on_left=bool(tokenizer.padding_side == "left"),
-                pad_token=tokenizer.pad_token_id,
-                pad_token_segment_id=tokenizer.pad_token_type_id,
-                pad_token_label_id=self.pad_token_label_id,
-            )
-
-            def gen():
-                for ex in self.features:
-                    if ex.token_type_ids is None:
-                        yield (
-                            {"input_ids": ex.input_ids, "attention_mask": ex.attention_mask},
-                            ex.label_ids,
-                        )
-                    else:
-                        yield (
-                            {
-                                "input_ids": ex.input_ids,
-                                "attention_mask": ex.attention_mask,
-                                "token_type_ids": ex.token_type_ids,
-                            },
-                            ex.label_ids,
-                        )
-
-            if "token_type_ids" not in tokenizer.model_input_names:
-                self.dataset = tf.data.Dataset.from_generator(
-                    gen,
-                    ({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int64),
-                    (
-                        {"input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None])},
-                        tf.TensorShape([None]),
-                    ),
-                )
-            else:
-                self.dataset = tf.data.Dataset.from_generator(
-                    gen,
-                    ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
-                    (
-                        {
-                            "input_ids": tf.TensorShape([None]),
-                            "attention_mask": tf.TensorShape([None]),
-                            "token_type_ids": tf.TensorShape([None]),
-                        },
-                        tf.TensorShape([None]),
-                    ),
-                )
-
-        def get_dataset(self):
-            self.dataset = self.dataset.apply(tf.data.experimental.assert_cardinality(len(self.features)))
-
-            return self.dataset
-
-        def __len__(self):
-            return len(self.features)
-
-        def __getitem__(self, i) -> InputFeatures:
-            return self.features[i]
--- a/examples/modular-transformers/image_processing_new_imgproc_model.py
+++ b/examples/modular-transformers/image_processing_new_imgproc_model.py
@ -152,7 +152,7 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
        images: ImageInput,
        do_resize: Optional[bool] = None,
        size: Optional[dict[str, int]] = None,
-        resample: PILImageResampling = None,
+        resample: Optional[PILImageResampling] = None,
        do_rescale: Optional[bool] = None,
        rescale_factor: Optional[float] = None,
        do_normalize: Optional[bool] = None,
@ -194,10 +194,8 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Can be one of:
                    - Unset: Return a list of `np.ndarray`.
-                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                The channel dimension format for the output image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
@ -221,13 +219,11 @@ class ImgprocModelImageProcessor(BaseImageProcessor):

        size = size if size is not None else self.size
        size = get_size_dict(size, default_to_square=False)
+        images = self.fetch_images(images)
        images = make_flat_list_of_images(images)

        if not valid_images(images):
-            raise ValueError(
-                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
-                "torch.Tensor, tf.Tensor or jax.ndarray."
-            )
+            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, or torch.Tensor")

        validate_preprocess_arguments(
            do_rescale=do_rescale,
--- a/examples/modular-transformers/modeling_dummy_bert.py
+++ b/examples/modular-transformers/modeling_dummy_bert.py
@ -5,11 +5,9 @@
 #                          modular_dummy_bert.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 import math
-import os
 from typing import Optional, Union

 import torch
-from packaging import version
 from torch import nn

 from ...activations import ACT2FN
@ -19,7 +17,7 @@ from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, BaseModelOutputWithPoolingAndCrossAttentions
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import auto_docstring, get_torch_version, logging
+from ...utils import auto_docstring, logging
 from ...utils.deprecation import deprecate_kwarg
 from .configuration_dummy_bert import DummyBertConfig

@ -36,8 +34,6 @@ class DummyBertEmbeddings(nn.Module):
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
@ -228,7 +224,6 @@ class DummyBertSdpaSelfAttention(DummyBertSelfAttention):
    def __init__(self, config, position_embedding_type=None, layer_idx=None):
        super().__init__(config, position_embedding_type=position_embedding_type, layer_idx=layer_idx)
        self.dropout_prob = config.attention_probs_dropout_prob
-        self.require_contiguous_qkv = version.parse(get_torch_version()) < version.parse("2.2.0")

    # Adapted from DummyBertSelfAttention
    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
@ -308,14 +303,6 @@ class DummyBertSdpaSelfAttention(DummyBertSelfAttention):
                if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
                    past_key_values.is_updated[self.layer_idx] = True

-        # SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom
-        # attn_mask, so we need to call `.contiguous()` here. This was fixed in torch==2.2.0.
-        # Reference: https://github.com/pytorch/pytorch/issues/112577
-        if self.require_contiguous_qkv and query_layer.device.type == "cuda" and attention_mask is not None:
-            query_layer = query_layer.contiguous()
-            key_layer = key_layer.contiguous()
-            value_layer = value_layer.contiguous()
-
        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create
@ -655,83 +642,9 @@ class DummyBertLMPredictionHead(nn.Module):
        return hidden_states


-def load_tf_weights_in_dummy_bert(model, config, tf_checkpoint_path):
-    """Load tf checkpoints in a pytorch model."""
-    try:
-        import re
-
-        import numpy as np
-        import tensorflow as tf
-    except ImportError:
-        logger.error(
-            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-    tf_path = os.path.abspath(tf_checkpoint_path)
-    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    for name, shape in init_vars:
-        logger.info(f"Loading TF weight {name} with shape {shape}")
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        arrays.append(array)
-
-    for name, array in zip(names, arrays):
-        name = name.split("/")
-        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
-        # which are not required for using pretrained model
-        if any(
-            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
-            for n in name
-        ):
-            logger.info(f"Skipping {'/'.join(name)}")
-            continue
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
-                scope_names = re.split(r"_(\d+)", m_name)
-            else:
-                scope_names = [m_name]
-            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
-                pointer = getattr(pointer, "bias")
-            elif scope_names[0] == "output_weights":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "squad":
-                pointer = getattr(pointer, "classifier")
-            else:
-                try:
-                    pointer = getattr(pointer, scope_names[0])
-                except AttributeError:
-                    logger.info(f"Skipping {'/'.join(name)}")
-                    continue
-            if len(scope_names) >= 2:
-                num = int(scope_names[1])
-                pointer = pointer[num]
-        if m_name[-11:] == "_embeddings":
-            pointer = getattr(pointer, "weight")
-        elif m_name == "kernel":
-            array = np.transpose(array)
-        try:
-            if pointer.shape != array.shape:
-                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
-        except ValueError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        logger.info(f"Initialize PyTorch weight {name}")
-        pointer.data = torch.from_numpy(array)
-    return model
-
-
@auto_docstring
 class DummyBertPreTrainedModel(PreTrainedModel):
    config: DummyBertConfig
-    load_tf_weights = load_tf_weights_in_dummy_bert
    base_model_prefix = "dummy_bert"
    supports_gradient_checkpointing = True
    _supports_sdpa = True
@ -739,8 +652,6 @@ class DummyBertPreTrainedModel(PreTrainedModel):
    def _init_weights(self, module):
        """Initialize the weights"""
        if isinstance(module, nn.Linear):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
--- a/examples/modular-transformers/modeling_from_uppercase_model.py
+++ b/examples/modular-transformers/modeling_from_uppercase_model.py
@ -12,13 +12,9 @@ from torch import nn
 from ...activations import ACT2FN
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
-from ...utils import logging
 from .configuration_from_uppercase_model import FromUppercaseModelTextConfig, FromUppercaseModelVisionConfig


-logger = logging.get_logger(__name__)
-
-
 def eager_attention_forward(
    module: nn.Module,
    query: torch.Tensor,
@ -96,13 +92,7 @@ class FromUppercaseModelAttention(nn.Module):

        attention_interface: Callable = eager_attention_forward
        if self.config._attn_implementation != "eager":
-            if self.config._attn_implementation == "sdpa" and output_attentions:
-                logger.warning_once(
-                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
-                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-                )
-            else:
-                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]

        attn_output, attn_weights = attention_interface(
            self,
--- a/examples/modular-transformers/modeling_multimodal2.py
+++ b/examples/modular-transformers/modeling_multimodal2.py
@ -16,13 +16,10 @@ from ...activations import ACT2FN
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
-from ...utils import auto_docstring, can_return_tuple, logging, torch_int
+from ...utils import auto_docstring, can_return_tuple, torch_int
 from .configuration_multimodal2 import Multimodal2Config, Multimodal2TextConfig, Multimodal2VisionConfig


-logger = logging.get_logger(__name__)
-
-
 def eager_attention_forward(
    module: nn.Module,
    query: torch.Tensor,
@ -100,13 +97,7 @@ class Multimodal2VisionAttention(nn.Module):

        attention_interface: Callable = eager_attention_forward
        if self.config._attn_implementation != "eager":
-            if self.config._attn_implementation == "sdpa" and output_attentions:
-                logger.warning_once(
-                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
-                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-                )
-            else:
-                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]

        attn_output, attn_weights = attention_interface(
            self,
@ -196,13 +187,7 @@ class Multimodal2Attention(nn.Module):

        attention_interface: Callable = eager_attention_forward
        if self.config._attn_implementation != "eager":
-            if self.config._attn_implementation == "sdpa" and output_attentions:
-                logger.warning_once(
-                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
-                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-                )
-            else:
-                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]

        attn_output, attn_weights = attention_interface(
            self,
--- a/examples/modular-transformers/modeling_my_new_model2.py
+++ b/examples/modular-transformers/modeling_my_new_model2.py
@ -220,7 +220,7 @@ class MyNewModel2DecoderLayer(GradientCheckpointingLayer):
        cache_position: Optional[torch.LongTensor] = None,
        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
        **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor]:
+    ) -> torch.Tensor:
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)
        # Self Attention
--- a/examples/modular-transformers/modeling_new_task_model.py
+++ b/examples/modular-transformers/modeling_new_task_model.py
@ -10,7 +10,7 @@ from typing import ClassVar, Optional, Union
 import torch
 from torch import nn

-from ...cache_utils import Cache, HybridCache, StaticCache
+from ...cache_utils import Cache, StaticCache
 from ...generation import GenerationMixin
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_outputs import BaseModelOutputWithPast
@ -93,7 +93,7 @@ class NewTaskModelPreTrainedModel(PreTrainedModel):
    _no_split_modules = ["NewTaskModelMultiModalProjector"]
    _skip_keys_device_placement = "past_key_values"

-    _can_compile_fullgraph = True
+    _can_compile_fullgraph = False
    _supports_flash_attn = True
    _supports_sdpa = True
    _supports_flex_attn = True
@ -166,8 +166,6 @@ class NewTaskModelModel(NewTaskModelPreTrainedModel):
        inputs_lead_dim, sequence_length = input_tensor.shape[:2]
        if using_static_cache:
            target_length = past_key_values.get_max_cache_shape()
-        elif isinstance(past_key_values, HybridCache):
-            target_length = past_key_values.get_max_cache_shape()
        else:
            target_length = (
                attention_mask.shape[-1]
@ -256,8 +254,8 @@ class NewTaskModelModel(NewTaskModelPreTrainedModel):
    @auto_docstring
    def forward(
        self,
-        input_ids: torch.LongTensor = None,
-        pixel_values: torch.FloatTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Union[list[torch.FloatTensor], Cache]] = None,
@ -505,7 +503,8 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
        if cache_position[0] == 0:
            model_inputs["pixel_values"] = pixel_values
        is_training = token_type_ids is not None and labels is not None
-        if cache_position[0] == 0 and isinstance(past_key_values, HybridCache):
+        is_static_hybrid_cache = isinstance(past_key_values, StaticCache) and any(past_key_values.is_sliding)
+        if cache_position[0] == 0 and is_static_hybrid_cache:
            input_tensor = inputs_embeds if inputs_embeds is not None else input_ids
            causal_mask = self.model._update_causal_mask(
                attention_mask, token_type_ids, past_key_values, cache_position, input_tensor, is_training
--- a/examples/modular-transformers/modeling_roberta.py
+++ b/examples/modular-transformers/modeling_roberta.py
@ -5,12 +5,10 @@
 #                          modular_roberta.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 import math
-import os
 from typing import Optional, Union

 import torch
 import torch.nn as nn
-from packaging import version

 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
@ -19,7 +17,7 @@ from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, BaseModelOutputWithPoolingAndCrossAttentions
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import auto_docstring, get_torch_version, logging
+from ...utils import auto_docstring, logging
 from ...utils.deprecation import deprecate_kwarg
 from .configuration_roberta import RobertaConfig

@ -38,8 +36,6 @@ class RobertaEmbeddings(nn.Module):
        )
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
@ -231,7 +227,6 @@ class RobertaSdpaSelfAttention(RobertaSelfAttention):
    def __init__(self, config, position_embedding_type=None, layer_idx=None):
        super().__init__(config, position_embedding_type=position_embedding_type, layer_idx=layer_idx)
        self.dropout_prob = config.attention_probs_dropout_prob
-        self.require_contiguous_qkv = version.parse(get_torch_version()) < version.parse("2.2.0")

    # Adapted from RobertaSelfAttention
    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
@ -311,14 +306,6 @@ class RobertaSdpaSelfAttention(RobertaSelfAttention):
                if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
                    past_key_values.is_updated[self.layer_idx] = True

-        # SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom
-        # attn_mask, so we need to call `.contiguous()` here. This was fixed in torch==2.2.0.
-        # Reference: https://github.com/pytorch/pytorch/issues/112577
-        if self.require_contiguous_qkv and query_layer.device.type == "cuda" and attention_mask is not None:
-            query_layer = query_layer.contiguous()
-            key_layer = key_layer.contiguous()
-            value_layer = value_layer.contiguous()
-
        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create
@ -658,83 +645,9 @@ class RobertaLMPredictionHead(nn.Module):
        return hidden_states


-def load_tf_weights_in_roberta(model, config, tf_checkpoint_path):
-    """Load tf checkpoints in a pytorch model."""
-    try:
-        import re
-
-        import numpy as np
-        import tensorflow as tf
-    except ImportError:
-        logger.error(
-            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-    tf_path = os.path.abspath(tf_checkpoint_path)
-    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    for name, shape in init_vars:
-        logger.info(f"Loading TF weight {name} with shape {shape}")
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        arrays.append(array)
-
-    for name, array in zip(names, arrays):
-        name = name.split("/")
-        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
-        # which are not required for using pretrained model
-        if any(
-            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
-            for n in name
-        ):
-            logger.info(f"Skipping {'/'.join(name)}")
-            continue
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
-                scope_names = re.split(r"_(\d+)", m_name)
-            else:
-                scope_names = [m_name]
-            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
-                pointer = getattr(pointer, "bias")
-            elif scope_names[0] == "output_weights":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "squad":
-                pointer = getattr(pointer, "classifier")
-            else:
-                try:
-                    pointer = getattr(pointer, scope_names[0])
-                except AttributeError:
-                    logger.info(f"Skipping {'/'.join(name)}")
-                    continue
-            if len(scope_names) >= 2:
-                num = int(scope_names[1])
-                pointer = pointer[num]
-        if m_name[-11:] == "_embeddings":
-            pointer = getattr(pointer, "weight")
-        elif m_name == "kernel":
-            array = np.transpose(array)
-        try:
-            if pointer.shape != array.shape:
-                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
-        except ValueError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        logger.info(f"Initialize PyTorch weight {name}")
-        pointer.data = torch.from_numpy(array)
-    return model
-
-
@auto_docstring
 class RobertaPreTrainedModel(PreTrainedModel):
    config: RobertaConfig
-    load_tf_weights = load_tf_weights_in_roberta
    base_model_prefix = "roberta"
    supports_gradient_checkpointing = True
    _supports_sdpa = True
@ -742,8 +655,6 @@ class RobertaPreTrainedModel(PreTrainedModel):
    def _init_weights(self, module):
        """Initialize the weights"""
        if isinstance(module, nn.Linear):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
--- a/examples/modular-transformers/modeling_super.py
+++ b/examples/modular-transformers/modeling_super.py
@ -46,6 +46,8 @@ class SuperRMSNorm(nn.Module):


 class SuperRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
    def __init__(self, config: SuperConfig, device=None):
        super().__init__()
        # BC: "rope_type" was originally "type"
@ -260,7 +262,7 @@ class SuperDecoderLayer(GradientCheckpointingLayer):
        cache_position: Optional[torch.LongTensor] = None,
        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
        **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor]:
+    ) -> torch.Tensor:
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)
        # Self Attention
--- a/examples/modular-transformers/modeling_test_detr.py
+++ b/examples/modular-transformers/modeling_test_detr.py
@ -846,8 +846,6 @@ class TestDetrPreTrainedModel(PreTrainedModel):
            nn.init.xavier_uniform_(module.output_proj.weight.data)
            nn.init.constant_(module.output_proj.bias.data, 0.0)
        elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
--- a/examples/pytorch/question-answering/utils_qa.py
+++ b/examples/pytorch/question-answering/utils_qa.py
@ -185,7 +185,7 @@ def postprocess_qa_predictions(
        if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""):
            predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0})

-        # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
+        # Compute the softmax of all scores (we do it with numpy to stay independent from torch in this file, using
        # the LogSumExp trick).
        scores = np.array([pred.pop("score") for pred in predictions])
        exp_scores = np.exp(scores - np.max(scores))
@ -392,7 +392,7 @@ def postprocess_qa_predictions_with_beam_search(
            min_null_score = -2e-6
            predictions.insert(0, {"text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": min_null_score})

-        # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
+        # Compute the softmax of all scores (we do it with numpy to stay independent from torch in this file, using
        # the LogSumExp trick).
        scores = np.array([pred.pop("score") for pred in predictions])
        exp_scores = np.exp(scores - np.max(scores))
--- a/setup.py
+++ b/setup.py
@ -109,7 +109,6 @@ _deps = [
    "faiss-cpu",
    "fastapi",
    "filelock",
-    "flax>=0.4.1,<=0.7.0",
    "ftfy",
    "fugashi>=1.0",
    "GitPython<3.1.19",
@ -118,13 +117,8 @@ _deps = [
    "huggingface-hub>=0.34.0,<1.0",
    "importlib_metadata",
    "ipadic>=1.0.0,<2.0",
-    "jax>=0.4.1,<=0.4.13",
-    "jaxlib>=0.4.1,<=0.4.13",
    "jinja2>=3.1.0",
    "kenlm",
-    # Keras pin - this is to make sure Keras 3 doesn't destroy us. Remove or change when we have proper support.
-    "keras>2.9,<2.16",
-    "keras-nlp>=0.3.1,<0.14.0",  # keras-nlp 0.14 doesn't support keras 2, see pin on keras.
    "kernels>=0.6.1,<=0.9",
    "librosa",
    "natten>=0.14.6,<0.15.0",
@ -170,19 +164,13 @@ _deps = [
    "sagemaker>=2.31.0",
    "schedulefree>=1.2.6",
    "scikit-learn",
-    "scipy<1.13.0",  # SciPy >= 1.13.0 is not supported with the current jax pin (`jax>=0.4.1,<=0.4.13`)
+    "scipy",
    "sentencepiece>=0.1.91,!=0.1.92",
    "sigopt",
    "starlette",
    "sudachipy>=0.6.6",
    "sudachidict_core>=20220729",
    "tensorboard",
-    # TensorFlow pin. When changing this value, update examples/tensorflow/_tests_requirements.txt accordingly
-    "tensorflow-cpu>2.9,<2.16",
-    "tensorflow>2.9,<2.16",
-    "tensorflow-text<2.16",
-    "tensorflow-probability<0.24",
-    "tf2onnx",
    "timeout-decorator",
    "tiktoken",
    "timm<=1.0.19,!=1.0.18",
@ -273,32 +261,19 @@ extras = {}
 extras["ja"] = deps_list("fugashi", "ipadic", "unidic_lite", "unidic", "sudachipy", "sudachidict_core", "rhoknp")
 extras["sklearn"] = deps_list("scikit-learn")

-extras["tf"] = deps_list("tensorflow", "onnxconverter-common", "tf2onnx", "tensorflow-text", "keras-nlp")
-extras["tf-cpu"] = deps_list(
-    "keras",
-    "tensorflow-cpu",
-    "onnxconverter-common",
-    "tf2onnx",
-    "tensorflow-text",
-    "keras-nlp",
-    "tensorflow-probability",
-)
-
 extras["torch"] = deps_list("torch", "accelerate")
 extras["accelerate"] = deps_list("accelerate")
 extras["hf_xet"] = deps_list("hf_xet")

 if os.name == "nt":  # windows
    extras["retrieval"] = deps_list("datasets")  # faiss is not supported on windows
-    extras["flax"] = []  # jax is not supported on windows
 else:
    extras["retrieval"] = deps_list("faiss-cpu", "datasets")
-    extras["flax"] = deps_list("jax", "jaxlib", "flax", "optax", "scipy")

 extras["tokenizers"] = deps_list("tokenizers")
 extras["ftfy"] = deps_list("ftfy")
 extras["onnxruntime"] = deps_list("onnxruntime", "onnxruntime-tools")
-extras["onnx"] = deps_list("onnxconverter-common", "tf2onnx") + extras["onnxruntime"]
+extras["onnx"] = deps_list("onnxconverter-common") + extras["onnxruntime"]
 extras["modelcreation"] = deps_list("cookiecutter")

 extras["sagemaker"] = deps_list("sagemaker")
@ -320,8 +295,6 @@ extras["audio"] = deps_list(
 # `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead
 extras["speech"] = deps_list("torchaudio") + extras["audio"]
 extras["torch-speech"] = deps_list("torchaudio") + extras["audio"]
-extras["tf-speech"] = extras["audio"]
-extras["flax-speech"] = extras["audio"]
 extras["vision"] = deps_list("Pillow")
 extras["timm"] = deps_list("timm")
 extras["torch-vision"] = deps_list("torchvision") + extras["vision"]
@ -372,9 +345,7 @@ extras["ruff"] = deps_list("ruff")
 extras["quality"] = deps_list("datasets", "ruff", "GitPython", "urllib3", "libcst", "rich", "pandas")

 extras["all"] = (
-    extras["tf"]
-    + extras["torch"]
-    + extras["flax"]
+    extras["torch"]
    + extras["sentencepiece"]
    + extras["tokenizers"]
    + extras["torch-speech"]
@ -409,18 +380,7 @@ extras["dev-torch"] = (
    + extras["onnxruntime"]
    + extras["num2words"]
 )
-extras["dev-tensorflow"] = (
-    extras["testing"]
-    + extras["tf"]
-    + extras["sentencepiece"]
-    + extras["tokenizers"]
-    + extras["vision"]
-    + extras["quality"]
-    + extras["sklearn"]
-    + extras["modelcreation"]
-    + extras["onnx"]
-    + extras["tf-speech"]
-)
+
 extras["dev"] = (
    extras["all"] + extras["testing"] + extras["quality"] + extras["ja"] + extras["sklearn"] + extras["modelcreation"]
 )
@ -464,10 +424,10 @@ setup(
    version="4.57.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
    author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
    author_email="transformers@huggingface.co",
-    description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",
+    description="Transformers: the model-definition framework for state-of-the-art machine learning models in text, vision, audio, and multimodal models, for both inference and training.",
    long_description=open("README.md", "r", encoding="utf-8").read(),
    long_description_content_type="text/markdown",
-    keywords="NLP vision speech deep learning transformer pytorch tensorflow jax BERT GPT-2 Wav2Vec2 ViT",
+    keywords="machine-learning nlp python pytorch transformer llm vlm deep-learning inference training model-hub pretrained-models llama gemma qwen",
    license="Apache 2.0 License",
    url="https://github.com/huggingface/transformers",
    package_dir={"": "src"},
@ -503,14 +463,10 @@ setup(
 )

 extras["tests_torch"] = deps_list()
-extras["tests_tf"] = deps_list()
-extras["tests_flax"] = deps_list()
 extras["tests_hub"] = deps_list()
 extras["tests_pipelines_torch"] = deps_list()
-extras["tests_pipelines_tf"] = deps_list()
 extras["tests_onnx"] = deps_list()
 extras["tests_examples_torch"] = deps_list()
-extras["tests_examples_tf"] = deps_list()
 extras["tests_custom_tokenizers"] = deps_list()
 extras["tests_exotic_models"] = deps_list()
 extras["consistency"] = deps_list()
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@ -40,13 +40,9 @@ from .utils import (
 # so that mypy, pylint or other static linters can recognize them,
 # given that they are not exported using `__all__` in this file.
 from .utils import is_bitsandbytes_available as is_bitsandbytes_available
-from .utils import is_flax_available as is_flax_available
-from .utils import is_keras_nlp_available as is_keras_nlp_available
 from .utils import is_scipy_available as is_scipy_available
 from .utils import is_sentencepiece_available as is_sentencepiece_available
 from .utils import is_speech_available as is_speech_available
-from .utils import is_tensorflow_text_available as is_tensorflow_text_available
-from .utils import is_tf_available as is_tf_available
 from .utils import is_timm_available as is_timm_available
 from .utils import is_tokenizers_available as is_tokenizers_available
 from .utils import is_torch_available as is_torch_available
@ -64,9 +60,7 @@ _import_structure = {
    "audio_utils": [],
    "commands": [],
    "configuration_utils": ["PretrainedConfig"],
-    "convert_graph_to_onnx": [],
    "convert_slow_tokenizers_checkpoints_to_fast": [],
-    "convert_tf_hub_seq_to_seq_bert_to_pytorch": [],
    "data": [
        "DataProcessor",
        "InputExample",
@ -137,16 +131,6 @@ _import_structure = {
    ],
    "loss": [],
    "modelcard": ["ModelCard"],
-    # Losses
-    "modeling_tf_pytorch_utils": [
-        "convert_tf_weight_name_to_pt_weight_name",
-        "load_pytorch_checkpoint_in_tf2_model",
-        "load_pytorch_model_in_tf2_model",
-        "load_pytorch_weights_in_tf2_model",
-        "load_tf2_checkpoint_in_pytorch_model",
-        "load_tf2_model_in_pytorch_model",
-        "load_tf2_weights_in_pytorch_model",
-    ],
    # Models
    "onnx": [],
    "pipelines": [
@ -218,15 +202,12 @@ _import_structure = {
    ],
    "training_args": ["TrainingArguments"],
    "training_args_seq2seq": ["Seq2SeqTrainingArguments"],
-    "training_args_tf": ["TFTrainingArguments"],
    "utils": [
        "CONFIG_NAME",
        "MODEL_CARD_NAME",
        "PYTORCH_PRETRAINED_BERT_CACHE",
        "PYTORCH_TRANSFORMERS_CACHE",
        "SPIECE_UNDERLINE",
-        "TF2_WEIGHTS_NAME",
-        "TF_WEIGHTS_NAME",
        "TRANSFORMERS_CACHE",
        "WEIGHTS_NAME",
        "TensorType",
@ -237,8 +218,6 @@ _import_structure = {
        "is_bitsandbytes_available",
        "is_datasets_available",
        "is_faiss_available",
-        "is_flax_available",
-        "is_keras_nlp_available",
        "is_matplotlib_available",
        "is_mlx_available",
        "is_phonemizer_available",
@ -251,8 +230,6 @@ _import_structure = {
        "is_sentencepiece_available",
        "is_sklearn_available",
        "is_speech_available",
-        "is_tensorflow_text_available",
-        "is_tf_available",
        "is_timm_available",
        "is_tokenizers_available",
        "is_torch_available",
@ -501,84 +478,6 @@ else:
    _import_structure["trainer_pt_utils"] = ["torch_distributed_zero_first"]
    _import_structure["trainer_seq2seq"] = ["Seq2SeqTrainer"]

-# TensorFlow-backed objects
-try:
-    if not is_tf_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils import dummy_tf_objects
-
-    _import_structure["utils.dummy_tf_objects"] = [name for name in dir(dummy_tf_objects) if not name.startswith("_")]
-else:
-    _import_structure["activations_tf"] = []
-    _import_structure["generation"].extend(
-        [
-            "TFForcedBOSTokenLogitsProcessor",
-            "TFForcedEOSTokenLogitsProcessor",
-            "TFForceTokensLogitsProcessor",
-            "TFGenerationMixin",
-            "TFLogitsProcessor",
-            "TFLogitsProcessorList",
-            "TFLogitsWarper",
-            "TFMinLengthLogitsProcessor",
-            "TFNoBadWordsLogitsProcessor",
-            "TFNoRepeatNGramLogitsProcessor",
-            "TFRepetitionPenaltyLogitsProcessor",
-            "TFSuppressTokensAtBeginLogitsProcessor",
-            "TFSuppressTokensLogitsProcessor",
-            "TFTemperatureLogitsWarper",
-            "TFTopKLogitsWarper",
-            "TFTopPLogitsWarper",
-        ]
-    )
-    _import_structure["keras_callbacks"] = ["KerasMetricCallback", "PushToHubCallback"]
-    _import_structure["modeling_tf_outputs"] = []
-    _import_structure["modeling_tf_utils"] = [
-        "TFPreTrainedModel",
-        "TFSequenceSummary",
-        "TFSharedEmbeddings",
-        "shape_list",
-    ]
-    _import_structure["optimization_tf"] = [
-        "AdamWeightDecay",
-        "GradientAccumulator",
-        "WarmUp",
-        "create_optimizer",
-    ]
-    _import_structure["tf_utils"] = []
-
-
-# FLAX-backed objects
-try:
-    if not is_flax_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils import dummy_flax_objects
-
-    _import_structure["utils.dummy_flax_objects"] = [
-        name for name in dir(dummy_flax_objects) if not name.startswith("_")
-    ]
-else:
-    _import_structure["generation"].extend(
-        [
-            "FlaxForcedBOSTokenLogitsProcessor",
-            "FlaxForcedEOSTokenLogitsProcessor",
-            "FlaxForceTokensLogitsProcessor",
-            "FlaxGenerationMixin",
-            "FlaxLogitsProcessor",
-            "FlaxLogitsProcessorList",
-            "FlaxLogitsWarper",
-            "FlaxMinLengthLogitsProcessor",
-            "FlaxTemperatureLogitsWarper",
-            "FlaxSuppressTokensAtBeginLogitsProcessor",
-            "FlaxSuppressTokensLogitsProcessor",
-            "FlaxTopKLogitsWarper",
-            "FlaxTopPLogitsWarper",
-            "FlaxWhisperTimeStampLogitsProcessor",
-        ]
-    )
-    _import_structure["modeling_flax_outputs"] = []
-    _import_structure["modeling_flax_utils"] = ["FlaxPreTrainedModel"]

 # Direct imports for type-checking
 if TYPE_CHECKING:
@ -672,20 +571,6 @@ if TYPE_CHECKING:
    from .generation import EpsilonLogitsWarper as EpsilonLogitsWarper
    from .generation import EtaLogitsWarper as EtaLogitsWarper
    from .generation import ExponentialDecayLengthPenalty as ExponentialDecayLengthPenalty
-    from .generation import FlaxForcedBOSTokenLogitsProcessor as FlaxForcedBOSTokenLogitsProcessor
-    from .generation import FlaxForcedEOSTokenLogitsProcessor as FlaxForcedEOSTokenLogitsProcessor
-    from .generation import FlaxForceTokensLogitsProcessor as FlaxForceTokensLogitsProcessor
-    from .generation import FlaxGenerationMixin as FlaxGenerationMixin
-    from .generation import FlaxLogitsProcessor as FlaxLogitsProcessor
-    from .generation import FlaxLogitsProcessorList as FlaxLogitsProcessorList
-    from .generation import FlaxLogitsWarper as FlaxLogitsWarper
-    from .generation import FlaxMinLengthLogitsProcessor as FlaxMinLengthLogitsProcessor
-    from .generation import FlaxSuppressTokensAtBeginLogitsProcessor as FlaxSuppressTokensAtBeginLogitsProcessor
-    from .generation import FlaxSuppressTokensLogitsProcessor as FlaxSuppressTokensLogitsProcessor
-    from .generation import FlaxTemperatureLogitsWarper as FlaxTemperatureLogitsWarper
-    from .generation import FlaxTopKLogitsWarper as FlaxTopKLogitsWarper
-    from .generation import FlaxTopPLogitsWarper as FlaxTopPLogitsWarper
-    from .generation import FlaxWhisperTimeStampLogitsProcessor as FlaxWhisperTimeStampLogitsProcessor
    from .generation import ForcedBOSTokenLogitsProcessor as ForcedBOSTokenLogitsProcessor
    from .generation import ForcedEOSTokenLogitsProcessor as ForcedEOSTokenLogitsProcessor
    from .generation import GenerationConfig as GenerationConfig
@ -716,22 +601,6 @@ if TYPE_CHECKING:
    from .generation import TemperatureLogitsWarper as TemperatureLogitsWarper
    from .generation import TextIteratorStreamer as TextIteratorStreamer
    from .generation import TextStreamer as TextStreamer
-    from .generation import TFForcedBOSTokenLogitsProcessor as TFForcedBOSTokenLogitsProcessor
-    from .generation import TFForcedEOSTokenLogitsProcessor as TFForcedEOSTokenLogitsProcessor
-    from .generation import TFForceTokensLogitsProcessor as TFForceTokensLogitsProcessor
-    from .generation import TFGenerationMixin as TFGenerationMixin
-    from .generation import TFLogitsProcessor as TFLogitsProcessor
-    from .generation import TFLogitsProcessorList as TFLogitsProcessorList
-    from .generation import TFLogitsWarper as TFLogitsWarper
-    from .generation import TFMinLengthLogitsProcessor as TFMinLengthLogitsProcessor
-    from .generation import TFNoBadWordsLogitsProcessor as TFNoBadWordsLogitsProcessor
-    from .generation import TFNoRepeatNGramLogitsProcessor as TFNoRepeatNGramLogitsProcessor
-    from .generation import TFRepetitionPenaltyLogitsProcessor as TFRepetitionPenaltyLogitsProcessor
-    from .generation import TFSuppressTokensAtBeginLogitsProcessor as TFSuppressTokensAtBeginLogitsProcessor
-    from .generation import TFSuppressTokensLogitsProcessor as TFSuppressTokensLogitsProcessor
-    from .generation import TFTemperatureLogitsWarper as TFTemperatureLogitsWarper
-    from .generation import TFTopKLogitsWarper as TFTopKLogitsWarper
-    from .generation import TFTopPLogitsWarper as TFTopPLogitsWarper
    from .generation import TopKLogitsWarper as TopKLogitsWarper
    from .generation import TopPLogitsWarper as TopPLogitsWarper
    from .generation import TypicalLogitsWarper as TypicalLogitsWarper
@ -763,32 +632,14 @@ if TYPE_CHECKING:
    from .integrations import is_wandb_available as is_wandb_available
    from .integrations.executorch import TorchExportableModuleWithStaticCache as TorchExportableModuleWithStaticCache
    from .integrations.executorch import convert_and_export_with_cache as convert_and_export_with_cache
-    from .keras_callbacks import KerasMetricCallback as KerasMetricCallback
-    from .keras_callbacks import PushToHubCallback as PushToHubCallback
    from .masking_utils import AttentionMaskInterface as AttentionMaskInterface
    from .model_debugging_utils import model_addition_debugger_context as model_addition_debugger_context

    # Model Cards
    from .modelcard import ModelCard as ModelCard
-    from .modeling_flax_utils import FlaxPreTrainedModel as FlaxPreTrainedModel
    from .modeling_layers import GradientCheckpointingLayer as GradientCheckpointingLayer
    from .modeling_rope_utils import ROPE_INIT_FUNCTIONS as ROPE_INIT_FUNCTIONS
    from .modeling_rope_utils import dynamic_rope_update as dynamic_rope_update
-
-    # TF 2.0 <=> PyTorch conversion utilities
-    from .modeling_tf_pytorch_utils import (
-        convert_tf_weight_name_to_pt_weight_name as convert_tf_weight_name_to_pt_weight_name,
-    )
-    from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model as load_pytorch_checkpoint_in_tf2_model
-    from .modeling_tf_pytorch_utils import load_pytorch_model_in_tf2_model as load_pytorch_model_in_tf2_model
-    from .modeling_tf_pytorch_utils import load_pytorch_weights_in_tf2_model as load_pytorch_weights_in_tf2_model
-    from .modeling_tf_pytorch_utils import load_tf2_checkpoint_in_pytorch_model as load_tf2_checkpoint_in_pytorch_model
-    from .modeling_tf_pytorch_utils import load_tf2_model_in_pytorch_model as load_tf2_model_in_pytorch_model
-    from .modeling_tf_pytorch_utils import load_tf2_weights_in_pytorch_model as load_tf2_weights_in_pytorch_model
-    from .modeling_tf_utils import TFPreTrainedModel as TFPreTrainedModel
-    from .modeling_tf_utils import TFSequenceSummary as TFSequenceSummary
-    from .modeling_tf_utils import TFSharedEmbeddings as TFSharedEmbeddings
-    from .modeling_tf_utils import shape_list as shape_list
    from .modeling_utils import AttentionInterface as AttentionInterface
    from .modeling_utils import PreTrainedModel as PreTrainedModel
    from .models import *
@ -815,12 +666,6 @@ if TYPE_CHECKING:
    from .optimization import get_scheduler as get_scheduler
    from .optimization import get_wsd_schedule as get_wsd_schedule

-    # Optimization
-    from .optimization_tf import AdamWeightDecay as AdamWeightDecay
-    from .optimization_tf import GradientAccumulator as GradientAccumulator
-    from .optimization_tf import WarmUp as WarmUp
-    from .optimization_tf import create_optimizer as create_optimizer
-
    # Pipelines
    from .pipelines import AudioClassificationPipeline as AudioClassificationPipeline
    from .pipelines import AutomaticSpeechRecognitionPipeline as AutomaticSpeechRecognitionPipeline
@ -894,7 +739,6 @@ if TYPE_CHECKING:
    from .trainer_utils import set_seed as set_seed
    from .training_args import TrainingArguments as TrainingArguments
    from .training_args_seq2seq import Seq2SeqTrainingArguments as Seq2SeqTrainingArguments
-    from .training_args_tf import TFTrainingArguments as TFTrainingArguments

    # Files and general utilities
    from .utils import CONFIG_NAME as CONFIG_NAME
@ -902,8 +746,6 @@ if TYPE_CHECKING:
    from .utils import PYTORCH_PRETRAINED_BERT_CACHE as PYTORCH_PRETRAINED_BERT_CACHE
    from .utils import PYTORCH_TRANSFORMERS_CACHE as PYTORCH_TRANSFORMERS_CACHE
    from .utils import SPIECE_UNDERLINE as SPIECE_UNDERLINE
-    from .utils import TF2_WEIGHTS_NAME as TF2_WEIGHTS_NAME
-    from .utils import TF_WEIGHTS_NAME as TF_WEIGHTS_NAME
    from .utils import TRANSFORMERS_CACHE as TRANSFORMERS_CACHE
    from .utils import WEIGHTS_NAME as WEIGHTS_NAME
    from .utils import TensorType as TensorType
@ -968,9 +810,7 @@ else:
    )


-if not is_tf_available() and not is_torch_available() and not is_flax_available():
+if not is_torch_available():
    logger.warning_advice(
-        "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. "
-        "Models won't be available and only tokenizers, configuration "
-        "and file/data utilities can be used."
+        "PyTorch was not found. Models won't be available and only tokenizers, configuration and file/data utilities can be used."
    )
--- a/src/transformers/activations_tf.py
+++ b/src/transformers/activations_tf.py
@ -1,147 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-
-import tensorflow as tf
-from packaging.version import parse
-
-
-try:
-    import tf_keras as keras
-except (ModuleNotFoundError, ImportError):
-    import keras
-
-    if parse(keras.__version__).major > 2:
-        raise ValueError(
-            "Your currently installed version of Keras is Keras 3, but this is not yet supported in "
-            "Transformers. Please install the backwards-compatible tf-keras package with "
-            "`pip install tf-keras`."
-        )
-
-
-def _gelu(x):
-    """
-    Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
-    initially created. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
-    0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) Also see
-    https://huggingface.co/papers/1606.08415
-    """
-    x = tf.convert_to_tensor(x)
-    cdf = 0.5 * (1.0 + tf.math.erf(x / tf.cast(tf.sqrt(2.0), x.dtype)))
-
-    return x * cdf
-
-
-def _gelu_new(x):
-    """
-    Gaussian Error Linear Unit. This is a smoother version of the GELU. Original paper: https://huggingface.co/papers/1606.0841
-
-    Args:
-        x: float Tensor to perform activation
-
-    Returns:
-        `x` with the GELU activation applied.
-    """
-    x = tf.convert_to_tensor(x)
-    pi = tf.cast(math.pi, x.dtype)
-    coeff = tf.cast(0.044715, x.dtype)
-    cdf = 0.5 * (1.0 + tf.tanh(tf.sqrt(2.0 / pi) * (x + coeff * tf.pow(x, 3))))
-
-    return x * cdf
-
-
-def mish(x):
-    x = tf.convert_to_tensor(x)
-
-    return x * tf.tanh(tf.math.softplus(x))
-
-
-def gelu_fast(x):
-    x = tf.convert_to_tensor(x)
-    coeff1 = tf.cast(0.044715, x.dtype)
-    coeff2 = tf.cast(0.7978845608, x.dtype)
-
-    return 0.5 * x * (1.0 + tf.tanh(x * coeff2 * (1.0 + coeff1 * x * x)))
-
-
-def quick_gelu(x):
-    x = tf.convert_to_tensor(x)
-    coeff = tf.cast(1.702, x.dtype)
-    return x * tf.math.sigmoid(coeff * x)
-
-
-def gelu_10(x):
-    """
-    Clip the range of possible GeLU outputs between [-10, 10]. This is especially useful for quantization purpose, as
-    it allows mapping 2 negatives values in the GeLU spectrum. For more information on this trick, please refer to
-    https://huggingface.co/papers/2004.09602
-
-    Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
-    initially created. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
-    0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) Also see
-    https://huggingface.co/papers/1606.08415 :param x: :return:
-    """
-    return tf.clip_by_value(_gelu(x), -10, 10)
-
-
-def glu(x, axis=-1):
-    """
-    Gated Linear Unit. Implementation as defined in the original paper (see https://huggingface.co/papers/1612.08083), where
-    the input `x` is split in two halves across a dimension (`axis`), A and B, returning A * sigmoid(B).
-
-    Args:
-        `x`: float Tensor to perform activation
-        `axis`: dimension across which `x` be split in half
-
-    Returns:
-        `x` with the GLU activation applied (with its size halved across the dimension `axis`).
-    """
-    a, b = tf.split(x, 2, axis=axis)
-    return a * tf.math.sigmoid(b)
-
-
-if parse(tf.version.VERSION) >= parse("2.4"):
-
-    def approximate_gelu_wrap(x):
-        return keras.activations.gelu(x, approximate=True)
-
-    gelu = keras.activations.gelu
-    gelu_new = approximate_gelu_wrap
-else:
-    gelu = _gelu
-    gelu_new = _gelu_new
-
-
-ACT2FN = {
-    "gelu": gelu,
-    "gelu_10": gelu_10,
-    "gelu_fast": gelu_fast,
-    "gelu_new": gelu_new,
-    "glu": glu,
-    "mish": mish,
-    "quick_gelu": quick_gelu,
-    "relu": keras.activations.relu,
-    "sigmoid": keras.activations.sigmoid,
-    "silu": keras.activations.swish,
-    "swish": keras.activations.swish,
-    "tanh": keras.activations.tanh,
-}
-
-
-def get_tf_activation(activation_string):
-    if activation_string in ACT2FN:
-        return ACT2FN[activation_string]
-    else:
-        raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}")
--- a/src/transformers/commands/convert.py
+++ b/src/transformers/commands/convert.py
@ -1,165 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from argparse import ArgumentParser, Namespace
-
-from ..utils import logging
-from . import BaseTransformersCLICommand
-
-
-def convert_command_factory(args: Namespace):
-    """
-    Factory function used to convert a model TF 1.0 checkpoint in a PyTorch checkpoint.
-
-    Returns: ServeCommand
-    """
-    return ConvertCommand(
-        args.model_type, args.tf_checkpoint, args.pytorch_dump_output, args.config, args.finetuning_task_name
-    )
-
-
-IMPORT_ERROR_MESSAGE = """
-transformers can only be used from the commandline to convert TensorFlow models in PyTorch, In that case, it requires
-TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.
-"""
-
-
-class ConvertCommand(BaseTransformersCLICommand):
-    @staticmethod
-    def register_subcommand(parser: ArgumentParser):
-        """
-        Register this command to argparse so it's available for the transformer-cli
-
-        Args:
-            parser: Root parser to register command-specific arguments
-        """
-        train_parser = parser.add_parser(
-            "convert",
-            help="CLI tool to run convert model from original author checkpoints to Transformers PyTorch checkpoints.",
-        )
-        train_parser.add_argument("--model_type", type=str, required=True, help="Model's type.")
-        train_parser.add_argument(
-            "--tf_checkpoint", type=str, required=True, help="TensorFlow checkpoint path or folder."
-        )
-        train_parser.add_argument(
-            "--pytorch_dump_output", type=str, required=True, help="Path to the PyTorch saved model output."
-        )
-        train_parser.add_argument("--config", type=str, default="", help="Configuration file path or folder.")
-        train_parser.add_argument(
-            "--finetuning_task_name",
-            type=str,
-            default=None,
-            help="Optional fine-tuning task name if the TF model was a finetuned model.",
-        )
-        train_parser.set_defaults(func=convert_command_factory)
-
-    def __init__(
-        self,
-        model_type: str,
-        tf_checkpoint: str,
-        pytorch_dump_output: str,
-        config: str,
-        finetuning_task_name: str,
-        *args,
-    ):
-        self._logger = logging.get_logger("transformers/converting")
-
-        self._logger.info(f"Loading model {model_type}")
-        self._model_type = model_type
-        self._tf_checkpoint = tf_checkpoint
-        self._pytorch_dump_output = pytorch_dump_output
-        self._config = config
-        self._finetuning_task_name = finetuning_task_name
-
-    def run(self):
-        if self._model_type == "albert":
-            try:
-                from ..models.albert.convert_albert_original_tf_checkpoint_to_pytorch import (
-                    convert_tf_checkpoint_to_pytorch,
-                )
-            except ImportError:
-                raise ImportError(IMPORT_ERROR_MESSAGE)
-
-            convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
-        elif self._model_type == "bert":
-            try:
-                from ..models.bert.convert_bert_original_tf_checkpoint_to_pytorch import (
-                    convert_tf_checkpoint_to_pytorch,
-                )
-            except ImportError:
-                raise ImportError(IMPORT_ERROR_MESSAGE)
-
-            convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
-        elif self._model_type == "funnel":
-            try:
-                from ..models.funnel.convert_funnel_original_tf_checkpoint_to_pytorch import (
-                    convert_tf_checkpoint_to_pytorch,
-                )
-            except ImportError:
-                raise ImportError(IMPORT_ERROR_MESSAGE)
-
-            convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
-        elif self._model_type == "t5":
-            try:
-                from ..models.t5.convert_t5_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
-            except ImportError:
-                raise ImportError(IMPORT_ERROR_MESSAGE)
-
-            convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
-        elif self._model_type == "gpt":
-            from ..models.openai.convert_openai_original_tf_checkpoint_to_pytorch import (
-                convert_openai_checkpoint_to_pytorch,
-            )
-
-            convert_openai_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
-        elif self._model_type == "gpt2":
-            try:
-                from ..models.gpt2.convert_gpt2_original_tf_checkpoint_to_pytorch import (
-                    convert_gpt2_checkpoint_to_pytorch,
-                )
-            except ImportError:
-                raise ImportError(IMPORT_ERROR_MESSAGE)
-
-            convert_gpt2_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
-        elif self._model_type == "xlnet":
-            try:
-                from ..models.xlnet.convert_xlnet_original_tf_checkpoint_to_pytorch import (
-                    convert_xlnet_checkpoint_to_pytorch,
-                )
-            except ImportError:
-                raise ImportError(IMPORT_ERROR_MESSAGE)
-
-            convert_xlnet_checkpoint_to_pytorch(
-                self._tf_checkpoint, self._config, self._pytorch_dump_output, self._finetuning_task_name
-            )
-        elif self._model_type == "xlm":
-            from ..models.xlm.convert_xlm_original_pytorch_checkpoint_to_pytorch import (
-                convert_xlm_checkpoint_to_pytorch,
-            )
-
-            convert_xlm_checkpoint_to_pytorch(self._tf_checkpoint, self._pytorch_dump_output)
-        elif self._model_type == "lxmert":
-            from ..models.lxmert.convert_lxmert_original_tf_checkpoint_to_pytorch import (
-                convert_lxmert_checkpoint_to_pytorch,
-            )
-
-            convert_lxmert_checkpoint_to_pytorch(self._tf_checkpoint, self._pytorch_dump_output)
-        elif self._model_type == "rembert":
-            from ..models.rembert.convert_rembert_tf_checkpoint_to_pytorch import (
-                convert_rembert_tf_checkpoint_to_pytorch,
-            )
-
-            convert_rembert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
-        else:
-            raise ValueError("--model_type should be selected in the list [bert, gpt, gpt2, t5, xlnet, xlm, lxmert]")
--- a/src/transformers/commands/env.py
+++ b/src/transformers/commands/env.py
@ -26,9 +26,7 @@ from .. import __version__ as version
 from ..integrations.deepspeed import is_deepspeed_available
 from ..utils import (
    is_accelerate_available,
-    is_flax_available,
    is_safetensors_available,
-    is_tf_available,
    is_torch_available,
    is_torch_hpu_available,
    is_torch_npu_available,
@ -109,19 +107,6 @@ class EnvironmentCommand(BaseTransformersCLICommand):
            elif pt_hpu_available:
                pt_accelerator = "HPU"

-        tf_version = "not installed"
-        tf_cuda_available = "NA"
-        if is_tf_available():
-            import tensorflow as tf
-
-            tf_version = tf.__version__
-            try:
-                # deprecated in v2.1
-                tf_cuda_available = tf.test.is_gpu_available()
-            except AttributeError:
-                # returns list of devices, convert to bool
-                tf_cuda_available = bool(tf.config.list_physical_devices("GPU"))
-
        deepspeed_version = "not installed"
        if is_deepspeed_available():
            # Redirect command line output to silence deepspeed import output.
@ -129,20 +114,6 @@ class EnvironmentCommand(BaseTransformersCLICommand):
                import deepspeed
            deepspeed_version = deepspeed.__version__

-        flax_version = "not installed"
-        jax_version = "not installed"
-        jaxlib_version = "not installed"
-        jax_backend = "NA"
-        if is_flax_available():
-            import flax
-            import jax
-            import jaxlib
-
-            flax_version = flax.__version__
-            jax_version = jax.__version__
-            jaxlib_version = jaxlib.__version__
-            jax_backend = jax.lib.xla_bridge.get_backend().platform
-
        info = {
            "`transformers` version": version,
            "Platform": platform.platform(),
@ -153,10 +124,6 @@ class EnvironmentCommand(BaseTransformersCLICommand):
            "Accelerate config": f"{accelerate_config_str}",
            "DeepSpeed version": f"{deepspeed_version}",
            "PyTorch version (accelerator?)": f"{pt_version} ({pt_accelerator})",
-            "Tensorflow version (GPU?)": f"{tf_version} ({tf_cuda_available})",
-            "Flax version (CPU?/GPU?/TPU?)": f"{flax_version} ({jax_backend})",
-            "Jax version": f"{jax_version}",
-            "JaxLib version": f"{jaxlib_version}",
            "Using distributed or parallel set-up in script?": "<fill in>",
        }
        if is_torch_available():
--- a/src/transformers/commands/train.py
+++ b/src/transformers/commands/train.py
@ -1,158 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from argparse import ArgumentParser, Namespace
-
-from ..data import SingleSentenceClassificationProcessor as Processor
-from ..pipelines import TextClassificationPipeline
-from ..utils import is_tf_available, is_torch_available, logging
-from . import BaseTransformersCLICommand
-
-
-if not is_tf_available() and not is_torch_available():
-    raise RuntimeError("At least one of PyTorch or TensorFlow 2.0+ should be installed to use CLI training")
-
-# TF training parameters
-USE_XLA = False
-USE_AMP = False
-
-
-def train_command_factory(args: Namespace):
-    """
-    Factory function used to instantiate training command from provided command line arguments.
-
-    Returns: TrainCommand
-    """
-    return TrainCommand(args)
-
-
-class TrainCommand(BaseTransformersCLICommand):
-    @staticmethod
-    def register_subcommand(parser: ArgumentParser):
-        """
-        Register this command to argparse so it's available for the transformer-cli
-
-        Args:
-            parser: Root parser to register command-specific arguments
-        """
-        train_parser = parser.add_parser("train", help="CLI tool to train a model on a task.")
-
-        train_parser.add_argument(
-            "--train_data",
-            type=str,
-            required=True,
-            help="path to train (and optionally evaluation) dataset as a csv with tab separated labels and sentences.",
-        )
-        train_parser.add_argument(
-            "--column_label", type=int, default=0, help="Column of the dataset csv file with example labels."
-        )
-        train_parser.add_argument(
-            "--column_text", type=int, default=1, help="Column of the dataset csv file with example texts."
-        )
-        train_parser.add_argument(
-            "--column_id", type=int, default=2, help="Column of the dataset csv file with example ids."
-        )
-        train_parser.add_argument(
-            "--skip_first_row", action="store_true", help="Skip the first row of the csv file (headers)."
-        )
-
-        train_parser.add_argument("--validation_data", type=str, default="", help="path to validation dataset.")
-        train_parser.add_argument(
-            "--validation_split",
-            type=float,
-            default=0.1,
-            help="if validation dataset is not provided, fraction of train dataset to use as validation dataset.",
-        )
-
-        train_parser.add_argument("--output", type=str, default="./", help="path to saved the trained model.")
-
-        train_parser.add_argument(
-            "--task", type=str, default="text_classification", help="Task to train the model on."
-        )
-        train_parser.add_argument(
-            "--model", type=str, default="google-bert/bert-base-uncased", help="Model's name or path to stored model."
-        )
-        train_parser.add_argument("--train_batch_size", type=int, default=32, help="Batch size for training.")
-        train_parser.add_argument("--valid_batch_size", type=int, default=64, help="Batch size for validation.")
-        train_parser.add_argument("--learning_rate", type=float, default=3e-5, help="Learning rate.")
-        train_parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon for Adam optimizer.")
-        train_parser.set_defaults(func=train_command_factory)
-
-    def __init__(self, args: Namespace):
-        self.logger = logging.get_logger("transformers/training")
-
-        self.framework = "tf" if is_tf_available() else "torch"
-
-        os.makedirs(args.output, exist_ok=True)
-        self.output = args.output
-
-        self.column_label = args.column_label
-        self.column_text = args.column_text
-        self.column_id = args.column_id
-
-        self.logger.info(f"Loading {args.task} pipeline for {args.model}")
-        if args.task == "text_classification":
-            self.pipeline = TextClassificationPipeline.from_pretrained(args.model)
-        elif args.task == "token_classification":
-            raise NotImplementedError
-        elif args.task == "question_answering":
-            raise NotImplementedError
-
-        self.logger.info(f"Loading dataset from {args.train_data}")
-        self.train_dataset = Processor.create_from_csv(
-            args.train_data,
-            column_label=args.column_label,
-            column_text=args.column_text,
-            column_id=args.column_id,
-            skip_first_row=args.skip_first_row,
-        )
-        self.valid_dataset = None
-        if args.validation_data:
-            self.logger.info(f"Loading validation dataset from {args.validation_data}")
-            self.valid_dataset = Processor.create_from_csv(
-                args.validation_data,
-                column_label=args.column_label,
-                column_text=args.column_text,
-                column_id=args.column_id,
-                skip_first_row=args.skip_first_row,
-            )
-
-        self.validation_split = args.validation_split
-        self.train_batch_size = args.train_batch_size
-        self.valid_batch_size = args.valid_batch_size
-        self.learning_rate = args.learning_rate
-        self.adam_epsilon = args.adam_epsilon
-
-    def run(self):
-        if self.framework == "tf":
-            return self.run_tf()
-        return self.run_torch()
-
-    def run_torch(self):
-        raise NotImplementedError
-
-    def run_tf(self):
-        self.pipeline.fit(
-            self.train_dataset,
-            validation_data=self.valid_dataset,
-            validation_split=self.validation_split,
-            learning_rate=self.learning_rate,
-            adam_epsilon=self.adam_epsilon,
-            train_batch_size=self.train_batch_size,
-            valid_batch_size=self.valid_batch_size,
-        )
-
-        # Save trained pipeline
-        self.pipeline.save_pretrained(self.output)
--- a/src/transformers/commands/transformers_cli.py
+++ b/src/transformers/commands/transformers_cli.py
@ -18,7 +18,6 @@ from transformers import HfArgumentParser
 from transformers.commands.add_fast_image_processor import AddFastImageProcessorCommand
 from transformers.commands.add_new_model_like import AddNewModelLikeCommand
 from transformers.commands.chat import ChatCommand
-from transformers.commands.convert import ConvertCommand
 from transformers.commands.download import DownloadCommand
 from transformers.commands.env import EnvironmentCommand
 from transformers.commands.run import RunCommand
@ -39,7 +38,6 @@ def main():

    # Register commands
    ChatCommand.register_subcommand(commands_parser)
-    ConvertCommand.register_subcommand(commands_parser)
    DownloadCommand.register_subcommand(commands_parser)
    EnvironmentCommand.register_subcommand(commands_parser)
    RunCommand.register_subcommand(commands_parser)
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@ -100,9 +100,8 @@ class PretrainedConfig(PushToHubMixin):

    Arg:
        name_or_path (`str`, *optional*, defaults to `""`):
-            Store the string that was passed to [`PreTrainedModel.from_pretrained`] or
-            [`TFPreTrainedModel.from_pretrained`] as `pretrained_model_name_or_path` if the configuration was created
-            with such a method.
+            Store the string that was passed to [`PreTrainedModel.from_pretrained`] as `pretrained_model_name_or_path`
+            if the configuration was created with such a method.
        output_hidden_states (`bool`, *optional*, defaults to `False`):
            Whether or not the model should return all hidden-states.
        output_attentions (`bool`, *optional*, defaults to `False`):
@ -140,8 +139,7 @@ class PretrainedConfig(PushToHubMixin):
        architectures (`list[str]`, *optional*):
            Model architectures that can be used with the model pretrained weights.
        finetuning_task (`str`, *optional*):
-            Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow
-            or PyTorch) checkpoint.
+            Name of the task used to fine-tune the model.
        id2label (`dict[int, str]`, *optional*):
            A map from index (for instance prediction index, or target index) to label.
        label2id (`dict[str, int]`, *optional*):
@ -346,10 +344,6 @@ class PretrainedConfig(PushToHubMixin):
                logger.error(f"Can't set {key} with value {value} for {self}")
                raise err

-        # TODO: remove later, deprecated arguments for TF models
-        self.tf_legacy_loss = kwargs.pop("tf_legacy_loss", False)
-        self.use_bfloat16 = kwargs.pop("use_bfloat16", False)
-
    def _create_id_label_maps(self, num_labels: int):
        self.id2label = {i: f"LABEL_{i}" for i in range(num_labels)}
        self.label2id = dict(zip(self.id2label.values(), self.id2label.keys()))
--- a/src/transformers/convert_graph_to_onnx.py
+++ b/src/transformers/convert_graph_to_onnx.py
@ -1,551 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-from argparse import ArgumentParser
-from os import listdir, makedirs
-from pathlib import Path
-from typing import Optional
-
-from packaging.version import Version, parse
-
-from transformers.pipelines import Pipeline, pipeline
-from transformers.tokenization_utils import BatchEncoding
-from transformers.utils import ModelOutput, is_tf_available, is_torch_available
-
-
-# This is the minimal required version to
-# support some ONNX Runtime features
-ORT_QUANTIZE_MINIMUM_VERSION = parse("1.4.0")
-
-
-SUPPORTED_PIPELINES = [
-    "feature-extraction",
-    "ner",
-    "sentiment-analysis",
-    "fill-mask",
-    "question-answering",
-    "text-generation",
-    "translation_en_to_fr",
-    "translation_en_to_de",
-    "translation_en_to_ro",
-]
-
-
-class OnnxConverterArgumentParser(ArgumentParser):
-    """
-    Wraps all the script arguments supported to export transformers models to ONNX IR
-    """
-
-    def __init__(self):
-        super().__init__("ONNX Converter")
-
-        self.add_argument(
-            "--pipeline",
-            type=str,
-            choices=SUPPORTED_PIPELINES,
-            default="feature-extraction",
-        )
-        self.add_argument(
-            "--model",
-            type=str,
-            required=True,
-            help="Model's id or path (ex: google-bert/bert-base-cased)",
-        )
-        self.add_argument("--tokenizer", type=str, help="Tokenizer's id or path (ex: google-bert/bert-base-cased)")
-        self.add_argument(
-            "--framework",
-            type=str,
-            choices=["pt", "tf"],
-            help="Framework for loading the model",
-        )
-        self.add_argument("--opset", type=int, default=11, help="ONNX opset to use")
-        self.add_argument(
-            "--check-loading",
-            action="store_true",
-            help="Check ONNX is able to load the model",
-        )
-        self.add_argument(
-            "--use-external-format",
-            action="store_true",
-            help="Allow exporting model >= than 2Gb",
-        )
-        self.add_argument(
-            "--quantize",
-            action="store_true",
-            help="Quantize the neural network to be run with int8",
-        )
-        self.add_argument("output")
-
-
-def generate_identified_filename(filename: Path, identifier: str) -> Path:
-    """
-    Append a string-identifier at the end (before the extension, if any) to the provided filepath
-
-    Args:
-        filename: pathlib.Path The actual path object we would like to add an identifier suffix
-        identifier: The suffix to add
-
-    Returns: String with concatenated identifier at the end of the filename
-    """
-    return filename.parent.joinpath(filename.stem + identifier).with_suffix(filename.suffix)
-
-
-def check_onnxruntime_requirements(minimum_version: Version):
-    """
-    Check onnxruntime is installed and if the installed version match is recent enough
-
-    Raises:
-        ImportError: If onnxruntime is not installed or too old version is found
-    """
-    try:
-        import onnxruntime
-
-        # Parse the version of the installed onnxruntime
-        ort_version = parse(onnxruntime.__version__)
-
-        # We require 1.4.0 minimum
-        if ort_version < ORT_QUANTIZE_MINIMUM_VERSION:
-            raise ImportError(
-                f"We found an older version of onnxruntime ({onnxruntime.__version__}) "
-                f"but we require onnxruntime to be >= {minimum_version} to enable all the conversions options.\n"
-                "Please update onnxruntime by running `pip install --upgrade onnxruntime`"
-            )
-
-    except ImportError:
-        raise ImportError(
-            "onnxruntime doesn't seem to be currently installed. "
-            "Please install the onnxruntime by running `pip install onnxruntime`"
-            " and relaunch the conversion."
-        )
-
-
-def ensure_valid_input(model, tokens, input_names):
-    """
-    Ensure inputs are presented in the correct order, without any Non
-
-    Args:
-        model: The model used to forward the input data
-        tokens: BatchEncoding holding the input data
-        input_names: The name of the inputs
-
-    Returns: Tuple
-
-    """
-    print("Ensuring inputs are in correct order")
-
-    model_args_name = model.forward.__code__.co_varnames
-    model_args, ordered_input_names = [], []
-    for arg_name in model_args_name[1:]:  # start at index 1 to skip "self" argument
-        if arg_name in input_names:
-            ordered_input_names.append(arg_name)
-            model_args.append(tokens[arg_name])
-        else:
-            print(f"{arg_name} is not present in the generated input list.")
-            break
-
-    print(f"Generated inputs order: {ordered_input_names}")
-    return ordered_input_names, tuple(model_args)
-
-
-def infer_shapes(nlp: Pipeline, framework: str) -> tuple[list[str], list[str], dict, BatchEncoding]:
-    """
-    Attempt to infer the static vs dynamic axes for each input and output tensors for a specific model
-
-    Args:
-        nlp: The pipeline object holding the model to be exported
-        framework: The framework identifier to dispatch to the correct inference scheme (pt/tf)
-
-    Returns:
-
-        - List of the inferred input variable names
-        - List of the inferred output variable names
-        - Dictionary with input/output variables names as key and shape tensor as value
-        - a BatchEncoding reference which was used to infer all the above information
-    """
-
-    def build_shape_dict(name: str, tensor, is_input: bool, seq_len: int):
-        if isinstance(tensor, (tuple, list)):
-            return [build_shape_dict(name, t, is_input, seq_len) for t in tensor]
-
-        else:
-            # Let's assume batch is the first axis with only 1 element (~~ might not be always true ...)
-            axes = {[axis for axis, numel in enumerate(tensor.shape) if numel == 1][0]: "batch"}
-            if is_input:
-                if len(tensor.shape) == 2:
-                    axes[1] = "sequence"
-                else:
-                    raise ValueError(f"Unable to infer tensor axes ({len(tensor.shape)})")
-            else:
-                seq_axes = [dim for dim, shape in enumerate(tensor.shape) if shape == seq_len]
-                axes.update(dict.fromkeys(seq_axes, "sequence"))
-
-        print(f"Found {'input' if is_input else 'output'} {name} with shape: {axes}")
-        return axes
-
-    tokens = nlp.tokenizer("This is a sample output", return_tensors=framework)
-    seq_len = tokens.input_ids.shape[-1]
-    outputs = nlp.model(**tokens) if framework == "pt" else nlp.model(tokens)
-    if isinstance(outputs, ModelOutput):
-        outputs = outputs.to_tuple()
-    if not isinstance(outputs, (list, tuple)):
-        outputs = (outputs,)
-
-    # Generate input names & axes
-    input_vars = list(tokens.keys())
-    input_dynamic_axes = {k: build_shape_dict(k, v, True, seq_len) for k, v in tokens.items()}
-
-    # flatten potentially grouped outputs (past for gpt2, attentions)
-    outputs_flat = []
-    for output in outputs:
-        if isinstance(output, (tuple, list)):
-            outputs_flat.extend(output)
-        else:
-            outputs_flat.append(output)
-
-    # Generate output names & axes
-    output_names = [f"output_{i}" for i in range(len(outputs_flat))]
-    output_dynamic_axes = {k: build_shape_dict(k, v, False, seq_len) for k, v in zip(output_names, outputs_flat)}
-
-    # Create the aggregated axes representation
-    dynamic_axes = dict(input_dynamic_axes, **output_dynamic_axes)
-    return input_vars, output_names, dynamic_axes, tokens
-
-
-def load_graph_from_args(
-    pipeline_name: str, framework: str, model: str, tokenizer: Optional[str] = None, **models_kwargs
-) -> Pipeline:
-    """
-    Convert the set of arguments provided through the CLI to an actual pipeline reference (tokenizer + model
-
-    Args:
-        pipeline_name: The kind of pipeline to use (ner, question-answering, etc.)
-        framework: The actual model to convert the pipeline from ("pt" or "tf")
-        model: The model name which will be loaded by the pipeline
-        tokenizer: The tokenizer name which will be loaded by the pipeline, default to the model's value
-
-    Returns: Pipeline object
-
-    """
-    # If no tokenizer provided
-    if tokenizer is None:
-        tokenizer = model
-
-    # Check the wanted framework is available
-    if framework == "pt" and not is_torch_available():
-        raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.")
-    if framework == "tf" and not is_tf_available():
-        raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.")
-
-    print(f"Loading pipeline (model: {model}, tokenizer: {tokenizer})")
-
-    # Allocate tokenizer and model
-    return pipeline(pipeline_name, model=model, tokenizer=tokenizer, framework=framework, model_kwargs=models_kwargs)
-
-
-def convert_pytorch(nlp: Pipeline, opset: int, output: Path, use_external_format: bool):
-    """
-    Export a PyTorch backed pipeline to ONNX Intermediate Representation (IR
-
-    Args:
-        nlp: The pipeline to be exported
-        opset: The actual version of the ONNX operator set to use
-        output: Path where will be stored the generated ONNX model
-        use_external_format: Split the model definition from its parameters to allow model bigger than 2GB
-
-    Returns:
-
-    """
-    if not is_torch_available():
-        raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.")
-
-    import torch
-    from torch.onnx import export
-
-    print(f"Using framework PyTorch: {torch.__version__}")
-
-    with torch.no_grad():
-        input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "pt")
-        ordered_input_names, model_args = ensure_valid_input(nlp.model, tokens, input_names)
-
-        export(
-            nlp.model,
-            model_args,
-            f=output.as_posix(),
-            input_names=ordered_input_names,
-            output_names=output_names,
-            dynamic_axes=dynamic_axes,
-            do_constant_folding=True,
-            opset_version=opset,
-        )
-
-
-def convert_tensorflow(nlp: Pipeline, opset: int, output: Path):
-    """
-    Export a TensorFlow backed pipeline to ONNX Intermediate Representation (IR)
-
-    Args:
-        nlp: The pipeline to be exported
-        opset: The actual version of the ONNX operator set to use
-        output: Path where will be stored the generated ONNX model
-
-    Notes: TensorFlow cannot export model bigger than 2GB due to internal constraint from TensorFlow
-
-    """
-    if not is_tf_available():
-        raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.")
-
-    print("/!\\ Please note TensorFlow doesn't support exporting model > 2Gb /!\\")
-
-    try:
-        import tensorflow as tf
-        import tf2onnx
-        from tf2onnx import __version__ as t2ov
-
-        print(f"Using framework TensorFlow: {tf.version.VERSION}, tf2onnx: {t2ov}")
-
-        # Build
-        input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "tf")
-
-        # Forward
-        nlp.model.predict(tokens.data)
-        input_signature = [tf.TensorSpec.from_tensor(tensor, name=key) for key, tensor in tokens.items()]
-        model_proto, _ = tf2onnx.convert.from_keras(
-            nlp.model, input_signature, opset=opset, output_path=output.as_posix()
-        )
-
-    except ImportError as e:
-        raise Exception(
-            f"Cannot import {e.name} required to convert TF model to ONNX. Please install {e.name} first. {e}"
-        )
-
-
-def convert(
-    framework: str,
-    model: str,
-    output: Path,
-    opset: int,
-    tokenizer: Optional[str] = None,
-    use_external_format: bool = False,
-    pipeline_name: str = "feature-extraction",
-    **model_kwargs,
-):
-    """
-    Convert the pipeline object to the ONNX Intermediate Representation (IR) format
-
-    Args:
-        framework: The framework the pipeline is backed by ("pt" or "tf")
-        model: The name of the model to load for the pipeline
-        output: The path where the ONNX graph will be stored
-        opset: The actual version of the ONNX operator set to use
-        tokenizer: The name of the model to load for the pipeline, default to the model's name if not provided
-        use_external_format:
-            Split the model definition from its parameters to allow model bigger than 2GB (PyTorch only)
-        pipeline_name: The kind of pipeline to instantiate (ner, question-answering, etc.)
-        model_kwargs: Keyword arguments to be forwarded to the model constructor
-
-    Returns:
-
-    """
-    warnings.warn(
-        "The `transformers.convert_graph_to_onnx` package is deprecated and will be removed in version 5 of"
-        " Transformers",
-        FutureWarning,
-    )
-    print(f"ONNX opset version set to: {opset}")
-
-    # Load the pipeline
-    nlp = load_graph_from_args(pipeline_name, framework, model, tokenizer, **model_kwargs)
-
-    if not output.parent.exists():
-        print(f"Creating folder {output.parent}")
-        makedirs(output.parent.as_posix())
-    elif len(listdir(output.parent.as_posix())) > 0:
-        raise Exception(f"Folder {output.parent.as_posix()} is not empty, aborting conversion")
-
-    # Export the graph
-    if framework == "pt":
-        convert_pytorch(nlp, opset, output, use_external_format)
-    else:
-        convert_tensorflow(nlp, opset, output)
-
-
-def optimize(onnx_model_path: Path) -> Path:
-    """
-    Load the model at the specified path and let onnxruntime look at transformations on the graph to enable all the
-    optimizations possible
-
-    Args:
-        onnx_model_path: filepath where the model binary description is stored
-
-    Returns: Path where the optimized model binary description has been saved
-
-    """
-    from onnxruntime import InferenceSession, SessionOptions
-
-    # Generate model name with suffix "optimized"
-    opt_model_path = generate_identified_filename(onnx_model_path, "-optimized")
-    sess_option = SessionOptions()
-    sess_option.optimized_model_filepath = opt_model_path.as_posix()
-    _ = InferenceSession(onnx_model_path.as_posix(), sess_option)
-
-    print(f"Optimized model has been written at {opt_model_path}: \N{HEAVY CHECK MARK}")
-    print("/!\\ Optimized model contains hardware specific operators which might not be portable. /!\\")
-
-    return opt_model_path
-
-
-def quantize(onnx_model_path: Path) -> Path:
-    """
-    Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU
-
-    Args:
-        onnx_model_path: Path to location the exported ONNX model is stored
-
-    Returns: The Path generated for the quantized
-    """
-    import onnx
-    import onnxruntime
-    from onnx.onnx_pb import ModelProto
-    from onnxruntime.quantization import QuantizationMode
-    from onnxruntime.quantization.onnx_quantizer import ONNXQuantizer
-    from onnxruntime.quantization.registry import IntegerOpsRegistry
-
-    # Load the ONNX model
-    onnx_model = onnx.load(onnx_model_path.as_posix())
-
-    if parse(onnx.__version__) < parse("1.5.0"):
-        print(
-            "Models larger than 2GB will fail to quantize due to protobuf constraint.\n"
-            "Please upgrade to onnxruntime >= 1.5.0."
-        )
-
-    # Copy it
-    copy_model = ModelProto()
-    copy_model.CopyFrom(onnx_model)
-
-    # Construct quantizer
-    # onnxruntime renamed input_qType to activation_qType in v1.13.1, so we
-    # check the onnxruntime version to ensure backward compatibility.
-    # See also: https://github.com/microsoft/onnxruntime/pull/12873
-    if parse(onnxruntime.__version__) < parse("1.13.1"):
-        quantizer = ONNXQuantizer(
-            model=copy_model,
-            per_channel=False,
-            reduce_range=False,
-            mode=QuantizationMode.IntegerOps,
-            static=False,
-            weight_qType=True,
-            input_qType=False,
-            tensors_range=None,
-            nodes_to_quantize=None,
-            nodes_to_exclude=None,
-            op_types_to_quantize=list(IntegerOpsRegistry),
-        )
-    else:
-        quantizer = ONNXQuantizer(
-            model=copy_model,
-            per_channel=False,
-            reduce_range=False,
-            mode=QuantizationMode.IntegerOps,
-            static=False,
-            weight_qType=True,
-            activation_qType=False,
-            tensors_range=None,
-            nodes_to_quantize=None,
-            nodes_to_exclude=None,
-            op_types_to_quantize=list(IntegerOpsRegistry),
-        )
-
-    # Quantize and export
-    quantizer.quantize_model()
-
-    # Append "-quantized" at the end of the model's name
-    quantized_model_path = generate_identified_filename(onnx_model_path, "-quantized")
-
-    # Save model
-    print(f"Quantized model has been written at {quantized_model_path}: \N{HEAVY CHECK MARK}")
-    onnx.save_model(quantizer.model.model, quantized_model_path.as_posix())
-
-    return quantized_model_path
-
-
-def verify(path: Path):
-    from onnxruntime import InferenceSession, SessionOptions
-    from onnxruntime.capi.onnxruntime_pybind11_state import RuntimeException
-
-    print(f"Checking ONNX model loading from: {path} ...")
-    try:
-        onnx_options = SessionOptions()
-        _ = InferenceSession(path.as_posix(), onnx_options, providers=["CPUExecutionProvider"])
-        print(f"Model {path} correctly loaded: \N{HEAVY CHECK MARK}")
-    except RuntimeException as re:
-        print(f"Error while loading the model {re}: \N{HEAVY BALLOT X}")
-
-
-if __name__ == "__main__":
-    parser = OnnxConverterArgumentParser()
-    args = parser.parse_args()
-
-    # Make sure output is absolute path
-    args.output = Path(args.output).absolute()
-
-    try:
-        print("\n====== Converting model to ONNX ======")
-        # Convert
-        convert(
-            args.framework,
-            args.model,
-            args.output,
-            args.opset,
-            args.tokenizer,
-            args.use_external_format,
-            args.pipeline,
-        )
-
-        if args.quantize:
-            # Ensure requirements for quantization on onnxruntime is met
-            check_onnxruntime_requirements(ORT_QUANTIZE_MINIMUM_VERSION)
-
-            # onnxruntime optimizations doesn't provide the same level of performances on TensorFlow than PyTorch
-            if args.framework == "tf":
-                print(
-                    "\t Using TensorFlow might not provide the same optimization level compared to PyTorch.\n"
-                    "\t For TensorFlow users you can try optimizing the model directly through onnxruntime_tools.\n"
-                    "\t For more information, please refer to the onnxruntime documentation:\n"
-                    "\t\thttps://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers\n"
-                )
-
-            print("\n====== Optimizing ONNX model ======")
-
-            # Quantization works best when using the optimized version of the model
-            args.optimized_output = optimize(args.output)
-
-            # Do the quantization on the right graph
-            args.quantized_output = quantize(args.optimized_output)
-
-        # And verify
-        if args.check_loading:
-            print("\n====== Check exported ONNX model(s) ======")
-            verify(args.output)
-
-            if hasattr(args, "optimized_output"):
-                verify(args.optimized_output)
-
-            if hasattr(args, "quantized_output"):
-                verify(args.quantized_output)
-
-    except Exception as e:
-        print(f"Error while converting the model: {e}")
-        exit(1)
--- a/src/transformers/convert_tf_hub_seq_to_seq_bert_to_pytorch.py
+++ b/src/transformers/convert_tf_hub_seq_to_seq_bert_to_pytorch.py
@ -1,86 +0,0 @@
-# Copyright 2020 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Seq2Seq TF Hub checkpoint."""
-
-import argparse
-
-from . import (
-    BertConfig,
-    BertGenerationConfig,
-    BertGenerationDecoder,
-    BertGenerationEncoder,
-    load_tf_weights_in_bert_generation,
-    logging,
-)
-
-
-logging.set_verbosity_info()
-
-
-def convert_tf_checkpoint_to_pytorch(tf_hub_path, pytorch_dump_path, is_encoder_named_decoder, vocab_size, is_encoder):
-    # Initialise PyTorch model
-    bert_config = BertConfig.from_pretrained(
-        "google-bert/bert-large-cased",
-        vocab_size=vocab_size,
-        max_position_embeddings=512,
-        is_decoder=True,
-        add_cross_attention=True,
-    )
-    bert_config_dict = bert_config.to_dict()
-    del bert_config_dict["type_vocab_size"]
-    config = BertGenerationConfig(**bert_config_dict)
-    if is_encoder:
-        model = BertGenerationEncoder(config)
-    else:
-        model = BertGenerationDecoder(config)
-    print(f"Building PyTorch model from configuration: {config}")
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_bert_generation(
-        model,
-        tf_hub_path,
-        model_class="bert",
-        is_encoder_named_decoder=is_encoder_named_decoder,
-        is_encoder=is_encoder,
-    )
-
-    # Save pytorch-model
-    print(f"Save PyTorch model and config to {pytorch_dump_path}")
-    model.save_pretrained(pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_hub_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--is_encoder_named_decoder",
-        action="store_true",
-        help="If decoder has to be renamed to encoder in PyTorch model.",
-    )
-    parser.add_argument("--is_encoder", action="store_true", help="If model is an encoder.")
-    parser.add_argument("--vocab_size", default=50358, type=int, help="Vocab size of model")
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(
-        args.tf_hub_path,
-        args.pytorch_dump_path,
-        args.is_encoder_named_decoder,
-        args.vocab_size,
-        is_encoder=args.is_encoder,
-    )
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@ -31,7 +31,7 @@ InputDataClass = NewType("InputDataClass", Any)

 """
 A DataCollator is a function that takes a list of samples from a Dataset and collate them into a batch, as a dictionary
-of PyTorch/TensorFlow tensors or NumPy arrays.
+of PyTorch tensors or NumPy arrays.
 """
 DataCollator = NewType("DataCollator", Callable[[list[InputDataClass]], dict[str, Any]])

@ -40,9 +40,7 @@ class DataCollatorMixin:
    def __call__(self, features, return_tensors=None):
        if return_tensors is None:
            return_tensors = self.return_tensors
-        if return_tensors == "tf":
-            return self.tf_call(features)
-        elif return_tensors == "pt":
+        if return_tensors == "pt":
            return self.torch_call(features)
        elif return_tensors == "np":
            return self.numpy_call(features)
@ -91,8 +89,6 @@ def default_data_collator(features: list[InputDataClass], return_tensors="pt") -

    if return_tensors == "pt":
        return torch_default_data_collator(features)
-    elif return_tensors == "tf":
-        return tf_default_data_collator(features)
    elif return_tensors == "np":
        return numpy_default_data_collator(features)

@ -114,7 +110,7 @@ class DefaultDataCollator(DataCollatorMixin):

    Args:
        return_tensors (`str`, *optional*, defaults to `"pt"`):
-            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
+            The type of Tensor to return. Allowable values are "np", or "pt".
    """

    return_tensors: str = "pt"
@ -161,47 +157,6 @@ def torch_default_data_collator(features: list[InputDataClass]) -> dict[str, Any
    return batch


-def tf_default_data_collator(features: list[InputDataClass]) -> dict[str, Any]:
-    import tensorflow as tf
-
-    if not isinstance(features[0], Mapping):
-        features = [vars(f) for f in features]
-    first = features[0]
-    batch = {}
-
-    # Special handling for labels.
-    # Ensure that tensor is created with the correct type
-    # (it should be automatically the case, but let's make sure of it.)
-    if "label" in first and first["label"] is not None:
-        label_col_name = "label"
-    elif "label_ids" in first and first["label_ids"] is not None:
-        label_col_name = "label_ids"
-    elif "labels" in first and first["labels"] is not None:
-        label_col_name = "labels"
-    else:
-        label_col_name = None
-    if label_col_name is not None:
-        if isinstance(first[label_col_name], tf.Tensor):
-            dtype = tf.int64 if first[label_col_name].dtype.is_integer else tf.float32
-        elif isinstance(first[label_col_name], (np.ndarray, np.generic)):
-            dtype = tf.int64 if np.issubdtype(first[label_col_name].dtype, np.integer) else tf.float32
-        elif isinstance(first[label_col_name], (tuple, list)):
-            dtype = tf.int64 if isinstance(first[label_col_name][0], int) else tf.float32
-        else:
-            dtype = tf.int64 if isinstance(first[label_col_name], int) else tf.float32
-        batch["labels"] = tf.convert_to_tensor([f[label_col_name] for f in features], dtype=dtype)
-    # Handling of all other possible keys.
-    # Again, we will use the first element to figure out which key/values are not None for this model.
-    for k, v in first.items():
-        if k not in ("label", "label_ids", "labels") and v is not None and not isinstance(v, str):
-            if isinstance(v, (tf.Tensor, np.ndarray)):
-                batch[k] = tf.stack([f[k] for f in features])
-            else:
-                batch[k] = tf.convert_to_tensor([f[k] for f in features])
-
-    return batch
-
-
 def numpy_default_data_collator(features: list[InputDataClass]) -> dict[str, Any]:
    if not isinstance(features[0], Mapping):
        features = [vars(f) for f in features]
@ -259,7 +214,7 @@ class DataCollatorWithPadding:
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.0 (Volta).
        return_tensors (`str`, *optional*, defaults to `"pt"`):
-            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
+            The type of Tensor to return. Allowable values are "np", or "pt".
    """

    tokenizer: PreTrainedTokenizerBase
@ -313,7 +268,7 @@ class DataCollatorForTokenClassification(DataCollatorMixin):
        label_pad_token_id (`int`, *optional*, defaults to -100):
            The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions).
        return_tensors (`str`, *optional*, defaults to `"pt"`):
-            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
+            The type of Tensor to return. Allowable values are "np", or "pt".
    """

    tokenizer: PreTrainedTokenizerBase
@ -363,38 +318,6 @@ class DataCollatorForTokenClassification(DataCollatorMixin):
        batch[label_name] = torch.tensor(batch[label_name], dtype=torch.int64)
        return batch

-    def tf_call(self, features):
-        import tensorflow as tf
-
-        label_name = "label" if "label" in features[0] else "labels"
-        labels = [feature[label_name] for feature in features] if label_name in features[0] else None
-        batch = pad_without_fast_tokenizer_warning(
-            self.tokenizer,
-            features,
-            padding=self.padding,
-            max_length=self.max_length,
-            pad_to_multiple_of=self.pad_to_multiple_of,
-            # Conversion to tensors will fail if we have labels as they are not of the same length yet.
-            return_tensors="tf" if labels is None else None,
-        )
-
-        if labels is None:
-            return batch
-
-        sequence_length = tf.convert_to_tensor(batch["input_ids"]).shape[1]
-        padding_side = self.tokenizer.padding_side
-        if padding_side == "right":
-            batch["labels"] = [
-                list(label) + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels
-            ]
-        else:
-            batch["labels"] = [
-                [self.label_pad_token_id] * (sequence_length - len(label)) + list(label) for label in labels
-            ]
-
-        batch = {k: tf.convert_to_tensor(v, dtype=tf.int64) for k, v in batch.items()}
-        return batch
-
    def numpy_call(self, features):
        label_name = "label" if "label" in features[0] else "labels"
        labels = [feature[label_name] for feature in features] if label_name in features[0] else None
@ -463,44 +386,6 @@ def _torch_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int]
    return result


-def _tf_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
-    import tensorflow as tf
-
-    """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
-    # Tensorize if necessary.
-    if isinstance(examples[0], (list, tuple)):
-        examples = [tf.convert_to_tensor(e, dtype=tf.int64) for e in examples]
-
-    # Check if padding is necessary.
-    length_of_first = len(examples[0])
-    are_tensors_same_length = all(len(x) == length_of_first for x in examples)
-    if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0):
-        return tf.stack(examples, axis=0)
-
-    # If yes, check if we have a `pad_token`.
-    if tokenizer.pad_token is None:
-        raise ValueError(
-            "You are attempting to pad samples but the tokenizer you are using"
-            f" ({tokenizer.__class__.__name__}) does not have a pad token."
-        )
-
-    # Creating the full tensor and filling it with our data.
-    max_length = max(len(x) for x in examples)
-    if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
-        max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
-    # result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id)
-    result = []
-    rank = tf.rank(examples[0])
-    paddings = np.zeros((rank, 2), dtype=np.int32)
-    for example in examples:
-        if tokenizer.padding_side == "right":
-            paddings[0, 1] = max_length - len(example)
-        else:
-            paddings[0, 0] = max_length - len(example)
-        result.append(tf.pad(example, paddings, constant_values=tokenizer.pad_token_id))
-    return tf.stack(result, axis=0)
-
-
 def _numpy_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
    """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
    # Tensorize if necessary.
@ -560,7 +445,7 @@ class DataCollatorForMultipleChoice(DataCollatorMixin):
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
        return_tensors (`str`, *optional*, defaults to `"pt"`):
-            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
+            The type of Tensor to return. Allowable values are "np", or "pt".
    """

    tokenizer: PreTrainedTokenizerBase
@ -599,30 +484,6 @@ class DataCollatorForMultipleChoice(DataCollatorMixin):
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

-    def tf_call(self, features):  # Implementation taken from the docs.
-        import tensorflow as tf
-
-        label_name = "label" if "label" in features[0] else "labels"
-        labels = [feature.pop(label_name) for feature in features]
-        batch_size = len(features)
-        num_choices = len(features[0]["input_ids"])
-        flattened_features = [
-            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
-        ]
-        flattened_features = sum(flattened_features, [])  # Sometimes written as list(chain(*flattened_features))
-
-        batch = self.tokenizer.pad(
-            flattened_features,
-            padding=self.padding,
-            max_length=self.max_length,
-            pad_to_multiple_of=self.pad_to_multiple_of,
-            return_tensors="tf",
-        )
-
-        batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()}
-        batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64)
-        return batch
-

@dataclass
 class DataCollatorForSeq2Seq:
@ -656,7 +517,7 @@ class DataCollatorForSeq2Seq:
        label_pad_token_id (`int`, *optional*, defaults to -100):
            The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).
        return_tensors (`str`, *optional*, defaults to `"pt"`):
-            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
+            The type of Tensor to return. Allowable values are "np", or "pt".
    """

    tokenizer: PreTrainedTokenizerBase
@ -739,10 +600,6 @@ class DataCollatorForSeq2Seq:
                import torch

                batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
-            elif return_tensors == "tf":
-                import tensorflow as tf
-
-                batch["labels"] = tf.constant(batch["labels"], dtype=tf.int64)
            else:
                batch["labels"] = np.array(batch["labels"], dtype=np.int64)
        else:
@ -787,7 +644,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
        pad_to_multiple_of (`int`, *optional*):
            If set, will pad the sequence to a multiple of the provided value.
        return_tensors (`str`):
-            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
+            The type of Tensor to return. Allowable values are "np", or "pt".
        seed (`int`, *optional*):
            The seed to use for the random number generator for masking. If not provided, the global RNG will be used.

@ -828,7 +685,6 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
    mask_replace_prob: float = 0.8
    random_replace_prob: float = 0.1
    pad_to_multiple_of: Optional[int] = None
-    tf_experimental_compile: bool = False
    return_tensors: str = "pt"
    seed: Optional[int] = None

@ -852,11 +708,6 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
        self.mask_replace_prob = float(self.mask_replace_prob)
        self.random_replace_prob = float(self.random_replace_prob)

-        if self.tf_experimental_compile:
-            import tensorflow as tf
-
-            self.tf_mask_tokens = tf.function(self.tf_mask_tokens, jit_compile=True)
-
        self.generator = None

    def get_generator(self, seed):
@ -864,10 +715,6 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
            import torch

            return torch.Generator().manual_seed(seed)
-        elif self.return_tensors == "tf":
-            import tensorflow as tf
-
-            return tf.random.Generator.from_seed(seed)
        else:
            import numpy as np

@ -882,7 +729,6 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
            # worker's generator, generated as the main seed + the worker's ID.
            # (https://pytorch.org/docs/stable/data.html#randomness-in-multi-process-data-loading)
            # Only PyTorch DataLoader allows us to access the worker ID, and so we check for this.
-            # For other frameworks, we will throw an error.
            import torch

            worker_info = torch.utils.data.get_worker_info()
@ -897,111 +743,6 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):

            self.generator = self.get_generator(self.seed + worker_info.id)

-    @staticmethod
-    def tf_bernoulli(shape, probability, generator=None):
-        import tensorflow as tf
-
-        prob_matrix = tf.fill(shape, probability)
-        # if generator exists, use it to generate the random numbers
-        # otherwise, use the global RNG
-        if generator:
-            return tf.cast(prob_matrix - generator.uniform(shape, 0, 1) >= 0, tf.bool)
-        else:
-            return tf.cast(prob_matrix - tf.random.uniform(shape, 0, 1) >= 0, tf.bool)
-
-    def tf_mask_tokens(
-        self, inputs: Any, vocab_size, mask_token_id, special_tokens_mask: Optional[Any] = None
-    ) -> tuple[Any, Any]:
-        """
-        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
-        """
-        import tensorflow as tf
-
-        mask_token_id = tf.cast(mask_token_id, inputs.dtype)
-
-        input_shape = tf.shape(inputs)
-        # 1 for a special token, 0 for a normal token in the special tokens mask
-        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
-        masked_indices = self.tf_bernoulli(input_shape, self.mlm_probability, self.generator) & ~special_tokens_mask
-        # Replace unmasked indices with -100 in the labels since we only compute loss on masked tokens
-        labels = tf.where(masked_indices, inputs, -100)
-
-        # mask_replace_prob% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
-        indices_replaced = self.tf_bernoulli(input_shape, self.mask_replace_prob, self.generator) & masked_indices
-
-        inputs = tf.where(indices_replaced, mask_token_id, inputs)
-
-        if self.mask_replace_prob == 1 or self.random_replace_prob == 0:
-            return inputs, labels
-
-        remaining_prob = 1 - self.mask_replace_prob
-        # scaling the random_replace_prob to the remaining probability for example if
-        # mask_replace_prob = 0.8 and random_replace_prob = 0.1,
-        # then random_replace_prob_scaled = 0.1 / 0.2 = 0.5
-        random_replace_prob_scaled = self.random_replace_prob / remaining_prob
-        # random_replace_prob% of the time, we replace masked input tokens with random word
-        indices_random = (
-            self.tf_bernoulli(input_shape, random_replace_prob_scaled, self.generator)
-            & masked_indices
-            & ~indices_replaced
-        )
-
-        if self.generator:
-            random_words = self.generator.uniform(input_shape, maxval=vocab_size, dtype=inputs.dtype)
-        else:
-            random_words = tf.random.uniform(input_shape, maxval=vocab_size, dtype=inputs.dtype)
-
-        inputs = tf.where(indices_random, random_words, inputs)
-
-        # The rest of the time ((1-random_replace_prob-mask_replace_prob)% of the time) we keep the masked input tokens unchanged
-        return inputs, labels
-
-    def tf_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
-        import tensorflow as tf
-
-        if self.seed and self.generator is None:
-            # If we have a seed, we need to create a generator object. Subsequent calls to this function will use the same generator.
-            # If no seed supplied, we will use the global RNG
-            self.create_rng()
-
-        # Handle dict or lists with proper padding and conversion to tensor.
-        if isinstance(examples[0], Mapping):
-            batch = pad_without_fast_tokenizer_warning(
-                self.tokenizer, examples, return_tensors="tf", pad_to_multiple_of=self.pad_to_multiple_of
-            )
-        else:
-            batch = {
-                "input_ids": _tf_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
-            }
-
-        # If special token mask has been preprocessed, pop it from the dict.
-        special_tokens_mask = batch.pop("special_tokens_mask", None)
-        if self.mlm:
-            if special_tokens_mask is None:
-                special_tokens_mask = [
-                    self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
-                    for val in batch["input_ids"].numpy().tolist()
-                ]
-                # Cannot directly create as bool
-                special_tokens_mask = tf.cast(tf.convert_to_tensor(special_tokens_mask, dtype=tf.int64), tf.bool)
-            else:
-                special_tokens_mask = tf.cast(special_tokens_mask, tf.bool)
-            batch["input_ids"], batch["labels"] = self.tf_mask_tokens(
-                tf.cast(batch["input_ids"], tf.int64),
-                special_tokens_mask=special_tokens_mask,
-                mask_token_id=self.tokenizer.mask_token_id,
-                vocab_size=len(self.tokenizer),
-            )
-        else:
-            labels = batch["input_ids"]
-            if self.tokenizer.pad_token_id is not None:
-                # Replace self.tokenizer.pad_token_id with -100
-                labels = tf.where(labels == self.tokenizer.pad_token_id, -100, labels)
-            else:
-                labels = tf.identity(labels)  # Makes a copy, just in case
-            batch["labels"] = labels
-        return batch
-
    def torch_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
        # Handle dict or lists with proper padding and conversion to tensor.

@ -1226,41 +967,6 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
        inputs, labels = self.torch_mask_tokens(batch_input, batch_mask)
        return {"input_ids": inputs, "labels": labels}

-    def tf_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
-        import tensorflow as tf
-
-        if self.seed and self.generator is None:
-            # If we have a seed, we need to create a generator object. Subsequent calls to this function will use the same generator.
-            # If no seed supplied, we will use the global RNG
-            self.create_rng()
-
-        if isinstance(examples[0], Mapping):
-            input_ids = [e["input_ids"] for e in examples]
-        else:
-            input_ids = examples
-            examples = [{"input_ids": e} for e in examples]
-
-        batch_input = _tf_collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
-
-        mask_labels = []
-        for e in examples:
-            ref_tokens = []
-            for id in tolist(e["input_ids"]):
-                token = self.tokenizer._convert_id_to_token(id)
-                ref_tokens.append(token)
-
-            # For Chinese tokens, we need extra inf to mark sub-word, e.g [喜,欢]-> [喜，##欢]
-            if "chinese_ref" in e:
-                ref_pos = tolist(e["chinese_ref"])
-                len_seq = len(e["input_ids"])
-                for i in range(len_seq):
-                    if i in ref_pos:
-                        ref_tokens[i] = "##" + ref_tokens[i]
-            mask_labels.append(self._whole_word_mask(ref_tokens))
-        batch_mask = _tf_collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
-        inputs, labels = self.tf_mask_tokens(tf.cast(batch_input, tf.int64), batch_mask)
-        return {"input_ids": inputs, "labels": labels}
-
    def numpy_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
        if self.seed and self.generator is None:
            # If we have a seed, we need to create a generator object. Subsequent calls to this function will use the same generator.
@ -1307,13 +1013,6 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
            indices = torch.randperm(len(cand_indexes), generator=self.generator)
            return [cand_indexes[i] for i in indices]

-        elif self.return_tensors == "tf":
-            import tensorflow as tf
-
-            seed = self.generator.make_seeds(2)[0]
-            indices = tf.random.experimental.stateless_shuffle(tf.range(len(cand_indexes)), seed=seed).numpy().tolist()
-            return [cand_indexes[i] for i in indices]
-
        elif self.return_tensors == "np":
            self.generator.shuffle(cand_indexes)
            return cand_indexes
@ -1414,66 +1113,6 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
        # The rest of the time ((1-random_replacement_prob-mask_replace_prob)% of the time) we keep the masked input tokens unchanged
        return inputs, labels

-    def tf_mask_tokens(self, inputs: Any, mask_labels: Any) -> tuple[Any, Any]:
-        """
-        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
-        'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
-        """
-        import tensorflow as tf
-
-        input_shape = tf.shape(inputs)
-        if self.tokenizer.mask_token is None:
-            raise ValueError(
-                "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the"
-                " --mlm flag if you want to use this tokenizer."
-            )
-        labels = tf.identity(inputs)
-        # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
-
-        masked_indices = tf.cast(mask_labels, tf.bool)
-
-        special_tokens_mask = [
-            self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels
-        ]
-        masked_indices = masked_indices & ~tf.cast(special_tokens_mask, dtype=tf.bool)
-        if self.tokenizer.pad_token is not None:
-            padding_mask = inputs == self.tokenizer.pad_token_id
-            masked_indices = masked_indices & ~padding_mask
-
-        # Replace unmasked indices with -100 in the labels since we only compute loss on masked tokens
-        labels = tf.where(masked_indices, inputs, -100)
-
-        # mask_replace_prob% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
-        indices_replaced = self.tf_bernoulli(input_shape, self.mask_replace_prob, self.generator) & masked_indices
-
-        inputs = tf.where(indices_replaced, self.tokenizer.mask_token_id, inputs)
-
-        if self.mask_replace_prob == 1 or self.random_replace_prob == 0:
-            return inputs, labels
-
-        remaining_prob = 1 - self.mask_replace_prob
-        # scaling the random_replace_prob to the remaining probability for example if
-        # mask_replace_prob = 0.8 and random_replace_prob = 0.1,
-        # then random_replace_prob_scaled = 0.1 / 0.2 = 0.5
-        random_replace_prob_scaled = self.random_replace_prob / remaining_prob
-
-        # random_replace_prob% of the time, we replace masked input tokens with random word
-        indices_random = (
-            self.tf_bernoulli(input_shape, random_replace_prob_scaled, self.generator)
-            & masked_indices
-            & ~indices_replaced
-        )
-
-        if self.generator:
-            random_words = self.generator.uniform(input_shape, maxval=len(self.tokenizer), dtype=tf.int64)
-        else:
-            random_words = tf.random.uniform(input_shape, maxval=len(self.tokenizer), dtype=tf.int64)
-
-        inputs = tf.where(indices_random, random_words, inputs)
-
-        # The rest of the time ((1-mask_replace_prob-random_replace_prob)% of the time) we keep the masked input tokens unchanged
-        return inputs, labels
-
    def numpy_mask_tokens(self, inputs: Any, mask_labels: Any) -> tuple[Any, Any]:
        """
        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
@ -1543,7 +1182,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
 def tolist(x):
    if isinstance(x, list):
        return x
-    elif hasattr(x, "numpy"):  # Checks for TF tensors without needing the import
+    elif hasattr(x, "numpy"):
        x = x.numpy()
    return x.tolist()

@ -1652,13 +1291,6 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
        inputs, perm_mask, target_mapping, labels = self.torch_mask_tokens(batch)
        return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}

-    def tf_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
-        if isinstance(examples[0], Mapping):
-            examples = [e["input_ids"] for e in examples]
-        batch = _tf_collate_batch(examples, self.tokenizer)
-        inputs, perm_mask, target_mapping, labels = self.tf_mask_tokens(batch)
-        return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}
-
    def numpy_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
        if isinstance(examples[0], Mapping):
            examples = [e["input_ids"] for e in examples]
@ -1765,113 +1397,6 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):

        return inputs.long(), perm_mask, target_mapping, labels.long()

-    def tf_mask_tokens(self, inputs: Any) -> tuple[Any, Any, Any, Any]:
-        """
-        The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
-
-            0. Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
-            1. Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be masked)
-            2. Reserve a context of length `context_length = span_length / plm_probability` to surround span to be
-               masked
-            3. Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length -
-               span_length]` and mask tokens `start_index:start_index + span_length`
-            4. Set `cur_len = cur_len + context_length`. If `cur_len < max_len` (i.e. there are tokens remaining in the
-               sequence to be processed), repeat from Step 1.
-        """
-        import tensorflow as tf
-
-        if self.tokenizer.mask_token is None:
-            raise ValueError(
-                "This tokenizer does not have a mask token which is necessary for permutation language modeling."
-                " Please add a mask token if you want to use this tokenizer."
-            )
-
-        if tf.shape(inputs)[1] % 2 != 0:
-            raise ValueError(
-                "This collator requires that sequence lengths be even to create a leakage-free perm_mask. Please see"
-                " relevant comments in source code for details."
-            )
-
-        labels = tf.identity(inputs)
-        # Creating the mask and target_mapping tensors
-        masked_indices = np.full(labels.shape.as_list(), 0, dtype=bool)
-        labels_shape = tf.shape(labels)
-        target_mapping = np.zeros((labels_shape[0], labels_shape[1], labels_shape[1]), dtype=np.float32)
-
-        for i in range(len(labels)):
-            # Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
-            cur_len = 0
-            max_len = tf.shape(labels)[1]
-
-            while cur_len < max_len:
-                # Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be masked)
-                span_length = randint(1, self.max_span_length + 1)
-                # Reserve a context of length `context_length = span_length / plm_probability` to surround the span to be masked
-                context_length = int(span_length / self.plm_probability)
-                # Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length`
-                start_index = cur_len + randint(0, context_length - span_length + 1)
-                masked_indices[i, start_index : start_index + span_length] = 1
-                # Set `cur_len = cur_len + context_length`
-                cur_len += context_length
-
-            # Since we're replacing non-masked tokens with -100 in the labels tensor instead of skipping them altogether,
-            # the i-th predict corresponds to the i-th token.
-            target_mapping[i] = np.eye(labels_shape[1])
-        masked_indices = tf.cast(tf.convert_to_tensor(masked_indices), dtype=tf.bool)
-        target_mapping = tf.convert_to_tensor(target_mapping)
-        special_tokens_mask = tf.convert_to_tensor(
-            [
-                self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
-                for val in labels.numpy().tolist()
-            ],
-        )
-        special_tokens_mask = tf.cast(special_tokens_mask, dtype=tf.bool)
-        masked_indices = masked_indices & ~special_tokens_mask
-        if self.tokenizer.pad_token is not None:
-            padding_mask = labels == self.tokenizer.pad_token_id
-            masked_indices = masked_indices & ~padding_mask
-
-        # Mask indicating non-functional tokens, where functional tokens are [SEP], [CLS], padding, etc.
-        non_func_mask = ~(padding_mask | special_tokens_mask)
-
-        inputs = tf.where(masked_indices, self.tokenizer.mask_token_id, inputs)
-        labels = tf.where(masked_indices, labels, -100)  # We only compute loss on masked tokens
-
-        perm_mask = []
-
-        for i in range(len(labels)):
-            # Generate permutation indices i.e. sample a random factorisation order for the sequence. This will
-            # determine which tokens a given token can attend to (encoded in `perm_mask`).
-            # Note: Length of token sequence being permuted has to be less than or equal to reused sequence length
-            # (see documentation for `mems`), otherwise information may leak through due to reuse. In this implementation,
-            # we assume that reused length is half of sequence length and permutation length is equal to reused length.
-            # This requires that the sequence length be even.
-
-            # Create a linear factorisation order
-            # tf.range is the equivalent of torch.arange
-            perm_index = tf.range(labels_shape[1])
-            # Split this into two halves, assuming that half the sequence is reused each time
-            perm_index = tf.transpose(tf.reshape(perm_index, (-1, labels_shape[1] // 2)))
-            # Permute the two halves such that they do not cross over
-            perm_index = tf.random.shuffle(perm_index)  # Shuffles along the first dimension
-            # Flatten this out into the desired permuted factorisation order
-            perm_index = tf.reshape(tf.transpose(perm_index), (-1,))
-            # Set the permutation indices of non-masked (non-functional) tokens to the
-            # smallest index (-1) so that:
-            # (1) They can be seen by all other positions
-            # (2) They cannot see masked positions, so there won't be information leak
-            perm_index = tf.where(~masked_indices[i] & non_func_mask[i], -1, perm_index)
-            # The logic for whether the i-th token can attend on the j-th token based on the factorisation order:
-            # 0 (can attend): If perm_index[i] > perm_index[j] or j is neither masked nor a functional token
-            # 1 (cannot attend): If perm_index[i] <= perm_index[j] and j is either masked or a functional token
-            perm_mask.append(
-                (tf.reshape(perm_index, (labels_shape[1], 1)) <= tf.reshape(perm_index, (1, labels_shape[1])))
-                & masked_indices[i]
-            )
-        perm_mask = tf.stack(perm_mask, axis=0)
-
-        return tf.cast(inputs, tf.int64), tf.cast(perm_mask, tf.float32), target_mapping, tf.cast(labels, tf.int64)
-
    def numpy_mask_tokens(self, inputs: Any) -> tuple[Any, Any, Any, Any]:
        """
        The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
--- a/src/transformers/data/datasets/glue.py
+++ b/src/transformers/data/datasets/glue.py
@ -69,10 +69,6 @@ class Split(Enum):


 class GlueDataset(Dataset):
-    """
-    This will be superseded by a framework-agnostic approach soon.
-    """
-
    args: GlueDataTrainingArguments
    output_mode: str
    features: list[InputFeatures]
--- a/src/transformers/data/datasets/language_modeling.py
+++ b/src/transformers/data/datasets/language_modeling.py
@ -38,10 +38,6 @@ DEPRECATION_WARNING = (


 class TextDataset(Dataset):
-    """
-    This will be superseded by a framework-agnostic approach soon.
-    """
-
    def __init__(
        self,
        tokenizer: PreTrainedTokenizer,
@ -111,10 +107,6 @@ class TextDataset(Dataset):


 class LineByLineTextDataset(Dataset):
-    """
-    This will be superseded by a framework-agnostic approach soon.
-    """
-
    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int):
        warnings.warn(
            DEPRECATION_WARNING.format(
@ -144,10 +136,6 @@ class LineByLineTextDataset(Dataset):


 class LineByLineWithRefDataset(Dataset):
-    """
-    This will be superseded by a framework-agnostic approach soon.
-    """
-
    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, ref_path: str):
        warnings.warn(
            DEPRECATION_WARNING.format(
@ -344,10 +332,6 @@ class LineByLineWithSOPTextDataset(Dataset):


 class TextDatasetForNextSentencePrediction(Dataset):
-    """
-    This will be superseded by a framework-agnostic approach soon.
-    """
-
    def __init__(
        self,
        tokenizer: PreTrainedTokenizer,
--- a/src/transformers/data/datasets/squad.py
+++ b/src/transformers/data/datasets/squad.py
@ -107,10 +107,6 @@ class Split(Enum):


 class SquadDataset(Dataset):
-    """
-    This will be superseded by a framework-agnostic approach soon.
-    """
-
    args: SquadDataTrainingArguments
    features: list[SquadFeatures]
    mode: Split
--- a/src/transformers/data/processors/glue.py
+++ b/src/transformers/data/processors/glue.py
@ -17,18 +17,14 @@

 import os
 import warnings
-from dataclasses import asdict
 from enum import Enum
 from typing import Optional, Union

 from ...tokenization_utils import PreTrainedTokenizer
-from ...utils import is_tf_available, logging
+from ...utils import logging
 from .utils import DataProcessor, InputExample, InputFeatures


-if is_tf_available():
-    import tensorflow as tf
-
 logger = logging.get_logger(__name__)

 DEPRECATION_WARNING = (
@ -39,7 +35,7 @@ DEPRECATION_WARNING = (


 def glue_convert_examples_to_features(
-    examples: Union[list[InputExample], "tf.data.Dataset"],
+    examples: list[InputExample],
    tokenizer: PreTrainedTokenizer,
    max_length: Optional[int] = None,
    task=None,
@ -50,7 +46,7 @@ def glue_convert_examples_to_features(
    Loads a data file into a list of `InputFeatures`

    Args:
-        examples: List of `InputExamples` or `tf.data.Dataset` containing the examples.
+        examples: List of `InputExamples` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length. Defaults to the tokenizer's max_len
        task: GLUE task
@ -58,54 +54,15 @@ def glue_convert_examples_to_features(
        output_mode: String indicating the output mode. Either `regression` or `classification`

    Returns:
-        If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the task-specific
-        features. If the input is a list of `InputExamples`, will return a list of task-specific `InputFeatures` which
-        can be fed to the model.
+        Will return a list of task-specific `InputFeatures` which can be fed to the model.

    """
    warnings.warn(DEPRECATION_WARNING.format("function"), FutureWarning)
-    if is_tf_available() and isinstance(examples, tf.data.Dataset):
-        if task is None:
-            raise ValueError("When calling glue_convert_examples_to_features from TF, the task parameter is required.")
-        return _tf_glue_convert_examples_to_features(examples, tokenizer, max_length=max_length, task=task)
    return _glue_convert_examples_to_features(
        examples, tokenizer, max_length=max_length, task=task, label_list=label_list, output_mode=output_mode
    )


-if is_tf_available():
-
-    def _tf_glue_convert_examples_to_features(
-        examples: tf.data.Dataset,
-        tokenizer: PreTrainedTokenizer,
-        task=str,
-        max_length: Optional[int] = None,
-    ) -> tf.data.Dataset:
-        """
-        Returns:
-            A `tf.data.Dataset` containing the task-specific features.
-
-        """
-        processor = glue_processors[task]()
-        examples = [processor.tfds_map(processor.get_example_from_tensor_dict(example)) for example in examples]
-        features = glue_convert_examples_to_features(examples, tokenizer, max_length=max_length, task=task)
-        label_type = tf.float32 if task == "sts-b" else tf.int64
-
-        def gen():
-            for ex in features:
-                d = {k: v for k, v in asdict(ex).items() if v is not None}
-                label = d.pop("label")
-                yield (d, label)
-
-        input_names = tokenizer.model_input_names
-
-        return tf.data.Dataset.from_generator(
-            gen,
-            (dict.fromkeys(input_names, tf.int32), label_type),
-            ({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([])),
-        )
-
-
 def _glue_convert_examples_to_features(
    examples: list[InputExample],
    tokenizer: PreTrainedTokenizer,
--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@ -24,7 +24,7 @@ from tqdm import tqdm

 from ...models.bert.tokenization_bert import whitespace_tokenize
 from ...tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy
-from ...utils import is_tf_available, is_torch_available, is_torch_hpu_available, logging
+from ...utils import is_torch_available, is_torch_hpu_available, logging
 from .utils import DataProcessor


@ -36,8 +36,6 @@ if is_torch_available():
    import torch
    from torch.utils.data import TensorDataset

-if is_tf_available():
-    import tensorflow as tf

 logger = logging.get_logger(__name__)

@ -244,7 +242,6 @@ def squad_convert_example_to_features(
        cls_index = span["input_ids"].index(tokenizer.cls_token_id)

        # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
-        # Original TF implementation also keep the classification token (set to 0)
        p_mask = np.ones_like(span["token_type_ids"])
        if tokenizer.padding_side == "right":
            p_mask[len(truncated_query) + sequence_added_tokens :] = 0
@ -338,8 +335,8 @@ def squad_convert_examples_to_features(
        max_query_length: The maximum length of the query.
        is_training: whether to create features for model evaluation or model training.
        padding_strategy: Default to "max_length". Which padding strategy to use
-        return_dataset: Default False. Either 'pt' or 'tf'.
-            if 'pt': returns a torch.data.TensorDataset, if 'tf': returns a tf.data.Dataset
+        return_dataset: Default False. Can also be 'pt'.
+            if 'pt': returns a torch.data.TensorDataset.
        threads: multiple processing threads.


@ -430,110 +427,6 @@ def squad_convert_examples_to_features(
            )

        return features, dataset
-    elif return_dataset == "tf":
-        if not is_tf_available():
-            raise RuntimeError("TensorFlow must be installed to return a TensorFlow dataset.")
-
-        def gen():
-            for i, ex in enumerate(features):
-                if ex.token_type_ids is None:
-                    yield (
-                        {
-                            "input_ids": ex.input_ids,
-                            "attention_mask": ex.attention_mask,
-                            "feature_index": i,
-                            "qas_id": ex.qas_id,
-                        },
-                        {
-                            "start_positions": ex.start_position,
-                            "end_positions": ex.end_position,
-                            "cls_index": ex.cls_index,
-                            "p_mask": ex.p_mask,
-                            "is_impossible": ex.is_impossible,
-                        },
-                    )
-                else:
-                    yield (
-                        {
-                            "input_ids": ex.input_ids,
-                            "attention_mask": ex.attention_mask,
-                            "token_type_ids": ex.token_type_ids,
-                            "feature_index": i,
-                            "qas_id": ex.qas_id,
-                        },
-                        {
-                            "start_positions": ex.start_position,
-                            "end_positions": ex.end_position,
-                            "cls_index": ex.cls_index,
-                            "p_mask": ex.p_mask,
-                            "is_impossible": ex.is_impossible,
-                        },
-                    )
-
-        # Why have we split the batch into a tuple? PyTorch just has a list of tensors.
-        if "token_type_ids" in tokenizer.model_input_names:
-            train_types = (
-                {
-                    "input_ids": tf.int32,
-                    "attention_mask": tf.int32,
-                    "token_type_ids": tf.int32,
-                    "feature_index": tf.int64,
-                    "qas_id": tf.string,
-                },
-                {
-                    "start_positions": tf.int64,
-                    "end_positions": tf.int64,
-                    "cls_index": tf.int64,
-                    "p_mask": tf.int32,
-                    "is_impossible": tf.int32,
-                },
-            )
-
-            train_shapes = (
-                {
-                    "input_ids": tf.TensorShape([None]),
-                    "attention_mask": tf.TensorShape([None]),
-                    "token_type_ids": tf.TensorShape([None]),
-                    "feature_index": tf.TensorShape([]),
-                    "qas_id": tf.TensorShape([]),
-                },
-                {
-                    "start_positions": tf.TensorShape([]),
-                    "end_positions": tf.TensorShape([]),
-                    "cls_index": tf.TensorShape([]),
-                    "p_mask": tf.TensorShape([None]),
-                    "is_impossible": tf.TensorShape([]),
-                },
-            )
-        else:
-            train_types = (
-                {"input_ids": tf.int32, "attention_mask": tf.int32, "feature_index": tf.int64, "qas_id": tf.string},
-                {
-                    "start_positions": tf.int64,
-                    "end_positions": tf.int64,
-                    "cls_index": tf.int64,
-                    "p_mask": tf.int32,
-                    "is_impossible": tf.int32,
-                },
-            )
-
-            train_shapes = (
-                {
-                    "input_ids": tf.TensorShape([None]),
-                    "attention_mask": tf.TensorShape([None]),
-                    "feature_index": tf.TensorShape([]),
-                    "qas_id": tf.TensorShape([]),
-                },
-                {
-                    "start_positions": tf.TensorShape([]),
-                    "end_positions": tf.TensorShape([]),
-                    "cls_index": tf.TensorShape([]),
-                    "p_mask": tf.TensorShape([None]),
-                    "is_impossible": tf.TensorShape([]),
-                },
-            )
-
-        return tf.data.Dataset.from_generator(gen, train_types, train_shapes)
    else:
        return features

--- a/src/transformers/data/processors/utils.py
+++ b/src/transformers/data/processors/utils.py
@ -20,7 +20,7 @@ import json
 from dataclasses import dataclass
 from typing import Optional, Union

-from ...utils import is_tf_available, is_torch_available, logging
+from ...utils import is_torch_available, logging


 logger = logging.get_logger(__name__)
@ -82,7 +82,7 @@ class DataProcessor:

    def get_example_from_tensor_dict(self, tensor_dict):
        """
-        Gets an example from a dict with tensorflow tensors.
+        Gets an example from a dict.

        Args:
            tensor_dict: Keys and values should match the corresponding Glue
@ -251,9 +251,7 @@ class SingleSentenceClassificationProcessor(DataProcessor):
                values)

        Returns:
-            If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the
-            task-specific features. If the input is a list of `InputExamples`, will return a list of task-specific
-            `InputFeatures` which can be fed to the model.
+            Will return a list of task-specific `InputFeatures` which can be fed to the model.

        """
        if max_length is None:
@ -315,21 +313,6 @@ class SingleSentenceClassificationProcessor(DataProcessor):

        if return_tensors is None:
            return features
-        elif return_tensors == "tf":
-            if not is_tf_available():
-                raise RuntimeError("return_tensors set to 'tf' but TensorFlow 2.0 can't be imported")
-            import tensorflow as tf
-
-            def gen():
-                for ex in features:
-                    yield ({"input_ids": ex.input_ids, "attention_mask": ex.attention_mask}, ex.label)
-
-            dataset = tf.data.Dataset.from_generator(
-                gen,
-                ({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int64),
-                ({"input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None])}, tf.TensorShape([])),
-            )
-            return dataset
        elif return_tensors == "pt":
            if not is_torch_available():
                raise RuntimeError("return_tensors set to 'pt' but PyTorch can't be imported")
@ -346,4 +329,4 @@ class SingleSentenceClassificationProcessor(DataProcessor):
            dataset = TensorDataset(all_input_ids, all_attention_mask, all_labels)
            return dataset
        else:
-            raise ValueError("return_tensors should be one of 'tf' or 'pt'")
+            raise ValueError("return_tensors should be `'pt'` or `None`")
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@ -18,7 +18,6 @@ deps = {
    "faiss-cpu": "faiss-cpu",
    "fastapi": "fastapi",
    "filelock": "filelock",
-    "flax": "flax>=0.4.1,<=0.7.0",
    "ftfy": "ftfy",
    "fugashi": "fugashi>=1.0",
    "GitPython": "GitPython<3.1.19",
@ -27,12 +26,8 @@ deps = {
    "huggingface-hub": "huggingface-hub>=0.34.0,<1.0",
    "importlib_metadata": "importlib_metadata",
    "ipadic": "ipadic>=1.0.0,<2.0",
-    "jax": "jax>=0.4.1,<=0.4.13",
-    "jaxlib": "jaxlib>=0.4.1,<=0.4.13",
    "jinja2": "jinja2>=3.1.0",
    "kenlm": "kenlm",
-    "keras": "keras>2.9,<2.16",
-    "keras-nlp": "keras-nlp>=0.3.1,<0.14.0",
    "kernels": "kernels>=0.6.1,<=0.9",
    "librosa": "librosa",
    "natten": "natten>=0.14.6,<0.15.0",
@ -75,18 +70,13 @@ deps = {
    "sagemaker": "sagemaker>=2.31.0",
    "schedulefree": "schedulefree>=1.2.6",
    "scikit-learn": "scikit-learn",
-    "scipy": "scipy<1.13.0",
+    "scipy": "scipy",
    "sentencepiece": "sentencepiece>=0.1.91,!=0.1.92",
    "sigopt": "sigopt",
    "starlette": "starlette",
    "sudachipy": "sudachipy>=0.6.6",
    "sudachidict_core": "sudachidict_core>=20220729",
    "tensorboard": "tensorboard",
-    "tensorflow-cpu": "tensorflow-cpu>2.9,<2.16",
-    "tensorflow": "tensorflow>2.9,<2.16",
-    "tensorflow-text": "tensorflow-text<2.16",
-    "tensorflow-probability": "tensorflow-probability<0.24",
-    "tf2onnx": "tf2onnx",
    "timeout-decorator": "timeout-decorator",
    "tiktoken": "tiktoken",
    "timm": "timm<=1.0.19,!=1.0.18",
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@ -20,7 +20,7 @@ from typing import Optional, Union
 import numpy as np

 from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin
-from .utils import PaddingStrategy, TensorType, is_tf_tensor, is_torch_tensor, logging, to_numpy
+from .utils import PaddingStrategy, TensorType, is_torch_tensor, logging, to_numpy


 logger = logging.get_logger(__name__)
@ -74,7 +74,7 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):

        <Tip>

-        If the `processed_features` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
+        If the `processed_features` passed are dictionary of numpy arrays or PyTorch tensors  the
        result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
        PyTorch tensors, you will lose the specific device of your tensors however.

@ -87,7 +87,7 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
                list[float]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
                collate function.

-                Instead of `list[float]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
+                Instead of `list[float]` you can have tensors (numpy arrays or PyTorch tensors),
                see the note above for the return type.
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
                Select a strategy to pad the returned sequences (according to the model's padding side and padding
@ -116,7 +116,6 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

-                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
        """
@ -145,7 +144,7 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
                processed_features["attention_mask"] = []
            return processed_features

-        # If we have PyTorch/TF tensors or lists as inputs, we cast them as Numpy arrays
+        # If we have PyTorch tensors or lists as inputs, we cast them as Numpy arrays
        # and rebuild them afterwards if no return_tensors is specified
        # Note that we lose the specific device the tensor may be on for PyTorch

@ -159,16 +158,14 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
                first_element = required_input[index][0]

        if return_tensors is None:
-            if is_tf_tensor(first_element):
-                return_tensors = "tf"
-            elif is_torch_tensor(first_element):
+            if is_torch_tensor(first_element):
                return_tensors = "pt"
            elif isinstance(first_element, (int, float, list, tuple, np.ndarray)):
                return_tensors = "np"
            else:
                raise ValueError(
                    f"type of {first_element} unknown: {type(first_element)}. "
-                    "Should be one of a python, numpy, pytorch or tensorflow object."
+                    "Should be one of a python, numpy, or pytorch object."
                )

        for key, value in processed_features.items():
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@ -32,12 +32,9 @@ from .utils import (
    TensorType,
    copy_func,
    download_url,
-    is_flax_available,
-    is_jax_tensor,
    is_numpy_array,
    is_offline_mode,
    is_remote_url,
-    is_tf_available,
    is_torch_available,
    is_torch_device,
    is_torch_dtype,
@ -71,7 +68,7 @@ class BatchFeature(UserDict):
            Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('input_values', 'attention_mask',
            etc.).
        tensor_type (`Union[None, str, TensorType]`, *optional*):
-            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
+            You can give a tensor_type here to convert the lists of integers in PyTorch/Numpy Tensors at
            initialization.
    """

@ -110,21 +107,7 @@ class BatchFeature(UserDict):
        if not isinstance(tensor_type, TensorType):
            tensor_type = TensorType(tensor_type)

-        # Get a function reference for the correct framework
-        if tensor_type == TensorType.TENSORFLOW:
-            logger.warning_once(
-                "TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We "
-                "recommend migrating to PyTorch classes or pinning your version of Transformers."
-            )
-            if not is_tf_available():
-                raise ImportError(
-                    "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed."
-                )
-            import tensorflow as tf
-
-            as_tensor = tf.constant
-            is_tensor = tf.is_tensor
-        elif tensor_type == TensorType.PYTORCH:
+        if tensor_type == TensorType.PYTORCH:
            if not is_torch_available():
                raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
            import torch  # noqa
@ -145,17 +128,6 @@ class BatchFeature(UserDict):
                    return torch.tensor(value)

            is_tensor = torch.is_tensor
-        elif tensor_type == TensorType.JAX:
-            logger.warning_once(
-                "TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We "
-                "recommend migrating to PyTorch classes or pinning your version of Transformers."
-            )
-            if not is_flax_available():
-                raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.")
-            import jax.numpy as jnp  # noqa: F811
-
-            as_tensor = jnp.array
-            is_tensor = is_jax_tensor
        else:

            def as_tensor(value, dtype=None):
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@ -31,7 +31,6 @@ from .utils import (
    ENV_VARS_TRUE_AND_AUTO_VALUES,
    ENV_VARS_TRUE_VALUES,
    FEATURE_EXTRACTOR_NAME,
-    FLAX_WEIGHTS_NAME,
    HF_MODULES_CACHE,
    HUGGINGFACE_CO_PREFIX,
    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
@ -42,14 +41,9 @@ from .utils import (
    S3_BUCKET_PREFIX,
    SENTENCEPIECE_UNDERLINE,
    SPIECE_UNDERLINE,
-    TF2_WEIGHTS_NAME,
-    TF_WEIGHTS_NAME,
    TORCH_FX_REQUIRED_VERSION,
    TRANSFORMERS_CACHE,
    TRANSFORMERS_DYNAMIC_MODULE_NAME,
-    USE_JAX,
-    USE_TF,
-    USE_TORCH,
    WEIGHTS_INDEX_NAME,
    WEIGHTS_NAME,
    ContextManagers,
@ -79,7 +73,6 @@ from .utils import (
    is_datasets_available,
    is_detectron2_available,
    is_faiss_available,
-    is_flax_available,
    is_ftfy_available,
    is_g2p_en_available,
    is_in_notebook,
@ -106,9 +99,6 @@ from .utils import (
    is_spacy_available,
    is_speech_available,
    is_tensor,
-    is_tensorflow_probability_available,
-    is_tf2onnx_available,
-    is_tf_available,
    is_timm_available,
    is_tokenizers_available,
    is_torch_available,
--- a/src/transformers/generation/init.py
+++ b/src/transformers/generation/init.py
@ -14,7 +14,7 @@

 from typing import TYPE_CHECKING

-from ..utils import OptionalDependencyNotAvailable, _LazyModule, is_flax_available, is_tf_available, is_torch_available
+from ..utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available


 _import_structure = {
@ -123,71 +123,6 @@ else:
        "SynthIDTextWatermarkDetector",
    ]

-try:
-    if not is_tf_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["tf_logits_process"] = [
-        "TFForcedBOSTokenLogitsProcessor",
-        "TFForcedEOSTokenLogitsProcessor",
-        "TFForceTokensLogitsProcessor",
-        "TFLogitsProcessor",
-        "TFLogitsProcessorList",
-        "TFLogitsWarper",
-        "TFMinLengthLogitsProcessor",
-        "TFNoBadWordsLogitsProcessor",
-        "TFNoRepeatNGramLogitsProcessor",
-        "TFRepetitionPenaltyLogitsProcessor",
-        "TFSuppressTokensAtBeginLogitsProcessor",
-        "TFSuppressTokensLogitsProcessor",
-        "TFTemperatureLogitsWarper",
-        "TFTopKLogitsWarper",
-        "TFTopPLogitsWarper",
-    ]
-    _import_structure["tf_utils"] = [
-        "TFGenerationMixin",
-        "TFGreedySearchDecoderOnlyOutput",
-        "TFGreedySearchEncoderDecoderOutput",
-        "TFSampleEncoderDecoderOutput",
-        "TFSampleDecoderOnlyOutput",
-        "TFBeamSearchEncoderDecoderOutput",
-        "TFBeamSearchDecoderOnlyOutput",
-        "TFBeamSampleEncoderDecoderOutput",
-        "TFBeamSampleDecoderOnlyOutput",
-        "TFContrastiveSearchEncoderDecoderOutput",
-        "TFContrastiveSearchDecoderOnlyOutput",
-    ]
-
-try:
-    if not is_flax_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["flax_logits_process"] = [
-        "FlaxForcedBOSTokenLogitsProcessor",
-        "FlaxForcedEOSTokenLogitsProcessor",
-        "FlaxForceTokensLogitsProcessor",
-        "FlaxLogitsProcessor",
-        "FlaxLogitsProcessorList",
-        "FlaxLogitsWarper",
-        "FlaxMinLengthLogitsProcessor",
-        "FlaxSuppressTokensAtBeginLogitsProcessor",
-        "FlaxSuppressTokensLogitsProcessor",
-        "FlaxTemperatureLogitsWarper",
-        "FlaxTopKLogitsWarper",
-        "FlaxTopPLogitsWarper",
-        "FlaxWhisperTimeStampLogitsProcessor",
-        "FlaxNoRepeatNGramLogitsProcessor",
-    ]
-    _import_structure["flax_utils"] = [
-        "FlaxGenerationMixin",
-        "FlaxGreedySearchOutput",
-        "FlaxSampleOutput",
-        "FlaxBeamSearchOutput",
-    ]

 if TYPE_CHECKING:
    from .configuration_utils import (
@ -283,66 +218,6 @@ if TYPE_CHECKING:
            WatermarkDetectorOutput,
        )

-    try:
-        if not is_tf_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .tf_logits_process import (
-            TFForcedBOSTokenLogitsProcessor,
-            TFForcedEOSTokenLogitsProcessor,
-            TFForceTokensLogitsProcessor,
-            TFLogitsProcessor,
-            TFLogitsProcessorList,
-            TFLogitsWarper,
-            TFMinLengthLogitsProcessor,
-            TFNoBadWordsLogitsProcessor,
-            TFNoRepeatNGramLogitsProcessor,
-            TFRepetitionPenaltyLogitsProcessor,
-            TFSuppressTokensAtBeginLogitsProcessor,
-            TFSuppressTokensLogitsProcessor,
-            TFTemperatureLogitsWarper,
-            TFTopKLogitsWarper,
-            TFTopPLogitsWarper,
-        )
-        from .tf_utils import (
-            TFBeamSampleDecoderOnlyOutput,
-            TFBeamSampleEncoderDecoderOutput,
-            TFBeamSearchDecoderOnlyOutput,
-            TFBeamSearchEncoderDecoderOutput,
-            TFContrastiveSearchDecoderOnlyOutput,
-            TFContrastiveSearchEncoderDecoderOutput,
-            TFGenerationMixin,
-            TFGreedySearchDecoderOnlyOutput,
-            TFGreedySearchEncoderDecoderOutput,
-            TFSampleDecoderOnlyOutput,
-            TFSampleEncoderDecoderOutput,
-        )
-
-    try:
-        if not is_flax_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .flax_logits_process import (
-            FlaxForcedBOSTokenLogitsProcessor,
-            FlaxForcedEOSTokenLogitsProcessor,
-            FlaxForceTokensLogitsProcessor,
-            FlaxLogitsProcessor,
-            FlaxLogitsProcessorList,
-            FlaxLogitsWarper,
-            FlaxMinLengthLogitsProcessor,
-            FlaxNoRepeatNGramLogitsProcessor,
-            FlaxSuppressTokensAtBeginLogitsProcessor,
-            FlaxSuppressTokensLogitsProcessor,
-            FlaxTemperatureLogitsWarper,
-            FlaxTopKLogitsWarper,
-            FlaxTopPLogitsWarper,
-            FlaxWhisperTimeStampLogitsProcessor,
-        )
-        from .flax_utils import FlaxBeamSearchOutput, FlaxGenerationMixin, FlaxGreedySearchOutput, FlaxSampleOutput
 else:
    import sys

--- a/src/transformers/generation/flax_logits_process.py
+++ b/src/transformers/generation/flax_logits_process.py
@ -1,544 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-
-import jax
-import jax.lax as lax
-import jax.numpy as jnp
-from jax.experimental import sparse
-
-from ..utils import add_start_docstrings
-from ..utils.logging import get_logger
-
-
-logger = get_logger(__name__)
-
-
-LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        scores (`jnp.ndarray` of shape `(batch_size, config.vocab_size)`):
-            Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
-            search or log softmax for each vocabulary token when using beam search
-        kwargs (`dict[str, Any]`, *optional*):
-            Additional logits processor specific kwargs.
-
-    Return:
-        `jnp.ndarray` of shape `(batch_size, config.vocab_size)`: The processed prediction scores.
-
-"""
-
-
-class FlaxLogitsProcessor:
-    """Abstract base class for all logit processors that can be applied during generation."""
-
-    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
-    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray) -> jnp.ndarray:
-        """Flax method for processing logits."""
-        raise NotImplementedError(
-            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
-        )
-
-
-class FlaxLogitsWarper:
-    """Abstract base class for all logit warpers that can be applied during generation with multinomial sampling."""
-
-    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
-    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray) -> jnp.ndarray:
-        """Flax method for warping logits."""
-        raise NotImplementedError(
-            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
-        )
-
-
-class FlaxLogitsProcessorList(list):
-    """
-    This class can be used to create a list of [`FlaxLogitsProcessor`] or [`FlaxLogitsWarper`] to subsequently process
-    a `scores` input tensor. This class inherits from list and adds a specific *__call__* method to apply each
-    [`FlaxLogitsProcessor`] or [`FlaxLogitsWarper`] to the inputs.
-    """
-
-    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
-    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int, **kwargs) -> jnp.ndarray:
-        for processor in self:
-            function_args = inspect.signature(processor.__call__).parameters
-            if len(function_args) > 3:
-                if not all(arg in kwargs for arg in list(function_args.keys())[2:]):
-                    raise ValueError(
-                        f"Make sure that all the required parameters: {list(function_args.keys())} for "
-                        f"{processor.__class__} are passed to the logits processor."
-                    )
-                scores = processor(input_ids, scores, cur_len, **kwargs)
-            else:
-                scores = processor(input_ids, scores, cur_len)
-        return scores
-
-
-class FlaxTemperatureLogitsWarper(FlaxLogitsWarper):
-    r"""
-    [`FlaxLogitsWarper`] for temperature (exponential scaling output probability distribution).
-
-    Args:
-        temperature (`float`):
-            The value used to module the logits distribution.
-    """
-
-    def __init__(self, temperature: float):
-        if not isinstance(temperature, float) or not (temperature > 0):
-            raise ValueError(f"`temperature` has to be a strictly positive float, but is {temperature}")
-
-        self.temperature = temperature
-
-    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
-        scores = scores / self.temperature
-        return scores
-
-
-class FlaxTopPLogitsWarper(FlaxLogitsWarper):
-    """
-    [`FlaxLogitsWarper`] that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <= prob_cut_off.
-
-    Args:
-        top_p (`float`):
-            If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
-            higher are kept for generation.
-        filter_value (`float`, *optional*, defaults to -inf):
-            All filtered values will be set to this float value.
-        min_tokens_to_keep (`int`, *optional*, defaults to 1):
-            Minimum number of tokens that cannot be filtered.
-    """
-
-    def __init__(self, top_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
-        if not isinstance(top_p, float) or (top_p < 0 or top_p > 1.0):
-            raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}")
-        if not isinstance(min_tokens_to_keep, int) or (min_tokens_to_keep < 1):
-            raise ValueError(f"`min_tokens_to_keep` has to be a positive integer, but is {min_tokens_to_keep}")
-
-        self.top_p = top_p
-        self.filter_value = filter_value
-        self.min_tokens_to_keep = min_tokens_to_keep
-
-    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
-        topk_scores, topk_indices = lax.top_k(scores, scores.shape[-1])
-
-        mask_scores = jnp.full_like(scores, self.filter_value)
-        cumulative_probs = jax.nn.softmax(topk_scores, axis=-1).cumsum(axis=-1)
-        score_mask = cumulative_probs < self.top_p
-
-        # include the token that is higher than top_p as well
-        score_mask = jnp.roll(score_mask, 1)
-        score_mask |= score_mask.at[:, 0].set(True)
-
-        # min tokens to keep
-        score_mask = score_mask.at[:, : self.min_tokens_to_keep].set(True)
-
-        topk_next_scores = jnp.where(score_mask, topk_scores, mask_scores)
-        next_scores = jax.lax.sort_key_val(topk_indices, topk_next_scores)[-1]
-
-        return next_scores
-
-
-class FlaxTopKLogitsWarper(FlaxLogitsWarper):
-    r"""
-    [`FlaxLogitsWarper`] that performs top-k, i.e. restricting to the k highest probability elements.
-
-    Args:
-        top_k (`int`):
-            The number of highest probability vocabulary tokens to keep for top-k-filtering.
-        filter_value (`float`, *optional*, defaults to -inf):
-            All filtered values will be set to this float value.
-        min_tokens_to_keep (`int`, *optional*, defaults to 1):
-            Minimum number of tokens that cannot be filtered.
-    """
-
-    def __init__(self, top_k: int, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
-        if not isinstance(top_k, int) or top_k <= 0:
-            raise ValueError(f"`top_k` has to be a strictly positive integer, but is {top_k}")
-
-        self.top_k = max(top_k, min_tokens_to_keep)
-        self.filter_value = filter_value
-
-    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
-        batch_size, vocab_size = scores.shape
-        next_scores_flat = jnp.full(batch_size * vocab_size, self.filter_value)
-
-        topk = min(self.top_k, scores.shape[-1])  # Safety check
-        topk_scores, topk_indices = lax.top_k(scores, topk)
-        shift = jnp.broadcast_to((jnp.arange(batch_size) * vocab_size)[:, None], (batch_size, topk)).flatten()
-        topk_scores_flat = topk_scores.flatten()
-        topk_indices_flat = topk_indices.flatten() + shift
-
-        next_scores_flat = next_scores_flat.at[topk_indices_flat].set(topk_scores_flat)
-        next_scores = next_scores_flat.reshape(batch_size, vocab_size)
-        return next_scores
-
-
-class FlaxForcedBOSTokenLogitsProcessor(FlaxLogitsProcessor):
-    r"""
-    [`FlaxLogitsProcessor`] that enforces the specified token as the first generated token.
-
-    Args:
-        bos_token_id (`int`):
-            The id of the token to force as the first generated token.
-    """
-
-    def __init__(self, bos_token_id: int):
-        self.bos_token_id = bos_token_id
-
-    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
-        new_scores = jnp.full(scores.shape, -float("inf"))
-
-        apply_penalty = 1 - jnp.bool_(cur_len - 1)
-
-        scores = jnp.where(apply_penalty, new_scores.at[:, self.bos_token_id].set(0), scores)
-
-        return scores
-
-
-class FlaxForcedEOSTokenLogitsProcessor(FlaxLogitsProcessor):
-    r"""
-    [`FlaxLogitsProcessor`] that enforces the specified token as the last generated token when `max_length` is reached.
-
-    Args:
-        max_length (`int`):
-            The maximum length of the sequence to be generated.
-        eos_token_id (`int`):
-            The id of the token to force as the last generated token when `max_length` is reached.
-    """
-
-    def __init__(self, max_length: int, eos_token_id: int):
-        self.max_length = max_length
-        self.eos_token_id = eos_token_id
-
-    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
-        new_scores = jnp.full(scores.shape, -float("inf"))
-
-        apply_penalty = 1 - jnp.bool_(cur_len - self.max_length + 1)
-
-        scores = jnp.where(apply_penalty, new_scores.at[:, self.eos_token_id].set(0), scores)
-
-        return scores
-
-
-class FlaxMinLengthLogitsProcessor(FlaxLogitsProcessor):
-    r"""
-    [`FlaxLogitsProcessor`] enforcing a min-length by setting EOS probability to 0.
-
-    Args:
-        min_length (`int`):
-            The minimum length below which the score of `eos_token_id` is set to `-float("Inf")`.
-        eos_token_id (`int`):
-            The id of the *end-of-sequence* token.
-    """
-
-    def __init__(self, min_length: int, eos_token_id: int):
-        if not isinstance(min_length, int) or min_length < 0:
-            raise ValueError(f"`min_length` has to be a positive integer, but is {min_length}")
-
-        if not isinstance(eos_token_id, int) or eos_token_id < 0:
-            raise ValueError(f"`eos_token_id` has to be a positive integer, but is {eos_token_id}")
-
-        self.min_length = min_length
-        self.eos_token_id = eos_token_id
-
-    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
-        # create boolean flag to decide if min length penalty should be applied
-        apply_penalty = 1 - jnp.clip(cur_len - self.min_length, 0, 1)
-
-        scores = jnp.where(apply_penalty, scores.at[:, self.eos_token_id].set(-float("inf")), scores)
-
-        return scores
-
-
-class FlaxSuppressTokensAtBeginLogitsProcessor(FlaxLogitsProcessor):
-    r"""
-    [`FlaxLogitsProcessor`] suppressing a list of tokens as soon as the `generate` function starts generating using
-    `begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` are not sampled at the
-    beginning of the generation.
-
-    Args:
-        begin_suppress_tokens (`list[int]`):
-            Tokens to not sample.
-        begin_index (`int`):
-            Index where the tokens are suppressed.
-    """
-
-    def __init__(self, begin_suppress_tokens, begin_index):
-        self.begin_suppress_tokens = list(begin_suppress_tokens)
-        self.begin_index = begin_index
-
-    def __call__(self, input_ids, scores, cur_len: int):
-        apply_penalty = 1 - jnp.bool_(cur_len - self.begin_index)
-
-        scores = jnp.where(apply_penalty, scores.at[:, self.begin_suppress_tokens].set(-float("inf")), scores)
-
-        return scores
-
-
-class FlaxSuppressTokensLogitsProcessor(FlaxLogitsProcessor):
-    r"""
-    [`FlaxLogitsProcessor`] suppressing a list of tokens at each decoding step. The processor will set their log probs
-    to be `-inf` so they are not sampled.
-
-    Args:
-        suppress_tokens (`list`):
-            Tokens to not sample.
-    """
-
-    def __init__(self, suppress_tokens: list):
-        self.suppress_tokens = list(suppress_tokens)
-
-    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
-        scores = scores.at[..., self.suppress_tokens].set(-float("inf"))
-
-        return scores
-
-
-class FlaxForceTokensLogitsProcessor(FlaxLogitsProcessor):
-    r"""
-    [`FlaxLogitsProcessor`] that takes a list of pairs of integers which indicates a mapping from generation indices to
-    token indices that will be forced before sampling. The processor will set their log probs to 0 and all other tokens
-    to `-inf` so that they are sampled at their corresponding index.
-
-    Args:
-        force_token_map (`list`):
-            Map giving token ids and indices where they will be forced to be sampled.
-    """
-
-    def __init__(self, force_token_map):
-        force_token_map = dict(force_token_map)
-        # Converts the dictionary of format {index: token} containing the tokens to be forced to an array, where the
-        # index of the array corresponds to the index of the token to be forced, for XLA compatibility.
-        # Indexes without forced tokens will have a negative value.
-        force_token_array = jnp.ones((max(force_token_map.keys()) + 1), dtype=jnp.int32) * -1
-        for index, token in force_token_map.items():
-            if token is not None:
-                force_token_array = force_token_array.at[index].set(token)
-        self.force_token_array = jnp.int32(force_token_array)
-
-    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
-        def _force_token(generation_idx):
-            batch_size = scores.shape[0]
-            current_token = self.force_token_array[generation_idx]
-
-            new_scores = jnp.ones_like(scores, dtype=scores.dtype) * -float("inf")
-            updates = jnp.zeros((batch_size, 1), dtype=scores.dtype)
-            new_scores = lax.dynamic_update_slice(new_scores, updates, (0, current_token))
-            return new_scores
-
-        scores = lax.cond(
-            cur_len >= self.force_token_array.shape[0],
-            # If the current length is geq than the length of force_token_array, the processor does nothing.
-            lambda: scores,
-            # Otherwise, it may force a certain token.
-            lambda: lax.cond(
-                self.force_token_array[cur_len] >= 0,
-                # Only valid (positive) tokens are forced
-                lambda: _force_token(cur_len),
-                # Otherwise, the processor does nothing.
-                lambda: scores,
-            ),
-        )
-        return scores
-
-
-class FlaxWhisperTimeStampLogitsProcessor(FlaxLogitsProcessor):
-    r"""
-    Whisper specific Processor. This processor can be used to force a list of tokens. The processor will set their log
-    probs to `inf` so that they are sampled at their corresponding index.
-
-    Args:
-        generate_config (`GenerateConfig`):
-            The generate config used to generate the output. The following parameters are required:
-                eos_token_id (`int`, *optional*, defaults to 50257):
-                    The id of the *end-of-sequence* token.
-                no_timestamps_token_id (`int`, *optional*, defaults to 50363):
-                    The id of the `"<|notimestamps|>"` token.
-                max_initial_timestamp_index (`int`, *optional*, defaults to 1):
-                    Used to set the maximum value of the initial timestamp. This is used to prevent the model from
-                    predicting timestamps that are too far in the future.
-    """
-
-    def __init__(self, generate_config, model_config, decoder_input_length):
-        self.eos_token_id = generate_config.eos_token_id
-        self.no_timestamps_token_id = generate_config.no_timestamps_token_id
-        self.timestamp_begin = generate_config.no_timestamps_token_id + 1
-
-        self.begin_index = decoder_input_length + 1
-
-        if generate_config.is_multilingual:
-            # room for language token and task token
-            self.begin_index += 2
-        if hasattr(generate_config, "max_initial_timestamp_index"):
-            self.max_initial_timestamp_index = generate_config.max_initial_timestamp_index
-        else:
-            self.max_initial_timestamp_index = model_config.vocab_size
-        if self.max_initial_timestamp_index is None:
-            self.max_initial_timestamp_index = model_config.vocab_size
-
-    def __call__(self, input_ids, scores, cur_len):
-        # suppress <|notimestamps|> which is handled by without_timestamps
-        scores = scores.at[:, self.no_timestamps_token_id].set(-float("inf"))
-
-        def handle_pairs(input_ids_k, scores_k):
-            last_was_timestamp = jnp.where((cur_len - self.begin_index) >= 1, True, False)
-            last_was_timestamp = jnp.where(
-                input_ids_k[cur_len - 1] >= self.timestamp_begin,
-                True and last_was_timestamp,
-                False,
-            )
-
-            penultimate_was_timestamp = jnp.where((cur_len - self.begin_index) < 2, True, False)
-            penultimate_was_timestamp = jnp.where(
-                input_ids_k[cur_len - 2] >= self.timestamp_begin,
-                True,
-                penultimate_was_timestamp,
-            )
-
-            return jnp.where(
-                last_was_timestamp,
-                jnp.where(
-                    penultimate_was_timestamp > 0,
-                    scores_k.at[self.timestamp_begin :].set(-float("inf")),
-                    scores_k.at[: self.eos_token_id].set(-float("inf")),
-                ),
-                scores_k,
-            )
-
-        scores = jax.vmap(handle_pairs)(input_ids, scores)
-
-        apply_max_initial_timestamp = jnp.where(cur_len == self.begin_index, True, False)
-        apply_max_initial_timestamp = jnp.where(
-            self.max_initial_timestamp_index is not None,
-            True and apply_max_initial_timestamp,
-            False,
-        )
-
-        last_allowed = self.timestamp_begin + self.max_initial_timestamp_index
-
-        scores = jnp.where(
-            apply_max_initial_timestamp,
-            scores.at[:, last_allowed + 1 :].set(-float("inf")),
-            scores,
-        )
-
-        # if sum of probability over timestamps is above any other token, sample timestamp
-        logprobs = jax.nn.log_softmax(scores, axis=-1)
-
-        def handle_cumulative_probs(logprobs_k, scores_k):
-            timestamp_logprob = jax.nn.logsumexp(logprobs_k[self.timestamp_begin :], axis=-1)
-            max_text_token_logprob = jnp.max(logprobs_k[: self.timestamp_begin])
-            return jnp.where(
-                timestamp_logprob > max_text_token_logprob,
-                scores_k.at[: self.timestamp_begin].set(-float("inf")),
-                scores_k,
-            )
-
-        scores = jax.vmap(handle_cumulative_probs)(logprobs, scores)
-
-        return scores
-
-
-class FlaxNoRepeatNGramLogitsProcessor(FlaxLogitsProcessor):
-    r"""
-    [`FlaxLogitsProcessor`] that enforces no repetition of n-grams. See
-    [Fairseq](https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345).
-
-    Args:
-        ngram_size (`int`):
-            All ngrams of size `ngram_size` can only occur once.
-    """
-
-    def __init__(self, ngram_size: int):
-        if not isinstance(ngram_size, int) or ngram_size <= 0:
-            raise ValueError(f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}")
-        self.ngram_size = ngram_size
-
-    def get_previous_ngrams(self, input_ids: jnp.ndarray, vocab_size: int, cur_len: int):
-        """
-        get a matrix of size (batch_size,) + (vocab_size,)*n (for n-grams) that
-        represent the n-grams that occurred previously.
-        The BCOO representation allow to store only the few non-zero entries, instead of the full (huge) matrix
-        """
-        batch_size, seq_len = input_ids.shape
-        # number of n-grams in the whole sequence
-        seq_ngrams = seq_len - (self.ngram_size - 1)
-        # number of n-grams in the currently generated sequence
-        cur_ngrams = cur_len - (self.ngram_size - 1)
-
-        def body_fun(i, val):
-            b = i % batch_size
-            pos = i // batch_size
-            return val.at[i].set(
-                jnp.array(
-                    [
-                        b,
-                    ]
-                    + [jnp.array(input_ids)[b, pos + j] for j in range(self.ngram_size)]
-                )
-            )
-
-        shape = (batch_size * seq_ngrams, self.ngram_size + 1)
-        all_update_indices = jax.lax.fori_loop(
-            0, batch_size * cur_ngrams, body_fun, jnp.zeros(shape, dtype=input_ids.dtype)
-        )
-
-        # ignore the n-grams not yet generated
-        data = (jnp.arange(batch_size * seq_ngrams) < batch_size * cur_ngrams).astype("float32")
-
-        return sparse.BCOO((data, all_update_indices), shape=(batch_size,) + (vocab_size,) * self.ngram_size)
-
-    def get_banned_tokens_mask(self, latest_tokens: jnp.ndarray, previous_ngrams) -> jnp.ndarray:
-        """
-        Determines which tokens must be banned given latest tokens and the previously seen
-        ngrams.
-        """
-
-        @sparse.sparsify
-        @jax.vmap
-        def inner_fn(latest_tokens, previous_ngrams):
-            return previous_ngrams[tuple(latest_tokens)]
-
-        return sparse.bcoo_todense(inner_fn(latest_tokens, previous_ngrams))
-
-    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
-        def true_fn():
-            _, vocab_size = scores.shape
-            # store the previously seen n-grams
-            previous_ngrams = self.get_previous_ngrams(input_ids, vocab_size, cur_len)
-
-            # get the n-1 last tokens that prefix the n-gram being generated
-            latest_tokens = jnp.zeros((input_ids.shape[0], self.ngram_size - 1), dtype=input_ids.dtype)
-            latest_tokens = jax.lax.dynamic_update_slice(
-                latest_tokens,
-                jax.lax.dynamic_slice(
-                    input_ids, (0, cur_len - (self.ngram_size - 1)), (input_ids.shape[0], (self.ngram_size - 1))
-                ),
-                (0, 0),
-            )
-
-            # compute the banned tokens, ie all the tokens that when added to the latest tokens lead to a n-gram that was previously generated
-            banned_tokens_indices_mask = self.get_banned_tokens_mask(latest_tokens, previous_ngrams).astype("bool")
-            return jnp.where(banned_tokens_indices_mask, -float("inf"), scores)
-
-        output = jax.lax.cond((cur_len >= self.ngram_size - 1), true_fn, lambda: scores)
-        return output
--- a/src/transformers/generation/flax_utils.py
+++ b/src/transformers/generation/flax_utils.py
--- a/src/transformers/generation/tf_logits_process.py
+++ b/src/transformers/generation/tf_logits_process.py
@ -1,600 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-
-import numpy as np
-import tensorflow as tf
-
-from ..tf_utils import stable_softmax
-from ..utils import add_start_docstrings
-from ..utils.logging import get_logger
-
-
-logger = get_logger(__name__)
-
-
-TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        scores (`tf.Tensor` of shape `(batch_size, config.vocab_size)`):
-            Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
-            search or log softmax for each vocabulary token when using beam search.
-        cur_len (`int`):
-            The current length of valid input sequence tokens. In the TF implementation, the input_ids' sequence length
-            is the maximum length generate can produce, and we need to know which of its tokens are valid.
-        kwargs (`dict[str, Any]`, *optional*):
-            Additional logits processor specific kwargs.
-
-    Return:
-        `tf.Tensor` of shape `(batch_size, config.vocab_size)`: The processed prediction scores.
-"""
-
-
-class TFLogitsProcessor:
-    """Abstract base class for all logit processors that can be applied during generation."""
-
-    @add_start_docstrings(TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING)
-    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
-        """TF method for processing logits."""
-        raise NotImplementedError(
-            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
-        )
-
-
-class TFLogitsWarper:
-    """Abstract base class for all logit warpers that can be applied during generation with multinomial sampling."""
-
-    @add_start_docstrings(TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING)
-    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
-        """TF method for warping logits."""
-        raise NotImplementedError(
-            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
-        )
-
-
-class TFLogitsProcessorList(list):
-    """
-    This class can be used to create a list of [`TFLogitsProcessor`] to subsequently process a `scores` input tensor.
-    This class inherits from list and adds a specific *__call__* method to apply each [`TFLogitsProcessor`] to the
-    inputs.
-    """
-
-    @add_start_docstrings(TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING)
-    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int, **kwargs) -> tf.Tensor:
-        for processor in self:
-            function_args = inspect.signature(processor.__call__).parameters
-            if len(function_args) > 3:
-                if not all(arg in kwargs for arg in list(function_args.keys())[2:]):
-                    raise ValueError(
-                        f"Make sure that all the required parameters: {list(function_args.keys())} for "
-                        f"{processor.__class__} are passed to the logits processor."
-                    )
-                scores = processor(input_ids, scores, cur_len, **kwargs)
-            else:
-                scores = processor(input_ids, scores, cur_len)
-        return scores
-
-
-class TFTemperatureLogitsWarper(TFLogitsWarper):
-    r"""
-    [`TFLogitsWarper`] for temperature (exponential scaling output probability distribution).
-
-    Args:
-        temperature (`float`):
-            The value used to module the logits distribution.
-    """
-
-    def __init__(self, temperature: float):
-        if not isinstance(temperature, float) or not (temperature > 0):
-            raise ValueError(f"`temperature` has to be a strictly positive float, but is {temperature}")
-
-        self.temperature = temperature
-
-    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
-        scores = scores / self.temperature
-        return scores
-
-
-class TFTopKLogitsWarper(TFLogitsWarper):
-    r"""
-    [`TFLogitsWarper`] that performs top-k, i.e. restricting to the k highest probability elements.
-
-    Args:
-        top_k (`int`):
-            The number of highest probability vocabulary tokens to keep for top-k-filtering.
-        filter_value (`float`, *optional*, defaults to -inf):
-            All filtered values will be set to this float value.
-        min_tokens_to_keep (`int`, *optional*, defaults to 1):
-            Minimum number of tokens that cannot be filtered.
-    """
-
-    def __init__(self, top_k: int, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
-        if not isinstance(top_k, int) or top_k <= 0:
-            raise ValueError(f"`top_k` has to be a strictly positive integer, but is {top_k}")
-
-        self.top_k = max(top_k, min_tokens_to_keep)
-        self.filter_value = filter_value
-
-    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
-        top_k = min(self.top_k, scores.shape[-1])  # Safety check
-        # Boolean mask containing all tokens with a probability less than the last token of the top-k
-        indices_to_remove = scores < tf.math.top_k(scores, k=top_k)[0][..., -1:]
-        next_scores = tf.where(indices_to_remove, self.filter_value, scores)
-        return next_scores
-
-
-class TFTopPLogitsWarper(TFLogitsWarper):
-    """
-    [`TFLogitsWarper`] that performs top-p, i.e. restricting to top tokens summing to <= prob_cut_off.
-
-    Args:
-        top_p (`float`):
-            If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
-            higher are kept for generation.
-        filter_value (`float`, *optional*, defaults to -inf):
-            All filtered values will be set to this float value.
-        min_tokens_to_keep (`int`, *optional*, defaults to 1):
-            Minimum number of tokens that cannot be filtered.
-    """
-
-    def __init__(self, top_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
-        if not isinstance(top_p, float) or (top_p < 0 or top_p > 1.0):
-            raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}")
-        if not isinstance(min_tokens_to_keep, int) or (min_tokens_to_keep < 1):
-            raise ValueError(f"`min_tokens_to_keep` has to be a positive integer, but is {min_tokens_to_keep}")
-
-        self.top_p = top_p
-        self.filter_value = filter_value
-        self.min_tokens_to_keep = min_tokens_to_keep
-
-    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
-        topk_scores, topk_indices = tf.math.top_k(scores, scores.shape[-1])
-
-        mask_scores = tf.fill(scores.shape, self.filter_value)
-        cumulative_probs = tf.math.cumsum(stable_softmax(topk_scores, axis=-1), axis=-1)
-        score_mask = cumulative_probs < self.top_p
-
-        # Also include the token that is higher than top_p (the first false = shift and insert a True on the left)
-        score_mask = tf.concat((tf.ones([score_mask.shape[0], 1], dtype=tf.bool), score_mask[:, :-1]), axis=-1)
-
-        # Ensure min tokens to keep
-        score_mask = tf.concat(
-            (
-                tf.ones([score_mask.shape[0], self.min_tokens_to_keep], dtype=tf.bool),
-                score_mask[:, self.min_tokens_to_keep :],
-            ),
-            axis=-1,
-        )
-
-        # Mask the values that do not fit the criteria
-        topk_next_scores = tf.where(score_mask, topk_scores, mask_scores)
-
-        # Undo the topk sorting: converts the 2D matrix of per-row original indices of shape (batch_size, vocab_size)
-        # to a 3D tensor of shape (batch_size, vocab_size, 2) containing the original score coordinate, from which we
-        # can scatter (i.e. `scatter_indices[row, col, :]` is a tensor containing `[row, topk_indices[row, col]]`)
-        scatter_rows = tf.tile(tf.expand_dims(tf.range(topk_indices.shape[0]), axis=-1), [1, topk_indices.shape[-1]])
-        scatter_indices = tf.stack((scatter_rows, topk_indices), axis=-1)
-        next_scores = tf.scatter_nd(scatter_indices, topk_next_scores, shape=topk_next_scores.shape)
-
-        return next_scores
-
-
-class TFMinLengthLogitsProcessor(TFLogitsProcessor):
-    r"""
-    [`TFLogitsProcessor`] enforcing a min-length by setting EOS probability to 0.
-
-    Args:
-        min_length (`int`):
-            The minimum length below which the score of `eos_token_id` is set to `-float("Inf")`.
-        eos_token_id (`int`):
-            The id of the *end-of-sequence* token.
-    """
-
-    def __init__(self, min_length: int, eos_token_id: int):
-        if not isinstance(min_length, int) or min_length < 0:
-            raise ValueError(f"`min_length` has to be a positive integer, but is {min_length}")
-
-        if not isinstance(eos_token_id, int) or eos_token_id < 0:
-            raise ValueError(f"`eos_token_id` has to be a positive integer, but is {eos_token_id}")
-
-        self.min_length = min_length
-        self.eos_token_id = eos_token_id
-
-    def _apply_eos_token_mask(self, scores: tf.Tensor) -> tf.Tensor:
-        eos_token_id_mask = tf.range(scores.shape[-1]) == self.eos_token_id
-        scores = tf.where(eos_token_id_mask, float("-inf"), scores)
-        return scores
-
-    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
-        # applies eos token masking if the first argument is true
-        scores = tf.cond(
-            tf.less(cur_len, self.min_length),
-            lambda: self._apply_eos_token_mask(scores),
-            lambda: tf.identity(scores),
-        )
-        return scores
-
-
-class TFRepetitionPenaltyLogitsProcessor(TFLogitsProcessor):
-    r"""
-    [`TFLogitsProcessor`] enforcing an exponential penalty on repeated sequences.
-
-    Args:
-        repetition_penalty (`float`):
-            The parameter for repetition penalty. 1.0 means no penalty. See [this
-            paper](https://huggingface.co/papers/1909.05858) for more details.
-    """
-
-    def __init__(self, penalty: float):
-        if not isinstance(penalty, float) or not (penalty > 0):
-            raise ValueError(f"`penalty` has to be a strictly positive float, but is {penalty}")
-
-        self.penalty = penalty
-
-    def _create_score_penalties(self, input_ids: tf.Tensor, logits: tf.Tensor) -> tf.Tensor:
-        # We want to populate the penalties in the positions of `input_ids`. Since XLA can't handle shapes unknown
-        # before runtime, `tf.unique` can't be used. Therefore, we may have redundant updates, when a given row has
-        # the same token multiple times.
-
-        # Gathers the penalties to apply
-        logit_penalties = tf.gather(logits, input_ids, axis=1, batch_dims=1)
-        logit_penalties = tf.where(logit_penalties > 0, 1 / self.penalty, logit_penalties)
-        logit_penalties = tf.where(logit_penalties < 0, self.penalty, logit_penalties)
-
-        # Scatters the penalties
-        token_penalties = tf.ones(logits.shape)
-        batch_size = input_ids.shape[0]
-        seq_len = tf.shape(input_ids)[1]  # the sequence length has dynamic size, hence the dynamic shape
-        indexable_prev_input_ids = tf.concat(
-            (
-                tf.expand_dims(tf.repeat(tf.range(batch_size), seq_len), axis=-1),
-                tf.expand_dims(tf.reshape(input_ids, [-1]), axis=-1),
-            ),
-            axis=1,
-        )
-        token_penalties = tf.tensor_scatter_nd_update(
-            token_penalties, indices=indexable_prev_input_ids, updates=tf.reshape(logit_penalties, [-1])
-        )
-        return token_penalties
-
-    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
-        score_penalties = self._create_score_penalties(input_ids[:, :cur_len], scores)
-
-        scores = tf.math.multiply(scores, score_penalties)
-
-        return scores
-
-
-class TFNoBadWordsLogitsProcessor(TFLogitsProcessor):
-    """
-    [`TFLogitsProcessor`] that enforces that specified sequences will never be sampled.
-
-    Args:
-        bad_words_ids (`list[list[int]]`):
-            List of list of token ids that are not allowed to be generated. In order to get the tokens of the words
-            that should not appear in the generated text, make sure to set `add_prefix_space=True` when initializing
-            the tokenizer, and use `tokenizer(bad_words, add_special_tokens=False).input_ids`. The `add_prefix_space`
-            argument is only supported for some slow tokenizers, as fast tokenizers' prefixing behaviours come from
-            `pre tokenizers`. Read more [here](https://huggingface.co/docs/tokenizers/api/pre-tokenizers).
-        eos_token_id (`int`):
-            The id of the *end-of-sequence* token.
-    """
-
-    def __init__(self, bad_words_ids: list[list[int]], eos_token_id: int):
-        if not isinstance(bad_words_ids, list) or len(bad_words_ids) == 0:
-            raise ValueError(f"`bad_words_ids` has to be a non-empty list, but is {bad_words_ids}.")
-        if any(not isinstance(bad_word_ids, list) for bad_word_ids in bad_words_ids):
-            raise ValueError(f"`bad_words_ids` has to be a list of lists, but is {bad_words_ids}.")
-        if any(
-            any((not isinstance(token_id, (int, np.integer)) or token_id < 0) for token_id in bad_word_ids)
-            for bad_word_ids in bad_words_ids
-        ):
-            raise ValueError(
-                f"Each list in `bad_words_ids` has to be a list of positive integers, but is {bad_words_ids}."
-            )
-
-        # stores the information about bad words in three tensors:
-        # 1. a rectangular tensor with the forbidden sequences (padded with `-1`), for full data comparisons
-        self.bad_word_seqs_ids = tf.ragged.constant(bad_words_ids).to_tensor(default_value=-1)
-        # 2. a tensor with the unpadded length of each forbidden sequence, for quick length comparisons
-        bad_word_seqs_len = [len(bad_words) for bad_words in bad_words_ids]
-        if any(word_len == 0 for word_len in bad_word_seqs_len):
-            raise ValueError(f"Banned words token sequences {bad_words_ids} cannot have an empty list")
-        self.bad_word_seqs_len = tf.convert_to_tensor(bad_word_seqs_len, dtype=tf.int32)
-        # 3. a tensor containing the last token for each sequence, for easy access to the tokens that may be banned
-        self.seq_forbidden_tokens = tf.convert_to_tensor([bad_words[-1] for bad_words in bad_words_ids])
-
-    def _calc_row_banned_bad_tokens(self, row_input_ids: tf.Tensor) -> tf.Tensor:
-        def _tokens_match(bad_word_seq_number):
-            def _len_one():
-                # If the bad sequence only has one token, always mask it
-                return tf.cond(
-                    tf.math.equal(self.bad_word_seqs_len[bad_word_seq_number], 1),
-                    lambda: tf.ones((), dtype=tf.bool),
-                    _len_greater_than_cur_len,
-                )
-
-            def _len_greater_than_cur_len():
-                # Otherwise, if the bad sequence is longer than the current length they can't ever match
-                return tf.cond(
-                    tf.math.greater(self.bad_word_seqs_len[bad_word_seq_number], tf.shape(row_input_ids)[0]),
-                    lambda: tf.zeros((), dtype=tf.bool),
-                    _match_found,
-                )
-
-            def _match_found():
-                # Finally, runs the actual comparison. Can only be called if the previous comparisons do not yield
-                # an answer (otherwise we get indexing exceptions)
-                compare_len = self.bad_word_seqs_len[bad_word_seq_number] - 1
-                return tf.cond(
-                    tf.math.reduce_all(
-                        tf.math.equal(
-                            row_input_ids[-compare_len:], self.bad_word_seqs_ids[bad_word_seq_number, :compare_len]
-                        )
-                    ),
-                    lambda: tf.ones((), dtype=tf.bool),
-                    lambda: tf.zeros((), dtype=tf.bool),
-                )
-
-            match = _len_one()
-            return match
-
-        # Compares the current row against all bad word sequences, obtaining a mask with the matches.
-        match_mask = tf.map_fn(_tokens_match, tf.range(self.bad_word_seqs_ids.shape[0]), fn_output_signature=tf.bool)
-        row_banned_tokens = self.seq_forbidden_tokens[match_mask]
-        return row_banned_tokens
-
-    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
-        # We want to mask some banned tokens, at a score level. Since the banned tokens depend on the previous
-        # `input_ids`, they may have a different length for each row, and they may even be empty for some rows.
-        # To remain simple and XLA-compatible, we work on a per-row fashion.
-        # TODO (Joao): this function might trigger XLA retracing as `cur_len` increases. Fix it if it becomes
-        # a frequent choke point. (make `cur_len` a tensor?)
-        def _get_row_updated_score(row_inputs: tuple[tf.Tensor]) -> tf.Tensor:
-            row_input_ids, row_score = row_inputs
-            banned_tokens = self._calc_row_banned_bad_tokens(row_input_ids[:cur_len])
-            banned_tokens_mask = tf.scatter_nd(
-                indices=tf.expand_dims(banned_tokens, axis=-1),
-                updates=tf.ones_like(banned_tokens, dtype=tf.bool),
-                shape=row_score.shape,
-            )
-            row_score = tf.where(banned_tokens_mask, -float("inf"), row_score)
-            return row_score
-
-        scores = tf.map_fn(_get_row_updated_score, (input_ids, scores), fn_output_signature=tf.float32)
-        return scores
-
-
-class TFNoRepeatNGramLogitsProcessor(TFLogitsProcessor):
-    r"""
-    [`TFLogitsProcessor`] that enforces no repetition of n-grams. See
-    [Fairseq](https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345).
-
-    Args:
-        ngram_size (`int`):
-            All ngrams of size `ngram_size` can only occur once.
-    """
-
-    def __init__(self, ngram_size: int):
-        if not isinstance(ngram_size, int) or ngram_size <= 0:
-            raise ValueError(f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}")
-        self.ngram_size = ngram_size
-
-    def calc_banned_ngram_tokens(self, input_ids, num_hypos, cur_len):
-        # Copied from fairseq for no_repeat_ngram in beam_search
-        if cur_len + 1 < self.ngram_size:
-            # return no banned tokens if we haven't generated ngram_size tokens yet
-            return [[] for _ in range(num_hypos)]
-        generated_ngrams = [{} for _ in range(num_hypos)]
-        prev_input_ids = input_ids[:, :cur_len]
-        for idx in range(num_hypos):
-            gen_tokens = prev_input_ids[idx].numpy().tolist()
-            generated_ngram = generated_ngrams[idx]
-            for ngram in zip(*[gen_tokens[i:] for i in range(self.ngram_size)]):
-                prev_ngram_tuple = tuple(ngram[:-1])
-                generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]]
-
-        def _get_generated_ngrams(hypo_idx):
-            # Before decoding the next token, prevent decoding of ngrams that have already appeared
-            start_idx = cur_len + 1 - self.ngram_size
-            ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].numpy().tolist())
-            return generated_ngrams[hypo_idx].get(ngram_idx, [])
-
-        banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)]
-
-        return banned_tokens
-
-    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
-        # TODO (joao): enable XLA on this logits processor. See discussion and attempts in
-        # https://github.com/huggingface/transformers/pull/16974
-        if not tf.executing_eagerly():
-            raise NotImplementedError("TFNoRepeatNGramLogitsProcessor is only implemented for eager execution.")
-
-        batch_size, vocab_size = scores.shape
-        banned_tokens = self.calc_banned_ngram_tokens(input_ids, batch_size, cur_len)
-
-        # create banned_tokens boolean mask
-        banned_tokens_indices_mask = []
-        for banned_tokens_slice in banned_tokens:
-            banned_tokens_indices_mask.append([token in banned_tokens_slice for token in range(vocab_size)])
-
-        scores = tf.where(tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf"), scores)
-
-        return scores
-
-
-class TFForcedBOSTokenLogitsProcessor(TFLogitsProcessor):
-    r"""
-    [`TFLogitsProcessor`] that enforces the specified token as the first generated token.
-
-    Args:
-        bos_token_id (`int`):
-            The id of the token to force as the first generated token.
-    """
-
-    def __init__(self, bos_token_id: int):
-        if bos_token_id < 0:
-            raise ValueError(f"The forced bos token id  must be a non-negative integer, got {bos_token_id}")
-        self.bos_token_id = bos_token_id
-
-    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
-        if cur_len == 1:
-            batch_size, num_tokens = scores.shape
-            # sets the score to 0 in the bos_token_id column
-            scores = tf.zeros((batch_size, 1))
-            # sets the score to -inf everywhere else
-            if self.bos_token_id > 0:
-                scores = tf.concat((tf.broadcast_to(-float("inf"), (batch_size, self.bos_token_id)), scores), axis=-1)
-            if self.bos_token_id < (num_tokens - 1):
-                scores = tf.concat(
-                    (scores, tf.broadcast_to(-float("inf"), (batch_size, (num_tokens - 1) - self.bos_token_id))),
-                    axis=-1,
-                )
-        return scores
-
-
-class TFForcedEOSTokenLogitsProcessor(TFLogitsProcessor):
-    r"""
-    [`TFLogitsProcessor`] that enforces the specified token as the last generated token when `max_length` is reached.
-
-    Args:
-        max_length (`int`):
-            The maximum length of the sequence to be generated.
-        eos_token_id (`int`):
-            The id of the token to force as the last generated token when `max_length` is reached.
-    """
-
-    def __init__(self, max_length: int, eos_token_id: int):
-        self.max_length = max_length
-        if eos_token_id < 0:
-            raise ValueError(f"The forced eos token id must be a non-negative integer, got {eos_token_id}")
-        self.eos_token_id = eos_token_id
-
-    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
-        if cur_len == self.max_length - 1:
-            batch_size, num_tokens = scores.shape
-            # sets the score to 0 in the eos_token_id column
-            scores = tf.zeros((batch_size, 1))
-            # sets the score to -inf everywhere else
-            if self.eos_token_id > 0:
-                scores = tf.concat((tf.broadcast_to(-float("inf"), (batch_size, self.eos_token_id)), scores), axis=-1)
-            if self.eos_token_id < (num_tokens - 1):
-                scores = tf.concat(
-                    (scores, tf.broadcast_to(-float("inf"), (batch_size, (num_tokens - 1) - self.eos_token_id))),
-                    axis=-1,
-                )
-        return scores
-
-
-class TFSuppressTokensAtBeginLogitsProcessor(TFLogitsProcessor):
-    r"""
-    [`TFSuppressTokensAtBeginLogitsProcessor`] suppresses a list of tokens as soon as the `generate` function starts
-    generating using `begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` at not
-    sampled at the beginning of the generation.
-    """
-
-    def __init__(self, begin_suppress_tokens, begin_index):
-        self.begin_suppress_tokens = list(begin_suppress_tokens)
-        self.begin_index = begin_index
-
-    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
-        suppressed_indices = []
-        for token in self.begin_suppress_tokens:
-            if token < scores.shape[-1]:  # to ensure we don't go beyond the vocab size
-                suppressed_indices.extend([[i, token] for i in range(scores.shape[0])])
-
-        if len(suppressed_indices) > 0:
-            scores = tf.cond(
-                tf.equal(cur_len, self.begin_index),
-                lambda: tf.tensor_scatter_nd_update(
-                    scores,
-                    indices=suppressed_indices,
-                    updates=[-float("inf") for _ in range(scores.shape[0] * len(self.begin_suppress_tokens))],
-                ),
-                lambda: scores,
-            )
-        return scores
-
-
-class TFSuppressTokensLogitsProcessor(TFLogitsProcessor):
-    r"""This processor can be used to suppress a list of tokens. The processor will set their log probs to `-inf` so that they
-    are not sampled."""
-
-    def __init__(self, suppress_tokens):
-        self.suppress_tokens = list(suppress_tokens)
-
-    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
-        suppressed_indices = []
-        for token in self.suppress_tokens:
-            if token < scores.shape[-1]:  # to ensure we don't go beyond the vocab size
-                suppressed_indices.extend([[i, token] for i in range(scores.shape[0])])
-
-        if len(suppressed_indices) > 0:
-            scores = tf.tensor_scatter_nd_update(
-                scores,
-                indices=[[i, token] for i in range(scores.shape[0]) for token in self.suppress_tokens],
-                updates=[-float("inf") for _ in range(scores.shape[0] * len(self.suppress_tokens))],
-            )
-        return scores
-
-
-class TFForceTokensLogitsProcessor(TFLogitsProcessor):
-    r"""This processor takes a list of pairs of integers which indicates a mapping from generation indices to token
-    indices that will be forced before sampling. The processor will set their log probs to `0` and all other tokens to
-    `-inf` so that they are sampled at their corresponding index."""
-
-    def __init__(self, force_token_map: list[list[int]]):
-        force_token_map = dict(force_token_map)
-        # Converts the dictionary of format {index: token} containing the tokens to be forced to an array, where the
-        # index of the array corresponds to the index of the token to be forced, for XLA compatibility.
-        # Indexes without forced tokens will have an negative value.
-        force_token_array = np.ones((max(force_token_map.keys()) + 1), dtype=np.int32) * -1
-        for index, token in force_token_map.items():
-            if token is not None:
-                force_token_array[index] = token
-        self.force_token_array = tf.convert_to_tensor(force_token_array, dtype=tf.int32)
-
-    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
-        def _force_token(generation_idx):
-            batch_size = scores.shape[0]
-            current_token = self.force_token_array[generation_idx]
-
-            new_scores = tf.zeros_like(scores, dtype=scores.dtype) + tf.constant([scores.dtype.min])
-            indices = tf.stack((tf.range(batch_size), tf.tile([current_token], [batch_size])), axis=1)
-            updates = tf.zeros((batch_size,), dtype=scores.dtype)
-            new_scores = tf.tensor_scatter_nd_update(new_scores, indices, updates)
-            return new_scores
-
-        scores = tf.cond(
-            tf.greater_equal(cur_len, tf.shape(self.force_token_array)[0]),
-            # If the current length is geq than the length of force_token_array, the processor does nothing.
-            lambda: tf.identity(scores),
-            # Otherwise, it may force a certain token.
-            lambda: tf.cond(
-                tf.greater_equal(self.force_token_array[cur_len], 0),
-                # Only valid (positive) tokens are forced
-                lambda: _force_token(cur_len),
-                # Otherwise, the processor does nothing.
-                lambda: scores,
-            ),
-        )
-        return scores
--- a/src/transformers/generation/tf_utils.py
+++ b/src/transformers/generation/tf_utils.py
--- a/src/transformers/image_processing_base.py
+++ b/src/transformers/image_processing_base.py
@ -55,7 +55,7 @@ class BatchFeature(BaseBatchFeature):
        data (`dict`):
            Dictionary of lists/arrays/tensors returned by the __call__ method ('pixel_values', etc.).
        tensor_type (`Union[None, str, TensorType]`, *optional*):
-            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
+            You can give a tensor_type here to convert the lists of integers in PyTorch/Numpy Tensors at
            initialization.
    """

--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@ -26,10 +26,8 @@ from .image_utils import (
    get_image_size,
    infer_channel_dimension_format,
 )
-from .utils import ExplicitEnum, TensorType, is_jax_tensor, is_tf_tensor, is_torch_tensor
+from .utils import ExplicitEnum, TensorType, is_torch_tensor
 from .utils.import_utils import (
-    is_flax_available,
-    is_tf_available,
    is_torch_available,
    is_vision_available,
    requires_backends,
@ -44,12 +42,6 @@ if is_vision_available():
 if is_torch_available():
    import torch

-if is_tf_available():
-    import tensorflow as tf
-
-if is_flax_available():
-    import jax.numpy as jnp
-

 def to_channel_dimension_format(
    image: np.ndarray,
@ -160,7 +152,7 @@ def _rescale_for_pil_conversion(image):


 def to_pil_image(
-    image: Union[np.ndarray, "PIL.Image.Image", "torch.Tensor", "tf.Tensor", "jnp.ndarray"],
+    image: Union[np.ndarray, "PIL.Image.Image", "torch.Tensor"],
    do_rescale: Optional[bool] = None,
    image_mode: Optional[str] = None,
    input_data_format: Optional[Union[str, ChannelDimension]] = None,
@ -170,7 +162,7 @@ def to_pil_image(
    needed.

    Args:
-        image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor` or `tf.Tensor`):
+        image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor`):
            The image to convert to the `PIL.Image` format.
        do_rescale (`bool`, *optional*):
            Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default
@ -190,10 +182,8 @@ def to_pil_image(
        return image

    # Convert all tensors to numpy arrays before converting to PIL image
-    if is_torch_tensor(image) or is_tf_tensor(image):
+    if is_torch_tensor(image):
        image = image.numpy()
-    elif is_jax_tensor(image):
-        image = np.array(image)
    elif not isinstance(image, np.ndarray):
        raise ValueError(f"Input image type not supported: {type(image)}")

@ -556,16 +546,6 @@ def _center_to_corners_format_numpy(bboxes_center: np.ndarray) -> np.ndarray:
    return bboxes_corners


-def _center_to_corners_format_tf(bboxes_center: "tf.Tensor") -> "tf.Tensor":
-    center_x, center_y, width, height = tf.unstack(bboxes_center, axis=-1)
-    bboxes_corners = tf.stack(
-        # top left x, top left y, bottom right x, bottom right y
-        [center_x - 0.5 * width, center_y - 0.5 * height, center_x + 0.5 * width, center_y + 0.5 * height],
-        axis=-1,
-    )
-    return bboxes_corners
-
-
 # 2 functions below inspired by https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
 def center_to_corners_format(bboxes_center: TensorType) -> TensorType:
    """
@ -576,14 +556,11 @@ def center_to_corners_format(bboxes_center: TensorType) -> TensorType:
    corners format: contains the coordinates for the top-left and bottom-right corners of the box
        (top_left_x, top_left_y, bottom_right_x, bottom_right_y)
    """
-    # Function is used during model forward pass, so we use the input framework if possible, without
-    # converting to numpy
+    # Function is used during model forward pass, so we use torch if relevant, without converting to numpy
    if is_torch_tensor(bboxes_center):
        return _center_to_corners_format_torch(bboxes_center)
    elif isinstance(bboxes_center, np.ndarray):
        return _center_to_corners_format_numpy(bboxes_center)
-    elif is_tf_tensor(bboxes_center):
-        return _center_to_corners_format_tf(bboxes_center)

    raise ValueError(f"Unsupported input type {type(bboxes_center)}")

@ -613,20 +590,6 @@ def _corners_to_center_format_numpy(bboxes_corners: np.ndarray) -> np.ndarray:
    return bboxes_center


-def _corners_to_center_format_tf(bboxes_corners: "tf.Tensor") -> "tf.Tensor":
-    top_left_x, top_left_y, bottom_right_x, bottom_right_y = tf.unstack(bboxes_corners, axis=-1)
-    bboxes_center = tf.stack(
-        [
-            (top_left_x + bottom_right_x) / 2,  # center x
-            (top_left_y + bottom_right_y) / 2,  # center y
-            (bottom_right_x - top_left_x),  # width
-            (bottom_right_y - top_left_y),  # height
-        ],
-        axis=-1,
-    )
-    return bboxes_center
-
-
 def corners_to_center_format(bboxes_corners: TensorType) -> TensorType:
    """
    Converts bounding boxes from corners format to center format.
@ -641,8 +604,6 @@ def corners_to_center_format(bboxes_corners: TensorType) -> TensorType:
        return _corners_to_center_format_torch(bboxes_corners)
    elif isinstance(bboxes_corners, np.ndarray):
        return _corners_to_center_format_numpy(bboxes_corners)
-    elif is_tf_tensor(bboxes_corners):
-        return _corners_to_center_format_tf(bboxes_corners)

    raise ValueError(f"Unsupported input type {type(bboxes_corners)}")

--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@ -24,9 +24,7 @@ import requests

 from .utils import (
    ExplicitEnum,
-    is_jax_tensor,
    is_numpy_array,
-    is_tf_tensor,
    is_torch_available,
    is_torch_tensor,
    is_torchvision_available,
@ -107,8 +105,6 @@ class ImageType(ExplicitEnum):
    PIL = "pillow"
    TORCH = "torch"
    NUMPY = "numpy"
-    TENSORFLOW = "tensorflow"
-    JAX = "jax"


 def get_image_type(image):
@ -118,15 +114,11 @@ def get_image_type(image):
        return ImageType.TORCH
    if is_numpy_array(image):
        return ImageType.NUMPY
-    if is_tf_tensor(image):
-        return ImageType.TENSORFLOW
-    if is_jax_tensor(image):
-        return ImageType.JAX
    raise ValueError(f"Unrecognized image type {type(image)}")


 def is_valid_image(img):
-    return is_pil_image(img) or is_numpy_array(img) or is_torch_tensor(img) or is_tf_tensor(img) or is_jax_tensor(img)
+    return is_pil_image(img) or is_numpy_array(img) or is_torch_tensor(img)


 def is_valid_list_of_images(images: list):
@ -205,8 +197,7 @@ def make_list_of_images(images, expected_ndims: int = 3) -> list[ImageInput]:
            )
        return images
    raise ValueError(
-        "Invalid image type. Expected either PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or "
-        f"jax.ndarray, but got {type(images)}."
+        f"Invalid image type. Expected either PIL.Image.Image, numpy.ndarray, or torch.Tensor, but got {type(images)}."
    )


@ -570,7 +561,6 @@ def validate_preprocess_arguments(
        raise ValueError("`size` and `resample/interpolation` must be specified if `do_resize` is `True`.")


-# In the future we can add a TF implementation here when we have TF models.
 class ImageFeatureExtractionMixin:
    """
    Mixin that contain utilities for preparing image features.
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@ -48,7 +48,6 @@ from ..utils import (
    flatten_dict,
    is_datasets_available,
    is_pandas_available,
-    is_tf_available,
    is_torch_available,
    logging,
 )
@ -56,8 +55,6 @@ from ..utils import (

 logger = logging.get_logger(__name__)

-if is_tf_available():
-    from .. import TFPreTrainedModel

 if is_torch_available():
    import torch
@ -760,12 +757,6 @@ def save_model_architecture_to_file(model: Any, output_dir: str):
    with open(f"{output_dir}/model_architecture.txt", "w+") as f:
        if isinstance(model, PreTrainedModel):
            print(model, file=f)
-        elif is_tf_available() and isinstance(model, TFPreTrainedModel):
-
-            def print_to_file(s):
-                print(s, file=f)
-
-            model.summary(print_fn=print_to_file)
        elif is_torch_available() and (
            isinstance(model, (torch.nn.Module, PushToHubMixin)) and hasattr(model, "base_model")
        ):
@ -1225,7 +1216,7 @@ class CometCallback(TrainerCallback):
        - **COMET_PROJECT_NAME** (`str`, *optional*):
            Comet project name for experiments.
        - **COMET_LOG_ASSETS** (`str`, *optional*, defaults to `TRUE`):
-            Whether or not to log training assets (tf event logs, checkpoints, etc), to Comet. Can be `TRUE`, or
+            Whether or not to log training assets (checkpoints, etc), to Comet. Can be `TRUE`, or
            `FALSE`.

        For a number of configurable items in the environment, see
--- a/src/transformers/keras_callbacks.py
+++ b/src/transformers/keras_callbacks.py
@ -1,413 +0,0 @@
-import logging
-import os
-from pathlib import Path
-from time import sleep
-from typing import Callable, Optional, Union
-
-import numpy as np
-import tensorflow as tf
-from huggingface_hub import Repository, create_repo
-from packaging.version import parse
-
-from . import IntervalStrategy, PreTrainedTokenizerBase
-from .modelcard import TrainingSummary
-from .modeling_tf_utils import keras
-
-
-logger = logging.getLogger(__name__)
-
-
-class KerasMetricCallback(keras.callbacks.Callback):
-    """
-    Callback to compute metrics at the end of every epoch. Unlike normal Keras metrics, these do not need to be
-    compilable by TF. It is particularly useful for common NLP metrics like BLEU and ROUGE that require string
-    operations or generation loops that cannot be compiled. Predictions (or generations) will be computed on the
-    `eval_dataset` before being passed to the `metric_fn` in `np.ndarray` format. The `metric_fn` should compute
-    metrics and return a dict mapping metric names to metric values.
-
-    We provide an example of a suitable metric_fn that computes ROUGE scores for a summarization model below. Note that
-    this example skips some post-processing for readability and simplicity, and should probably not be used as-is!
-
-    ```py
-    from datasets import load_metric
-
-    rouge_metric = load_metric("rouge")
-
-
-    def rouge_fn(predictions, labels):
-        decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
-        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
-        result = rouge_metric.compute(predictions=decoded_predictions, references=decoded_labels)
-        return {key: value.mid.fmeasure * 100 for key, value in result.items()}
-    ```
-
-    The above function will return a dict containing values which will be logged like any other Keras metric:
-
-    ```
-    {'rouge1': 37.4199, 'rouge2': 13.9768, 'rougeL': 34.361, 'rougeLsum': 35.0781
-    ```
-
-    Args:
-        metric_fn (`Callable`):
-            Metric function provided by the user. It will be called with two arguments - `predictions` and `labels`.
-            These contain the model's outputs and matching labels from the dataset. It should return a dict mapping
-            metric names to numerical values.
-        eval_dataset (`tf.data.Dataset` or `dict` or `tuple` or `np.ndarray` or `tf.Tensor`):
-            Validation data to be used to generate predictions for the `metric_fn`.
-        output_cols (`list[str], *optional*):
-            A list of columns to be retained from the model output as the predictions. Defaults to all.
-        label_cols ('`list[str]`, *optional*'):
-            A list of columns to be retained from the input dataset as the labels. Will be autodetected if this is not
-            supplied.
-        batch_size (`int`, *optional*):
-            Batch size. Only used when the data is not a pre-batched `tf.data.Dataset`.
-        predict_with_generate (`bool`, *optional*, defaults to `False`):
-            Whether we should use `model.generate()` to get outputs for the model.
-        use_xla_generation (`bool`, *optional*, defaults to `False`):
-            If we're generating, whether to compile model generation with XLA. This can massively increase the speed of
-            generation (up to 100X speedup) but will require a new XLA compilation for each input shape. When using XLA
-            generation, it's a good idea to pad your inputs to the same size, or to use the `pad_to_multiple_of`
-            argument in your `tokenizer` or `DataCollator`, which will reduce the number of unique input shapes and
-            save a lot of compilation time. This option has no effect is `predict_with_generate` is `False`.
-        generate_kwargs (`dict`, *optional*):
-            Keyword arguments to pass to `model.generate()` when generating. Has no effect if `predict_with_generate`
-            is `False`.
-
-    """
-
-    def __init__(
-        self,
-        metric_fn: Callable,
-        eval_dataset: Union[tf.data.Dataset, np.ndarray, tf.Tensor, tuple, dict],
-        output_cols: Optional[list[str]] = None,
-        label_cols: Optional[list[str]] = None,
-        batch_size: Optional[int] = None,
-        predict_with_generate: bool = False,
-        use_xla_generation: bool = False,
-        generate_kwargs: Optional[dict] = None,
-    ):
-        super().__init__()
-        self.metric_fn = metric_fn
-        self.batch_size = batch_size
-        if not isinstance(eval_dataset, tf.data.Dataset):
-            if batch_size is None:
-                raise ValueError(
-                    "When passing data to KerasMetricCallback that is not a pre-batched tf.data.Dataset "
-                    "the batch_size argument must be set."
-                )
-            # Wrap a tf.data.Dataset around it
-            eval_dataset = tf.data.Dataset.from_tensor_slices(eval_dataset).batch(batch_size, drop_remainder=False)
-        self.eval_dataset = eval_dataset
-        self.predict_with_generate = predict_with_generate
-        self.output_cols = output_cols
-
-        # This next block attempts to parse out which elements of the dataset should be appended to the labels list
-        # that is passed to the metric_fn
-        if isinstance(eval_dataset.element_spec, tuple) and len(eval_dataset.element_spec) == 2:
-            input_spec, label_spec = eval_dataset.element_spec
-        else:
-            input_spec = eval_dataset.element_spec
-            label_spec = None
-        if label_cols is not None:
-            for label in label_cols:
-                if label not in input_spec:
-                    raise ValueError(f"Label {label} is in label_cols but could not be found in the dataset inputs!")
-            self.label_cols = label_cols
-            self.use_keras_label = False
-        elif label_spec is not None:
-            # If the dataset inputs are split into a 2-tuple of inputs and labels,
-            # assume the second element is the labels
-            self.label_cols = None
-            self.use_keras_label = True
-        elif "labels" in input_spec:
-            self.label_cols = ["labels"]
-            self.use_keras_label = False
-            logging.warning("No label_cols specified for KerasMetricCallback, assuming you want the 'labels' key.")
-        elif "start_positions" in input_spec and "end_positions" in input_spec:
-            self.label_cols = ["start_positions", "end_positions"]
-            self.use_keras_label = False
-            logging.warning(
-                "No label_cols specified for KerasMetricCallback, assuming you want the "
-                "start_positions and end_positions keys."
-            )
-        else:
-            raise ValueError("Could not autodetect label_cols for KerasMetricCallback, please specify them!")
-        if parse(tf.__version__) < parse("2.7"):
-            logging.warning("TF versions less than 2.7 may encounter issues with KerasMetricCallback!")
-
-        self.use_xla_generation = use_xla_generation
-        self.generate_kwargs = {} if generate_kwargs is None else generate_kwargs
-
-        self.generation_function = None
-
-    @staticmethod
-    def _concatenate_batches(batches, padding_index=-100):
-        # If all batches are unidimensional or same length, do a simple concatenation
-        if batches[0].ndim == 1 or all(batch.shape[1] == batches[0].shape[1] for batch in batches):
-            return np.concatenate(batches, axis=0)
-
-        # Welp, they're not the same length. Let's do some padding
-        max_len = max([batch.shape[1] for batch in batches])
-        num_samples = sum([batch.shape[0] for batch in batches])
-        output = np.full_like(
-            batches[0], fill_value=padding_index, shape=[num_samples, max_len] + list(batches[0].shape[2:])
-        )
-        # i keeps track of which part of the concatenated array we're writing the next batch to
-        i = 0
-        for batch in batches:
-            output[i : i + len(batch), : batch.shape[1]] = batch
-            i += len(batch)
-        return output
-
-    def _postprocess_predictions_or_labels(self, inputs):
-        if isinstance(inputs[0], dict):
-            outputs = {}
-            for key in inputs[0]:
-                outputs[key] = self._concatenate_batches([batch[key] for batch in inputs])
-            # If it's a dict with only one key, just return the array
-            if len(outputs) == 1:
-                outputs = list(outputs.values())[0]
-        elif isinstance(inputs[0], (tuple, list)):
-            outputs = []
-            for input_list in zip(*inputs):
-                outputs.append(self._concatenate_batches(input_list))
-            if len(outputs) == 1:
-                outputs = outputs[0]  # If it's a list with only one element, just return the array
-        elif isinstance(inputs[0], np.ndarray):
-            outputs = self._concatenate_batches(inputs)
-        elif isinstance(inputs[0], tf.Tensor):
-            outputs = self._concatenate_batches([tensor.numpy() for tensor in inputs])
-        else:
-            raise TypeError(f"Couldn't handle batch of type {type(inputs[0])}!")
-        return outputs
-
-    def on_epoch_end(self, epoch, logs=None):
-        if hasattr(self.model, "config"):
-            ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", [])
-        else:
-            ignore_keys = []
-
-        main_input_name = None
-        if self.predict_with_generate:
-            # This dense conditional recognizes the case where we have an encoder-decoder model, but
-            # avoids getting tangled up when we just have a model with a layer called 'encoder'
-            if hasattr(self.model, "encoder") and hasattr(self.model.encoder, "main_input_name"):
-                main_input_name = self.model.encoder.main_input_name
-            else:
-                main_input_name = getattr(self.model, "main_input_name", "input_ids")
-
-            if self.use_xla_generation and self.generation_function is None:
-
-                def generation_function(inputs, attention_mask):
-                    return self.model.generate(inputs, attention_mask=attention_mask, **self.generate_kwargs)
-
-                self.generation_function = tf.function(generation_function, jit_compile=True)
-
-        prediction_list = []
-        label_list = []
-
-        # The whole predict/generate loop is handled inside this method
-        for batch in self.eval_dataset:
-            if isinstance(batch, tuple):
-                batch, labels = batch
-            else:
-                labels = None
-            if self.predict_with_generate:
-                if isinstance(batch, dict):
-                    generation_inputs = batch[main_input_name]
-                    attention_mask = batch.get("attention_mask", None)
-                else:
-                    generation_inputs = batch
-                    attention_mask = None
-                if self.use_xla_generation:
-                    predictions = self.generation_function(generation_inputs, attention_mask=attention_mask)
-                else:
-                    predictions = self.model.generate(
-                        generation_inputs, attention_mask=attention_mask, **self.generate_kwargs
-                    )
-            else:
-                predictions = self.model.predict_on_batch(batch)
-                if isinstance(predictions, dict):
-                    # This converts any dict-subclass to a regular dict
-                    # Keras REALLY doesn't like it when we pass around a BatchEncoding or other derived class
-                    predictions = dict(predictions)
-                    if self.output_cols is not None:
-                        predictions = {key: predictions[key] for key in self.output_cols}
-                    else:
-                        predictions = {
-                            key: val for key, val in predictions.items() if key not in ignore_keys + ["loss"]
-                        }
-            prediction_list.append(predictions)
-            if not self.use_keras_label:
-                labels = {key: batch[key].numpy() for key in self.label_cols}
-            elif isinstance(labels, dict):
-                labels = {key: array.numpy() for key, array in labels.items()}
-            elif isinstance(labels, (list, tuple)):
-                labels = [array.numpy() for array in labels]
-            elif isinstance(labels, tf.Tensor):
-                labels = labels.numpy()
-            else:
-                raise TypeError(f"Confused by labels of type {type(labels)}")
-            label_list.append(labels)
-
-        all_preds = self._postprocess_predictions_or_labels(prediction_list)
-        all_labels = self._postprocess_predictions_or_labels(label_list)
-
-        metric_output = self.metric_fn((all_preds, all_labels))
-        if not isinstance(metric_output, dict):
-            raise TypeError(
-                f"metric_fn should return a dict mapping metric names to values but instead returned {metric_output}"
-            )
-        # This is the critical bit - Keras passes a dict containing the loss and standard metric values for this epoch
-        # in the logs argument. Ordinarily, this is so the callback can read them, but in this case we write a bunch of
-        # new keys in there, which will then get read by the History callback and treated like any other metric value.
-        # I promise that I have it in writing from Chollet that this is okay.
-        logs.update(metric_output)
-
-
-class PushToHubCallback(keras.callbacks.Callback):
-    """
-    Callback that will save and push the model to the Hub regularly. By default, it pushes once per epoch, but this can
-    be changed with the `save_strategy` argument. Pushed models can be accessed like any other model on the hub, such
-    as with the `from_pretrained` method.
-
-    ```py
-    from transformers.keras_callbacks import PushToHubCallback
-
-    push_to_hub_callback = PushToHubCallback(
-        output_dir="./model_save",
-        tokenizer=tokenizer,
-        hub_model_id="gpt5-7xlarge",
-    )
-
-    model.fit(train_dataset, callbacks=[push_to_hub_callback])
-    ```
-
-    Args:
-        output_dir (`str`):
-            The output directory where the model predictions and checkpoints will be written and synced with the
-            repository on the Hub.
-        save_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"epoch"`):
-            The checkpoint save strategy to adopt during training. Possible values are:
-
-                - `"no"`: Save is done at the end of training.
-                - `"epoch"`: Save is done at the end of each epoch.
-                - `"steps"`: Save is done every `save_steps`
-        save_steps (`int`, *optional*):
-            The number of steps between saves when using the "steps" `save_strategy`.
-        tokenizer (`PreTrainedTokenizerBase`, *optional*):
-            The tokenizer used by the model. If supplied, will be uploaded to the repo alongside the weights.
-        hub_model_id (`str`, *optional*):
-            The name of the repository to keep in sync with the local `output_dir`. It can be a simple model ID in
-            which case the model will be pushed in your namespace. Otherwise it should be the whole repository name,
-            for instance `"user_name/model"`, which allows you to push to an organization you are a member of with
-            `"organization_name/model"`.
-
-            Will default to the name of `output_dir`.
-        hub_token (`str`, *optional*):
-            The token to use to push the model to the Hub. Will default to the token in the cache folder obtained with
-            `hf auth login`.
-        checkpoint (`bool`, *optional*, defaults to `False`):
-            Whether to save full training checkpoints (including epoch and optimizer state) to allow training to be
-            resumed. Only usable when `save_strategy` is `"epoch"`.
-    """
-
-    def __init__(
-        self,
-        output_dir: Union[str, Path],
-        save_strategy: Union[str, IntervalStrategy] = "epoch",
-        save_steps: Optional[int] = None,
-        tokenizer: Optional[PreTrainedTokenizerBase] = None,
-        hub_model_id: Optional[str] = None,
-        hub_token: Optional[str] = None,
-        checkpoint: bool = False,
-        **model_card_args,
-    ):
-        super().__init__()
-        if checkpoint and save_strategy != "epoch":
-            raise ValueError("Cannot save checkpoints when save_strategy is not 'epoch'!")
-        if isinstance(save_strategy, str):
-            save_strategy = IntervalStrategy(save_strategy.lower())
-        self.save_strategy = save_strategy
-        if self.save_strategy == IntervalStrategy.STEPS and (not isinstance(save_steps, int) or save_steps <= 0):
-            raise ValueError("Please supply a positive integer argument for save_steps when save_strategy == 'steps'!")
-        self.save_steps = save_steps
-        output_dir = Path(output_dir)
-
-        # Create repo and retrieve repo_id
-        if hub_model_id is None:
-            hub_model_id = output_dir.absolute().name
-        self.hub_model_id = create_repo(repo_id=hub_model_id, exist_ok=True, token=hub_token).repo_id
-
-        self.output_dir = output_dir
-        self.repo = Repository(str(self.output_dir), clone_from=self.hub_model_id, token=hub_token)
-
-        self.tokenizer = tokenizer
-        self.last_job = None
-        self.checkpoint = checkpoint
-        self.training_history = None
-        self.model_card_args = model_card_args
-
-    def on_train_begin(self, logs=None):
-        # Although we can access model.history, we have no guarantees that the History callback will fire before this
-        # one, so we keep track of it here too
-        self.training_history = []
-
-    def on_train_batch_end(self, batch, logs=None):
-        if self.save_strategy == IntervalStrategy.STEPS and (batch + 1) % self.save_steps == 0:
-            if self.last_job is not None and not self.last_job.is_done:
-                return  # The last upload is still running, don't start another
-            self.model.save_pretrained(self.output_dir)
-            if self.tokenizer is not None:
-                self.tokenizer.save_pretrained(self.output_dir)
-            _, self.last_job = self.repo.push_to_hub(
-                commit_message=f"Training in progress steps {batch}", blocking=False
-            )
-
-    def on_epoch_end(self, epoch, logs=None):
-        logs = logs.copy()  # Don't accidentally write things that Keras will read later
-        if "epoch" not in logs:
-            logs["epoch"] = epoch
-        self.training_history.append(logs)
-        if self.save_strategy == IntervalStrategy.EPOCH:
-            if self.last_job is not None and not self.last_job.is_done:
-                return  # The last upload is still running, don't start another
-            self.model.save_pretrained(self.output_dir)
-            if self.tokenizer is not None:
-                self.tokenizer.save_pretrained(self.output_dir)
-            if self.checkpoint:
-                checkpoint_dir = os.path.join(self.output_dir, "checkpoint")
-                self.model._save_checkpoint(checkpoint_dir, epoch)
-            train_summary = TrainingSummary.from_keras(
-                model=self.model,
-                model_name=self.hub_model_id,
-                keras_history=self.training_history,
-                **self.model_card_args,
-            )
-            model_card = train_summary.to_model_card()
-            with (self.output_dir / "README.md").open("w") as f:
-                f.write(model_card)
-            _, self.last_job = self.repo.push_to_hub(
-                commit_message=f"Training in progress epoch {epoch}", blocking=False
-            )
-
-    def on_train_end(self, logs=None):
-        # Makes sure the latest version of the model is uploaded
-        if self.last_job is not None and not self.last_job.is_done:
-            logging.info("Pushing the last epoch to the Hub, this may take a while...")
-            while not self.last_job.is_done:
-                sleep(1)
-        else:
-            self.model.save_pretrained(self.output_dir)
-            if self.tokenizer is not None:
-                self.tokenizer.save_pretrained(self.output_dir)
-            train_summary = TrainingSummary.from_keras(
-                model=self.model,
-                model_name=self.hub_model_id,
-                keras_history=self.training_history,
-                **self.model_card_args,
-            )
-            model_card = train_summary.to_model_card()
-            with (self.output_dir / "README.md").open("w") as f:
-                f.write(model_card)
-            self.repo.push_to_hub(commit_message="End of training", blocking=True)
--- a/src/transformers/modelcard.py
+++ b/src/transformers/modelcard.py
@ -51,7 +51,6 @@ from .utils import (
    cached_file,
    is_datasets_available,
    is_offline_mode,
-    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
    logging,
@ -256,11 +255,6 @@ AUTOGENERATED_TRAINER_COMMENT = """
 should probably proofread and complete it, then remove this comment. -->
 """

-AUTOGENERATED_KERAS_COMMENT = """
-<!-- This model card has been generated automatically according to the information Keras had access to. You should
-probably proofread and complete it, then remove this comment. -->
-"""
-

 TASK_TAG_TO_NAME_MAPPING = {
    "fill-mask": "Masked Language Modeling",
@ -483,8 +477,6 @@ class TrainingSummary:
        # Now the model card for realsies.
        if self.source == "trainer":
            model_card += AUTOGENERATED_TRAINER_COMMENT
-        else:
-            model_card += AUTOGENERATED_KERAS_COMMENT

        model_card += f"\n# {self.model_name}\n\n"

@ -538,10 +530,6 @@ class TrainingSummary:
            import torch

            model_card += f"- Pytorch {torch.__version__}\n"
-        elif self.source == "keras" and is_tf_available():
-            import tensorflow as tf
-
-            model_card += f"- TensorFlow {tf.__version__}\n"
        if is_datasets_available():
            import datasets

@ -631,116 +619,6 @@ class TrainingSummary:
            hyperparameters=hyperparameters,
        )

-    @classmethod
-    def from_keras(
-        cls,
-        model,
-        model_name,
-        keras_history=None,
-        language=None,
-        license=None,
-        tags=None,
-        finetuned_from=None,
-        tasks=None,
-        dataset_tags=None,
-        dataset=None,
-        dataset_args=None,
-    ):
-        # Infer default from dataset
-        if dataset is not None:
-            if is_hf_dataset(dataset) and (dataset_tags is None or dataset_args is None):
-                default_tag = dataset.builder_name
-                # Those are not real datasets from the Hub so we exclude them.
-                if default_tag not in ["csv", "json", "pandas", "parquet", "text"]:
-                    if dataset_tags is None:
-                        dataset_tags = [default_tag]
-                    if dataset_args is None:
-                        dataset_args = [dataset.config_name]
-
-        if dataset is None and dataset_tags is not None:
-            dataset = dataset_tags
-
-        # Infer default finetuned_from
-        if (
-            finetuned_from is None
-            and hasattr(model.config, "_name_or_path")
-            and not os.path.isdir(model.config._name_or_path)
-        ):
-            finetuned_from = model.config._name_or_path
-
-        # Infer default task tag:
-        if tasks is None:
-            model_class_name = model.__class__.__name__
-            for task, mapping in TASK_MAPPING.items():
-                if model_class_name in _get_mapping_values(mapping):
-                    tasks = task
-
-        # Add `generated_from_keras_callback` to the tags
-        if tags is None:
-            tags = ["generated_from_keras_callback"]
-        elif isinstance(tags, str) and tags != "generated_from_keras_callback":
-            tags = [tags, "generated_from_keras_callback"]
-        elif "generated_from_keras_callback" not in tags:
-            tags.append("generated_from_keras_callback")
-
-        if keras_history is not None:
-            _, eval_lines, eval_results = parse_keras_history(keras_history)
-        else:
-            eval_lines = []
-            eval_results = {}
-        hyperparameters = extract_hyperparameters_from_keras(model)
-
-        return cls(
-            language=language,
-            license=license,
-            tags=tags,
-            model_name=model_name,
-            finetuned_from=finetuned_from,
-            tasks=tasks,
-            dataset_tags=dataset_tags,
-            dataset=dataset,
-            dataset_args=dataset_args,
-            eval_results=eval_results,
-            eval_lines=eval_lines,
-            hyperparameters=hyperparameters,
-            source="keras",
-        )
-
-
-def parse_keras_history(logs):
-    """
-    Parse the `logs` of either a `keras.History` object returned by `model.fit()` or an accumulated logs `dict`
-    passed to the `PushToHubCallback`. Returns lines and logs compatible with those returned by `parse_log_history`.
-    """
-    if hasattr(logs, "history"):
-        # This looks like a `History` object
-        if not hasattr(logs, "epoch"):
-            # This history looks empty, return empty results
-            return None, [], {}
-        logs.history["epoch"] = logs.epoch
-        logs = logs.history
-    else:
-        # Training logs is a list of dicts, let's invert it to a dict of lists to match a History object
-        logs = {log_key: [single_dict[log_key] for single_dict in logs] for log_key in logs[0]}
-
-    lines = []
-    for i in range(len(logs["epoch"])):
-        epoch_dict = {log_key: log_value_list[i] for log_key, log_value_list in logs.items()}
-        values = {}
-        for k, v in epoch_dict.items():
-            if k.startswith("val_"):
-                k = "validation_" + k[4:]
-            elif k != "epoch":
-                k = "train_" + k
-            splits = k.split("_")
-            name = " ".join([part.capitalize() for part in splits])
-            values[name] = v
-        lines.append(values)
-
-    eval_results = lines[-1]
-
-    return logs, lines, eval_results
-

 def parse_log_history(log_history):
    """
@ -804,19 +682,6 @@ def parse_log_history(log_history):
        return train_log, lines, None


-def extract_hyperparameters_from_keras(model):
-    from .modeling_tf_utils import keras
-
-    hyperparameters = {}
-    if hasattr(model, "optimizer") and model.optimizer is not None:
-        hyperparameters["optimizer"] = model.optimizer.get_config()
-    else:
-        hyperparameters["optimizer"] = None
-    hyperparameters["training_precision"] = keras.mixed_precision.global_policy().name
-
-    return hyperparameters
-
-
 def _maybe_round(v, decimals=4):
    if isinstance(v, float) and len(str(v).split(".")) > 1 and len(str(v).split(".")[1]) > decimals:
        return f"{v:.{decimals}f}"
--- a/src/transformers/modeling_flax_outputs.py
+++ b/src/transformers/modeling_flax_outputs.py
@ -1,700 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Optional
-
-import flax
-import jax.numpy as jnp
-
-from .utils import ModelOutput
-
-
-@flax.struct.dataclass
-class FlaxBaseModelOutput(ModelOutput):
-    """
-    Base class for model's outputs, with potential hidden states and attentions.
-
-    Args:
-        last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    last_hidden_state: Optional[jnp.ndarray] = None
-    hidden_states: Optional[tuple[jnp.ndarray]] = None
-    attentions: Optional[tuple[jnp.ndarray]] = None
-
-
-@flax.struct.dataclass
-class FlaxBaseModelOutputWithNoAttention(ModelOutput):
-    """
-    Base class for model's outputs, with potential hidden states.
-
-    Args:
-        last_hidden_state (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `jnp.ndarray` (one for the output of the embeddings, if the model has an embedding layer, + one
-            for the output of each layer) of shape `(batch_size, num_channels, height, width)`. Hidden-states of the
-            model at the output of each layer plus the optional initial embedding outputs.
-    """
-
-    last_hidden_state: Optional[jnp.ndarray] = None
-    hidden_states: Optional[tuple[jnp.ndarray]] = None
-
-
-@flax.struct.dataclass
-class FlaxBaseModelOutputWithPoolingAndNoAttention(ModelOutput):
-    """
-    Base class for model's outputs that also contains a pooling of the last hidden states.
-
-    Args:
-        last_hidden_state (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        pooler_output (`jnp.ndarray` of shape `(batch_size, hidden_size)`):
-            Last layer hidden-state after a pooling operation on the spatial dimensions.
-        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `jnp.ndarray` (one for the output of the embeddings, if the model has an embedding layer, + one
-            for the output of each layer) of shape `(batch_size, num_channels, height, width)`. Hidden-states of the
-            model at the output of each layer plus the optional initial embedding outputs.
-    """
-
-    last_hidden_state: Optional[jnp.ndarray] = None
-    pooler_output: Optional[jnp.ndarray] = None
-    hidden_states: Optional[tuple[jnp.ndarray]] = None
-
-
-@flax.struct.dataclass
-class FlaxImageClassifierOutputWithNoAttention(ModelOutput):
-    """
-    Base class for outputs of image classification models.
-
-    Args:
-        logits (`jnp.ndarray` of shape `(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when
-        `config.output_hidden_states=True`):
-            Tuple of `jnp.ndarray` (one for the output of the embeddings, if the model has an embedding layer, + one
-            for the output of each stage) of shape `(batch_size, num_channels, height, width)`. Hidden-states (also
-            called feature maps) of the model at the output of each stage.
-    """
-
-    logits: Optional[jnp.ndarray] = None
-    hidden_states: Optional[tuple[jnp.ndarray]] = None
-
-
-@flax.struct.dataclass
-class FlaxBaseModelOutputWithPast(ModelOutput):
-    """
-    Base class for model's outputs, with potential hidden states and attentions.
-
-    Args:
-        last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        past_key_values (`dict[str, jnp.ndarray]`):
-            Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
-            auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
-        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    last_hidden_state: Optional[jnp.ndarray] = None
-    past_key_values: Optional[dict[str, jnp.ndarray]] = None
-    hidden_states: Optional[tuple[jnp.ndarray]] = None
-    attentions: Optional[tuple[jnp.ndarray]] = None
-
-
-@flax.struct.dataclass
-class FlaxBaseModelOutputWithPooling(ModelOutput):
-    """
-    Base class for model's outputs that also contains a pooling of the last hidden states.
-
-    Args:
-        last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        pooler_output (`jnp.ndarray` of shape `(batch_size, hidden_size)`):
-            Last layer hidden-state of the first token of the sequence (classification token) further processed by a
-            Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence
-            prediction (classification) objective during pretraining.
-        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    last_hidden_state: Optional[jnp.ndarray] = None
-    pooler_output: Optional[jnp.ndarray] = None
-    hidden_states: Optional[tuple[jnp.ndarray]] = None
-    attentions: Optional[tuple[jnp.ndarray]] = None
-
-
-@flax.struct.dataclass
-class FlaxBaseModelOutputWithPoolingAndCrossAttentions(ModelOutput):
-    """
-    Base class for model's outputs that also contains a pooling of the last hidden states.
-
-    Args:
-        last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        pooler_output (`jnp.ndarray` of shape `(batch_size, hidden_size)`):
-            Last layer hidden-state of the first token of the sequence (classification token) after further processing
-            through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
-            the classification token after processing through a linear layer and a tanh activation function. The linear
-            layer weights are trained from the next sentence prediction (classification) objective during pretraining.
-        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `jnp.ndarray` (one for the output of the embeddings, if the model has an embedding layer, + one
-            for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
-            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-        past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
-            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
-            encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
-            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
-            input) to speed up sequential decoding.
-    """
-
-    last_hidden_state: Optional[jnp.ndarray] = None
-    pooler_output: Optional[jnp.ndarray] = None
-    hidden_states: Optional[tuple[jnp.ndarray]] = None
-    past_key_values: Optional[tuple[tuple[jnp.ndarray]]] = None
-    attentions: Optional[tuple[jnp.ndarray]] = None
-    cross_attentions: Optional[tuple[jnp.ndarray]] = None
-
-
-@flax.struct.dataclass
-class FlaxBaseModelOutputWithPastAndCrossAttentions(ModelOutput):
-    """
-    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
-
-    Args:
-        last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-
-            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
-            hidden_size)` is output.
-        past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
-            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
-            encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
-            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
-            input) to speed up sequential decoding.
-        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
-            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-    """
-
-    last_hidden_state: Optional[jnp.ndarray] = None
-    past_key_values: Optional[tuple[tuple[jnp.ndarray]]] = None
-    hidden_states: Optional[tuple[jnp.ndarray]] = None
-    attentions: Optional[tuple[jnp.ndarray]] = None
-    cross_attentions: Optional[tuple[jnp.ndarray]] = None
-
-
-@flax.struct.dataclass
-class FlaxSeq2SeqModelOutput(ModelOutput):
-    """
-    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
-    decoding.
-
-    Args:
-        last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the decoder of the model.
-
-            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
-            hidden_size)` is output.
-        past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-        decoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
-        decoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-        encoder_last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
-        encoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-    """
-
-    last_hidden_state: Optional[jnp.ndarray] = None
-    past_key_values: Optional[tuple[tuple[jnp.ndarray]]] = None
-    decoder_hidden_states: Optional[tuple[jnp.ndarray]] = None
-    decoder_attentions: Optional[tuple[jnp.ndarray]] = None
-    cross_attentions: Optional[tuple[jnp.ndarray]] = None
-    encoder_last_hidden_state: Optional[jnp.ndarray] = None
-    encoder_hidden_states: Optional[tuple[jnp.ndarray]] = None
-    encoder_attentions: Optional[tuple[jnp.ndarray]] = None
-
-
-@flax.struct.dataclass
-class FlaxCausalLMOutputWithCrossAttentions(ModelOutput):
-    """
-    Base class for causal language model (or autoregressive) outputs.
-
-    Args:
-        logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Cross attentions weights after the attention softmax, used to compute the weighted average in the
-            cross-attention heads.
-        past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `jnp.ndarray` tuples of length `config.n_layers`, with each tuple containing the cached key, value
-            states of the self-attention and the cross-attention layers if model is used in encoder-decoder setting.
-            Only relevant if `config.is_decoder = True`.
-
-            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
-            `past_key_values` input) to speed up sequential decoding.
-    """
-
-    logits: Optional[jnp.ndarray] = None
-    past_key_values: Optional[tuple[tuple[jnp.ndarray]]] = None
-    hidden_states: Optional[tuple[jnp.ndarray]] = None
-    attentions: Optional[tuple[jnp.ndarray]] = None
-    cross_attentions: Optional[tuple[jnp.ndarray]] = None
-
-
-@flax.struct.dataclass
-class FlaxMaskedLMOutput(ModelOutput):
-    """
-    Base class for masked language models outputs.
-
-    Args:
-        logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    logits: Optional[jnp.ndarray] = None
-    hidden_states: Optional[tuple[jnp.ndarray]] = None
-    attentions: Optional[tuple[jnp.ndarray]] = None
-
-
-FlaxCausalLMOutput = FlaxMaskedLMOutput
-
-
-@flax.struct.dataclass
-class FlaxSeq2SeqLMOutput(ModelOutput):
-    """
-    Base class for sequence-to-sequence language models outputs.
-
-    Args:
-        logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-        decoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
-        decoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-        encoder_last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
-        encoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-    """
-
-    logits: Optional[jnp.ndarray] = None
-    past_key_values: Optional[tuple[tuple[jnp.ndarray]]] = None
-    decoder_hidden_states: Optional[tuple[jnp.ndarray]] = None
-    decoder_attentions: Optional[tuple[jnp.ndarray]] = None
-    cross_attentions: Optional[tuple[jnp.ndarray]] = None
-    encoder_last_hidden_state: Optional[jnp.ndarray] = None
-    encoder_hidden_states: Optional[tuple[jnp.ndarray]] = None
-    encoder_attentions: Optional[tuple[jnp.ndarray]] = None
-
-
-@flax.struct.dataclass
-class FlaxNextSentencePredictorOutput(ModelOutput):
-    """
-    Base class for outputs of models predicting if two sentences are consecutive or not.
-
-    Args:
-        logits (`jnp.ndarray` of shape `(batch_size, 2)`):
-            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
-            before SoftMax).
-        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    logits: Optional[jnp.ndarray] = None
-    hidden_states: Optional[tuple[jnp.ndarray]] = None
-    attentions: Optional[tuple[jnp.ndarray]] = None
-
-
-@flax.struct.dataclass
-class FlaxSequenceClassifierOutput(ModelOutput):
-    """
-    Base class for outputs of sentence classification models.
-
-    Args:
-        logits (`jnp.ndarray` of shape `(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    logits: Optional[jnp.ndarray] = None
-    hidden_states: Optional[tuple[jnp.ndarray]] = None
-    attentions: Optional[tuple[jnp.ndarray]] = None
-
-
-@flax.struct.dataclass
-class FlaxSeq2SeqSequenceClassifierOutput(ModelOutput):
-    """
-    Base class for outputs of sequence-to-sequence sentence classification models.
-
-    Args:
-        logits (`jnp.ndarray` of shape `(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-        decoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
-        decoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-        encoder_last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
-        encoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-    """
-
-    logits: Optional[jnp.ndarray] = None
-    past_key_values: Optional[tuple[tuple[jnp.ndarray]]] = None
-    decoder_hidden_states: Optional[tuple[jnp.ndarray]] = None
-    decoder_attentions: Optional[tuple[jnp.ndarray]] = None
-    cross_attentions: Optional[tuple[jnp.ndarray]] = None
-    encoder_last_hidden_state: Optional[jnp.ndarray] = None
-    encoder_hidden_states: Optional[tuple[jnp.ndarray]] = None
-    encoder_attentions: Optional[tuple[jnp.ndarray]] = None
-
-
-@flax.struct.dataclass
-class FlaxMultipleChoiceModelOutput(ModelOutput):
-    """
-    Base class for outputs of multiple choice models.
-
-    Args:
-        logits (`jnp.ndarray` of shape `(batch_size, num_choices)`):
-            *num_choices* is the second dimension of the input tensors. (see *input_ids* above).
-
-            Classification scores (before SoftMax).
-        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    logits: Optional[jnp.ndarray] = None
-    hidden_states: Optional[tuple[jnp.ndarray]] = None
-    attentions: Optional[tuple[jnp.ndarray]] = None
-
-
-@flax.struct.dataclass
-class FlaxTokenClassifierOutput(ModelOutput):
-    """
-    Base class for outputs of token classification models.
-
-    Args:
-        logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.num_labels)`):
-            Classification scores (before SoftMax).
-        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    logits: Optional[jnp.ndarray] = None
-    hidden_states: Optional[tuple[jnp.ndarray]] = None
-    attentions: Optional[tuple[jnp.ndarray]] = None
-
-
-@flax.struct.dataclass
-class FlaxQuestionAnsweringModelOutput(ModelOutput):
-    """
-    Base class for outputs of question answering models.
-
-    Args:
-        start_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
-            Span-start scores (before SoftMax).
-        end_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
-            Span-end scores (before SoftMax).
-        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    start_logits: Optional[jnp.ndarray] = None
-    end_logits: Optional[jnp.ndarray] = None
-    hidden_states: Optional[tuple[jnp.ndarray]] = None
-    attentions: Optional[tuple[jnp.ndarray]] = None
-
-
-@flax.struct.dataclass
-class FlaxSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
-    """
-    Base class for outputs of sequence-to-sequence question answering models.
-
-    Args:
-        start_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
-            Span-start scores (before SoftMax).
-        end_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
-            Span-end scores (before SoftMax).
-        past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-        decoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
-        decoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-        encoder_last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
-        encoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-    """
-
-    start_logits: Optional[jnp.ndarray] = None
-    end_logits: Optional[jnp.ndarray] = None
-    past_key_values: Optional[tuple[tuple[jnp.ndarray]]] = None
-    decoder_hidden_states: Optional[tuple[jnp.ndarray]] = None
-    decoder_attentions: Optional[tuple[jnp.ndarray]] = None
-    cross_attentions: Optional[tuple[jnp.ndarray]] = None
-    encoder_last_hidden_state: Optional[jnp.ndarray] = None
-    encoder_hidden_states: Optional[tuple[jnp.ndarray]] = None
-    encoder_attentions: Optional[tuple[jnp.ndarray]] = None
--- a/src/transformers/modeling_flax_pytorch_utils.py
+++ b/src/transformers/modeling_flax_pytorch_utils.py
@ -1,491 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch - Flax general utilities."""
-
-import os
-from pickle import UnpicklingError
-
-import jax
-import jax.numpy as jnp
-import numpy as np
-from flax.serialization import from_bytes
-from flax.traverse_util import flatten_dict, unflatten_dict
-
-import transformers
-
-from . import is_safetensors_available, is_torch_available
-from .utils import check_torch_load_is_safe, logging
-
-
-if is_torch_available():
-    import torch
-
-if is_safetensors_available():
-    from safetensors import safe_open
-    from safetensors.flax import load_file as safe_load_file
-
-
-logger = logging.get_logger(__name__)
-
-
-#####################
-# PyTorch => Flax #
-#####################
-
-
-def load_pytorch_checkpoint_in_flax_state_dict(
-    flax_model, pytorch_checkpoint_path, is_sharded, allow_missing_keys=False
-):
-    """Load pytorch checkpoints in a flax model"""
-
-    if not is_sharded:
-        pt_path = os.path.abspath(pytorch_checkpoint_path)
-        logger.info(f"Loading PyTorch weights from {pt_path}")
-
-        if pt_path.endswith(".safetensors"):
-            pt_state_dict = {}
-            with safe_open(pt_path, framework="flax") as f:
-                for k in f.keys():
-                    pt_state_dict[k] = f.get_tensor(k)
-        else:
-            try:
-                import torch  # noqa: F401
-            except (ImportError, ModuleNotFoundError):
-                logger.error(
-                    "Loading a PyTorch model in Flax, requires both PyTorch and Flax to be installed. Please see"
-                    " https://pytorch.org/ and https://flax.readthedocs.io/en/latest/index.html#installation for installation"
-                    " instructions."
-                )
-                raise
-
-            check_torch_load_is_safe()
-            pt_state_dict = torch.load(pt_path, map_location="cpu", weights_only=True)
-            logger.info(f"PyTorch checkpoint contains {sum(t.numel() for t in pt_state_dict.values()):,} parameters.")
-
-        flax_state_dict = convert_pytorch_state_dict_to_flax(pt_state_dict, flax_model)
-    else:
-        # model is sharded and pytorch_checkpoint_path already contains the list of .pt shard files
-        flax_state_dict = convert_pytorch_sharded_state_dict_to_flax(pytorch_checkpoint_path, flax_model)
-    return flax_state_dict
-
-
-def rename_key_and_reshape_tensor(
-    pt_tuple_key: tuple[str],
-    pt_tensor: np.ndarray,
-    random_flax_state_dict: dict[str, jnp.ndarray],
-    model_prefix: str,
-) -> tuple[tuple[str], np.ndarray]:
-    """Rename PT weight names to corresponding Flax weight names and reshape tensor if necessary"""
-
-    def is_key_or_prefix_key_in_dict(key: tuple[str]) -> bool:
-        """Checks if `key` of `(prefix,) + key` is in random_flax_state_dict"""
-        return len(set(random_flax_state_dict) & {key, (model_prefix,) + key}) > 0
-
-    # layer norm
-    renamed_pt_tuple_key = pt_tuple_key[:-1] + ("scale",)
-    if pt_tuple_key[-1] in ["weight", "gamma"] and is_key_or_prefix_key_in_dict(renamed_pt_tuple_key):
-        return renamed_pt_tuple_key, pt_tensor
-
-    # batch norm layer mean
-    renamed_pt_tuple_key = pt_tuple_key[:-1] + ("mean",)
-    if pt_tuple_key[-1] == "running_mean" and not is_key_or_prefix_key_in_dict(pt_tuple_key):
-        return renamed_pt_tuple_key, pt_tensor
-
-    # batch norm layer var
-    renamed_pt_tuple_key = pt_tuple_key[:-1] + ("var",)
-    if pt_tuple_key[-1] == "running_var" and not is_key_or_prefix_key_in_dict(pt_tuple_key):
-        return renamed_pt_tuple_key, pt_tensor
-
-    # embedding
-    renamed_pt_tuple_key = pt_tuple_key[:-1] + ("embedding",)
-    if pt_tuple_key[-1] == "weight" and is_key_or_prefix_key_in_dict(renamed_pt_tuple_key):
-        return renamed_pt_tuple_key, pt_tensor
-
-    # conv layer
-    renamed_pt_tuple_key = pt_tuple_key[:-1] + ("kernel",)
-    if pt_tuple_key[-1] == "weight" and pt_tensor.ndim == 4 and not is_key_or_prefix_key_in_dict(pt_tuple_key):
-        pt_tensor = pt_tensor.transpose(2, 3, 1, 0)
-        return renamed_pt_tuple_key, pt_tensor
-
-    # linear layer
-    renamed_pt_tuple_key = pt_tuple_key[:-1] + ("kernel",)
-    if pt_tuple_key[-1] == "weight" and not is_key_or_prefix_key_in_dict(pt_tuple_key):
-        pt_tensor = pt_tensor.T
-        return renamed_pt_tuple_key, pt_tensor
-
-    # old PyTorch layer norm weight
-    renamed_pt_tuple_key = pt_tuple_key[:-1] + ("weight",)
-    if pt_tuple_key[-1] == "gamma":
-        return renamed_pt_tuple_key, pt_tensor
-
-    # old PyTorch layer norm bias
-    renamed_pt_tuple_key = pt_tuple_key[:-1] + ("bias",)
-    if pt_tuple_key[-1] == "beta":
-        return renamed_pt_tuple_key, pt_tensor
-
-    # New `weight_norm` from https://github.com/huggingface/transformers/pull/24030
-    name = None
-    if pt_tuple_key[-3::2] == ("parametrizations", "original0"):
-        name = pt_tuple_key[-2] + "_g"
-    elif pt_tuple_key[-3::2] == ("parametrizations", "original1"):
-        name = pt_tuple_key[-2] + "_v"
-    if name is not None:
-        renamed_pt_tuple_key = pt_tuple_key[:-3] + (name,)
-        return renamed_pt_tuple_key, pt_tensor
-
-    return pt_tuple_key, pt_tensor
-
-
-def convert_pytorch_state_dict_to_flax(pt_state_dict, flax_model):
-    # convert pytorch tensor to numpy
-    from_bin = is_torch_available() and isinstance(next(iter(pt_state_dict.values())), torch.Tensor)
-    bfloat16 = torch.bfloat16 if from_bin else "bfloat16"
-
-    weight_dtypes = {k: v.dtype for k, v in pt_state_dict.items()}
-
-    if from_bin:
-        for k, v in pt_state_dict.items():
-            # numpy currently does not support bfloat16, need to go over float32 in this case to not lose precision
-            if v.dtype == bfloat16:
-                v = v.float()
-            pt_state_dict[k] = v.cpu().numpy()
-
-    model_prefix = flax_model.base_model_prefix
-
-    # use params dict if the model contains batch norm layers
-    if "params" in flax_model.params:
-        flax_model_params = flax_model.params["params"]
-    else:
-        flax_model_params = flax_model.params
-    random_flax_state_dict = flatten_dict(flax_model_params)
-
-    # add batch_stats keys,values to dict
-    if "batch_stats" in flax_model.params:
-        flax_batch_stats = flatten_dict(flax_model.params["batch_stats"])
-        random_flax_state_dict.update(flax_batch_stats)
-
-    flax_state_dict = {}
-
-    load_model_with_head_into_base_model = (model_prefix not in flax_model_params) and (
-        model_prefix in {k.split(".")[0] for k in pt_state_dict}
-    )
-    load_base_model_into_model_with_head = (model_prefix in flax_model_params) and (
-        model_prefix not in {k.split(".")[0] for k in pt_state_dict}
-    )
-
-    # Need to change some parameters name to match Flax names
-    for pt_key, pt_tensor in pt_state_dict.items():
-        pt_tuple_key = tuple(pt_key.split("."))
-        is_bfloat_16 = weight_dtypes[pt_key] == bfloat16
-
-        # remove base model prefix if necessary
-        has_base_model_prefix = pt_tuple_key[0] == model_prefix
-        if load_model_with_head_into_base_model and has_base_model_prefix:
-            pt_tuple_key = pt_tuple_key[1:]
-
-        # Correctly rename weight parameters
-        flax_key, flax_tensor = rename_key_and_reshape_tensor(
-            pt_tuple_key, pt_tensor, random_flax_state_dict, model_prefix
-        )
-
-        # add model prefix if necessary
-        require_base_model_prefix = (model_prefix,) + flax_key in random_flax_state_dict
-        if load_base_model_into_model_with_head and require_base_model_prefix:
-            flax_key = (model_prefix,) + flax_key
-
-        if flax_key in random_flax_state_dict:
-            if flax_tensor.shape != random_flax_state_dict[flax_key].shape:
-                raise ValueError(
-                    f"PyTorch checkpoint seems to be incorrect. Weight {pt_key} was expected to be of shape "
-                    f"{random_flax_state_dict[flax_key].shape}, but is {flax_tensor.shape}."
-                )
-
-        # add batch stats if the model contains batchnorm layers
-        if "batch_stats" in flax_model.params:
-            if "mean" in flax_key[-1] or "var" in flax_key[-1]:
-                flax_state_dict[("batch_stats",) + flax_key] = jnp.asarray(flax_tensor)
-                continue
-            # remove num_batches_tracked key
-            if "num_batches_tracked" in flax_key[-1]:
-                flax_state_dict.pop(flax_key, None)
-                continue
-
-            # also add unexpected weight so that warning is thrown
-            flax_state_dict[("params",) + flax_key] = (
-                jnp.asarray(flax_tensor) if not is_bfloat_16 else jnp.asarray(flax_tensor, dtype=jnp.bfloat16)
-            )
-        else:
-            # also add unexpected weight so that warning is thrown
-            flax_state_dict[flax_key] = (
-                jnp.asarray(flax_tensor) if not is_bfloat_16 else jnp.asarray(flax_tensor, dtype=jnp.bfloat16)
-            )
-
-    return unflatten_dict(flax_state_dict)
-
-
-############################
-# Sharded Pytorch => Flax #
-############################
-
-
-def convert_pytorch_sharded_state_dict_to_flax(shard_filenames, flax_model):
-    import torch
-
-    # Load the index
-    flax_state_dict = {}
-    for shard_file in shard_filenames:
-        # load using msgpack utils
-        check_torch_load_is_safe()
-        pt_state_dict = torch.load(shard_file, weights_only=True)
-        weight_dtypes = {k: v.dtype for k, v in pt_state_dict.items()}
-        pt_state_dict = {
-            k: v.numpy() if v.dtype != torch.bfloat16 else v.float().numpy() for k, v in pt_state_dict.items()
-        }
-
-        model_prefix = flax_model.base_model_prefix
-
-        # use params dict if the model contains batch norm layers and then add batch_stats keys,values to dict
-        if "batch_stats" in flax_model.params:
-            flax_model_params = flax_model.params["params"]
-
-            random_flax_state_dict = flatten_dict(flax_model_params)
-            random_flax_state_dict.update(flatten_dict(flax_model.params["batch_stats"]))
-        else:
-            flax_model_params = flax_model.params
-            random_flax_state_dict = flatten_dict(flax_model_params)
-
-        load_model_with_head_into_base_model = (model_prefix not in flax_model_params) and (
-            model_prefix in {k.split(".")[0] for k in pt_state_dict}
-        )
-        load_base_model_into_model_with_head = (model_prefix in flax_model_params) and (
-            model_prefix not in {k.split(".")[0] for k in pt_state_dict}
-        )
-        # Need to change some parameters name to match Flax names
-        for pt_key, pt_tensor in pt_state_dict.items():
-            pt_tuple_key = tuple(pt_key.split("."))
-            is_bfloat_16 = weight_dtypes[pt_key] == torch.bfloat16
-
-            # remove base model prefix if necessary
-            has_base_model_prefix = pt_tuple_key[0] == model_prefix
-            if load_model_with_head_into_base_model and has_base_model_prefix:
-                pt_tuple_key = pt_tuple_key[1:]
-
-            # Correctly rename weight parameters
-            flax_key, flax_tensor = rename_key_and_reshape_tensor(
-                pt_tuple_key, pt_tensor, random_flax_state_dict, model_prefix
-            )
-            # add model prefix if necessary
-            require_base_model_prefix = (model_prefix,) + flax_key in random_flax_state_dict
-            if load_base_model_into_model_with_head and require_base_model_prefix:
-                flax_key = (model_prefix,) + flax_key
-
-            if flax_key in random_flax_state_dict:
-                if flax_tensor.shape != random_flax_state_dict[flax_key].shape:
-                    raise ValueError(
-                        f"PyTorch checkpoint seems to be incorrect. Weight {pt_key} was expected to be of shape "
-                        f"{random_flax_state_dict[flax_key].shape}, but is {flax_tensor.shape}."
-                    )
-
-            # add batch stats if the model contains batchnorm layers
-            if "batch_stats" in flax_model.params:
-                if "mean" in flax_key[-1]:
-                    flax_state_dict[("batch_stats",) + flax_key] = jnp.asarray(flax_tensor)
-                    continue
-                if "var" in flax_key[-1]:
-                    flax_state_dict[("batch_stats",) + flax_key] = jnp.asarray(flax_tensor)
-                    continue
-                # remove num_batches_tracked key
-                if "num_batches_tracked" in flax_key[-1]:
-                    flax_state_dict.pop(flax_key, None)
-                    continue
-
-                # also add unexpected weight so that warning is thrown
-                flax_state_dict[("params",) + flax_key] = (
-                    jnp.asarray(flax_tensor) if not is_bfloat_16 else jnp.asarray(flax_tensor, dtype=jnp.bfloat16)
-                )
-
-            else:
-                # also add unexpected weight so that warning is thrown
-                flax_state_dict[flax_key] = (
-                    jnp.asarray(flax_tensor) if not is_bfloat_16 else jnp.asarray(flax_tensor, dtype=jnp.bfloat16)
-                )
-    return unflatten_dict(flax_state_dict)
-
-
-#####################
-# Flax => PyTorch #
-#####################
-
-
-def load_flax_checkpoint_in_pytorch_model(model, flax_checkpoint_path):
-    """Load flax checkpoints in a PyTorch model"""
-    flax_checkpoint_path = os.path.abspath(flax_checkpoint_path)
-    logger.info(f"Loading Flax weights from {flax_checkpoint_path}")
-
-    # import correct flax class
-    flax_cls = getattr(transformers, "Flax" + model.__class__.__name__)
-
-    # load flax weight dict
-    if flax_checkpoint_path.endswith(".safetensors"):
-        flax_state_dict = safe_load_file(flax_checkpoint_path)
-        flax_state_dict = unflatten_dict(flax_state_dict, sep=".")
-    else:
-        with open(flax_checkpoint_path, "rb") as state_f:
-            try:
-                flax_state_dict = from_bytes(flax_cls, state_f.read())
-            except UnpicklingError:
-                raise OSError(f"Unable to convert {flax_checkpoint_path} to Flax deserializable object. ")
-
-    return load_flax_weights_in_pytorch_model(model, flax_state_dict)
-
-
-def load_flax_weights_in_pytorch_model(pt_model, flax_state):
-    """Load flax checkpoints in a PyTorch model"""
-
-    try:
-        import torch  # noqa: F401
-    except (ImportError, ModuleNotFoundError):
-        logger.error(
-            "Loading a Flax weights in PyTorch, requires both PyTorch and Flax to be installed. Please see"
-            " https://pytorch.org/ and https://flax.readthedocs.io/en/latest/index.html#installation for installation"
-            " instructions."
-        )
-        raise
-
-    # check if we have bf16 weights
-    is_type_bf16 = flatten_dict(jax.tree_util.tree_map(lambda x: x.dtype == jnp.bfloat16, flax_state)).values()
-    if any(is_type_bf16):
-        # convert all weights to fp32 if the are bf16 since torch.from_numpy can-not handle bf16
-        # and bf16 is not fully supported in PT yet.
-        logger.warning(
-            "Found ``bfloat16`` weights in Flax model. Casting all ``bfloat16`` weights to ``float32`` "
-            "before loading those in PyTorch model."
-        )
-        flax_state = jax.tree_util.tree_map(
-            lambda params: params.astype(np.float32) if params.dtype == jnp.bfloat16 else params, flax_state
-        )
-
-    flax_state_dict = flatten_dict(flax_state)
-    pt_model_dict = pt_model.state_dict()
-
-    load_model_with_head_into_base_model = (pt_model.base_model_prefix in flax_state) and (
-        pt_model.base_model_prefix not in {k.split(".")[0] for k in pt_model_dict}
-    )
-    load_base_model_into_model_with_head = (pt_model.base_model_prefix not in flax_state) and (
-        pt_model.base_model_prefix in {k.split(".")[0] for k in pt_model_dict}
-    )
-
-    # keep track of unexpected & missing keys
-    unexpected_keys = []
-    missing_keys = set(pt_model_dict.keys())
-
-    for flax_key_tuple, flax_tensor in flax_state_dict.items():
-        has_base_model_prefix = flax_key_tuple[0] == pt_model.base_model_prefix
-        require_base_model_prefix = ".".join((pt_model.base_model_prefix,) + flax_key_tuple) in pt_model_dict
-
-        # adapt flax_key to prepare for loading from/to base model only
-        if load_model_with_head_into_base_model and has_base_model_prefix:
-            flax_key_tuple = flax_key_tuple[1:]
-        elif load_base_model_into_model_with_head and require_base_model_prefix:
-            flax_key_tuple = (pt_model.base_model_prefix,) + flax_key_tuple
-
-        # rename flax weights to PyTorch format
-        if flax_key_tuple[-1] == "kernel" and flax_tensor.ndim == 4 and ".".join(flax_key_tuple) not in pt_model_dict:
-            # conv layer
-            flax_key_tuple = flax_key_tuple[:-1] + ("weight",)
-            flax_tensor = jnp.transpose(flax_tensor, (3, 2, 0, 1))
-        elif flax_key_tuple[-1] == "kernel" and ".".join(flax_key_tuple) not in pt_model_dict:
-            # linear layer
-            flax_key_tuple = flax_key_tuple[:-1] + ("weight",)
-            flax_tensor = flax_tensor.T
-        elif flax_key_tuple[-1] in ["scale", "embedding"]:
-            flax_key_tuple = flax_key_tuple[:-1] + ("weight",)
-
-        # adding batch stats from flax batch norm to pt
-        elif "mean" in flax_key_tuple[-1]:
-            flax_key_tuple = flax_key_tuple[:-1] + ("running_mean",)
-        elif "var" in flax_key_tuple[-1]:
-            flax_key_tuple = flax_key_tuple[:-1] + ("running_var",)
-
-        if "batch_stats" in flax_state:
-            flax_key = ".".join(flax_key_tuple[1:])  # Remove the params/batch_stats header
-        else:
-            flax_key = ".".join(flax_key_tuple)
-
-        # We also need to look at `pt_model_dict` and see if there are keys requiring further transformation.
-        special_pt_names = {}
-        # New `weight_norm` from https://github.com/huggingface/transformers/pull/24030
-        for key in pt_model_dict:
-            key_components = key.split(".")
-            name = None
-            if key_components[-3::2] == ["parametrizations", "original0"]:
-                name = key_components[-2] + "_g"
-            elif key_components[-3::2] == ["parametrizations", "original1"]:
-                name = key_components[-2] + "_v"
-            if name is not None:
-                key_components = key_components[:-3] + [name]
-                key_to_check = ".".join(key_components)
-                special_pt_names[key_to_check] = key
-
-        if flax_key in special_pt_names:
-            flax_key = special_pt_names[flax_key]
-
-        if flax_key in pt_model_dict:
-            if flax_tensor.shape != pt_model_dict[flax_key].shape:
-                raise ValueError(
-                    f"Flax checkpoint seems to be incorrect. Weight {flax_key_tuple} was expected "
-                    f"to be of shape {pt_model_dict[flax_key].shape}, but is {flax_tensor.shape}."
-                )
-            else:
-                # add weight to pytorch dict
-                flax_tensor = np.asarray(flax_tensor) if not isinstance(flax_tensor, np.ndarray) else flax_tensor
-                pt_model_dict[flax_key] = torch.from_numpy(flax_tensor)
-                # remove from missing keys
-                missing_keys.remove(flax_key)
-        else:
-            # weight is not expected by PyTorch model
-            unexpected_keys.append(flax_key)
-
-    pt_model.load_state_dict(pt_model_dict)
-
-    # re-transform missing_keys to list
-    missing_keys = list(missing_keys)
-
-    if len(unexpected_keys) > 0:
-        logger.warning(
-            "Some weights of the Flax model were not used when initializing the PyTorch model"
-            f" {pt_model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are initializing"
-            f" {pt_model.__class__.__name__} from a Flax model trained on another task or with another architecture"
-            " (e.g. initializing a BertForSequenceClassification model from a FlaxBertForPreTraining model).\n- This"
-            f" IS NOT expected if you are initializing {pt_model.__class__.__name__} from a Flax model that you expect"
-            " to be exactly identical (e.g. initializing a BertForSequenceClassification model from a"
-            " FlaxBertForSequenceClassification model)."
-        )
-    else:
-        logger.warning(f"All Flax model weights were used when initializing {pt_model.__class__.__name__}.\n")
-    if len(missing_keys) > 0:
-        logger.warning(
-            f"Some weights of {pt_model.__class__.__name__} were not initialized from the Flax model and are newly"
-            f" initialized: {missing_keys}\nYou should probably TRAIN this model on a down-stream task to be able to"
-            " use it for predictions and inference."
-        )
-    else:
-        logger.warning(
-            f"All the weights of {pt_model.__class__.__name__} were initialized from the Flax model.\n"
-            "If your task is similar to the task the model of the checkpoint was trained on, "
-            f"you can already use {pt_model.__class__.__name__} for predictions without further training."
-        )
-
-    return pt_model
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
--- a/src/transformers/modeling_tf_outputs.py
+++ b/src/transformers/modeling_tf_outputs.py
@ -1,990 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import warnings
-from dataclasses import dataclass
-
-import tensorflow as tf
-
-from .utils import ModelOutput
-
-
-@dataclass
-class TFBaseModelOutput(ModelOutput):
-    """
-    Base class for model's outputs, with potential hidden states and attentions.
-
-    Args:
-        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (`tuple(tf.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    last_hidden_state: tf.Tensor | None = None
-    hidden_states: tuple[tf.Tensor] | None = None
-    attentions: tuple[tf.Tensor] | None = None
-
-
-@dataclass
-class TFBaseModelOutputWithNoAttention(ModelOutput):
-    """
-    Base class for model's outputs, with potential hidden states.
-
-    Args:
-        last_hidden_state (`tf.Tensor` shape `(batch_size, num_channels, height, width)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
-            the output of each layer) of shape `(batch_size, num_channels, height, width)`.
-
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-    """
-
-    last_hidden_state: tf.Tensor | None = None
-    hidden_states: tuple[tf.Tensor, ...] | None = None
-
-
-@dataclass
-class TFBaseModelOutputWithPooling(ModelOutput):
-    """
-    Base class for model's outputs that also contains a pooling of the last hidden states.
-
-    Args:
-        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`):
-            Last layer hidden-state of the first token of the sequence (classification token) further processed by a
-            Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence
-            prediction (classification) objective during pretraining.
-
-            This output is usually *not* a good summary of the semantic content of the input, you're often better with
-            averaging or pooling the sequence of hidden-states for the whole input sequence.
-        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    last_hidden_state: tf.Tensor | None = None
-    pooler_output: tf.Tensor | None = None
-    hidden_states: tuple[tf.Tensor] | None = None
-    attentions: tuple[tf.Tensor] | None = None
-
-
-@dataclass
-class TFBaseModelOutputWithPoolingAndNoAttention(ModelOutput):
-    """
-    Base class for model's outputs that also contains a pooling of the last hidden states.
-
-    Args:
-        last_hidden_state (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`):
-            Last layer hidden-state after a pooling operation on the spatial dimensions.
-        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
-            the output of each layer) of shape `(batch_size, num_channels, height, width)`.
-
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-    """
-
-    last_hidden_state: tf.Tensor | None = None
-    pooler_output: tf.Tensor | None = None
-    hidden_states: tuple[tf.Tensor, ...] | None = None
-
-
-@dataclass
-class TFBaseModelOutputWithPoolingAndCrossAttentions(ModelOutput):
-    """
-    Base class for model's outputs that also contains a pooling of the last hidden states.
-
-    Args:
-        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`):
-            Last layer hidden-state of the first token of the sequence (classification token) further processed by a
-            Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence
-            prediction (classification) objective during pretraining.
-
-            This output is usually *not* a good summary of the semantic content of the input, you're often better with
-            averaging or pooling the sequence of hidden-states for the whole input sequence.
-        past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
-            sequence_length, embed_size_per_head)`).
-
-            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
-            `past_key_values` input) to speed up sequential decoding.
-        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-    """
-
-    last_hidden_state: tf.Tensor | None = None
-    pooler_output: tf.Tensor | None = None
-    past_key_values: list[tf.Tensor] | None = None
-    hidden_states: tuple[tf.Tensor] | None = None
-    attentions: tuple[tf.Tensor] | None = None
-    cross_attentions: tuple[tf.Tensor] | None = None
-
-
-@dataclass
-class TFBaseModelOutputWithPast(ModelOutput):
-    """
-    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
-
-    Args:
-        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-
-            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
-            hidden_size)` is output.
-        past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
-            sequence_length, embed_size_per_head)`).
-
-            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
-            `past_key_values` input) to speed up sequential decoding.
-        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    last_hidden_state: tf.Tensor | None = None
-    past_key_values: list[tf.Tensor] | None = None
-    hidden_states: tuple[tf.Tensor] | None = None
-    attentions: tuple[tf.Tensor] | None = None
-
-
-@dataclass
-class TFBaseModelOutputWithCrossAttentions(ModelOutput):
-    """
-    Base class for model's outputs, with potential hidden states and attentions.
-
-    Args:
-        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (`tuple(tf.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-    """
-
-    last_hidden_state: tf.Tensor | None = None
-    hidden_states: tuple[tf.Tensor] | None = None
-    attentions: tuple[tf.Tensor] | None = None
-    cross_attentions: tuple[tf.Tensor] | None = None
-
-
-@dataclass
-class TFBaseModelOutputWithPastAndCrossAttentions(ModelOutput):
-    """
-    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
-
-    Args:
-        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-
-            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
-            hidden_size)` is output.
-        past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
-            sequence_length, embed_size_per_head)`).
-
-            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
-            `past_key_values` input) to speed up sequential decoding.
-        hidden_states (`tuple(tf.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-    """
-
-    last_hidden_state: tf.Tensor | None = None
-    past_key_values: list[tf.Tensor] | None = None
-    hidden_states: tuple[tf.Tensor] | None = None
-    attentions: tuple[tf.Tensor] | None = None
-    cross_attentions: tuple[tf.Tensor] | None = None
-
-
-@dataclass
-class TFSeq2SeqModelOutput(ModelOutput):
-    """
-    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
-    decoding.
-
-    Args:
-        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the decoder of the model.
-
-            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
-            hidden_size)` is output.
-        past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
-            sequence_length, embed_size_per_head)`).
-
-            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
-            used (see `past_key_values` input) to speed up sequential decoding.
-        decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
-        decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-        encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
-        encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-    """
-
-    last_hidden_state: tf.Tensor | None = None
-    past_key_values: list[tf.Tensor] | None = None
-    decoder_hidden_states: tuple[tf.Tensor] | None = None
-    decoder_attentions: tuple[tf.Tensor] | None = None
-    cross_attentions: tuple[tf.Tensor] | None = None
-    encoder_last_hidden_state: tf.Tensor | None = None
-    encoder_hidden_states: tuple[tf.Tensor] | None = None
-    encoder_attentions: tuple[tf.Tensor] | None = None
-
-
-@dataclass
-class TFCausalLMOutput(ModelOutput):
-    """
-    Base class for causal language model (or autoregressive) outputs.
-
-    Args:
-        loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
-            Language modeling loss (for next-token prediction).
-        logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: tf.Tensor | None = None
-    logits: tf.Tensor | None = None
-    hidden_states: tuple[tf.Tensor] | None = None
-    attentions: tuple[tf.Tensor] | None = None
-
-
-@dataclass
-class TFCausalLMOutputWithPast(ModelOutput):
-    """
-    Base class for causal language model (or autoregressive) outputs.
-
-    Args:
-        loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
-            Language modeling loss (for next-token prediction).
-        logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
-            sequence_length, embed_size_per_head)`).
-
-            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
-            `past_key_values` input) to speed up sequential decoding.
-        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: tf.Tensor | None = None
-    logits: tf.Tensor | None = None
-    past_key_values: list[tf.Tensor] | None = None
-    hidden_states: tuple[tf.Tensor] | None = None
-    attentions: tuple[tf.Tensor] | None = None
-
-
-@dataclass
-class TFCausalLMOutputWithCrossAttentions(ModelOutput):
-    """
-    Base class for causal language model (or autoregressive) outputs.
-
-    Args:
-        loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
-            Language modeling loss (for next-token prediction).
-        logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-        past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
-            sequence_length, embed_size_per_head)`).
-
-            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
-            `past_key_values` input) to speed up sequential decoding.
-    """
-
-    loss: tf.Tensor | None = None
-    logits: tf.Tensor | None = None
-    past_key_values: list[tf.Tensor] | None = None
-    hidden_states: tuple[tf.Tensor] | None = None
-    attentions: tuple[tf.Tensor] | None = None
-    cross_attentions: tuple[tf.Tensor] | None = None
-
-
-@dataclass
-class TFMaskedLMOutput(ModelOutput):
-    """
-    Base class for masked language models outputs.
-
-    Args:
-        loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
-            Masked language modeling (MLM) loss.
-        logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: tf.Tensor | None = None
-    logits: tf.Tensor | None = None
-    hidden_states: tuple[tf.Tensor] | None = None
-    attentions: tuple[tf.Tensor] | None = None
-
-
-@dataclass
-class TFSeq2SeqLMOutput(ModelOutput):
-    """
-    Base class for sequence-to-sequence language models outputs.
-
-    Args:
-        loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
-            Language modeling loss.
-        logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
-            sequence_length, embed_size_per_head)`).
-
-            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
-            used (see `past_key_values` input) to speed up sequential decoding.
-        decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
-        decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-        encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
-        encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-    """
-
-    loss: tf.Tensor | None = None
-    logits: tf.Tensor | None = None
-    past_key_values: list[tf.Tensor] | None = None
-    decoder_hidden_states: tuple[tf.Tensor] | None = None
-    decoder_attentions: tuple[tf.Tensor] | None = None
-    cross_attentions: tuple[tf.Tensor] | None = None
-    encoder_last_hidden_state: tf.Tensor | None = None
-    encoder_hidden_states: tuple[tf.Tensor] | None = None
-    encoder_attentions: tuple[tf.Tensor] | None = None
-
-
-@dataclass
-class TFNextSentencePredictorOutput(ModelOutput):
-    """
-    Base class for outputs of models predicting if two sentences are consecutive or not.
-
-    Args:
-        loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `next_sentence_label` is provided):
-            Next sentence prediction loss.
-        logits (`tf.Tensor` of shape `(batch_size, 2)`):
-            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
-            before SoftMax).
-        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: tf.Tensor | None = None
-    logits: tf.Tensor | None = None
-    hidden_states: tuple[tf.Tensor] | None = None
-    attentions: tuple[tf.Tensor] | None = None
-
-
-@dataclass
-class TFSequenceClassifierOutput(ModelOutput):
-    """
-    Base class for outputs of sentence classification models.
-
-    Args:
-        loss (`tf.Tensor` of shape `(batch_size, )`, *optional*, returned when `labels` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: tf.Tensor | None = None
-    logits: tf.Tensor | None = None
-    hidden_states: tuple[tf.Tensor] | None = None
-    attentions: tuple[tf.Tensor] | None = None
-
-
-@dataclass
-class TFSeq2SeqSequenceClassifierOutput(ModelOutput):
-    """
-    Base class for outputs of sequence-to-sequence sentence classification models.
-
-    Args:
-        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `label` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
-            sequence_length, embed_size_per_head)`).
-
-            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
-            used (see `past_key_values` input) to speed up sequential decoding.
-        decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
-        decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`
-        encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
-        encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-    """
-
-    loss: tf.Tensor | None = None
-    logits: tf.Tensor | None = None
-    past_key_values: list[tf.Tensor] | None = None
-    decoder_hidden_states: tuple[tf.Tensor] | None = None
-    decoder_attentions: tuple[tf.Tensor] | None = None
-    cross_attentions: tuple[tf.Tensor] | None = None
-    encoder_last_hidden_state: tf.Tensor | None = None
-    encoder_hidden_states: tuple[tf.Tensor] | None = None
-    encoder_attentions: tuple[tf.Tensor] | None = None
-
-
-@dataclass
-class TFSemanticSegmenterOutput(ModelOutput):
-    """
-    Base class for outputs of semantic segmentation models.
-
-    Args:
-        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (`tf.Tensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
-            Classification scores for each pixel.
-
-            <Tip warning={true}>
-
-            The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
-            to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
-            original image size as post-processing. You should always check your logits shape and resize as needed.
-
-            </Tip>
-
-        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
-            the output of each layer) of shape `(batch_size, patch_size, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: tf.Tensor | None = None
-    logits: tf.Tensor | None = None
-    hidden_states: tuple[tf.Tensor] | None = None
-    attentions: tuple[tf.Tensor] | None = None
-
-
-@dataclass
-class TFSemanticSegmenterOutputWithNoAttention(ModelOutput):
-    """
-    Base class for outputs of semantic segmentation models that do not output attention scores.
-
-    Args:
-        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (`tf.Tensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
-            Classification scores for each pixel.
-
-            <Tip warning={true}>
-
-            The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
-            to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
-            original image size as post-processing. You should always check your logits shape and resize as needed.
-
-            </Tip>
-
-        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
-            the output of each layer) of shape `(batch_size, patch_size, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-    """
-
-    loss: tf.Tensor | None = None
-    logits: tf.Tensor | None = None
-    hidden_states: tuple[tf.Tensor] | None = None
-
-
-@dataclass
-class TFImageClassifierOutput(ModelOutput):
-    """
-    Base class for outputs of image classification models.
-
-    Args:
-        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
-            the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states (also called
-            feature maps) of the model at the output of each stage.
-        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: tf.Tensor | None = None
-    logits: tf.Tensor | None = None
-    hidden_states: tuple[tf.Tensor] | None = None
-    attentions: tuple[tf.Tensor] | None = None
-
-
-@dataclass
-class TFMultipleChoiceModelOutput(ModelOutput):
-    """
-    Base class for outputs of multiple choice models.
-
-    Args:
-        loss (`tf.Tensor` of shape *(batch_size, )*, *optional*, returned when `labels` is provided):
-            Classification loss.
-        logits (`tf.Tensor` of shape `(batch_size, num_choices)`):
-            *num_choices* is the second dimension of the input tensors. (see *input_ids* above).
-
-            Classification scores (before SoftMax).
-        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: tf.Tensor | None = None
-    logits: tf.Tensor | None = None
-    hidden_states: tuple[tf.Tensor] | None = None
-    attentions: tuple[tf.Tensor] | None = None
-
-
-@dataclass
-class TFTokenClassifierOutput(ModelOutput):
-    """
-    Base class for outputs of token classification models.
-
-    Args:
-        loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of unmasked labels, returned when `labels` is provided) :
-            Classification loss.
-        logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.num_labels)`):
-            Classification scores (before SoftMax).
-        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: tf.Tensor | None = None
-    logits: tf.Tensor | None = None
-    hidden_states: tuple[tf.Tensor] | None = None
-    attentions: tuple[tf.Tensor] | None = None
-
-
-@dataclass
-class TFQuestionAnsweringModelOutput(ModelOutput):
-    """
-    Base class for outputs of question answering models.
-
-    Args:
-        loss (`tf.Tensor` of shape `(batch_size, )`, *optional*, returned when `start_positions` and `end_positions` are provided):
-            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
-        start_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
-            Span-start scores (before SoftMax).
-        end_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
-            Span-end scores (before SoftMax).
-        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: tf.Tensor | None = None
-    start_logits: tf.Tensor | None = None
-    end_logits: tf.Tensor | None = None
-    hidden_states: tuple[tf.Tensor] | None = None
-    attentions: tuple[tf.Tensor] | None = None
-
-
-@dataclass
-class TFSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
-    """
-    Base class for outputs of sequence-to-sequence question answering models.
-
-    Args:
-        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
-        start_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
-            Span-start scores (before SoftMax).
-        end_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
-            Span-end scores (before SoftMax).
-        past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
-            sequence_length, embed_size_per_head)`).
-
-            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
-            used (see `past_key_values` input) to speed up sequential decoding.
-        decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
-        decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
-        encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-    """
-
-    loss: tf.Tensor | None = None
-    start_logits: tf.Tensor | None = None
-    end_logits: tf.Tensor | None = None
-    past_key_values: list[tf.Tensor] | None = None
-    decoder_hidden_states: tuple[tf.Tensor] | None = None
-    decoder_attentions: tuple[tf.Tensor] | None = None
-    encoder_last_hidden_state: tf.Tensor | None = None
-    encoder_hidden_states: tuple[tf.Tensor] | None = None
-    encoder_attentions: tuple[tf.Tensor] | None = None
-
-
-@dataclass
-class TFSequenceClassifierOutputWithPast(ModelOutput):
-    """
-    Base class for outputs of sentence classification models.
-
-    Args:
-        loss (`tf.Tensor` of shape `(batch_size, )`, *optional*, returned when `labels` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
-            sequence_length, embed_size_per_head)`).
-
-            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
-            `past_key_values` input) to speed up sequential decoding.
-        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
-            `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: tf.Tensor | None = None
-    logits: tf.Tensor | None = None
-    past_key_values: list[tf.Tensor] | None = None
-    hidden_states: tuple[tf.Tensor] | None = None
-    attentions: tuple[tf.Tensor] | None = None
-
-
-@dataclass
-class TFImageClassifierOutputWithNoAttention(ModelOutput):
-    """
-    Base class for outputs of image classification models.
-
-    Args:
-        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
-            the output of each stage) of shape `(batch_size, num_channels, height, width)`. Hidden-states (also called
-            feature maps) of the model at the output of each stage.
-    """
-
-    loss: tf.Tensor | None = None
-    logits: tf.Tensor | None = None
-    hidden_states: tuple[tf.Tensor, ...] | None = None
-
-
-@dataclass
-class TFMaskedImageModelingOutput(ModelOutput):
-    """
-    Base class for outputs of masked image completion / in-painting models.
-
-    Args:
-        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
-            Reconstruction loss.
-        reconstruction (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
-           Reconstructed / completed images.
-        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when
-        `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
-            the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states (also called
-            feature maps) of the model at the output of each stage.
-        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when
-        `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: tf.Tensor | None = None
-    reconstruction: tf.Tensor | None = None
-    hidden_states: tuple[tf.Tensor] | None = None
-    attentions: tuple[tf.Tensor] | None = None
-
-    @property
-    def logits(self):
-        warnings.warn(
-            "logits attribute is deprecated and will be removed in version 5 of Transformers."
-            " Please use the reconstruction attribute to retrieve the final output instead.",
-            FutureWarning,
-        )
-        return self.reconstruction
--- a/src/transformers/modeling_tf_pytorch_utils.py
+++ b/src/transformers/modeling_tf_pytorch_utils.py
@ -1,676 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch - TF 2.0 general utilities."""
-
-import os
-import re
-
-import numpy
-
-from .utils import (
-    ExplicitEnum,
-    check_torch_load_is_safe,
-    expand_dims,
-    is_numpy_array,
-    is_safetensors_available,
-    is_torch_tensor,
-    logging,
-    reshape,
-    squeeze,
-    tensor_size,
-)
-from .utils import transpose as transpose_func
-
-
-if is_safetensors_available():
-    from safetensors import safe_open
-
-
-logger = logging.get_logger(__name__)
-
-
-class TransposeType(ExplicitEnum):
-    """
-    Possible ...
-    """
-
-    NO = "no"
-    SIMPLE = "simple"
-    CONV1D = "conv1d"
-    CONV2D = "conv2d"
-
-
-def convert_tf_weight_name_to_pt_weight_name(
-    tf_name, start_prefix_to_remove="", tf_weight_shape=None, name_scope=None
-):
-    """
-    Convert a TF 2.0 model variable name in a pytorch model weight name.
-
-    Conventions for TF2.0 scopes -> PyTorch attribute names conversions:
-
-        - '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch)
-        - '_._' is replaced by a new level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList)
-
-    return tuple with:
-
-        - pytorch model weight name
-        - transpose: `TransposeType` member indicating whether and how TF2.0 and PyTorch weights matrices should be
-          transposed with regards to each other
-    """
-    if name_scope is not None:
-        if not tf_name.startswith(name_scope) and "final_logits_bias" not in tf_name:
-            raise ValueError(
-                f"Weight name {tf_name} does not start with name_scope {name_scope}. This is an internal error "
-                "in Transformers, so (unless you were doing something really evil) please open an issue to report it!"
-            )
-        tf_name = tf_name[len(name_scope) :]
-        tf_name = tf_name.lstrip("/")
-    tf_name = tf_name.replace(":0", "")  # device ids
-    if (len(tf_name) > 2048 and "___" in tf_name) or tf_name.count("___") > 10:
-        # ReDOS check
-        raise ValueError("TF variable name is too long or contains too many ___ separators: " + tf_name)
-    tf_name = re.sub(
-        r"/[^/]*___([^/]*)/", r"/\1/", tf_name
-    )  # '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch)
-    tf_name = tf_name.replace(
-        "_._", "/"
-    )  # '_._' is replaced by a level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList)
-    tf_name = re.sub(r"//+", "/", tf_name)  # Remove empty levels at the end
-    tf_name = tf_name.split("/")  # Convert from TF2.0 '/' separators to PyTorch '.' separators
-    # Some weights have a single name without "/" such as final_logits_bias in BART
-    if len(tf_name) > 1:
-        tf_name = tf_name[1:]  # Remove level zero
-
-    tf_weight_shape = list(tf_weight_shape)
-
-    # When should we transpose the weights
-    if tf_name[-1] == "kernel" and tf_weight_shape is not None and len(tf_weight_shape) == 4:
-        transpose = TransposeType.CONV2D
-    elif tf_name[-1] == "kernel" and tf_weight_shape is not None and len(tf_weight_shape) == 3:
-        transpose = TransposeType.CONV1D
-    elif bool(
-        tf_name[-1] in ["kernel", "pointwise_kernel", "depthwise_kernel"]
-        or "emb_projs" in tf_name
-        or "out_projs" in tf_name
-    ):
-        transpose = TransposeType.SIMPLE
-    else:
-        transpose = TransposeType.NO
-
-    # Convert standard TF2.0 names in PyTorch names
-    if tf_name[-1] == "kernel" or tf_name[-1] == "embeddings" or tf_name[-1] == "gamma":
-        tf_name[-1] = "weight"
-    if tf_name[-1] == "beta":
-        tf_name[-1] = "bias"
-
-    # The SeparableConv1D TF layer contains two weights that are translated to PyTorch Conv1D here
-    if tf_name[-1] == "pointwise_kernel" or tf_name[-1] == "depthwise_kernel":
-        tf_name[-1] = tf_name[-1].replace("_kernel", ".weight")
-
-    # Remove prefix if needed
-    tf_name = ".".join(tf_name)
-    if start_prefix_to_remove:
-        tf_name = tf_name.replace(start_prefix_to_remove, "", 1)
-
-    return tf_name, transpose
-
-
-def apply_transpose(transpose: TransposeType, weight, match_shape=None, pt_to_tf=True):
-    """
-    Apply a transpose to some weight then tries to reshape the weight to the same shape as a given shape, all in a
-    framework agnostic way.
-    """
-    if transpose is TransposeType.CONV2D:
-        # Conv2D weight:
-        #    PT: (num_out_channel, num_in_channel, kernel[0], kernel[1])
-        # -> TF: (kernel[0], kernel[1], num_in_channel, num_out_channel)
-        axes = (2, 3, 1, 0) if pt_to_tf else (3, 2, 0, 1)
-        weight = transpose_func(weight, axes=axes)
-    elif transpose is TransposeType.CONV1D:
-        # Conv1D weight:
-        #    PT: (num_out_channel, num_in_channel, kernel)
-        # -> TF: (kernel, num_in_channel, num_out_channel)
-        weight = transpose_func(weight, axes=(2, 1, 0))
-    elif transpose is TransposeType.SIMPLE:
-        weight = transpose_func(weight)
-
-    if match_shape is None:
-        return weight
-
-    if len(match_shape) < len(weight.shape):
-        weight = squeeze(weight)
-    elif len(match_shape) > len(weight.shape):
-        weight = expand_dims(weight, axis=0)
-
-    if list(match_shape) != list(weight.shape):
-        try:
-            weight = reshape(weight, match_shape)
-        except AssertionError as e:
-            e.args += (match_shape, match_shape)
-            raise e
-
-    return weight
-
-
-#####################
-# PyTorch => TF 2.0 #
-#####################
-
-
-def load_pytorch_checkpoint_in_tf2_model(
-    tf_model,
-    pytorch_checkpoint_path,
-    tf_inputs=None,
-    allow_missing_keys=False,
-    output_loading_info=False,
-    _prefix=None,
-    tf_to_pt_weight_rename=None,
-):
-    """Load pytorch checkpoints in a TF 2.0 model"""
-    try:
-        import tensorflow as tf  # noqa: F401
-        import torch  # noqa: F401
-        from safetensors.torch import load_file as safe_load_file  # noqa: F401
-    except ImportError:
-        logger.error(
-            "Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see "
-            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-
-    # Treats a single file as a collection of shards with 1 shard.
-    if isinstance(pytorch_checkpoint_path, str):
-        pytorch_checkpoint_path = [pytorch_checkpoint_path]
-
-    # Loads all shards into a single state dictionary
-    pt_state_dict = {}
-    for path in pytorch_checkpoint_path:
-        pt_path = os.path.abspath(path)
-        logger.info(f"Loading PyTorch weights from {pt_path}")
-        if pt_path.endswith(".safetensors"):
-            state_dict = safe_load_file(pt_path)
-        else:
-            check_torch_load_is_safe()
-            state_dict = torch.load(pt_path, map_location="cpu", weights_only=True)
-
-        pt_state_dict.update(state_dict)
-
-    logger.info(f"PyTorch checkpoint contains {sum(t.numel() for t in pt_state_dict.values()):,} parameters")
-
-    return load_pytorch_weights_in_tf2_model(
-        tf_model,
-        pt_state_dict,
-        tf_inputs=tf_inputs,
-        allow_missing_keys=allow_missing_keys,
-        output_loading_info=output_loading_info,
-        _prefix=_prefix,
-        tf_to_pt_weight_rename=tf_to_pt_weight_rename,
-    )
-
-
-def load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=None, allow_missing_keys=False):
-    """Load pytorch checkpoints in a TF 2.0 model"""
-    pt_state_dict = pt_model.state_dict()
-
-    return load_pytorch_weights_in_tf2_model(
-        tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys
-    )
-
-
-def load_pytorch_weights_in_tf2_model(
-    tf_model,
-    pt_state_dict,
-    tf_inputs=None,
-    allow_missing_keys=False,
-    output_loading_info=False,
-    _prefix=None,
-    tf_to_pt_weight_rename=None,
-):
-    """Load pytorch state_dict in a TF 2.0 model."""
-    try:
-        import tensorflow as tf  # noqa: F401
-        import torch  # noqa: F401
-    except ImportError:
-        logger.error(
-            "Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see "
-            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-
-    # Numpy doesn't understand bfloat16, so upcast to a dtype that doesn't lose precision
-    pt_state_dict = {
-        k: v.numpy() if v.dtype != torch.bfloat16 else v.float().numpy() for k, v in pt_state_dict.items()
-    }
-    return load_pytorch_state_dict_in_tf2_model(
-        tf_model,
-        pt_state_dict,
-        tf_inputs=tf_inputs,
-        allow_missing_keys=allow_missing_keys,
-        output_loading_info=output_loading_info,
-        _prefix=_prefix,
-        tf_to_pt_weight_rename=tf_to_pt_weight_rename,
-    )
-
-
-def _log_key_warnings(missing_keys, unexpected_keys, mismatched_keys, class_name):
-    if len(unexpected_keys) > 0:
-        logger.warning(
-            "Some weights of the PyTorch model were not used when initializing the TF 2.0 model"
-            f" {class_name}: {unexpected_keys}\n- This IS expected if you are initializing"
-            f" {class_name} from a PyTorch model trained on another task or with another architecture"
-            " (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).\n- This IS"
-            f" NOT expected if you are initializing {class_name} from a PyTorch model that you expect"
-            " to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a"
-            " BertForSequenceClassification model)."
-        )
-    else:
-        logger.warning(f"All PyTorch model weights were used when initializing {class_name}.\n")
-    if len(missing_keys) > 0:
-        logger.warning(
-            f"Some weights or buffers of the TF 2.0 model {class_name} were not initialized from the"
-            f" PyTorch model and are newly initialized: {missing_keys}\nYou should probably TRAIN this model on a"
-            " down-stream task to be able to use it for predictions and inference."
-        )
-    else:
-        logger.warning(
-            f"All the weights of {class_name} were initialized from the PyTorch model.\n"
-            "If your task is similar to the task the model of the checkpoint was trained on, "
-            f"you can already use {class_name} for predictions without further training."
-        )
-
-    if len(mismatched_keys) > 0:
-        mismatched_warning = "\n".join(
-            [
-                f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
-                for key, shape1, shape2 in mismatched_keys
-            ]
-        )
-        logger.warning(
-            f"Some weights of {class_name} were not initialized from the model checkpoint"
-            f" are newly initialized because the shapes did not"
-            f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be able"
-            " to use it for predictions and inference."
-        )
-
-
-def load_pytorch_state_dict_in_tf2_model(
-    tf_model,
-    pt_state_dict,
-    tf_inputs=None,
-    allow_missing_keys=False,
-    output_loading_info=False,
-    _prefix=None,
-    tf_to_pt_weight_rename=None,
-    ignore_mismatched_sizes=False,
-    skip_logger_warnings=False,
-):
-    """Load a pytorch state_dict in a TF 2.0 model. pt_state_dict can be either an actual dict or a lazy-loading
-    safetensors archive created with the safe_open() function."""
-    import tensorflow as tf
-
-    if tf_inputs is None:
-        tf_inputs = tf_model.dummy_inputs
-
-    if _prefix is None:
-        _prefix = ""
-    if tf_inputs:
-        with tf.name_scope(_prefix):
-            tf_model(tf_inputs, training=False)  # Make sure model is built
-    # Convert old format to new format if needed from a PyTorch state_dict
-    tf_keys_to_pt_keys = {}
-    for key in pt_state_dict:
-        new_key = None
-        if "gamma" in key:
-            new_key = key.replace("gamma", "weight")
-        if "beta" in key:
-            new_key = key.replace("beta", "bias")
-        if "running_var" in key:
-            new_key = key.replace("running_var", "moving_variance")
-        if "running_mean" in key:
-            new_key = key.replace("running_mean", "moving_mean")
-
-        # New `weight_norm` from https://github.com/huggingface/transformers/pull/24030
-        key_components = key.split(".")
-        name = None
-        if key_components[-3::2] == ["parametrizations", "original0"]:
-            name = key_components[-2] + "_g"
-        elif key_components[-3::2] == ["parametrizations", "original1"]:
-            name = key_components[-2] + "_v"
-        if name is not None:
-            key_components = key_components[:-3] + [name]
-            new_key = ".".join(key_components)
-
-        if new_key is None:
-            new_key = key
-        tf_keys_to_pt_keys[new_key] = key
-
-    # Matt: All TF models store the actual model stem in a MainLayer class, including the base model.
-    # In PT, the derived models (with heads) use the base model class as the stem instead,
-    # and there is no MainLayer class. This means that TF base classes have one
-    # extra layer in their weight names, corresponding to the MainLayer class. This code block compensates for that.
-    start_prefix_to_remove = ""
-    if not any(s.startswith(tf_model.base_model_prefix) for s in tf_keys_to_pt_keys):
-        start_prefix_to_remove = tf_model.base_model_prefix + "."
-
-    symbolic_weights = tf_model.trainable_weights + tf_model.non_trainable_weights
-    tf_loaded_numel = 0
-    all_pytorch_weights = set(tf_keys_to_pt_keys.keys())
-    missing_keys = []
-    mismatched_keys = []
-    is_safetensor_archive = hasattr(pt_state_dict, "get_tensor")
-    for symbolic_weight in symbolic_weights:
-        sw_name = symbolic_weight.name
-        name, transpose = convert_tf_weight_name_to_pt_weight_name(
-            sw_name,
-            start_prefix_to_remove=start_prefix_to_remove,
-            tf_weight_shape=symbolic_weight.shape,
-            name_scope=_prefix,
-        )
-        if tf_to_pt_weight_rename is not None:
-            aliases = tf_to_pt_weight_rename(name)  # Is a tuple to account for possible name aliasing
-            for alias in aliases:  # The aliases are in priority order, take the first one that matches
-                if alias in tf_keys_to_pt_keys:
-                    name = alias
-                    break
-            else:
-                # If none of the aliases match, just use the first one (it'll be reported as missing)
-                name = aliases[0]
-
-        # Find associated numpy array in pytorch model state dict
-        if name not in tf_keys_to_pt_keys:
-            if allow_missing_keys:
-                missing_keys.append(name)
-                continue
-            elif tf_model._keys_to_ignore_on_load_missing is not None:
-                # authorized missing keys don't have to be loaded
-                if any(re.search(pat, name) is not None for pat in tf_model._keys_to_ignore_on_load_missing):
-                    continue
-            raise AttributeError(f"{name} not found in PyTorch model")
-        state_dict_name = tf_keys_to_pt_keys[name]
-        if is_safetensor_archive:
-            array = pt_state_dict.get_tensor(state_dict_name)
-        else:
-            array = pt_state_dict[state_dict_name]
-        try:
-            array = apply_transpose(transpose, array, symbolic_weight.shape)
-        except tf.errors.InvalidArgumentError as e:
-            if not ignore_mismatched_sizes:
-                error_msg = str(e)
-                error_msg += (
-                    "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method."
-                )
-                raise tf.errors.InvalidArgumentError(error_msg)
-            else:
-                mismatched_keys.append((name, array.shape, symbolic_weight.shape))
-                continue
-
-        tf_loaded_numel += tensor_size(array)
-
-        symbolic_weight.assign(tf.cast(array, symbolic_weight.dtype))
-        del array  # Immediately free memory to keep peak usage as low as possible
-        all_pytorch_weights.discard(name)
-
-    logger.info(f"Loaded {tf_loaded_numel:,} parameters in the TF 2.0 model.")
-
-    unexpected_keys = list(all_pytorch_weights)
-
-    if tf_model._keys_to_ignore_on_load_missing is not None:
-        for pat in tf_model._keys_to_ignore_on_load_missing:
-            missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
-    if tf_model._keys_to_ignore_on_load_unexpected is not None:
-        for pat in tf_model._keys_to_ignore_on_load_unexpected:
-            unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
-    if not skip_logger_warnings:
-        _log_key_warnings(missing_keys, unexpected_keys, mismatched_keys, class_name=tf_model.__class__.__name__)
-
-    if output_loading_info:
-        loading_info = {
-            "missing_keys": missing_keys,
-            "unexpected_keys": unexpected_keys,
-            "mismatched_keys": mismatched_keys,
-        }
-        return tf_model, loading_info
-
-    return tf_model
-
-
-def load_sharded_pytorch_safetensors_in_tf2_model(
-    tf_model,
-    safetensors_shards,
-    tf_inputs=None,
-    allow_missing_keys=False,
-    output_loading_info=False,
-    _prefix=None,
-    tf_to_pt_weight_rename=None,
-    ignore_mismatched_sizes=False,
-):
-    all_loading_infos = []
-    for shard in safetensors_shards:
-        with safe_open(shard, framework="tf") as safetensors_archive:
-            tf_model, loading_info = load_pytorch_state_dict_in_tf2_model(
-                tf_model,
-                safetensors_archive,
-                tf_inputs=tf_inputs,
-                allow_missing_keys=allow_missing_keys,
-                output_loading_info=True,
-                _prefix=_prefix,
-                tf_to_pt_weight_rename=tf_to_pt_weight_rename,
-                ignore_mismatched_sizes=ignore_mismatched_sizes,
-                skip_logger_warnings=True,  # We will emit merged warnings at the end
-            )
-        all_loading_infos.append(loading_info)
-    # Now we just need to merge the loading info
-    # Keys are missing only if they're missing in *every* shard
-    missing_keys = sorted(set.intersection(*[set(info["missing_keys"]) for info in all_loading_infos]))
-    # Keys are unexpected/mismatched if they're unexpected/mismatched in *any* shard
-    unexpected_keys = sum([info["unexpected_keys"] for info in all_loading_infos], [])
-    mismatched_keys = sum([info["mismatched_keys"] for info in all_loading_infos], [])
-
-    _log_key_warnings(missing_keys, unexpected_keys, mismatched_keys, class_name=tf_model.__class__.__name__)
-
-    if output_loading_info:
-        loading_info = {
-            "missing_keys": missing_keys,
-            "unexpected_keys": unexpected_keys,
-            "mismatched_keys": mismatched_keys,
-        }
-        return tf_model, loading_info
-
-    return tf_model
-
-
-#####################
-# TF 2.0 => PyTorch #
-#####################
-
-
-def load_tf2_checkpoint_in_pytorch_model(
-    pt_model, tf_checkpoint_path, tf_inputs=None, allow_missing_keys=False, output_loading_info=False
-):
-    """
-    Load TF 2.0 HDF5 checkpoint in a PyTorch model We use HDF5 to easily do transfer learning (see
-    https://github.com/tensorflow/tensorflow/blob/ee16fcac960ae660e0e4496658a366e2f745e1f0/tensorflow/python/keras/engine/network.py#L1352-L1357).
-    """
-    try:
-        import tensorflow as tf  # noqa: F401
-        import torch  # noqa: F401
-    except ImportError:
-        logger.error(
-            "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
-            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-
-    import transformers
-
-    from .modeling_tf_utils import load_tf_weights
-
-    logger.info(f"Loading TensorFlow weights from {tf_checkpoint_path}")
-
-    # Instantiate and load the associated TF 2.0 model
-    tf_model_class_name = "TF" + pt_model.__class__.__name__  # Add "TF" at the beginning
-    tf_model_class = getattr(transformers, tf_model_class_name)
-    tf_model = tf_model_class(pt_model.config)
-
-    if tf_inputs is None:
-        tf_inputs = tf_model.dummy_inputs
-
-    if tf_inputs is not None:
-        tf_model(tf_inputs, training=False)  # Make sure model is built
-
-    load_tf_weights(tf_model, tf_checkpoint_path)
-
-    return load_tf2_model_in_pytorch_model(
-        pt_model, tf_model, allow_missing_keys=allow_missing_keys, output_loading_info=output_loading_info
-    )
-
-
-def load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=False, output_loading_info=False):
-    """Load TF 2.0 model in a pytorch model"""
-    weights = tf_model.weights
-
-    return load_tf2_weights_in_pytorch_model(
-        pt_model, weights, allow_missing_keys=allow_missing_keys, output_loading_info=output_loading_info
-    )
-
-
-def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=False, output_loading_info=False):
-    """Load TF2.0 symbolic weights in a PyTorch model"""
-    try:
-        import tensorflow as tf  # noqa: F401
-        import torch  # noqa: F401
-    except ImportError:
-        logger.error(
-            "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
-            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-
-    tf_state_dict = {tf_weight.name: tf_weight.numpy() for tf_weight in tf_weights}
-    return load_tf2_state_dict_in_pytorch_model(
-        pt_model, tf_state_dict, allow_missing_keys=allow_missing_keys, output_loading_info=output_loading_info
-    )
-
-
-def load_tf2_state_dict_in_pytorch_model(pt_model, tf_state_dict, allow_missing_keys=False, output_loading_info=False):
-    import torch
-
-    new_pt_params_dict = {}
-    current_pt_params_dict = dict(pt_model.named_parameters())
-
-    # Make sure we are able to load PyTorch base models as well as derived models (with heads)
-    # TF models always have a prefix, some of PyTorch models (base ones) don't
-    start_prefix_to_remove = ""
-    if not any(s.startswith(pt_model.base_model_prefix) for s in current_pt_params_dict):
-        start_prefix_to_remove = pt_model.base_model_prefix + "."
-
-    # Build a map from potential PyTorch weight names to TF 2.0 Variables
-    tf_weights_map = {}
-    for name, tf_weight in tf_state_dict.items():
-        pt_name, transpose = convert_tf_weight_name_to_pt_weight_name(
-            name, start_prefix_to_remove=start_prefix_to_remove, tf_weight_shape=tf_weight.shape
-        )
-        tf_weights_map[pt_name] = (tf_weight, transpose)
-
-    all_tf_weights = set(tf_weights_map.keys())
-    loaded_pt_weights_data_ptr = {}
-    missing_keys_pt = []
-    for pt_weight_name, pt_weight in current_pt_params_dict.items():
-        # Handle PyTorch shared weight not duplicated in TF 2.0
-        if pt_weight.data_ptr() in loaded_pt_weights_data_ptr and pt_weight.data_ptr() != 0:
-            new_pt_params_dict[pt_weight_name] = loaded_pt_weights_data_ptr[pt_weight.data_ptr()]
-            continue
-
-        pt_weight_name_to_check = pt_weight_name
-        # New `weight_norm` from https://github.com/huggingface/transformers/pull/24030
-        key_components = pt_weight_name.split(".")
-        name = None
-        if key_components[-3::2] == ["parametrizations", "original0"]:
-            name = key_components[-2] + "_g"
-        elif key_components[-3::2] == ["parametrizations", "original1"]:
-            name = key_components[-2] + "_v"
-        if name is not None:
-            key_components = key_components[:-3] + [name]
-            pt_weight_name_to_check = ".".join(key_components)
-
-        # Find associated numpy array in pytorch model state dict
-        if pt_weight_name_to_check not in tf_weights_map:
-            if allow_missing_keys:
-                missing_keys_pt.append(pt_weight_name)
-                continue
-
-            raise AttributeError(f"{pt_weight_name} not found in TF 2.0 model")
-
-        array, transpose = tf_weights_map[pt_weight_name_to_check]
-
-        array = apply_transpose(transpose, array, pt_weight.shape, pt_to_tf=False)
-
-        if numpy.isscalar(array):
-            array = numpy.array(array)
-        if not is_torch_tensor(array) and not is_numpy_array(array):
-            array = array.numpy()
-        if is_numpy_array(array):
-            # Convert to torch tensor
-            array = torch.from_numpy(array)
-
-        new_pt_params_dict[pt_weight_name] = array
-        loaded_pt_weights_data_ptr[pt_weight.data_ptr()] = array
-        all_tf_weights.discard(pt_weight_name)
-
-    missing_keys, unexpected_keys = pt_model.load_state_dict(new_pt_params_dict, strict=False)
-    missing_keys += missing_keys_pt
-
-    # Some models may have keys that are not in the state by design, removing them before needlessly warning
-    # the user.
-    if pt_model._keys_to_ignore_on_load_missing is not None:
-        for pat in pt_model._keys_to_ignore_on_load_missing:
-            missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
-
-    if pt_model._keys_to_ignore_on_load_unexpected is not None:
-        for pat in pt_model._keys_to_ignore_on_load_unexpected:
-            unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
-
-    if len(unexpected_keys) > 0:
-        logger.warning(
-            "Some weights of the TF 2.0 model were not used when initializing the PyTorch model"
-            f" {pt_model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are initializing"
-            f" {pt_model.__class__.__name__} from a TF 2.0 model trained on another task or with another architecture"
-            " (e.g. initializing a BertForSequenceClassification model from a TFBertForPreTraining model).\n- This IS"
-            f" NOT expected if you are initializing {pt_model.__class__.__name__} from a TF 2.0 model that you expect"
-            " to be exactly identical (e.g. initializing a BertForSequenceClassification model from a"
-            " TFBertForSequenceClassification model)."
-        )
-    else:
-        logger.warning(f"All TF 2.0 model weights were used when initializing {pt_model.__class__.__name__}.\n")
-    if len(missing_keys) > 0:
-        logger.warning(
-            f"Some weights of {pt_model.__class__.__name__} were not initialized from the TF 2.0 model and are newly"
-            f" initialized: {missing_keys}\nYou should probably TRAIN this model on a down-stream task to be able to"
-            " use it for predictions and inference."
-        )
-    else:
-        logger.warning(
-            f"All the weights of {pt_model.__class__.__name__} were initialized from the TF 2.0 model.\n"
-            "If your task is similar to the task the model of the checkpoint was trained on, "
-            f"you can already use {pt_model.__class__.__name__} for predictions without further training."
-        )
-
-    logger.info(f"Weights or buffers not loaded from TF 2.0 model: {all_tf_weights}")
-
-    if output_loading_info:
-        loading_info = {"missing_keys": missing_keys, "unexpected_keys": unexpected_keys}
-        return pt_model, loading_info
-
-    return pt_model
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@ -79,11 +79,8 @@ from .utils import (
    ADAPTER_WEIGHTS_NAME,
    CONFIG_NAME,
    DUMMY_INPUTS,
-    FLAX_WEIGHTS_NAME,
    SAFE_WEIGHTS_INDEX_NAME,
    SAFE_WEIGHTS_NAME,
-    TF2_WEIGHTS_NAME,
-    TF_WEIGHTS_NAME,
    WEIGHTS_INDEX_NAME,
    WEIGHTS_NAME,
    ContextManagers,
@ -506,13 +503,6 @@ def load_state_dict(
    # Use safetensors if possible
    if checkpoint_file.endswith(".safetensors") and is_safetensors_available():
        with safe_open(checkpoint_file, framework="pt") as f:
-            metadata = f.metadata()
-
-            if metadata is not None and metadata.get("format") not in ["pt", "tf", "flax", "mlx"]:
-                raise OSError(
-                    f"The safetensors archive passed at {checkpoint_file} does not contain the valid metadata. Make sure "
-                    "you save your model with the `save_pretrained` method."
-                )
            state_dict = {}
            for k in f.keys():
                if map_location == "meta":
@ -568,11 +558,7 @@ def load_state_dict(
                        "model. Make sure you have saved the model properly."
                    ) from e
        except (UnicodeDecodeError, ValueError):
-            raise OSError(
-                f"Unable to load weights from pytorch checkpoint file for '{checkpoint_file}' "
-                f"at '{checkpoint_file}'. "
-                "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True."
-            )
+            raise OSError(f"Unable to load weights from pytorch checkpoint file '{checkpoint_file}'.")


 def set_initialized_submodules(model, state_dict_keys):
@ -1004,9 +990,7 @@ def _get_resolved_checkpoint_files(
    subfolder: str,
    variant: Optional[str],
    gguf_file: Optional[str],
-    from_tf: bool,
-    from_flax: bool,
-    use_safetensors: bool,
+    use_safetensors: Optional[bool],
    cache_dir: str,
    force_download: bool,
    proxies: Optional[dict[str, str]],
@ -1032,19 +1016,6 @@ def _get_resolved_checkpoint_files(
                # If the filename is explicitly defined, load this by default.
                archive_file = os.path.join(pretrained_model_name_or_path, subfolder, transformers_explicit_filename)
                is_sharded = transformers_explicit_filename.endswith(".safetensors.index.json")
-            elif from_tf and os.path.isfile(
-                os.path.join(pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index")
-            ):
-                # Load from a TF 1.0 checkpoint in priority if from_tf
-                archive_file = os.path.join(pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index")
-            elif from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, TF2_WEIGHTS_NAME)):
-                # Load from a TF 2.0 checkpoint in priority if from_tf
-                archive_file = os.path.join(pretrained_model_name_or_path, subfolder, TF2_WEIGHTS_NAME)
-            elif from_flax and os.path.isfile(
-                os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME)
-            ):
-                # Load from a Flax checkpoint in priority if from_flax
-                archive_file = os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME)
            elif use_safetensors is not False and os.path.isfile(
                os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(SAFE_WEIGHTS_NAME, variant))
            ):
@ -1075,24 +1046,6 @@ def _get_resolved_checkpoint_files(
                    pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_INDEX_NAME, variant)
                )
                is_sharded = True
-            # At this stage we don't have a weight file so we will raise an error.
-            elif not use_safetensors and (
-                os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index"))
-                or os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, TF2_WEIGHTS_NAME))
-            ):
-                raise OSError(
-                    f"Error no file named {_add_variant(WEIGHTS_NAME, variant)} found in directory"
-                    f" {pretrained_model_name_or_path} but there is a file for TensorFlow weights. Use"
-                    " `from_tf=True` to load this model from those weights."
-                )
-            elif not use_safetensors and os.path.isfile(
-                os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME)
-            ):
-                raise OSError(
-                    f"Error no file named {_add_variant(WEIGHTS_NAME, variant)} found in directory"
-                    f" {pretrained_model_name_or_path} but there is a file for Flax weights. Use `from_flax=True`"
-                    " to load this model from those weights."
-                )
            elif use_safetensors:
                raise OSError(
                    f"Error no file named {_add_variant(SAFE_WEIGHTS_NAME, variant)} found in directory"
@ -1100,21 +1053,12 @@ def _get_resolved_checkpoint_files(
                )
            else:
                raise OSError(
-                    f"Error no file named {_add_variant(WEIGHTS_NAME, variant)}, {_add_variant(SAFE_WEIGHTS_NAME, variant)},"
-                    f" {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME + '.index'} or {FLAX_WEIGHTS_NAME} found in directory"
-                    f" {pretrained_model_name_or_path}."
+                    f"Error no file named {_add_variant(SAFE_WEIGHTS_NAME, variant)}, or {_add_variant(WEIGHTS_NAME, variant)},"
+                    f" found in directory {pretrained_model_name_or_path}."
                )
        elif os.path.isfile(os.path.join(subfolder, pretrained_model_name_or_path)):
            archive_file = pretrained_model_name_or_path
            is_local = True
-        elif os.path.isfile(os.path.join(subfolder, pretrained_model_name_or_path + ".index")):
-            if not from_tf:
-                raise ValueError(
-                    f"We found a TensorFlow checkpoint at {pretrained_model_name_or_path + '.index'}, please set "
-                    "from_tf to True to load from this checkpoint."
-                )
-            archive_file = os.path.join(subfolder, pretrained_model_name_or_path + ".index")
-            is_local = True
        elif is_remote_url(pretrained_model_name_or_path):
            filename = pretrained_model_name_or_path
            resolved_archive_file = download_url(pretrained_model_name_or_path)
@ -1123,10 +1067,6 @@ def _get_resolved_checkpoint_files(
            if transformers_explicit_filename is not None:
                filename = transformers_explicit_filename
                is_sharded = transformers_explicit_filename.endswith(".safetensors.index.json")
-            elif from_tf:
-                filename = TF2_WEIGHTS_NAME
-            elif from_flax:
-                filename = FLAX_WEIGHTS_NAME
            elif use_safetensors is not False:
                filename = _add_variant(SAFE_WEIGHTS_NAME, variant)
            else:
@ -1223,8 +1163,7 @@ def _get_resolved_checkpoint_files(
                                    name="Thread-auto_conversion",
                                ).start()
                    else:
-                        # Otherwise, no PyTorch file was found, maybe there is a TF or Flax model file.
-                        # We try those to give a helpful error message.
+                        # Otherwise, no PyTorch file was found
                        has_file_kwargs = {
                            "revision": revision,
                            "proxies": proxies,
@ -1232,19 +1171,7 @@ def _get_resolved_checkpoint_files(
                            "cache_dir": cache_dir,
                            "local_files_only": local_files_only,
                        }
-                        if has_file(pretrained_model_name_or_path, TF2_WEIGHTS_NAME, **has_file_kwargs):
-                            raise OSError(
-                                f"{pretrained_model_name_or_path} does not appear to have a file named"
-                                f" {_add_variant(WEIGHTS_NAME, variant)} but there is a file for TensorFlow weights."
-                                " Use `from_tf=True` to load this model from those weights."
-                            )
-                        elif has_file(pretrained_model_name_or_path, FLAX_WEIGHTS_NAME, **has_file_kwargs):
-                            raise OSError(
-                                f"{pretrained_model_name_or_path} does not appear to have a file named"
-                                f" {_add_variant(WEIGHTS_NAME, variant)} but there is a file for Flax weights. Use"
-                                " `from_flax=True` to load this model from those weights."
-                            )
-                        elif variant is not None and has_file(
+                        if variant is not None and has_file(
                            pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs
                        ):
                            raise OSError(
@ -1255,8 +1182,7 @@ def _get_resolved_checkpoint_files(
                        else:
                            raise OSError(
                                f"{pretrained_model_name_or_path} does not appear to have a file named"
-                                f" {_add_variant(WEIGHTS_NAME, variant)}, {_add_variant(SAFE_WEIGHTS_NAME, variant)},"
-                                f" {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME} or {FLAX_WEIGHTS_NAME}."
+                                f" {_add_variant(WEIGHTS_NAME, variant)} or {_add_variant(SAFE_WEIGHTS_NAME, variant)}."
                            )

            except OSError:
@ -1269,8 +1195,7 @@ def _get_resolved_checkpoint_files(
                    f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it"
                    " from 'https://huggingface.co/models', make sure you don't have a local directory with the"
                    f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
-                    f" directory containing a file named {_add_variant(WEIGHTS_NAME, variant)},"
-                    f" {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME} or {FLAX_WEIGHTS_NAME}."
+                    f" directory containing a file named {_add_variant(WEIGHTS_NAME, variant)}."
                ) from e

        if is_local:
@ -1682,8 +1607,6 @@ class ModuleUtilsMixin:
        if encoder_attention_mask.dim() == 2:
            encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
        # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
-        # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow
-        # /transformer/transformer_layers.py#L270
        # encoder_extended_attention_mask = (encoder_extended_attention_mask ==
        # encoder_extended_attention_mask.transpose(-1, -2))
        encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
@ -2018,19 +1941,13 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH

        - **config_class** ([`PretrainedConfig`]) -- A subclass of [`PretrainedConfig`] to use as configuration class
          for this model architecture.
-        - **load_tf_weights** (`Callable`) -- A python *method* for loading a TensorFlow checkpoint in a PyTorch model,
-          taking as arguments:
-
-            - **model** ([`PreTrainedModel`]) -- An instance of the model on which to load the TensorFlow checkpoint.
-            - **config** ([`PreTrainedConfig`]) -- An instance of the configuration associated to the model.
-            - **path** (`str`) -- A path to the TensorFlow checkpoint.
-
        - **base_model_prefix** (`str`) -- A string indicating the attribute associated to the base model in derived
          classes of the same architecture adding modules on top of the base model.
        - **is_parallelizable** (`bool`) -- A flag indicating whether this model supports model parallelization.
        - **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for NLP
          models, `pixel_values` for vision models and `input_values` for speech models).
-        - **can_record_outputs** (dict):"""
+        - **can_record_outputs** (dict):
+    """

    config_class = None
    base_model_prefix = ""
@ -2155,13 +2072,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
        """
        return {"input_ids": torch.tensor(DUMMY_INPUTS)}

-    @property
-    def framework(self) -> str:
-        """
-        :str: Identifies that this is a PyTorch model.
-        """
-        return "pt"
-
    def __init_subclass__(cls, **kwargs):
        super().__init_subclass__(**kwargs)
        # For BC we keep the original `config_class` definition in case
@ -3800,9 +3710,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
        """
        Activates gradient checkpointing for the current model.

-        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
-        activations".
-
        We pass the `__call__` method of the modules instead of `forward` because `__call__` attaches all the hooks of
        the module. https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2

@ -3863,9 +3770,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
    def gradient_checkpointing_disable(self):
        """
        Deactivates gradient checkpointing for the current model.
-
-        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
-        activations".
        """
        if self.supports_gradient_checkpointing:
            # For old GC format (transformers < 4.35.0) for models that live on the Hub
@ -3887,9 +3791,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
    def is_gradient_checkpointing(self) -> bool:
        """
        Whether gradient checkpointing is activated for this model or not.
-
-        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
-        activations".
        """
        return any(hasattr(m, "gradient_checkpointing") and m.gradient_checkpointing for m in self.modules())

@ -4543,13 +4444,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
                    - A path to a *directory* containing model weights saved using
                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
-                    - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
-                      this case, `from_tf` should be set to `True` and a configuration object should be provided as
-                      `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
-                      PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
-                    - A path or url to a model folder containing a *flax checkpoint file* in *.msgpack* format (e.g,
-                      `./flax_model/` containing `flax_model.msgpack`). In this case, `from_flax` should be set to
-                      `True`.
                    - `None` if you are both providing the configuration and state dictionary (resp. with keyword
                      arguments `config` and `state_dict`).
            model_args (sequence of positional arguments, *optional*):
@ -4578,12 +4472,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
            cache_dir (`Union[str, os.PathLike]`, *optional*):
                Path to a directory in which a downloaded pretrained model configuration should be cached if the
                standard cache should not be used.
-            from_tf (`bool`, *optional*, defaults to `False`):
-                Load the model weights from a TensorFlow checkpoint save file (see docstring of
-                `pretrained_model_name_or_path` argument).
-            from_flax (`bool`, *optional*, defaults to `False`):
-                Load the model weights from a Flax checkpoint save file (see docstring of
-                `pretrained_model_name_or_path` argument).
            ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`):
                Whether or not to raise an error if some of the weights from the checkpoint do not have the same size
                as the weights of the model (if for instance, you are instantiating a model with 10 labels from a
@ -4699,8 +4587,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
                specify the folder name here.
            variant (`str`, *optional*):
-                If specified load weights from `variant` filename, *e.g.* pytorch_model.<variant>.bin. `variant` is
-                ignored when using `from_tf` or `from_flax`.
+                If specified load weights from `variant` filename, *e.g.* pytorch_model.<variant>.bin.
            use_safetensors (`bool`, *optional*, defaults to `None`):
                Whether or not to use `safetensors` checkpoints. Defaults to `None`. If not specified and `safetensors`
                is not installed, it will be set to `False`.
@ -4744,16 +4631,9 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
        >>> # Update configuration during loading.
        >>> model = BertModel.from_pretrained("google-bert/bert-base-uncased", output_attentions=True)
        >>> assert model.config.output_attentions == True
-        >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower, for example purposes, not runnable).
-        >>> config = BertConfig.from_json_file("./tf_model/my_tf_model_config.json")
-        >>> model = BertModel.from_pretrained("./tf_model/my_tf_checkpoint.ckpt.index", from_tf=True, config=config)
-        >>> # Loading from a Flax checkpoint file instead of a PyTorch model (slower)
-        >>> model = BertModel.from_pretrained("google-bert/bert-base-uncased", from_flax=True)
        ```
        """
        state_dict = kwargs.pop("state_dict", None)
-        from_tf = kwargs.pop("from_tf", False)
-        from_flax = kwargs.pop("from_flax", False)
        proxies = kwargs.pop("proxies", None)
        output_loading_info = kwargs.pop("output_loading_info", False)
        use_auth_token = kwargs.pop("use_auth_token", None)
@ -4796,8 +4676,10 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
        # Not used anymore -- remove them from the kwargs
        _ = kwargs.pop("resume_download", None)
        _ = kwargs.pop("mirror", None)
-        _ = kwargs.pop("_fast_init", True)
+        _ = kwargs.pop("_fast_init", None)
        _ = kwargs.pop("low_cpu_mem_usage", None)
+        _ = kwargs.pop("from_tf", None)
+        _ = kwargs.pop("from_flax", None)

        # For BC on torch_dtype argument
        if torch_dtype is not None:
@ -4964,8 +4846,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
                "Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead."
            )

-        from_pt = not (from_tf | from_flax)
-
        user_agent = {"file_type": "model", "framework": "pytorch", "from_auto_class": from_auto_class}
        if from_pipeline is not None:
            user_agent["using_pipeline"] = from_pipeline
@ -5016,7 +4896,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
                )

        hf_quantizer, config, dtype, device_map = get_hf_quantizer(
-            config, quantization_config, dtype, from_tf, from_flax, device_map, weights_only, user_agent
+            config, quantization_config, dtype, device_map, weights_only, user_agent
        )

        if gguf_file is not None and hf_quantizer is not None:
@ -5039,8 +4919,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
            subfolder=subfolder,
            variant=variant,
            gguf_file=gguf_file,
-            from_tf=from_tf,
-            from_flax=from_flax,
            use_safetensors=use_safetensors,
            cache_dir=cache_dir,
            force_download=force_download,
@ -5054,56 +4932,35 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
            transformers_explicit_filename=transformers_explicit_filename,
        )

-        is_sharded = sharded_metadata is not None
        is_quantized = hf_quantizer is not None
        is_from_file = pretrained_model_name_or_path is not None or gguf_file is not None

-        if (
-            is_safetensors_available()
-            and is_from_file
-            and not is_sharded
-            and checkpoint_files[0].endswith(".safetensors")
-        ):
+        # Just a helpful message in case we try to load safetensors files coming from old Transformers tf/flax classes
+        if is_safetensors_available() and is_from_file and checkpoint_files[0].endswith(".safetensors"):
            with safe_open(checkpoint_files[0], framework="pt") as f:
                metadata = f.metadata()
-
-            if metadata is None:
-                # Assume it's a pytorch checkpoint (introduced for timm checkpoints)
-                pass
-            elif metadata.get("format") == "pt":
-                pass
-            elif metadata.get("format") == "tf":
-                from_tf = True
-                logger.info("A TensorFlow safetensors file is being loaded in a PyTorch model.")
-            elif metadata.get("format") == "flax":
-                from_flax = True
-                logger.info("A Flax safetensors file is being loaded in a PyTorch model.")
-            elif metadata.get("format") == "mlx":
-                # This is a mlx file, we assume weights are compatible with pt
-                pass
-            else:
-                raise ValueError(
-                    f"Incompatible safetensors file. File metadata is not ['pt', 'tf', 'flax', 'mlx'] but {metadata.get('format')}"
+            if metadata is not None and metadata.get("format") in ["tf", "flax"]:
+                logger.warning(
+                    "The safetensors checkpoint found has format `tf` or `flax`. This mean that the keys will very"
+                    "likely not match to the model you are trying to load, and will be newly initialized. If it's the case "
+                    "another warning will be raised later. Consider converting your checkpoint to the correct format."
                )

-        from_pt = not (from_tf | from_flax)
+        if gguf_file:
+            from .modeling_gguf_pytorch_utils import load_gguf_checkpoint

-        if from_pt:
-            if gguf_file:
-                from .modeling_gguf_pytorch_utils import load_gguf_checkpoint
+            # we need a dummy model to get the state_dict - for this reason, we keep the state_dict as if it was
+            # passed directly as a kwarg from now on
+            with torch.device("meta"):
+                dummy_model = cls(config)
+            state_dict = load_gguf_checkpoint(checkpoint_files[0], return_tensors=True, model_to_load=dummy_model)[
+                "tensors"
+            ]

-                # we need a dummy model to get the state_dict - for this reason, we keep the state_dict as if it was
-                # passed directly as a kwarg from now on
-                with torch.device("meta"):
-                    dummy_model = cls(config)
-                state_dict = load_gguf_checkpoint(checkpoint_files[0], return_tensors=True, model_to_load=dummy_model)[
-                    "tensors"
-                ]
-
-            # Find the correct dtype based on current state
-            config, dtype, dtype_orig = _get_dtype(
-                cls, dtype, checkpoint_files, config, sharded_metadata, state_dict, weights_only
-            )
+        # Find the correct dtype based on current state
+        config, dtype, dtype_orig = _get_dtype(
+            cls, dtype, checkpoint_files, config, sharded_metadata, state_dict, weights_only
+        )

        config.name_or_path = pretrained_model_name_or_path
        model_init_context = cls.get_init_context(is_quantized, _is_ds_init_called)
@ -5166,40 +5023,36 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
        if device_map is not None:
            device_map = _get_device_map(model, device_map, max_memory, hf_quantizer, dtype, keep_in_fp32_regex)

-        # Finalize model weight initialization
-        if from_tf:
-            model, loading_info = cls._load_from_tf(model, config, checkpoint_files)
-        elif from_flax:
-            model = cls._load_from_flax(model, checkpoint_files)
-        elif from_pt:
-            # restore default dtype
-            if dtype_orig is not None:
-                torch.set_default_dtype(dtype_orig)
+        # restore default dtype
+        if dtype_orig is not None:
+            torch.set_default_dtype(dtype_orig)
+
+        # Finalize model weight initialization
+        (
+            model,
+            missing_keys,
+            unexpected_keys,
+            mismatched_keys,
+            offload_index,
+            error_msgs,
+        ) = cls._load_pretrained_model(
+            model,
+            state_dict,
+            checkpoint_files,
+            pretrained_model_name_or_path,
+            ignore_mismatched_sizes=ignore_mismatched_sizes,
+            sharded_metadata=sharded_metadata,
+            device_map=device_map,
+            disk_offload_folder=offload_folder,
+            offload_state_dict=offload_state_dict,
+            dtype=dtype,
+            hf_quantizer=hf_quantizer,
+            keep_in_fp32_regex=keep_in_fp32_regex,
+            device_mesh=device_mesh,
+            key_mapping=key_mapping,
+            weights_only=weights_only,
+        )

-            (
-                model,
-                missing_keys,
-                unexpected_keys,
-                mismatched_keys,
-                offload_index,
-                error_msgs,
-            ) = cls._load_pretrained_model(
-                model,
-                state_dict,
-                checkpoint_files,
-                pretrained_model_name_or_path,
-                ignore_mismatched_sizes=ignore_mismatched_sizes,
-                sharded_metadata=sharded_metadata,
-                device_map=device_map,
-                disk_offload_folder=offload_folder,
-                offload_state_dict=offload_state_dict,
-                dtype=dtype,
-                hf_quantizer=hf_quantizer,
-                keep_in_fp32_regex=keep_in_fp32_regex,
-                device_mesh=device_mesh,
-                key_mapping=key_mapping,
-                weights_only=weights_only,
-            )
        # make sure token embedding weights are still tied if needed
        model.tie_weights()

@ -5292,15 +5145,12 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
            )

        if output_loading_info:
-            if from_pt:
-                loading_info = {
-                    "missing_keys": missing_keys,
-                    "unexpected_keys": unexpected_keys,
-                    "mismatched_keys": mismatched_keys,
-                    "error_msgs": error_msgs,
-                }
-            elif from_flax:
-                loading_info = None
+            loading_info = {
+                "missing_keys": missing_keys,
+                "unexpected_keys": unexpected_keys,
+                "mismatched_keys": mismatched_keys,
+                "error_msgs": error_msgs,
+            }
            return model, loading_info
        return model

@ -5751,44 +5601,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH

        return model, missing_keys, unexpected_keys, mismatched_keys, disk_offload_index, error_msgs

-    @classmethod
-    def _load_from_tf(cls, model, config, checkpoint_files):
-        if checkpoint_files[0].endswith(".index"):
-            # Load from a TensorFlow 1.X checkpoint - provided by original authors
-            model = cls.load_tf_weights(model, config, checkpoint_files[0][:-6])  # Remove the '.index'
-            loading_info = None
-        else:
-            # Load from our TensorFlow 2.0 checkpoints
-            try:
-                from .modeling_tf_pytorch_utils import load_tf2_checkpoint_in_pytorch_model
-
-                model, loading_info = load_tf2_checkpoint_in_pytorch_model(
-                    model, checkpoint_files[0], allow_missing_keys=True, output_loading_info=True
-                )
-            except ImportError:
-                logger.error(
-                    "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed."
-                    " Please see https://pytorch.org/ and https://www.tensorflow.org/install/ for installation"
-                    " instructions."
-                )
-                raise
-        return model, loading_info
-
-    @classmethod
-    def _load_from_flax(cls, model, checkpoint_files):
-        try:
-            from .modeling_flax_pytorch_utils import load_flax_checkpoint_in_pytorch_model
-
-            model = load_flax_checkpoint_in_pytorch_model(model, checkpoint_files[0])
-        except ImportError:
-            logger.error(
-                "Loading a Flax model in PyTorch, requires both PyTorch and Flax to be installed. Please see"
-                " https://pytorch.org/ and https://flax.readthedocs.io/en/latest/installation.html for"
-                " installation instructions."
-            )
-            raise
-        return model
-
    def retrieve_modules_from_names(self, names, add_prefix=False, remove_prefix=False):
        module_keys = {".".join(key.split(".")[:-1]) for key in names}

--- a/src/transformers/models/albert/init.py
+++ b/src/transformers/models/albert/init.py
@ -20,8 +20,6 @@ from ...utils.import_utils import define_import_structure
 if TYPE_CHECKING:
    from .configuration_albert import *
    from .modeling_albert import *
-    from .modeling_flax_albert import *
-    from .modeling_tf_albert import *
    from .tokenization_albert import *
    from .tokenization_albert_fast import *
 else:
--- a/src/transformers/models/albert/modeling_albert.py
+++ b/src/transformers/models/albert/modeling_albert.py
@ -15,7 +15,6 @@
 """PyTorch ALBERT model."""

 import math
-import os
 from dataclasses import dataclass
 from typing import Optional, Union

@ -47,132 +46,6 @@ from .configuration_albert import AlbertConfig
 logger = logging.get_logger(__name__)


-def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
-    """Load tf checkpoints in a pytorch model."""
-    try:
-        import re
-
-        import numpy as np
-        import tensorflow as tf
-    except ImportError:
-        logger.error(
-            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-    tf_path = os.path.abspath(tf_checkpoint_path)
-    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    for name, shape in init_vars:
-        logger.info(f"Loading TF weight {name} with shape {shape}")
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        arrays.append(array)
-
-    for name, array in zip(names, arrays):
-        print(name)
-
-    for name, array in zip(names, arrays):
-        original_name = name
-
-        # If saved from the TF HUB module
-        name = name.replace("module/", "")
-
-        # Renaming and simplifying
-        name = name.replace("ffn_1", "ffn")
-        name = name.replace("bert/", "albert/")
-        name = name.replace("attention_1", "attention")
-        name = name.replace("transform/", "")
-        name = name.replace("LayerNorm_1", "full_layer_layer_norm")
-        name = name.replace("LayerNorm", "attention/LayerNorm")
-        name = name.replace("transformer/", "")
-
-        # The feed forward layer had an 'intermediate' step which has been abstracted away
-        name = name.replace("intermediate/dense/", "")
-        name = name.replace("ffn/intermediate/output/dense/", "ffn_output/")
-
-        # ALBERT attention was split between self and output which have been abstracted away
-        name = name.replace("/output/", "/")
-        name = name.replace("/self/", "/")
-
-        # The pooler is a linear layer
-        name = name.replace("pooler/dense", "pooler")
-
-        # The classifier was simplified to predictions from cls/predictions
-        name = name.replace("cls/predictions", "predictions")
-        name = name.replace("predictions/attention", "predictions")
-
-        # Naming was changed to be more explicit
-        name = name.replace("embeddings/attention", "embeddings")
-        name = name.replace("inner_group_", "albert_layers/")
-        name = name.replace("group_", "albert_layer_groups/")
-
-        # Classifier
-        if len(name.split("/")) == 1 and ("output_bias" in name or "output_weights" in name):
-            name = "classifier/" + name
-
-        # No ALBERT model currently handles the next sentence prediction task
-        if "seq_relationship" in name:
-            name = name.replace("seq_relationship/output_", "sop_classifier/classifier/")
-            name = name.replace("weights", "weight")
-
-        name = name.split("/")
-
-        # Ignore the gradients applied by the LAMB/ADAM optimizers.
-        if (
-            "adam_m" in name
-            or "adam_v" in name
-            or "AdamWeightDecayOptimizer" in name
-            or "AdamWeightDecayOptimizer_1" in name
-            or "global_step" in name
-        ):
-            logger.info(f"Skipping {'/'.join(name)}")
-            continue
-
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
-                scope_names = re.split(r"_(\d+)", m_name)
-            else:
-                scope_names = [m_name]
-
-            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
-                pointer = getattr(pointer, "bias")
-            elif scope_names[0] == "output_weights":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "squad":
-                pointer = getattr(pointer, "classifier")
-            else:
-                try:
-                    pointer = getattr(pointer, scope_names[0])
-                except AttributeError:
-                    logger.info(f"Skipping {'/'.join(name)}")
-                    continue
-            if len(scope_names) >= 2:
-                num = int(scope_names[1])
-                pointer = pointer[num]
-
-        if m_name[-11:] == "_embeddings":
-            pointer = getattr(pointer, "weight")
-        elif m_name == "kernel":
-            array = np.transpose(array)
-        try:
-            if pointer.shape != array.shape:
-                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
-        except ValueError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        print(f"Initialize PyTorch weight {name} from {original_name}")
-        pointer.data = torch.from_numpy(array)
-
-    return model
-
-
 class AlbertEmbeddings(nn.Module):
    """
    Construct the embeddings from word, position and token_type embeddings.
@ -184,8 +57,6 @@ class AlbertEmbeddings(nn.Module):
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size)

-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
        self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

@ -546,15 +417,12 @@ class AlbertTransformer(nn.Module):
@auto_docstring
 class AlbertPreTrainedModel(PreTrainedModel):
    config: AlbertConfig
-    load_tf_weights = load_tf_weights_in_albert
    base_model_prefix = "albert"
    _supports_sdpa = True

    def _init_weights(self, module):
        """Initialize the weights."""
        if isinstance(module, nn.Linear):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
@ -1337,7 +1205,6 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):


 __all__ = [
-    "load_tf_weights_in_albert",
    "AlbertPreTrainedModel",
    "AlbertModel",
    "AlbertForPreTraining",
--- a/src/transformers/models/albert/modeling_flax_albert.py
+++ b/src/transformers/models/albert/modeling_flax_albert.py
--- a/src/transformers/models/albert/modeling_tf_albert.py
+++ b/src/transformers/models/albert/modeling_tf_albert.py
--- a/src/transformers/models/align/modeling_align.py
+++ b/src/transformers/models/align/modeling_align.py
@ -516,8 +516,6 @@ class AlignTextEmbeddings(nn.Module):
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
--- a/src/transformers/models/altclip/modeling_altclip.py
+++ b/src/transformers/models/altclip/modeling_altclip.py
@ -101,8 +101,6 @@ class AltRobertaEmbeddings(nn.Module):
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
--- a/src/transformers/models/aria/image_processing_aria.py
+++ b/src/transformers/models/aria/image_processing_aria.py
@ -232,10 +232,7 @@ class AriaImageProcessor(BaseImageProcessor):
        images = make_flat_list_of_images(images)

        if not valid_images(images):
-            raise ValueError(
-                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
-                "torch.Tensor, tf.Tensor or jax.ndarray."
-            )
+            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, or torch.Tensor")

        validate_preprocess_arguments(
            do_normalize=do_normalize,
--- a/src/transformers/models/aria/modular_aria.py
+++ b/src/transformers/models/aria/modular_aria.py
@ -615,10 +615,7 @@ class AriaImageProcessor(BaseImageProcessor):
        images = make_flat_list_of_images(images)

        if not valid_images(images):
-            raise ValueError(
-                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
-                "torch.Tensor, tf.Tensor or jax.ndarray."
-            )
+            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, or torch.Tensor")

        validate_preprocess_arguments(
            do_normalize=do_normalize,
--- a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
@ -179,7 +179,6 @@ class ASTFeatureExtractor(SequenceFeatureExtractor):
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

-                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
        """
--- a/src/transformers/models/auto/init.py
+++ b/src/transformers/models/auto/init.py
@ -23,8 +23,6 @@ if TYPE_CHECKING:
    from .feature_extraction_auto import *
    from .image_processing_auto import *
    from .modeling_auto import *
-    from .modeling_flax_auto import *
-    from .modeling_tf_auto import *
    from .processing_auto import *
    from .tokenization_auto import *
    from .video_processing_auto import *
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@ -102,10 +102,6 @@ FROM_PRETRAINED_TORCH_DOCSTRING = """
                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
                    - A path to a *directory* containing model weights saved using
                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
-                    - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
-                      this case, `from_tf` should be set to `True` and a configuration object should be provided as
-                      `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
-                      PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
            model_args (additional positional arguments, *optional*):
                Will be passed along to the underlying model `__init__()` method.
            config ([`PretrainedConfig`], *optional*):
@ -127,9 +123,6 @@ FROM_PRETRAINED_TORCH_DOCSTRING = """
            cache_dir (`str` or `os.PathLike`, *optional*):
                Path to a directory in which a downloaded pretrained model configuration should be cached if the
                standard cache should not be used.
-            from_tf (`bool`, *optional*, defaults to `False`):
-                Load the model weights from a TensorFlow checkpoint save file (see docstring of
-                `pretrained_model_name_or_path` argument).
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                cached versions if they exist.
@ -182,210 +175,6 @@ FROM_PRETRAINED_TORCH_DOCSTRING = """
        >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder", output_attentions=True)
        >>> model.config.output_attentions
        True
-
-        >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-        >>> config = AutoConfig.from_pretrained("./tf_model/shortcut_placeholder_tf_model_config.json")
-        >>> model = BaseAutoModelClass.from_pretrained(
-        ...     "./tf_model/shortcut_placeholder_tf_checkpoint.ckpt.index", from_tf=True, config=config
-        ... )
-        ```
-"""
-
-FROM_PRETRAINED_TF_DOCSTRING = """
-        Instantiate one of the model classes of the library from a pretrained model.
-
-        The model class to instantiate is selected based on the `model_type` property of the config object (either
-        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
-        falling back to using pattern matching on `pretrained_model_name_or_path`:
-
-        List options
-
-        Args:
-            pretrained_model_name_or_path (`str` or `os.PathLike`):
-                Can be either:
-
-                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                    - A path to a *directory* containing model weights saved using
-                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
-                    - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In this
-                      case, `from_pt` should be set to `True` and a configuration object should be provided as `config`
-                      argument. This loading path is slower than converting the PyTorch model in a TensorFlow model
-                      using the provided conversion scripts and loading the TensorFlow model afterwards.
-            model_args (additional positional arguments, *optional*):
-                Will be passed along to the underlying model `__init__()` method.
-            config ([`PretrainedConfig`], *optional*):
-                Configuration for the model to use instead of an automatically loaded configuration. Configuration can
-                be automatically loaded when:
-
-                    - The model is a model provided by the library (loaded with the *model id* string of a pretrained
-                      model).
-                    - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded by supplying the
-                      save directory.
-                    - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
-                      configuration JSON file named *config.json* is found in the directory.
-            cache_dir (`str` or `os.PathLike`, *optional*):
-                Path to a directory in which a downloaded pretrained model configuration should be cached if the
-                standard cache should not be used.
-            from_pt (`bool`, *optional*, defaults to `False`):
-                Load the model weights from a PyTorch checkpoint save file (see docstring of
-                `pretrained_model_name_or_path` argument).
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download:
-                Deprecated and ignored. All downloads are now resumed by default when possible.
-                Will be removed in v5 of Transformers.
-            proxies (`dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            output_loading_info(`bool`, *optional*, defaults to `False`):
-                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
-            local_files_only(`bool`, *optional*, defaults to `False`):
-                Whether or not to only look at local files (e.g., not try downloading the model).
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-                identifier allowed by git.
-            trust_remote_code (`bool`, *optional*, defaults to `False`):
-                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
-                should only be set to `True` for repositories you trust and in which you have read the code, as it will
-                execute code present on the Hub on your local machine.
-            code_revision (`str`, *optional*, defaults to `"main"`):
-                The specific revision to use for the code on the Hub, if the code leaves in a different repository than
-                the rest of the model. It can be a branch name, a tag name, or a commit id, since we use a git-based
-                system for storing models and other artifacts on huggingface.co, so `revision` can be any identifier
-                allowed by git.
-            kwargs (additional keyword arguments, *optional*):
-                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
-                `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
-                automatically loaded:
-
-                    - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
-                      underlying model's `__init__` method (we assume all relevant updates to the configuration have
-                      already been done)
-                    - If a configuration is not provided, `kwargs` will be first passed to the configuration class
-                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that
-                      corresponds to a configuration attribute will be used to override said attribute with the
-                      supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute
-                      will be passed to the underlying model's `__init__` function.
-
-        Examples:
-
-        ```python
-        >>> from transformers import AutoConfig, BaseAutoModelClass
-
-        >>> # Download model and configuration from huggingface.co and cache.
-        >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder")
-
-        >>> # Update configuration during loading
-        >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder", output_attentions=True)
-        >>> model.config.output_attentions
-        True
-
-        >>> # Loading from a PyTorch checkpoint file instead of a TensorFlow model (slower)
-        >>> config = AutoConfig.from_pretrained("./pt_model/shortcut_placeholder_pt_model_config.json")
-        >>> model = BaseAutoModelClass.from_pretrained(
-        ...     "./pt_model/shortcut_placeholder_pytorch_model.bin", from_pt=True, config=config
-        ... )
-        ```
-"""
-
-FROM_PRETRAINED_FLAX_DOCSTRING = """
-        Instantiate one of the model classes of the library from a pretrained model.
-
-        The model class to instantiate is selected based on the `model_type` property of the config object (either
-        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
-        falling back to using pattern matching on `pretrained_model_name_or_path`:
-
-        List options
-
-        Args:
-            pretrained_model_name_or_path (`str` or `os.PathLike`):
-                Can be either:
-
-                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                    - A path to a *directory* containing model weights saved using
-                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
-                    - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In this
-                      case, `from_pt` should be set to `True` and a configuration object should be provided as `config`
-                      argument. This loading path is slower than converting the PyTorch model in a TensorFlow model
-                      using the provided conversion scripts and loading the TensorFlow model afterwards.
-            model_args (additional positional arguments, *optional*):
-                Will be passed along to the underlying model `__init__()` method.
-            config ([`PretrainedConfig`], *optional*):
-                Configuration for the model to use instead of an automatically loaded configuration. Configuration can
-                be automatically loaded when:
-
-                    - The model is a model provided by the library (loaded with the *model id* string of a pretrained
-                      model).
-                    - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded by supplying the
-                      save directory.
-                    - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
-                      configuration JSON file named *config.json* is found in the directory.
-            cache_dir (`str` or `os.PathLike`, *optional*):
-                Path to a directory in which a downloaded pretrained model configuration should be cached if the
-                standard cache should not be used.
-            from_pt (`bool`, *optional*, defaults to `False`):
-                Load the model weights from a PyTorch checkpoint save file (see docstring of
-                `pretrained_model_name_or_path` argument).
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download:
-                Deprecated and ignored. All downloads are now resumed by default when possible.
-                Will be removed in v5 of Transformers.
-            proxies (`dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            output_loading_info(`bool`, *optional*, defaults to `False`):
-                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
-            local_files_only(`bool`, *optional*, defaults to `False`):
-                Whether or not to only look at local files (e.g., not try downloading the model).
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-                identifier allowed by git.
-            trust_remote_code (`bool`, *optional*, defaults to `False`):
-                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
-                should only be set to `True` for repositories you trust and in which you have read the code, as it will
-                execute code present on the Hub on your local machine.
-            code_revision (`str`, *optional*, defaults to `"main"`):
-                The specific revision to use for the code on the Hub, if the code leaves in a different repository than
-                the rest of the model. It can be a branch name, a tag name, or a commit id, since we use a git-based
-                system for storing models and other artifacts on huggingface.co, so `revision` can be any identifier
-                allowed by git.
-            kwargs (additional keyword arguments, *optional*):
-                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
-                `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
-                automatically loaded:
-
-                    - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
-                      underlying model's `__init__` method (we assume all relevant updates to the configuration have
-                      already been done)
-                    - If a configuration is not provided, `kwargs` will be first passed to the configuration class
-                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that
-                      corresponds to a configuration attribute will be used to override said attribute with the
-                      supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute
-                      will be passed to the underlying model's `__init__` function.
-
-        Examples:
-
-        ```python
-        >>> from transformers import AutoConfig, BaseAutoModelClass
-
-        >>> # Download model and configuration from huggingface.co and cache.
-        >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder")
-
-        >>> # Update configuration during loading
-        >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder", output_attentions=True)
-        >>> model.config.output_attentions
-        True
-
-        >>> # Loading from a PyTorch checkpoint file instead of a TensorFlow model (slower)
-        >>> config = AutoConfig.from_pretrained("./pt_model/shortcut_placeholder_pt_model_config.json")
-        >>> model = BaseAutoModelClass.from_pretrained(
-        ...     "./pt_model/shortcut_placeholder_pytorch_model.bin", from_pt=True, config=config
-        ... )
        ```
 """

@ -400,10 +189,6 @@ def _get_model_class(config, model_mapping):
    for arch in architectures:
        if arch in name_to_model:
            return name_to_model[arch]
-        elif f"TF{arch}" in name_to_model:
-            return name_to_model[f"TF{arch}"]
-        elif f"Flax{arch}" in name_to_model:
-            return name_to_model[f"Flax{arch}"]

    # If not architecture is set in the config or match the supported models, the first element of the tuple is the
    # defaults.
@ -696,12 +481,7 @@ def auto_class_update(cls, checkpoint_for_example: str = "google-bert/bert-base-
    from_config = replace_list_option_in_docstrings(model_mapping._model_mapping, use_model_types=False)(from_config)
    cls.from_config = classmethod(from_config)

-    if name.startswith("TF"):
-        from_pretrained_docstring = FROM_PRETRAINED_TF_DOCSTRING
-    elif name.startswith("Flax"):
-        from_pretrained_docstring = FROM_PRETRAINED_FLAX_DOCSTRING
-    else:
-        from_pretrained_docstring = FROM_PRETRAINED_TORCH_DOCSTRING
+    from_pretrained_docstring = FROM_PRETRAINED_TORCH_DOCSTRING
    from_pretrained = copy_func(_BaseAutoModelClass.from_pretrained)
    from_pretrained_docstring = insert_head_doc(from_pretrained_docstring, head_doc=head_doc)
    from_pretrained_docstring = from_pretrained_docstring.replace("BaseAutoModelClass", name)
--- a/src/transformers/models/auto/modeling_flax_auto.py
+++ b/src/transformers/models/auto/modeling_flax_auto.py
@ -1,413 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google Flax Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Auto Model class."""
-
-from collections import OrderedDict
-
-from ...utils import logging
-from .auto_factory import _BaseAutoModelClass, _LazyAutoMapping, auto_class_update
-from .configuration_auto import CONFIG_MAPPING_NAMES
-
-
-logger = logging.get_logger(__name__)
-
-
-FLAX_MODEL_MAPPING_NAMES = OrderedDict(
-    [
-        # Base model mapping
-        ("albert", "FlaxAlbertModel"),
-        ("bart", "FlaxBartModel"),
-        ("beit", "FlaxBeitModel"),
-        ("bert", "FlaxBertModel"),
-        ("big_bird", "FlaxBigBirdModel"),
-        ("blenderbot", "FlaxBlenderbotModel"),
-        ("blenderbot-small", "FlaxBlenderbotSmallModel"),
-        ("bloom", "FlaxBloomModel"),
-        ("clip", "FlaxCLIPModel"),
-        ("dinov2", "FlaxDinov2Model"),
-        ("distilbert", "FlaxDistilBertModel"),
-        ("electra", "FlaxElectraModel"),
-        ("gemma", "FlaxGemmaModel"),
-        ("gpt-sw3", "FlaxGPT2Model"),
-        ("gpt2", "FlaxGPT2Model"),
-        ("gpt_neo", "FlaxGPTNeoModel"),
-        ("gptj", "FlaxGPTJModel"),
-        ("llama", "FlaxLlamaModel"),
-        ("longt5", "FlaxLongT5Model"),
-        ("marian", "FlaxMarianModel"),
-        ("mbart", "FlaxMBartModel"),
-        ("mistral", "FlaxMistralModel"),
-        ("mt5", "FlaxMT5Model"),
-        ("opt", "FlaxOPTModel"),
-        ("pegasus", "FlaxPegasusModel"),
-        ("regnet", "FlaxRegNetModel"),
-        ("resnet", "FlaxResNetModel"),
-        ("roberta", "FlaxRobertaModel"),
-        ("roberta-prelayernorm", "FlaxRobertaPreLayerNormModel"),
-        ("roformer", "FlaxRoFormerModel"),
-        ("t5", "FlaxT5Model"),
-        ("vision-text-dual-encoder", "FlaxVisionTextDualEncoderModel"),
-        ("vit", "FlaxViTModel"),
-        ("wav2vec2", "FlaxWav2Vec2Model"),
-        ("whisper", "FlaxWhisperModel"),
-        ("xglm", "FlaxXGLMModel"),
-        ("xlm-roberta", "FlaxXLMRobertaModel"),
-    ]
-)
-
-FLAX_MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for pre-training mapping
-        ("albert", "FlaxAlbertForPreTraining"),
-        ("bart", "FlaxBartForConditionalGeneration"),
-        ("bert", "FlaxBertForPreTraining"),
-        ("big_bird", "FlaxBigBirdForPreTraining"),
-        ("electra", "FlaxElectraForPreTraining"),
-        ("longt5", "FlaxLongT5ForConditionalGeneration"),
-        ("mbart", "FlaxMBartForConditionalGeneration"),
-        ("mt5", "FlaxMT5ForConditionalGeneration"),
-        ("roberta", "FlaxRobertaForMaskedLM"),
-        ("roberta-prelayernorm", "FlaxRobertaPreLayerNormForMaskedLM"),
-        ("roformer", "FlaxRoFormerForMaskedLM"),
-        ("t5", "FlaxT5ForConditionalGeneration"),
-        ("wav2vec2", "FlaxWav2Vec2ForPreTraining"),
-        ("whisper", "FlaxWhisperForConditionalGeneration"),
-        ("xlm-roberta", "FlaxXLMRobertaForMaskedLM"),
-    ]
-)
-
-FLAX_MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for Masked LM mapping
-        ("albert", "FlaxAlbertForMaskedLM"),
-        ("bart", "FlaxBartForConditionalGeneration"),
-        ("bert", "FlaxBertForMaskedLM"),
-        ("big_bird", "FlaxBigBirdForMaskedLM"),
-        ("distilbert", "FlaxDistilBertForMaskedLM"),
-        ("electra", "FlaxElectraForMaskedLM"),
-        ("mbart", "FlaxMBartForConditionalGeneration"),
-        ("roberta", "FlaxRobertaForMaskedLM"),
-        ("roberta-prelayernorm", "FlaxRobertaPreLayerNormForMaskedLM"),
-        ("roformer", "FlaxRoFormerForMaskedLM"),
-        ("xlm-roberta", "FlaxXLMRobertaForMaskedLM"),
-    ]
-)
-
-FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for Seq2Seq Causal LM mapping
-        ("bart", "FlaxBartForConditionalGeneration"),
-        ("blenderbot", "FlaxBlenderbotForConditionalGeneration"),
-        ("blenderbot-small", "FlaxBlenderbotSmallForConditionalGeneration"),
-        ("encoder-decoder", "FlaxEncoderDecoderModel"),
-        ("longt5", "FlaxLongT5ForConditionalGeneration"),
-        ("marian", "FlaxMarianMTModel"),
-        ("mbart", "FlaxMBartForConditionalGeneration"),
-        ("mt5", "FlaxMT5ForConditionalGeneration"),
-        ("pegasus", "FlaxPegasusForConditionalGeneration"),
-        ("t5", "FlaxT5ForConditionalGeneration"),
-    ]
-)
-
-FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for Image-classification
-        ("beit", "FlaxBeitForImageClassification"),
-        ("dinov2", "FlaxDinov2ForImageClassification"),
-        ("regnet", "FlaxRegNetForImageClassification"),
-        ("resnet", "FlaxResNetForImageClassification"),
-        ("vit", "FlaxViTForImageClassification"),
-    ]
-)
-
-FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict(
-    [
-        ("vision-encoder-decoder", "FlaxVisionEncoderDecoderModel"),
-    ]
-)
-
-FLAX_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for Causal LM mapping
-        ("bart", "FlaxBartForCausalLM"),
-        ("bert", "FlaxBertForCausalLM"),
-        ("big_bird", "FlaxBigBirdForCausalLM"),
-        ("bloom", "FlaxBloomForCausalLM"),
-        ("electra", "FlaxElectraForCausalLM"),
-        ("gemma", "FlaxGemmaForCausalLM"),
-        ("gpt-sw3", "FlaxGPT2LMHeadModel"),
-        ("gpt2", "FlaxGPT2LMHeadModel"),
-        ("gpt_neo", "FlaxGPTNeoForCausalLM"),
-        ("gptj", "FlaxGPTJForCausalLM"),
-        ("llama", "FlaxLlamaForCausalLM"),
-        ("mistral", "FlaxMistralForCausalLM"),
-        ("opt", "FlaxOPTForCausalLM"),
-        ("roberta", "FlaxRobertaForCausalLM"),
-        ("roberta-prelayernorm", "FlaxRobertaPreLayerNormForCausalLM"),
-        ("xglm", "FlaxXGLMForCausalLM"),
-        ("xlm-roberta", "FlaxXLMRobertaForCausalLM"),
-    ]
-)
-
-FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for Sequence Classification mapping
-        ("albert", "FlaxAlbertForSequenceClassification"),
-        ("bart", "FlaxBartForSequenceClassification"),
-        ("bert", "FlaxBertForSequenceClassification"),
-        ("big_bird", "FlaxBigBirdForSequenceClassification"),
-        ("distilbert", "FlaxDistilBertForSequenceClassification"),
-        ("electra", "FlaxElectraForSequenceClassification"),
-        ("mbart", "FlaxMBartForSequenceClassification"),
-        ("roberta", "FlaxRobertaForSequenceClassification"),
-        ("roberta-prelayernorm", "FlaxRobertaPreLayerNormForSequenceClassification"),
-        ("roformer", "FlaxRoFormerForSequenceClassification"),
-        ("xlm-roberta", "FlaxXLMRobertaForSequenceClassification"),
-    ]
-)
-
-FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for Question Answering mapping
-        ("albert", "FlaxAlbertForQuestionAnswering"),
-        ("bart", "FlaxBartForQuestionAnswering"),
-        ("bert", "FlaxBertForQuestionAnswering"),
-        ("big_bird", "FlaxBigBirdForQuestionAnswering"),
-        ("distilbert", "FlaxDistilBertForQuestionAnswering"),
-        ("electra", "FlaxElectraForQuestionAnswering"),
-        ("mbart", "FlaxMBartForQuestionAnswering"),
-        ("roberta", "FlaxRobertaForQuestionAnswering"),
-        ("roberta-prelayernorm", "FlaxRobertaPreLayerNormForQuestionAnswering"),
-        ("roformer", "FlaxRoFormerForQuestionAnswering"),
-        ("xlm-roberta", "FlaxXLMRobertaForQuestionAnswering"),
-    ]
-)
-
-FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for Token Classification mapping
-        ("albert", "FlaxAlbertForTokenClassification"),
-        ("bert", "FlaxBertForTokenClassification"),
-        ("big_bird", "FlaxBigBirdForTokenClassification"),
-        ("distilbert", "FlaxDistilBertForTokenClassification"),
-        ("electra", "FlaxElectraForTokenClassification"),
-        ("roberta", "FlaxRobertaForTokenClassification"),
-        ("roberta-prelayernorm", "FlaxRobertaPreLayerNormForTokenClassification"),
-        ("roformer", "FlaxRoFormerForTokenClassification"),
-        ("xlm-roberta", "FlaxXLMRobertaForTokenClassification"),
-    ]
-)
-
-FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for Multiple Choice mapping
-        ("albert", "FlaxAlbertForMultipleChoice"),
-        ("bert", "FlaxBertForMultipleChoice"),
-        ("big_bird", "FlaxBigBirdForMultipleChoice"),
-        ("distilbert", "FlaxDistilBertForMultipleChoice"),
-        ("electra", "FlaxElectraForMultipleChoice"),
-        ("roberta", "FlaxRobertaForMultipleChoice"),
-        ("roberta-prelayernorm", "FlaxRobertaPreLayerNormForMultipleChoice"),
-        ("roformer", "FlaxRoFormerForMultipleChoice"),
-        ("xlm-roberta", "FlaxXLMRobertaForMultipleChoice"),
-    ]
-)
-
-FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES = OrderedDict(
-    [
-        ("bert", "FlaxBertForNextSentencePrediction"),
-    ]
-)
-
-FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = OrderedDict(
-    [
-        ("speech-encoder-decoder", "FlaxSpeechEncoderDecoderModel"),
-        ("whisper", "FlaxWhisperForConditionalGeneration"),
-    ]
-)
-
-FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
-    [
-        ("whisper", "FlaxWhisperForAudioClassification"),
-    ]
-)
-
-FLAX_MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_MAPPING_NAMES)
-FLAX_MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_PRETRAINING_MAPPING_NAMES)
-FLAX_MODEL_FOR_MASKED_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_MASKED_LM_MAPPING_NAMES)
-FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
-)
-FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
-)
-FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES)
-FLAX_MODEL_FOR_CAUSAL_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
-FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
-)
-FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
-)
-FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
-)
-FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES
-)
-FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES
-)
-FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES
-)
-FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES
-)
-
-
-class FlaxAutoModel(_BaseAutoModelClass):
-    _model_mapping = FLAX_MODEL_MAPPING
-
-
-FlaxAutoModel = auto_class_update(FlaxAutoModel)
-
-
-class FlaxAutoModelForPreTraining(_BaseAutoModelClass):
-    _model_mapping = FLAX_MODEL_FOR_PRETRAINING_MAPPING
-
-
-FlaxAutoModelForPreTraining = auto_class_update(FlaxAutoModelForPreTraining, head_doc="pretraining")
-
-
-class FlaxAutoModelForCausalLM(_BaseAutoModelClass):
-    _model_mapping = FLAX_MODEL_FOR_CAUSAL_LM_MAPPING
-
-
-FlaxAutoModelForCausalLM = auto_class_update(FlaxAutoModelForCausalLM, head_doc="causal language modeling")
-
-
-class FlaxAutoModelForMaskedLM(_BaseAutoModelClass):
-    _model_mapping = FLAX_MODEL_FOR_MASKED_LM_MAPPING
-
-
-FlaxAutoModelForMaskedLM = auto_class_update(FlaxAutoModelForMaskedLM, head_doc="masked language modeling")
-
-
-class FlaxAutoModelForSeq2SeqLM(_BaseAutoModelClass):
-    _model_mapping = FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
-
-
-FlaxAutoModelForSeq2SeqLM = auto_class_update(
-    FlaxAutoModelForSeq2SeqLM,
-    head_doc="sequence-to-sequence language modeling",
-    checkpoint_for_example="google-t5/t5-base",
-)
-
-
-class FlaxAutoModelForSequenceClassification(_BaseAutoModelClass):
-    _model_mapping = FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
-
-
-FlaxAutoModelForSequenceClassification = auto_class_update(
-    FlaxAutoModelForSequenceClassification, head_doc="sequence classification"
-)
-
-
-class FlaxAutoModelForQuestionAnswering(_BaseAutoModelClass):
-    _model_mapping = FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING
-
-
-FlaxAutoModelForQuestionAnswering = auto_class_update(FlaxAutoModelForQuestionAnswering, head_doc="question answering")
-
-
-class FlaxAutoModelForTokenClassification(_BaseAutoModelClass):
-    _model_mapping = FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
-
-
-FlaxAutoModelForTokenClassification = auto_class_update(
-    FlaxAutoModelForTokenClassification, head_doc="token classification"
-)
-
-
-class FlaxAutoModelForMultipleChoice(_BaseAutoModelClass):
-    _model_mapping = FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING
-
-
-FlaxAutoModelForMultipleChoice = auto_class_update(FlaxAutoModelForMultipleChoice, head_doc="multiple choice")
-
-
-class FlaxAutoModelForNextSentencePrediction(_BaseAutoModelClass):
-    _model_mapping = FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING
-
-
-FlaxAutoModelForNextSentencePrediction = auto_class_update(
-    FlaxAutoModelForNextSentencePrediction, head_doc="next sentence prediction"
-)
-
-
-class FlaxAutoModelForImageClassification(_BaseAutoModelClass):
-    _model_mapping = FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
-
-
-FlaxAutoModelForImageClassification = auto_class_update(
-    FlaxAutoModelForImageClassification, head_doc="image classification"
-)
-
-
-class FlaxAutoModelForVision2Seq(_BaseAutoModelClass):
-    _model_mapping = FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING
-
-
-FlaxAutoModelForVision2Seq = auto_class_update(FlaxAutoModelForVision2Seq, head_doc="vision-to-text modeling")
-
-
-class FlaxAutoModelForSpeechSeq2Seq(_BaseAutoModelClass):
-    _model_mapping = FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING
-
-
-FlaxAutoModelForSpeechSeq2Seq = auto_class_update(
-    FlaxAutoModelForSpeechSeq2Seq, head_doc="sequence-to-sequence speech-to-text modeling"
-)
-
-__all__ = [
-    "FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
-    "FLAX_MODEL_FOR_CAUSAL_LM_MAPPING",
-    "FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
-    "FLAX_MODEL_FOR_MASKED_LM_MAPPING",
-    "FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING",
-    "FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING",
-    "FLAX_MODEL_FOR_PRETRAINING_MAPPING",
-    "FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING",
-    "FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
-    "FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
-    "FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
-    "FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
-    "FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING",
-    "FLAX_MODEL_MAPPING",
-    "FlaxAutoModel",
-    "FlaxAutoModelForCausalLM",
-    "FlaxAutoModelForImageClassification",
-    "FlaxAutoModelForMaskedLM",
-    "FlaxAutoModelForMultipleChoice",
-    "FlaxAutoModelForNextSentencePrediction",
-    "FlaxAutoModelForPreTraining",
-    "FlaxAutoModelForQuestionAnswering",
-    "FlaxAutoModelForSeq2SeqLM",
-    "FlaxAutoModelForSequenceClassification",
-    "FlaxAutoModelForSpeechSeq2Seq",
-    "FlaxAutoModelForTokenClassification",
-    "FlaxAutoModelForVision2Seq",
-]
--- a/src/transformers/models/auto/modeling_tf_auto.py
+++ b/src/transformers/models/auto/modeling_tf_auto.py
@ -1,776 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Auto Model class."""
-
-import warnings
-from collections import OrderedDict
-
-from ...utils import logging
-from .auto_factory import _BaseAutoModelClass, _LazyAutoMapping, auto_class_update
-from .configuration_auto import CONFIG_MAPPING_NAMES
-
-
-logger = logging.get_logger(__name__)
-
-
-TF_MODEL_MAPPING_NAMES = OrderedDict(
-    [
-        # Base model mapping
-        ("albert", "TFAlbertModel"),
-        ("bart", "TFBartModel"),
-        ("bert", "TFBertModel"),
-        ("blenderbot", "TFBlenderbotModel"),
-        ("blenderbot-small", "TFBlenderbotSmallModel"),
-        ("blip", "TFBlipModel"),
-        ("camembert", "TFCamembertModel"),
-        ("clip", "TFCLIPModel"),
-        ("convbert", "TFConvBertModel"),
-        ("convnext", "TFConvNextModel"),
-        ("convnextv2", "TFConvNextV2Model"),
-        ("ctrl", "TFCTRLModel"),
-        ("cvt", "TFCvtModel"),
-        ("data2vec-vision", "TFData2VecVisionModel"),
-        ("deberta", "TFDebertaModel"),
-        ("deberta-v2", "TFDebertaV2Model"),
-        ("deit", "TFDeiTModel"),
-        ("distilbert", "TFDistilBertModel"),
-        ("dpr", "TFDPRQuestionEncoder"),
-        ("efficientformer", "TFEfficientFormerModel"),
-        ("electra", "TFElectraModel"),
-        ("esm", "TFEsmModel"),
-        ("flaubert", "TFFlaubertModel"),
-        ("funnel", ("TFFunnelModel", "TFFunnelBaseModel")),
-        ("gpt-sw3", "TFGPT2Model"),
-        ("gpt2", "TFGPT2Model"),
-        ("gptj", "TFGPTJModel"),
-        ("groupvit", "TFGroupViTModel"),
-        ("hubert", "TFHubertModel"),
-        ("idefics", "TFIdeficsModel"),
-        ("layoutlm", "TFLayoutLMModel"),
-        ("layoutlmv3", "TFLayoutLMv3Model"),
-        ("led", "TFLEDModel"),
-        ("longformer", "TFLongformerModel"),
-        ("lxmert", "TFLxmertModel"),
-        ("marian", "TFMarianModel"),
-        ("mbart", "TFMBartModel"),
-        ("mistral", "TFMistralModel"),
-        ("mobilebert", "TFMobileBertModel"),
-        ("mobilevit", "TFMobileViTModel"),
-        ("mpnet", "TFMPNetModel"),
-        ("mt5", "TFMT5Model"),
-        ("openai-gpt", "TFOpenAIGPTModel"),
-        ("opt", "TFOPTModel"),
-        ("pegasus", "TFPegasusModel"),
-        ("regnet", "TFRegNetModel"),
-        ("rembert", "TFRemBertModel"),
-        ("resnet", "TFResNetModel"),
-        ("roberta", "TFRobertaModel"),
-        ("roberta-prelayernorm", "TFRobertaPreLayerNormModel"),
-        ("roformer", "TFRoFormerModel"),
-        ("sam", "TFSamModel"),
-        ("sam_vision_model", "TFSamVisionModel"),
-        ("segformer", "TFSegformerModel"),
-        ("speech_to_text", "TFSpeech2TextModel"),
-        ("swiftformer", "TFSwiftFormerModel"),
-        ("swin", "TFSwinModel"),
-        ("t5", "TFT5Model"),
-        ("tapas", "TFTapasModel"),
-        ("transfo-xl", "TFTransfoXLModel"),
-        ("vision-text-dual-encoder", "TFVisionTextDualEncoderModel"),
-        ("vit", "TFViTModel"),
-        ("vit_mae", "TFViTMAEModel"),
-        ("wav2vec2", "TFWav2Vec2Model"),
-        ("whisper", "TFWhisperModel"),
-        ("xglm", "TFXGLMModel"),
-        ("xlm", "TFXLMModel"),
-        ("xlm-roberta", "TFXLMRobertaModel"),
-        ("xlnet", "TFXLNetModel"),
-    ]
-)
-
-TF_MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for pre-training mapping
-        ("albert", "TFAlbertForPreTraining"),
-        ("bart", "TFBartForConditionalGeneration"),
-        ("bert", "TFBertForPreTraining"),
-        ("camembert", "TFCamembertForMaskedLM"),
-        ("ctrl", "TFCTRLLMHeadModel"),
-        ("distilbert", "TFDistilBertForMaskedLM"),
-        ("electra", "TFElectraForPreTraining"),
-        ("flaubert", "TFFlaubertWithLMHeadModel"),
-        ("funnel", "TFFunnelForPreTraining"),
-        ("gpt-sw3", "TFGPT2LMHeadModel"),
-        ("gpt2", "TFGPT2LMHeadModel"),
-        ("idefics", "TFIdeficsForVisionText2Text"),
-        ("layoutlm", "TFLayoutLMForMaskedLM"),
-        ("lxmert", "TFLxmertForPreTraining"),
-        ("mobilebert", "TFMobileBertForPreTraining"),
-        ("mpnet", "TFMPNetForMaskedLM"),
-        ("openai-gpt", "TFOpenAIGPTLMHeadModel"),
-        ("roberta", "TFRobertaForMaskedLM"),
-        ("roberta-prelayernorm", "TFRobertaPreLayerNormForMaskedLM"),
-        ("t5", "TFT5ForConditionalGeneration"),
-        ("tapas", "TFTapasForMaskedLM"),
-        ("transfo-xl", "TFTransfoXLLMHeadModel"),
-        ("vit_mae", "TFViTMAEForPreTraining"),
-        ("xlm", "TFXLMWithLMHeadModel"),
-        ("xlm-roberta", "TFXLMRobertaForMaskedLM"),
-        ("xlnet", "TFXLNetLMHeadModel"),
-    ]
-)
-
-TF_MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
-    [
-        # Model with LM heads mapping
-        ("albert", "TFAlbertForMaskedLM"),
-        ("bart", "TFBartForConditionalGeneration"),
-        ("bert", "TFBertForMaskedLM"),
-        ("camembert", "TFCamembertForMaskedLM"),
-        ("convbert", "TFConvBertForMaskedLM"),
-        ("ctrl", "TFCTRLLMHeadModel"),
-        ("distilbert", "TFDistilBertForMaskedLM"),
-        ("electra", "TFElectraForMaskedLM"),
-        ("esm", "TFEsmForMaskedLM"),
-        ("flaubert", "TFFlaubertWithLMHeadModel"),
-        ("funnel", "TFFunnelForMaskedLM"),
-        ("gpt-sw3", "TFGPT2LMHeadModel"),
-        ("gpt2", "TFGPT2LMHeadModel"),
-        ("gptj", "TFGPTJForCausalLM"),
-        ("layoutlm", "TFLayoutLMForMaskedLM"),
-        ("led", "TFLEDForConditionalGeneration"),
-        ("longformer", "TFLongformerForMaskedLM"),
-        ("marian", "TFMarianMTModel"),
-        ("mobilebert", "TFMobileBertForMaskedLM"),
-        ("mpnet", "TFMPNetForMaskedLM"),
-        ("openai-gpt", "TFOpenAIGPTLMHeadModel"),
-        ("rembert", "TFRemBertForMaskedLM"),
-        ("roberta", "TFRobertaForMaskedLM"),
-        ("roberta-prelayernorm", "TFRobertaPreLayerNormForMaskedLM"),
-        ("roformer", "TFRoFormerForMaskedLM"),
-        ("speech_to_text", "TFSpeech2TextForConditionalGeneration"),
-        ("t5", "TFT5ForConditionalGeneration"),
-        ("tapas", "TFTapasForMaskedLM"),
-        ("transfo-xl", "TFTransfoXLLMHeadModel"),
-        ("whisper", "TFWhisperForConditionalGeneration"),
-        ("xlm", "TFXLMWithLMHeadModel"),
-        ("xlm-roberta", "TFXLMRobertaForMaskedLM"),
-        ("xlnet", "TFXLNetLMHeadModel"),
-    ]
-)
-
-TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for Causal LM mapping
-        ("bert", "TFBertLMHeadModel"),
-        ("camembert", "TFCamembertForCausalLM"),
-        ("ctrl", "TFCTRLLMHeadModel"),
-        ("gpt-sw3", "TFGPT2LMHeadModel"),
-        ("gpt2", "TFGPT2LMHeadModel"),
-        ("gptj", "TFGPTJForCausalLM"),
-        ("mistral", "TFMistralForCausalLM"),
-        ("openai-gpt", "TFOpenAIGPTLMHeadModel"),
-        ("opt", "TFOPTForCausalLM"),
-        ("rembert", "TFRemBertForCausalLM"),
-        ("roberta", "TFRobertaForCausalLM"),
-        ("roberta-prelayernorm", "TFRobertaPreLayerNormForCausalLM"),
-        ("roformer", "TFRoFormerForCausalLM"),
-        ("transfo-xl", "TFTransfoXLLMHeadModel"),
-        ("xglm", "TFXGLMForCausalLM"),
-        ("xlm", "TFXLMWithLMHeadModel"),
-        ("xlm-roberta", "TFXLMRobertaForCausalLM"),
-        ("xlnet", "TFXLNetLMHeadModel"),
-    ]
-)
-
-TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES = OrderedDict(
-    [
-        ("deit", "TFDeiTForMaskedImageModeling"),
-        ("swin", "TFSwinForMaskedImageModeling"),
-    ]
-)
-
-TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for Image-classsification
-        ("convnext", "TFConvNextForImageClassification"),
-        ("convnextv2", "TFConvNextV2ForImageClassification"),
-        ("cvt", "TFCvtForImageClassification"),
-        ("data2vec-vision", "TFData2VecVisionForImageClassification"),
-        ("deit", ("TFDeiTForImageClassification", "TFDeiTForImageClassificationWithTeacher")),
-        (
-            "efficientformer",
-            ("TFEfficientFormerForImageClassification", "TFEfficientFormerForImageClassificationWithTeacher"),
-        ),
-        ("mobilevit", "TFMobileViTForImageClassification"),
-        ("regnet", "TFRegNetForImageClassification"),
-        ("resnet", "TFResNetForImageClassification"),
-        ("segformer", "TFSegformerForImageClassification"),
-        ("swiftformer", "TFSwiftFormerForImageClassification"),
-        ("swin", "TFSwinForImageClassification"),
-        ("vit", "TFViTForImageClassification"),
-    ]
-)
-
-
-TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for Zero Shot Image Classification mapping
-        ("blip", "TFBlipModel"),
-        ("clip", "TFCLIPModel"),
-    ]
-)
-
-
-TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for Semantic Segmentation mapping
-        ("data2vec-vision", "TFData2VecVisionForSemanticSegmentation"),
-        ("mobilevit", "TFMobileViTForSemanticSegmentation"),
-        ("segformer", "TFSegformerForSemanticSegmentation"),
-    ]
-)
-
-TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict(
-    [
-        ("blip", "TFBlipForConditionalGeneration"),
-        ("vision-encoder-decoder", "TFVisionEncoderDecoderModel"),
-    ]
-)
-
-TF_MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for Masked LM mapping
-        ("albert", "TFAlbertForMaskedLM"),
-        ("bert", "TFBertForMaskedLM"),
-        ("camembert", "TFCamembertForMaskedLM"),
-        ("convbert", "TFConvBertForMaskedLM"),
-        ("deberta", "TFDebertaForMaskedLM"),
-        ("deberta-v2", "TFDebertaV2ForMaskedLM"),
-        ("distilbert", "TFDistilBertForMaskedLM"),
-        ("electra", "TFElectraForMaskedLM"),
-        ("esm", "TFEsmForMaskedLM"),
-        ("flaubert", "TFFlaubertWithLMHeadModel"),
-        ("funnel", "TFFunnelForMaskedLM"),
-        ("layoutlm", "TFLayoutLMForMaskedLM"),
-        ("longformer", "TFLongformerForMaskedLM"),
-        ("mobilebert", "TFMobileBertForMaskedLM"),
-        ("mpnet", "TFMPNetForMaskedLM"),
-        ("rembert", "TFRemBertForMaskedLM"),
-        ("roberta", "TFRobertaForMaskedLM"),
-        ("roberta-prelayernorm", "TFRobertaPreLayerNormForMaskedLM"),
-        ("roformer", "TFRoFormerForMaskedLM"),
-        ("tapas", "TFTapasForMaskedLM"),
-        ("xlm", "TFXLMWithLMHeadModel"),
-        ("xlm-roberta", "TFXLMRobertaForMaskedLM"),
-    ]
-)
-
-TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for Seq2Seq Causal LM mapping
-        ("bart", "TFBartForConditionalGeneration"),
-        ("blenderbot", "TFBlenderbotForConditionalGeneration"),
-        ("blenderbot-small", "TFBlenderbotSmallForConditionalGeneration"),
-        ("encoder-decoder", "TFEncoderDecoderModel"),
-        ("led", "TFLEDForConditionalGeneration"),
-        ("marian", "TFMarianMTModel"),
-        ("mbart", "TFMBartForConditionalGeneration"),
-        ("mt5", "TFMT5ForConditionalGeneration"),
-        ("pegasus", "TFPegasusForConditionalGeneration"),
-        ("t5", "TFT5ForConditionalGeneration"),
-    ]
-)
-
-TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = OrderedDict(
-    [
-        ("speech_to_text", "TFSpeech2TextForConditionalGeneration"),
-        ("whisper", "TFWhisperForConditionalGeneration"),
-    ]
-)
-
-TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for Sequence Classification mapping
-        ("albert", "TFAlbertForSequenceClassification"),
-        ("bart", "TFBartForSequenceClassification"),
-        ("bert", "TFBertForSequenceClassification"),
-        ("camembert", "TFCamembertForSequenceClassification"),
-        ("convbert", "TFConvBertForSequenceClassification"),
-        ("ctrl", "TFCTRLForSequenceClassification"),
-        ("deberta", "TFDebertaForSequenceClassification"),
-        ("deberta-v2", "TFDebertaV2ForSequenceClassification"),
-        ("distilbert", "TFDistilBertForSequenceClassification"),
-        ("electra", "TFElectraForSequenceClassification"),
-        ("esm", "TFEsmForSequenceClassification"),
-        ("flaubert", "TFFlaubertForSequenceClassification"),
-        ("funnel", "TFFunnelForSequenceClassification"),
-        ("gpt-sw3", "TFGPT2ForSequenceClassification"),
-        ("gpt2", "TFGPT2ForSequenceClassification"),
-        ("gptj", "TFGPTJForSequenceClassification"),
-        ("layoutlm", "TFLayoutLMForSequenceClassification"),
-        ("layoutlmv3", "TFLayoutLMv3ForSequenceClassification"),
-        ("longformer", "TFLongformerForSequenceClassification"),
-        ("mistral", "TFMistralForSequenceClassification"),
-        ("mobilebert", "TFMobileBertForSequenceClassification"),
-        ("mpnet", "TFMPNetForSequenceClassification"),
-        ("openai-gpt", "TFOpenAIGPTForSequenceClassification"),
-        ("rembert", "TFRemBertForSequenceClassification"),
-        ("roberta", "TFRobertaForSequenceClassification"),
-        ("roberta-prelayernorm", "TFRobertaPreLayerNormForSequenceClassification"),
-        ("roformer", "TFRoFormerForSequenceClassification"),
-        ("tapas", "TFTapasForSequenceClassification"),
-        ("transfo-xl", "TFTransfoXLForSequenceClassification"),
-        ("xlm", "TFXLMForSequenceClassification"),
-        ("xlm-roberta", "TFXLMRobertaForSequenceClassification"),
-        ("xlnet", "TFXLNetForSequenceClassification"),
-    ]
-)
-
-TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for Question Answering mapping
-        ("albert", "TFAlbertForQuestionAnswering"),
-        ("bert", "TFBertForQuestionAnswering"),
-        ("camembert", "TFCamembertForQuestionAnswering"),
-        ("convbert", "TFConvBertForQuestionAnswering"),
-        ("deberta", "TFDebertaForQuestionAnswering"),
-        ("deberta-v2", "TFDebertaV2ForQuestionAnswering"),
-        ("distilbert", "TFDistilBertForQuestionAnswering"),
-        ("electra", "TFElectraForQuestionAnswering"),
-        ("flaubert", "TFFlaubertForQuestionAnsweringSimple"),
-        ("funnel", "TFFunnelForQuestionAnswering"),
-        ("gptj", "TFGPTJForQuestionAnswering"),
-        ("layoutlmv3", "TFLayoutLMv3ForQuestionAnswering"),
-        ("longformer", "TFLongformerForQuestionAnswering"),
-        ("mobilebert", "TFMobileBertForQuestionAnswering"),
-        ("mpnet", "TFMPNetForQuestionAnswering"),
-        ("rembert", "TFRemBertForQuestionAnswering"),
-        ("roberta", "TFRobertaForQuestionAnswering"),
-        ("roberta-prelayernorm", "TFRobertaPreLayerNormForQuestionAnswering"),
-        ("roformer", "TFRoFormerForQuestionAnswering"),
-        ("xlm", "TFXLMForQuestionAnsweringSimple"),
-        ("xlm-roberta", "TFXLMRobertaForQuestionAnswering"),
-        ("xlnet", "TFXLNetForQuestionAnsweringSimple"),
-    ]
-)
-TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = OrderedDict([("wav2vec2", "TFWav2Vec2ForSequenceClassification")])
-
-TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
-    [
-        ("layoutlm", "TFLayoutLMForQuestionAnswering"),
-        ("layoutlmv3", "TFLayoutLMv3ForQuestionAnswering"),
-    ]
-)
-
-
-TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for Table Question Answering mapping
-        ("tapas", "TFTapasForQuestionAnswering"),
-    ]
-)
-
-TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for Token Classification mapping
-        ("albert", "TFAlbertForTokenClassification"),
-        ("bert", "TFBertForTokenClassification"),
-        ("camembert", "TFCamembertForTokenClassification"),
-        ("convbert", "TFConvBertForTokenClassification"),
-        ("deberta", "TFDebertaForTokenClassification"),
-        ("deberta-v2", "TFDebertaV2ForTokenClassification"),
-        ("distilbert", "TFDistilBertForTokenClassification"),
-        ("electra", "TFElectraForTokenClassification"),
-        ("esm", "TFEsmForTokenClassification"),
-        ("flaubert", "TFFlaubertForTokenClassification"),
-        ("funnel", "TFFunnelForTokenClassification"),
-        ("layoutlm", "TFLayoutLMForTokenClassification"),
-        ("layoutlmv3", "TFLayoutLMv3ForTokenClassification"),
-        ("longformer", "TFLongformerForTokenClassification"),
-        ("mobilebert", "TFMobileBertForTokenClassification"),
-        ("mpnet", "TFMPNetForTokenClassification"),
-        ("rembert", "TFRemBertForTokenClassification"),
-        ("roberta", "TFRobertaForTokenClassification"),
-        ("roberta-prelayernorm", "TFRobertaPreLayerNormForTokenClassification"),
-        ("roformer", "TFRoFormerForTokenClassification"),
-        ("xlm", "TFXLMForTokenClassification"),
-        ("xlm-roberta", "TFXLMRobertaForTokenClassification"),
-        ("xlnet", "TFXLNetForTokenClassification"),
-    ]
-)
-
-TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES = OrderedDict(
-    [
-        # Model for Multiple Choice mapping
-        ("albert", "TFAlbertForMultipleChoice"),
-        ("bert", "TFBertForMultipleChoice"),
-        ("camembert", "TFCamembertForMultipleChoice"),
-        ("convbert", "TFConvBertForMultipleChoice"),
-        ("deberta-v2", "TFDebertaV2ForMultipleChoice"),
-        ("distilbert", "TFDistilBertForMultipleChoice"),
-        ("electra", "TFElectraForMultipleChoice"),
-        ("flaubert", "TFFlaubertForMultipleChoice"),
-        ("funnel", "TFFunnelForMultipleChoice"),
-        ("longformer", "TFLongformerForMultipleChoice"),
-        ("mobilebert", "TFMobileBertForMultipleChoice"),
-        ("mpnet", "TFMPNetForMultipleChoice"),
-        ("rembert", "TFRemBertForMultipleChoice"),
-        ("roberta", "TFRobertaForMultipleChoice"),
-        ("roberta-prelayernorm", "TFRobertaPreLayerNormForMultipleChoice"),
-        ("roformer", "TFRoFormerForMultipleChoice"),
-        ("xlm", "TFXLMForMultipleChoice"),
-        ("xlm-roberta", "TFXLMRobertaForMultipleChoice"),
-        ("xlnet", "TFXLNetForMultipleChoice"),
-    ]
-)
-
-TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES = OrderedDict(
-    [
-        ("bert", "TFBertForNextSentencePrediction"),
-        ("mobilebert", "TFMobileBertForNextSentencePrediction"),
-    ]
-)
-TF_MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = OrderedDict(
-    [
-        ("sam", "TFSamModel"),
-    ]
-)
-TF_MODEL_FOR_TEXT_ENCODING_MAPPING_NAMES = OrderedDict(
-    [
-        ("albert", "TFAlbertModel"),
-        ("bert", "TFBertModel"),
-        ("convbert", "TFConvBertModel"),
-        ("deberta", "TFDebertaModel"),
-        ("deberta-v2", "TFDebertaV2Model"),
-        ("distilbert", "TFDistilBertModel"),
-        ("electra", "TFElectraModel"),
-        ("flaubert", "TFFlaubertModel"),
-        ("longformer", "TFLongformerModel"),
-        ("mobilebert", "TFMobileBertModel"),
-        ("mt5", "TFMT5EncoderModel"),
-        ("rembert", "TFRemBertModel"),
-        ("roberta", "TFRobertaModel"),
-        ("roberta-prelayernorm", "TFRobertaPreLayerNormModel"),
-        ("roformer", "TFRoFormerModel"),
-        ("t5", "TFT5EncoderModel"),
-        ("xlm", "TFXLMModel"),
-        ("xlm-roberta", "TFXLMRobertaModel"),
-    ]
-)
-
-TF_MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_MAPPING_NAMES)
-TF_MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_PRETRAINING_MAPPING_NAMES)
-TF_MODEL_WITH_LM_HEAD_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_WITH_LM_HEAD_MAPPING_NAMES)
-TF_MODEL_FOR_CAUSAL_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
-TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES
-)
-TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
-)
-TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
-)
-TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES
-)
-TF_MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES)
-TF_MODEL_FOR_MASKED_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_MASKED_LM_MAPPING_NAMES)
-TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
-)
-TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
-)
-TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES
-)
-TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
-)
-TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES
-)
-TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES
-)
-TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
-)
-TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES
-)
-TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES
-)
-TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES
-)
-
-TF_MODEL_FOR_MASK_GENERATION_MAPPING = _LazyAutoMapping(
-    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_MASK_GENERATION_MAPPING_NAMES
-)
-
-TF_MODEL_FOR_TEXT_ENCODING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_TEXT_ENCODING_MAPPING_NAMES)
-
-
-class TFAutoModelForMaskGeneration(_BaseAutoModelClass):
-    _model_mapping = TF_MODEL_FOR_MASK_GENERATION_MAPPING
-
-
-class TFAutoModelForTextEncoding(_BaseAutoModelClass):
-    _model_mapping = TF_MODEL_FOR_TEXT_ENCODING_MAPPING
-
-
-class TFAutoModel(_BaseAutoModelClass):
-    _model_mapping = TF_MODEL_MAPPING
-
-
-TFAutoModel = auto_class_update(TFAutoModel)
-
-
-class TFAutoModelForAudioClassification(_BaseAutoModelClass):
-    _model_mapping = TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
-
-
-TFAutoModelForAudioClassification = auto_class_update(
-    TFAutoModelForAudioClassification, head_doc="audio classification"
-)
-
-
-class TFAutoModelForPreTraining(_BaseAutoModelClass):
-    _model_mapping = TF_MODEL_FOR_PRETRAINING_MAPPING
-
-
-TFAutoModelForPreTraining = auto_class_update(TFAutoModelForPreTraining, head_doc="pretraining")
-
-
-# Private on purpose, the public class will add the deprecation warnings.
-class _TFAutoModelWithLMHead(_BaseAutoModelClass):
-    _model_mapping = TF_MODEL_WITH_LM_HEAD_MAPPING
-
-
-_TFAutoModelWithLMHead = auto_class_update(_TFAutoModelWithLMHead, head_doc="language modeling")
-
-
-class TFAutoModelForCausalLM(_BaseAutoModelClass):
-    _model_mapping = TF_MODEL_FOR_CAUSAL_LM_MAPPING
-
-
-TFAutoModelForCausalLM = auto_class_update(TFAutoModelForCausalLM, head_doc="causal language modeling")
-
-
-class TFAutoModelForMaskedImageModeling(_BaseAutoModelClass):
-    _model_mapping = TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING
-
-
-TFAutoModelForMaskedImageModeling = auto_class_update(
-    TFAutoModelForMaskedImageModeling, head_doc="masked image modeling"
-)
-
-
-class TFAutoModelForImageClassification(_BaseAutoModelClass):
-    _model_mapping = TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
-
-
-TFAutoModelForImageClassification = auto_class_update(
-    TFAutoModelForImageClassification, head_doc="image classification"
-)
-
-
-class TFAutoModelForZeroShotImageClassification(_BaseAutoModelClass):
-    _model_mapping = TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING
-
-
-TFAutoModelForZeroShotImageClassification = auto_class_update(
-    TFAutoModelForZeroShotImageClassification, head_doc="zero-shot image classification"
-)
-
-
-class TFAutoModelForSemanticSegmentation(_BaseAutoModelClass):
-    _model_mapping = TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING
-
-
-TFAutoModelForSemanticSegmentation = auto_class_update(
-    TFAutoModelForSemanticSegmentation, head_doc="semantic segmentation"
-)
-
-
-class TFAutoModelForVision2Seq(_BaseAutoModelClass):
-    _model_mapping = TF_MODEL_FOR_VISION_2_SEQ_MAPPING
-
-
-TFAutoModelForVision2Seq = auto_class_update(TFAutoModelForVision2Seq, head_doc="vision-to-text modeling")
-
-
-class TFAutoModelForMaskedLM(_BaseAutoModelClass):
-    _model_mapping = TF_MODEL_FOR_MASKED_LM_MAPPING
-
-
-TFAutoModelForMaskedLM = auto_class_update(TFAutoModelForMaskedLM, head_doc="masked language modeling")
-
-
-class TFAutoModelForSeq2SeqLM(_BaseAutoModelClass):
-    _model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
-
-
-TFAutoModelForSeq2SeqLM = auto_class_update(
-    TFAutoModelForSeq2SeqLM,
-    head_doc="sequence-to-sequence language modeling",
-    checkpoint_for_example="google-t5/t5-base",
-)
-
-
-class TFAutoModelForSequenceClassification(_BaseAutoModelClass):
-    _model_mapping = TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
-
-
-TFAutoModelForSequenceClassification = auto_class_update(
-    TFAutoModelForSequenceClassification, head_doc="sequence classification"
-)
-
-
-class TFAutoModelForQuestionAnswering(_BaseAutoModelClass):
-    _model_mapping = TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING
-
-
-TFAutoModelForQuestionAnswering = auto_class_update(TFAutoModelForQuestionAnswering, head_doc="question answering")
-
-
-class TFAutoModelForDocumentQuestionAnswering(_BaseAutoModelClass):
-    _model_mapping = TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING
-
-
-TFAutoModelForDocumentQuestionAnswering = auto_class_update(
-    TFAutoModelForDocumentQuestionAnswering,
-    head_doc="document question answering",
-    checkpoint_for_example='impira/layoutlm-document-qa", revision="52e01b3',
-)
-
-
-class TFAutoModelForTableQuestionAnswering(_BaseAutoModelClass):
-    _model_mapping = TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING
-
-
-TFAutoModelForTableQuestionAnswering = auto_class_update(
-    TFAutoModelForTableQuestionAnswering,
-    head_doc="table question answering",
-    checkpoint_for_example="google/tapas-base-finetuned-wtq",
-)
-
-
-class TFAutoModelForTokenClassification(_BaseAutoModelClass):
-    _model_mapping = TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
-
-
-TFAutoModelForTokenClassification = auto_class_update(
-    TFAutoModelForTokenClassification, head_doc="token classification"
-)
-
-
-class TFAutoModelForMultipleChoice(_BaseAutoModelClass):
-    _model_mapping = TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING
-
-
-TFAutoModelForMultipleChoice = auto_class_update(TFAutoModelForMultipleChoice, head_doc="multiple choice")
-
-
-class TFAutoModelForNextSentencePrediction(_BaseAutoModelClass):
-    _model_mapping = TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING
-
-
-TFAutoModelForNextSentencePrediction = auto_class_update(
-    TFAutoModelForNextSentencePrediction, head_doc="next sentence prediction"
-)
-
-
-class TFAutoModelForSpeechSeq2Seq(_BaseAutoModelClass):
-    _model_mapping = TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING
-
-
-TFAutoModelForSpeechSeq2Seq = auto_class_update(
-    TFAutoModelForSpeechSeq2Seq, head_doc="sequence-to-sequence speech-to-text modeling"
-)
-
-
-class TFAutoModelWithLMHead(_TFAutoModelWithLMHead):
-    @classmethod
-    def from_config(cls, config):
-        warnings.warn(
-            "The class `TFAutoModelWithLMHead` is deprecated and will be removed in a future version. Please use"
-            " `TFAutoModelForCausalLM` for causal language models, `TFAutoModelForMaskedLM` for masked language models"
-            " and `TFAutoModelForSeq2SeqLM` for encoder-decoder models.",
-            FutureWarning,
-        )
-        return super().from_config(config)
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        warnings.warn(
-            "The class `TFAutoModelWithLMHead` is deprecated and will be removed in a future version. Please use"
-            " `TFAutoModelForCausalLM` for causal language models, `TFAutoModelForMaskedLM` for masked language models"
-            " and `TFAutoModelForSeq2SeqLM` for encoder-decoder models.",
-            FutureWarning,
-        )
-        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-
-
-__all__ = [
-    "TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
-    "TF_MODEL_FOR_CAUSAL_LM_MAPPING",
-    "TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
-    "TF_MODEL_FOR_MASK_GENERATION_MAPPING",
-    "TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING",
-    "TF_MODEL_FOR_MASKED_LM_MAPPING",
-    "TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING",
-    "TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING",
-    "TF_MODEL_FOR_PRETRAINING_MAPPING",
-    "TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING",
-    "TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING",
-    "TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING",
-    "TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
-    "TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
-    "TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
-    "TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
-    "TF_MODEL_FOR_TEXT_ENCODING_MAPPING",
-    "TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
-    "TF_MODEL_FOR_VISION_2_SEQ_MAPPING",
-    "TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING",
-    "TF_MODEL_MAPPING",
-    "TF_MODEL_WITH_LM_HEAD_MAPPING",
-    "TFAutoModel",
-    "TFAutoModelForAudioClassification",
-    "TFAutoModelForCausalLM",
-    "TFAutoModelForImageClassification",
-    "TFAutoModelForMaskedImageModeling",
-    "TFAutoModelForMaskedLM",
-    "TFAutoModelForMaskGeneration",
-    "TFAutoModelForMultipleChoice",
-    "TFAutoModelForNextSentencePrediction",
-    "TFAutoModelForPreTraining",
-    "TFAutoModelForDocumentQuestionAnswering",
-    "TFAutoModelForQuestionAnswering",
-    "TFAutoModelForSemanticSegmentation",
-    "TFAutoModelForSeq2SeqLM",
-    "TFAutoModelForSequenceClassification",
-    "TFAutoModelForSpeechSeq2Seq",
-    "TFAutoModelForTableQuestionAnswering",
-    "TFAutoModelForTextEncoding",
-    "TFAutoModelForTokenClassification",
-    "TFAutoModelForVision2Seq",
-    "TFAutoModelForZeroShotImageClassification",
-    "TFAutoModelWithLMHead",
-]
--- a/src/transformers/models/aya_vision/processing_aya_vision.py
+++ b/src/transformers/models/aya_vision/processing_aya_vision.py
@ -160,10 +160,8 @@ class AyaVisionProcessor(ProcessorMixin):
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
--- a/Show More
+++ b/Show More