Disable head masking

Fix inits, add autoclasses
Update tests, add dummies, make fixup
2025-11-04 12:04:37 +08:00 · 2023-03-27 14:54:40 +01:00 · 2023-03-24 18:18:56 +00:00 · 2023-03-24 18:13:52 +00:00 · 2023-03-24 17:47:18 +00:00 · 2023-03-24 17:44:03 +00:00
694 changed files with 7390 additions and 57135 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -63,12 +63,6 @@ jobs:
                  else
                      touch test_preparation/examples_test_list.txt
                  fi
-            - run: |
-                  if [ -f filtered_test_list_cross_tests.txt ]; then
-                      mv filtered_test_list_cross_tests.txt test_preparation/filtered_test_list_cross_tests.txt
-                  else
-                      touch test_preparation/filtered_test_list_cross_tests.txt
-                  fi
            - store_artifacts:
                  path: test_preparation/test_list.txt
            - store_artifacts:
@ -84,8 +78,6 @@ jobs:
            - run: cp test_preparation/generated_config.yml test_preparation/generated_config.txt
            - store_artifacts:
                  path: test_preparation/generated_config.txt
-            - store_artifacts:
-                  path: test_preparation/filtered_test_list_cross_tests.txt
            - continuation/continue:
                  configuration_path: test_preparation/generated_config.yml

@ -184,6 +176,7 @@ jobs:
            - run: python utils/check_config_attributes.py
            - run: python utils/check_doctest_list.py
            - run: make deps_table_check_updated
+            - run: python utils/tests_fetcher.py --sanity_check
            - run: python utils/update_metadata.py --check-only
            - run: python utils/check_task_guides.py

--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@ -51,8 +51,6 @@ class CircleCIJob:
    resource_class: Optional[str] = "xlarge"
    tests_to_run: Optional[List[str]] = None
    working_directory: str = "~/transformers"
-    # This should be only used for doctest job!
-    command_timeout: Optional[int] = None

    def __post_init__(self):
        # Deal with defaults for mutable attributes.
@ -109,15 +107,11 @@ class CircleCIJob:
        steps.append({"store_artifacts": {"path": "~/transformers/installed.txt"}})

        all_options = {**COMMON_PYTEST_OPTIONS, **self.pytest_options}
-        pytest_flags = [f"--{key}={value}" if (value is not None or key in ["doctest-modules"]) else f"-{key}" for key, value in all_options.items()]
+        pytest_flags = [f"--{key}={value}" if value is not None else f"-{key}" for key, value in all_options.items()]
        pytest_flags.append(
            f"--make-reports={self.name}" if "examples" in self.name else f"--make-reports=tests_{self.name}"
        )
-        test_command = ""
-        if self.command_timeout:
-            test_command = f"timeout {self.command_timeout} "
-        test_command += f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags)
-        
+        test_command = f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags)
        if self.parallelism == 1:
            if self.tests_to_run is None:
                test_command += " << pipeline.parameters.tests_to_run >>"
@ -167,37 +161,12 @@ class CircleCIJob:
            steps.append({"store_artifacts": {"path": "~/transformers/tests.txt"}})
            steps.append({"store_artifacts": {"path": "~/transformers/splitted_tests.txt"}})

-            test_command = ""
-            if self.timeout:
-                test_command = f"timeout {self.timeout} "
-            test_command += f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags)
+            test_command = f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags)
            test_command += " $(cat splitted_tests.txt)"
        if self.marker is not None:
            test_command += f" -m {self.marker}"
-
-        if self.name == "pr_documentation_tests":
-            # can't use ` | tee tee tests_output.txt` as usual
-            test_command += " > tests_output.txt"
-            # Save the return code, so we can check if it is timeout in the next step.
-            test_command += '; touch "$?".txt'
-            # Never fail the test step for the doctest job. We will check the results in the next step, and fail that
-            # step instead if the actual test failures are found. This is to avoid the timeout being reported as test
-            # failure.
-            test_command = f"({test_command}) || true"
-        else:
-            test_command += " | tee tests_output.txt"
+        test_command += " | tee tests_output.txt"
        steps.append({"run": {"name": "Run tests", "command": test_command}})
-
-        # return code `124` means the previous (pytest run) step is timeout
-        if self.name == "pr_documentation_tests":
-            checkout_doctest_command = 'if [ -s reports/tests_pr_documentation_tests/failures_short.txt ]; '
-            checkout_doctest_command += 'then echo "some test failed"; '
-            checkout_doctest_command += 'cat reports/tests_pr_documentation_tests/failures_short.txt; '
-            checkout_doctest_command += 'cat reports/tests_pr_documentation_tests/summary_short.txt; exit -1; '
-            checkout_doctest_command += 'elif [ -s reports/tests_pr_documentation_tests/stats.txt ]; then echo "All tests pass!"; '
-            checkout_doctest_command += 'elif [ -f 124.txt ]; then echo "doctest timeout!"; else echo "other fatal error)"; exit -1; fi;'
-            steps.append({"run": {"name": "Check doctest results", "command": checkout_doctest_command}})
-
        steps.append({"store_artifacts": {"path": "~/transformers/tests_output.txt"}})
        steps.append({"store_artifacts": {"path": "~/transformers/reports"}})
        job["steps"] = steps
@ -213,7 +182,7 @@ torch_and_tf_job = CircleCIJob(
    "torch_and_tf",
    additional_env={"RUN_PT_TF_CROSS_TESTS": True},
    install_steps=[
-        "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng git-lfs cmake",
+        "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng git-lfs",
        "git lfs install",
        "pip install --upgrade pip",
        "pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]",
@ -255,7 +224,7 @@ torch_job = CircleCIJob(
 tf_job = CircleCIJob(
    "tf",
    install_steps=[
-        "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng cmake",
+        "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng",
        "pip install --upgrade pip",
        "pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]",
        "pip install tensorflow_probability",
@ -294,7 +263,6 @@ pipelines_tf_job = CircleCIJob(
    "pipelines_tf",
    additional_env={"RUN_PIPELINE_TESTS": True},
    install_steps=[
-        "sudo apt-get -y update && sudo apt-get install -y cmake",
        "pip install --upgrade pip",
        "pip install .[sklearn,tf-cpu,testing,sentencepiece,vision]",
        "pip install tensorflow_probability",
@ -350,7 +318,6 @@ examples_tensorflow_job = CircleCIJob(
    "examples_tensorflow",
    cache_name="tensorflow_examples",
    install_steps=[
-        "sudo apt-get -y update && sudo apt-get install -y cmake",
        "pip install --upgrade pip",
        "pip install .[sklearn,tensorflow,sentencepiece,testing]",
        "pip install -r examples/tensorflow/_tests_requirements.txt",
@ -388,7 +355,6 @@ hub_job = CircleCIJob(
 onnx_job = CircleCIJob(
    "onnx",
    install_steps=[
-        "sudo apt-get -y update && sudo apt-get install -y cmake",
        "pip install --upgrade pip",
        "pip install .[torch,tf,testing,sentencepiece,onnxruntime,vision,rjieba]",
    ],
@ -432,47 +398,6 @@ repo_utils_job = CircleCIJob(
    tests_to_run="tests/repo_utils",
 )

-
-# We also include a `dummy.py` file in the files to be doc-tested to prevent edge case failure. Otherwise, the pytest
-# hangs forever during test collection while showing `collecting 0 items / 21 errors`. (To see this, we have to remove
-# the bash output redirection.)
-py_command = 'from utils.tests_fetcher import get_doctest_files; to_test = get_doctest_files() + ["dummy.py"]; to_test = " ".join(to_test); print(to_test)'
-py_command = f"$(python3 -c '{py_command}')"
-command = f'echo "{py_command}" > pr_documentation_tests_temp.txt'
-doc_test_job = CircleCIJob(
-    "pr_documentation_tests",
-    additional_env={"TRANSFORMERS_VERBOSITY": "error", "DATASETS_VERBOSITY": "error", "SKIP_CUDA_DOCTEST": "1"},
-    install_steps=[
-        "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng time",
-        "pip install --upgrade pip",
-        "pip install -e .[dev]",
-        "pip install git+https://github.com/huggingface/accelerate",
-        "pip install --upgrade pytest pytest-sugar",
-        "find -name __pycache__ -delete",
-        "find . -name \*.pyc -delete",
-        # Add an empty file to keep the test step running correctly even no file is selected to be tested.
-        "touch dummy.py",
-        {
-            "name": "Get files to test",
-            "command": command,
-        },
-        {
-            "name": "Show information in `Get files to test`",
-            "command":
-                "cat pr_documentation_tests_temp.txt"
-        },
-        {
-            "name": "Get the last line in `pr_documentation_tests.txt`",
-            "command":
-                "tail -n1 pr_documentation_tests_temp.txt | tee pr_documentation_tests.txt"
-        },
-    ],
-    tests_to_run="$(cat pr_documentation_tests.txt)",  # noqa
-    pytest_options={"-doctest-modules": None, "doctest-glob": "*.mdx", "dist": "loadfile", "rvsA": None},
-    command_timeout=1200,  # test cannot run longer than 1200 seconds
-    pytest_num_workers=1,
-)
-
 REGULAR_TESTS = [
    torch_and_tf_job,
    torch_and_flax_job,
@ -483,7 +408,6 @@ REGULAR_TESTS = [
    hub_job,
    onnx_job,
    exotic_models_job,
-    doc_test_job
 ]
 EXAMPLES_TESTS = [
    examples_torch_job,
@ -520,34 +444,6 @@ def create_circleci_config(folder=None):
    if len(test_list) > 0:
        jobs.extend(REGULAR_TESTS)

-        extended_tests_to_run = set(test_list.split())
-        # Extend the test files for cross test jobs
-        for job in jobs:
-            if job.job_name in ["tests_torch_and_tf", "tests_torch_and_flax"]:
-                for test_path in copy.copy(extended_tests_to_run):
-                    dir_path, fn = os.path.split(test_path)
-                    if fn.startswith("test_modeling_tf_"):
-                        fn = fn.replace("test_modeling_tf_", "test_modeling_")
-                    elif fn.startswith("test_modeling_flax_"):
-                        fn = fn.replace("test_modeling_flax_", "test_modeling_")
-                    else:
-                        if job.job_name == "test_torch_and_tf":
-                            fn = fn.replace("test_modeling_", "test_modeling_tf_")
-                        elif job.job_name == "test_torch_and_flax":
-                            fn = fn.replace("test_modeling_", "test_modeling_flax_")
-                    new_test_file = str(os.path.join(dir_path, fn))
-                    if os.path.isfile(new_test_file):
-                        if new_test_file not in extended_tests_to_run:
-                            extended_tests_to_run.add(new_test_file)
-        extended_tests_to_run = sorted(extended_tests_to_run)
-        for job in jobs:
-            if job.job_name in ["tests_torch_and_tf", "tests_torch_and_flax"]:
-                job.tests_to_run = extended_tests_to_run
-        fn = "filtered_test_list_cross_tests.txt"
-        f_path = os.path.join(folder, fn)
-        with open(f_path, "w") as fp:
-            fp.write(" ".join(extended_tests_to_run))
-
    example_file = os.path.join(folder, "examples_test_list.txt")
    if os.path.exists(example_file) and os.path.getsize(example_file) > 0:
        jobs.extend(EXAMPLES_TESTS)
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@ -3,7 +3,7 @@ name: Build docker images (scheduled)
 on:
  push:
    branches:
-      - build_ci_docker_image*
+      - docker-image*
  repository_dispatch:
  workflow_call:
    inputs:
@ -11,7 +11,7 @@ on:
        required: true
        type: string
  schedule:
-    - cron: "17 0 * * *"
+    - cron: "0 1 * * *"

 concurrency:
  group: docker-images-builds
@ -67,6 +67,35 @@ jobs:
          push: true
          tags: huggingface/transformers-all-latest-gpu-push-ci

+  latest-with-torch-nightly-docker:
+    name: "Nightly PyTorch + Stable TensorFlow"
+    # Push CI doesn't need this image
+    if: inputs.image_postfix != '-push-ci'
+    runs-on: ubuntu-latest
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+      -
+        name: Check out code
+        uses: actions/checkout@v3
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v3
+        with:
+          context: ./docker/transformers-all-latest-gpu
+          build-args: |
+            REF=main
+            PYTORCH=pre
+          push: true
+          tags: huggingface/transformers-all-latest-torch-nightly-gpu
+
  latest-torch-deepspeed-docker:
    name: "Latest PyTorch + DeepSpeed"
    runs-on: ubuntu-latest
@ -124,6 +153,34 @@ jobs:
          push: true
          tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci

+  nightly-torch-deepspeed-docker:
+    name: "Nightly PyTorch + DeepSpeed"
+    # Push CI doesn't need this image
+    if: inputs.image_postfix != '-push-ci'
+    runs-on: ubuntu-latest
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+      -
+        name: Check out code
+        uses: actions/checkout@v3
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v3
+        with:
+          context: ./docker/transformers-pytorch-deepspeed-nightly-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-deepspeed-nightly-gpu
+
  doc-builder:
    name: "Doc builder"
    # Push CI doesn't need this image
--- a/.github/workflows/build-nightly-ci-docker-images.yml
+++ b/.github/workflows/build-nightly-ci-docker-images.yml
@ -1,75 +0,0 @@
-name: Build docker images (Nightly CI)
-
-on:
-  workflow_call:
-  push:
-    branches:
-      - build_nightly_ci_docker_image*
-
-concurrency:
-  group: docker-images-builds
-  cancel-in-progress: false
-
-jobs:
-  latest-with-torch-nightly-docker:
-    name: "Nightly PyTorch + Stable TensorFlow"
-    runs-on: ubuntu-latest
-    steps:
-      - name: Cleanup disk
-        run: |
-          sudo ls -l /usr/local/lib/
-          sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
-      -
-        name: Check out code
-        uses: actions/checkout@v3
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v2
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      -
-        name: Build and push
-        uses: docker/build-push-action@v3
-        with:
-          context: ./docker/transformers-all-latest-gpu
-          build-args: |
-            REF=main
-            PYTORCH=pre
-          push: true
-          tags: huggingface/transformers-all-latest-torch-nightly-gpu
-
-  nightly-torch-deepspeed-docker:
-    name: "Nightly PyTorch + DeepSpeed"
-    runs-on: ubuntu-latest
-    steps:
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
-      -
-        name: Check out code
-        uses: actions/checkout@v3
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v2
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      -
-        name: Build and push
-        uses: docker/build-push-action@v3
-        with:
-          context: ./docker/transformers-pytorch-deepspeed-nightly-gpu
-          build-args: |
-            REF=main
-          push: true
-          tags: huggingface/transformers-pytorch-deepspeed-nightly-gpu
--- a/.github/workflows/build-past-ci-docker-images.yml
+++ b/.github/workflows/build-past-ci-docker-images.yml
@ -3,7 +3,7 @@ name: Build docker images (Past CI)
 on:
  push:
    branches:
-      - build_past_ci_docker_image*
+      - past-ci-docker-image*

 concurrency:
  group: docker-images-builds
@ -15,7 +15,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        version: ["1.13", "1.12", "1.11", "1.10", "1.9"]
+        version: ["1.11", "1.10", "1.9", "1.8", "1.7", "1.6", "1.5", "1.4"]
    runs-on: ubuntu-latest
    steps:
      -
@ -24,17 +24,6 @@ jobs:
      -
        name: Check out code
        uses: actions/checkout@v3
-      -
-        id: get-base-image
-        name: Get Base Image
-        env:
-          framework_version: ${{ matrix.version }}
-        run: |
-          echo "base_image=$(python3 -c 'import os; from utils.past_ci_versions import past_versions_testing; base_image = past_versions_testing["pytorch"][os.environ["framework_version"]]["base_image"]; print(base_image)')" >> $GITHUB_OUTPUT
-      -
-        name: Print Base Image
-        run: |
-          echo ${{ steps.get-base-image.outputs.base_image }}
      -
        name: Login to DockerHub
        uses: docker/login-action@v2
@ -48,7 +37,6 @@ jobs:
          context: ./docker/transformers-past-gpu
          build-args: |
            REF=main
-            BASE_DOCKER_IMAGE=${{ steps.get-base-image.outputs.base_image }}
            FRAMEWORK=pytorch
            VERSION=${{ matrix.version }}
          push: true
@ -59,7 +47,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        version: ["2.11", "2.10", "2.9", "2.8", "2.7", "2.6", "2.5"]
+        version: ["2.8", "2.7", "2.6", "2.5"]
    runs-on: ubuntu-latest
    steps:
      -
@ -68,17 +56,6 @@ jobs:
      -
        name: Check out code
        uses: actions/checkout@v3
-      -
-        id: get-base-image
-        name: Get Base Image
-        env:
-          framework_version: ${{ matrix.version }}
-        run: |
-          echo "base_image=$(python3 -c 'import os; from utils.past_ci_versions import past_versions_testing; base_image = past_versions_testing["tensorflow"][os.environ["framework_version"]]["base_image"]; print(base_image)')" >> $GITHUB_OUTPUT
-      -
-        name: Print Base Image
-        run: |
-          echo ${{ steps.get-base-image.outputs.base_image }}
      -
        name: Login to DockerHub
        uses: docker/login-action@v2
@ -92,8 +69,40 @@ jobs:
          context: ./docker/transformers-past-gpu
          build-args: |
            REF=main
-            BASE_DOCKER_IMAGE=${{ steps.get-base-image.outputs.base_image }}
            FRAMEWORK=tensorflow
            VERSION=${{ matrix.version }}
          push: true
          tags: huggingface/transformers-tensorflow-past-${{ matrix.version }}-gpu
+
+  past-tensorflow-docker-2-4:
+    name: "Past TensorFlow Docker"
+    strategy:
+      fail-fast: false
+      matrix:
+        version: ["2.4"]
+    runs-on: ubuntu-latest
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+      -
+        name: Check out code
+        uses: actions/checkout@v3
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v3
+        with:
+          context: ./docker/transformers-past-gpu
+          build-args: |
+            REF=main
+            BASE_DOCKER_IMAGE=nvidia/cuda:11.0.3-cudnn8-devel-ubuntu20.04
+            FRAMEWORK=tensorflow
+            VERSION=${{ matrix.version }}
+          push: true
+          tags: huggingface/transformers-tensorflow-past-${{ matrix.version }}-gpu
--- a/.github/workflows/check_tiny_models.yml
+++ b/.github/workflows/check_tiny_models.yml
@ -1,82 +0,0 @@
-name: Check Tiny Models
-
-on:
-  push:
-    branches:
-      - check_tiny_models*
-  repository_dispatch:
-  schedule:
-    - cron: "0 2 * * *"
-
-env:
-  TOKEN: ${{ secrets.TRANSFORMERS_HUB_BOT_HF_TOKEN }}
-
-jobs:
-  check_tiny_models:
-    name: Check tiny models
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout transformers
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 2
-
-      - uses: actions/checkout@v3
-      - name: Set up Python 3.8
-        uses: actions/setup-python@v4
-        with:
-          # Semantic version range syntax or exact version of a Python version
-          python-version: '3.8'
-          # Optional - x64 or x86 architecture, defaults to x64
-          architecture: 'x64'
-
-      - name: Install
-        run: |
-          sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng cmake
-          pip install --upgrade pip
-          python -m pip install -U .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm,video,tf-cpu]
-          pip install tensorflow_probability
-          python -m pip install -U natten
-
-      - name: Create all tiny models (locally)
-        run: |
-          python utils/create_dummy_models.py tiny_local_models --all --num_workers 2
-
-      - name: Local tiny model reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v3
-        with:
-          name: tiny_local_model_creation_reports
-          path: tiny_local_models/reports
-
-      # GitHub-hosted runners have 2-core CPUs
-      - name: Run pipeline tests against all new (local) tiny models
-        run: |
-          OMP_NUM_THREADS=1 TRANSFORMERS_TINY_MODEL_PATH=tiny_local_models python -m pytest --max-worker-restart=0 -n 2 --dist=loadfile -s -rA --make-reports=tests_pipelines tests/models -m is_pipeline_test -k "test_pipeline_" | tee tests_output.txt
-
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v3
-        with:
-          name: tiny_local_model_creation_reports
-          path: reports/tests_pipelines
-
-      - name: Create + Upload tiny models for new model architecture(s)
-        run: | 
-          python utils/update_tiny_models.py --num_workers 2
-
-      - name: Full report
-        run: cat tiny_models/reports/tiny_model_creation_report.json
-
-      - name: Failure report
-        run: cat tiny_models/reports/simple_failed_report.txt
-
-      - name: Summary report
-        run: cat tiny_models/reports/tiny_model_summary.json
-
-      - name: New tiny model creation reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v3
-        with:
-          name: tiny_model_creation_reports
-          path: tiny_models/reports
--- a/.github/workflows/doctests.yml
+++ b/.github/workflows/doctests.yml
@ -6,7 +6,7 @@ on:
      - doctest*
  repository_dispatch:
  schedule:
-    - cron: "17 2 * * *"
+    - cron: "0 2 * * *"


 env:
@ -25,17 +25,11 @@ jobs:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
-      - name: uninstall transformers (installed during docker image build)
-        run: python3 -m pip uninstall -y transformers
-
      - uses: actions/checkout@v3
      - name: NVIDIA-SMI
        run: |
          nvidia-smi

-      - name: Install transformers in edit mode
-        run: python3 -m pip install -e .
-
      - name: GPU visibility
        run: |
          python3 utils/print_env.py
@ -43,10 +37,18 @@ jobs:
      - name: Show installed libraries and their versions
        run: pip freeze

+      - name: Prepare files for doctests
+        run: |
+          python3 utils/prepare_for_doc_test.py src docs
+
      - name: Run doctests
        run: |
          python3 -m pytest -v --make-reports doc_tests_gpu --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.mdx"

+      - name: Clean files after doctests
+        run: |
+          python3 utils/prepare_for_doc_test.py src docs --remove_new_line
+
      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
--- a/.github/workflows/self-nightly-past-ci-caller.yml
+++ b/.github/workflows/self-nightly-past-ci-caller.yml
@ -1,156 +0,0 @@
-name: Self-hosted runner (nightly-past-ci-caller)
-
-on:
-  schedule:
-    # 2:17 am on each Sunday and Thursday
-
-    - cron: "17 2 * * 0,4"
-  push:
-    branches:
-      - run_nightly_ci*
-      - run_past_ci*
-
-jobs:
-  build_nightly_ci_images:
-    name: Build Nightly CI Docker Images
-    if: (github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_nightly_ci'))
-    uses: ./.github/workflows/build-nightly-ci-docker-images.yml
-    secrets: inherit
-
-  run_nightly_ci:
-    name: Nightly CI
-    needs: [build_nightly_ci_images]
-    uses: ./.github/workflows/self-nightly-scheduled.yml
-    secrets: inherit
-
-  run_past_ci_pytorch_1-13:
-    name: PyTorch 1.13
-    if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')))
-    needs: [run_nightly_ci]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: pytorch
-      version: "1.13"
-      sha: ${{ github.sha }}
-    secrets: inherit
-
-  run_past_ci_pytorch_1-12:
-    name: PyTorch 1.12
-    if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')))
-    needs: [run_past_ci_pytorch_1-13]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: pytorch
-      version: "1.12"
-      sha: ${{ github.sha }}
-    secrets: inherit
-
-  run_past_ci_pytorch_1-11:
-    name: PyTorch 1.11
-    if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')))
-    needs: [run_past_ci_pytorch_1-12]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: pytorch
-      version: "1.11"
-      sha: ${{ github.sha }}
-    secrets: inherit
-
-  run_past_ci_pytorch_1-10:
-    name: PyTorch 1.10
-    if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')))
-    needs: [run_past_ci_pytorch_1-11]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: pytorch
-      version: "1.10"
-      sha: ${{ github.sha }}
-    secrets: inherit
-
-  run_past_ci_pytorch_1-9:
-    name: PyTorch 1.9
-    if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')))
-    needs: [run_past_ci_pytorch_1-10]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: pytorch
-      version: "1.9"
-      sha: ${{ github.sha }}
-    secrets: inherit
-
-  run_past_ci_tensorflow_2-11:
-    name: TensorFlow 2.11
-    if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
-    needs: [run_past_ci_pytorch_1-9]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: tensorflow
-      version: "2.11"
-      sha: ${{ github.sha }}
-    secrets: inherit
-
-  run_past_ci_tensorflow_2-10:
-    name: TensorFlow 2.10
-    if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
-    needs: [run_past_ci_tensorflow_2-11]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: tensorflow
-      version: "2.10"
-      sha: ${{ github.sha }}
-    secrets: inherit
-
-  run_past_ci_tensorflow_2-9:
-    name: TensorFlow 2.9
-    if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
-    needs: [run_past_ci_tensorflow_2-10]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: tensorflow
-      version: "2.9"
-      sha: ${{ github.sha }}
-    secrets: inherit
-
-  run_past_ci_tensorflow_2-8:
-    name: TensorFlow 2.8
-    if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
-    needs: [run_past_ci_tensorflow_2-9]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: tensorflow
-      version: "2.8"
-      sha: ${{ github.sha }}
-    secrets: inherit
-
-  run_past_ci_tensorflow_2-7:
-    name: TensorFlow 2.7
-    if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
-    needs: [run_past_ci_tensorflow_2-8]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: tensorflow
-      version: "2.7"
-      sha: ${{ github.sha }}
-    secrets: inherit
-
-  run_past_ci_tensorflow_2-6:
-    name: TensorFlow 2.6
-    if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
-    needs: [run_past_ci_tensorflow_2-7]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: tensorflow
-      version: "2.6"
-      sha: ${{ github.sha }}
-    secrets: inherit
-
-  run_past_ci_tensorflow_2-5:
-    name: TensorFlow 2.5
-    if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
-    needs: [run_past_ci_tensorflow_2-6]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: tensorflow
-      version: "2.5"
-      sha: ${{ github.sha }}
-    secrets: inherit
--- a/.github/workflows/self-nightly-scheduled.yml
+++ b/.github/workflows/self-nightly-scheduled.yml
@ -1,4 +1,4 @@
-name: Self-hosted runner (nightly-ci)
+name: Self-hosted runner (nightly)

 # Note that each job's dependencies go into a corresponding docker file.
 #
@ -8,7 +8,9 @@ name: Self-hosted runner (nightly-ci)

 on:
  repository_dispatch:
-  workflow_call:
+# Disable temporarily until the test suite can be run under 12 hours.
+#  schedule:
+#    - cron: "0 16 * * *"

 env:
  HF_HOME: /mnt/cache
@ -31,7 +33,7 @@ jobs:
          fetch-depth: 2

      - name: Check Runner Status
-        run: python utils/check_self_hosted_runner.py --target_runners single-gpu-past-ci-runner-docker,multi-gpu-past-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+        run: python utils/check_self_hosted_runner.py --target_runners single-gpu-scheduled-ci-runner-docker,multi-gpu-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}

  check_runners:
    name: Check Runners
@ -39,7 +41,7 @@ jobs:
    strategy:
      matrix:
        machine_type: [single-gpu, multi-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
+    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
    container:
      image: huggingface/transformers-all-latest-torch-nightly-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -54,7 +56,7 @@ jobs:
    strategy:
      matrix:
        machine_type: [single-gpu, multi-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
+    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
    container:
      image: huggingface/transformers-all-latest-torch-nightly-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -94,7 +96,7 @@ jobs:
      matrix:
        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
        machine_type: [single-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
+    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
    container:
      image: huggingface/transformers-all-latest-torch-nightly-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -141,7 +143,7 @@ jobs:
        if: ${{ always() }}
        uses: actions/upload-artifact@v3
        with:
-          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_nightly
+          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
          path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}

  run_tests_multi_gpu:
@ -151,7 +153,7 @@ jobs:
      matrix:
        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
        machine_type: [multi-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
+    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
    container:
      image: huggingface/transformers-all-latest-torch-nightly-gpu
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -198,7 +200,7 @@ jobs:
        if: ${{ always() }}
        uses: actions/upload-artifact@v3
        with:
-          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_nightly
+          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
          path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}

  run_all_tests_torch_cuda_extensions_gpu:
@ -207,7 +209,7 @@ jobs:
      fail-fast: false
      matrix:
        machine_type: [single-gpu, multi-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
+    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
    needs: setup
    container:
      image: huggingface/transformers-pytorch-deepspeed-nightly-gpu
@ -227,7 +229,7 @@ jobs:
          python3 -m pip uninstall -y deepspeed
          rm -rf DeepSpeed
          git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check

      - name: NVIDIA-SMI
        run: |
@ -256,7 +258,7 @@ jobs:
        if: ${{ always() }}
        uses: actions/upload-artifact@v3
        with:
-          name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports_postfix_nightly
+          name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports
          path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu

  send_results:
@ -290,7 +292,7 @@ jobs:
          CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
-          CI_EVENT: Nightly CI
+          CI_EVENT: nightly-build
          RUNNER_STATUS: ${{ needs.check_runner_status.result }}
          RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
          SETUP_STATUS: ${{ needs.setup.result }}
@ -300,11 +302,3 @@ jobs:
          pip install slack_sdk
          pip show slack_sdk
          python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
-
-
-      # delete-artifact
-      - uses: geekyeggo/delete-artifact@v2
-        with:
-          name: |
-              single-*
-              multi-*
--- a/.github/workflows/self-past-caller.yml
+++ b/.github/workflows/self-past-caller.yml
@ -0,0 +1,136 @@
+name: Self-hosted runner (past-ci-caller)
+
+on:
+  push:
+    branches:
+      - run-past-ci*
+
+jobs:
+  run_past_ci_pytorch_1-11:
+    name: PyTorch 1.11
+    if: always()
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: pytorch
+      version: "1.11"
+    secrets: inherit
+
+  run_past_ci_pytorch_1-10:
+    name: PyTorch 1.10
+    if: always()
+    needs: [run_past_ci_pytorch_1-11]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: pytorch
+      version: "1.10"
+    secrets: inherit
+
+  run_past_ci_pytorch_1-9:
+    name: PyTorch 1.9
+    if: always()
+    needs: [run_past_ci_pytorch_1-10]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: pytorch
+      version: "1.9"
+    secrets: inherit
+
+  run_past_ci_pytorch_1-8:
+    name: PyTorch 1.8
+    if: always()
+    needs: [run_past_ci_pytorch_1-9]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: pytorch
+      version: "1.8"
+    secrets: inherit
+
+  run_past_ci_pytorch_1-7:
+    name: PyTorch 1.7
+    if: always()
+    needs: [run_past_ci_pytorch_1-8]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: pytorch
+      version: "1.7"
+    secrets: inherit
+
+  run_past_ci_pytorch_1-6:
+    name: PyTorch 1.6
+    if: always()
+    needs: [run_past_ci_pytorch_1-7]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: pytorch
+      version: "1.6"
+    secrets: inherit
+
+  run_past_ci_pytorch_1-5:
+    name: PyTorch 1.5
+    if: always()
+    needs: [run_past_ci_pytorch_1-6]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: pytorch
+      version: "1.5"
+    secrets: inherit
+
+  run_past_ci_pytorch_1-4:
+    name: PyTorch 1.4
+    if: always()
+    needs: [run_past_ci_pytorch_1-5]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: pytorch
+      version: "1.4"
+    secrets: inherit
+
+  run_past_ci_tensorflow_2-8:
+    name: TensorFlow 2.8
+    if: always()
+    needs: [run_past_ci_pytorch_1-4]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: tensorflow
+      version: "2.8"
+    secrets: inherit
+
+  run_past_ci_tensorflow_2-7:
+    name: TensorFlow 2.7
+    if: always()
+    needs: [run_past_ci_tensorflow_2-8]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: tensorflow
+      version: "2.7"
+    secrets: inherit
+
+  run_past_ci_tensorflow_2-6:
+    name: TensorFlow 2.6
+    if: always()
+    needs: [run_past_ci_tensorflow_2-7]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: tensorflow
+      version: "2.6"
+    secrets: inherit
+
+  run_past_ci_tensorflow_2-5:
+    name: TensorFlow 2.5
+    if: always()
+    needs: [run_past_ci_tensorflow_2-6]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: tensorflow
+      version: "2.5"
+    secrets: inherit
+
+  run_past_ci_tensorflow_2-4:
+    name: TensorFlow 2.4
+    if: always()
+    needs: [run_past_ci_tensorflow_2-5]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: tensorflow
+      version: "2.4"
+    secrets: inherit
--- a/.github/workflows/self-past.yml
+++ b/.github/workflows/self-past.yml
@ -1,4 +1,4 @@
-name: Self-hosted runner (past-ci)
+name: Self-hosted runner (past)

 # Note that each job's dependencies go into a corresponding docker file.
 #
@ -126,12 +126,6 @@ jobs:
        run: |
          nvidia-smi

-      - name: Install
-        if: inputs.framework == 'pytorch'
-        working-directory: /transformers
-        run: |
-          python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
-
      - name: Environment
        working-directory: /transformers
        run: |
@ -163,7 +157,7 @@ jobs:
        if: ${{ always() }}
        uses: actions/upload-artifact@v3
        with:
-          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }}
+          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
          path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}

  run_tests_multi_gpu:
@ -198,12 +192,6 @@ jobs:
        run: |
          nvidia-smi

-      - name: Install
-        if: inputs.framework == 'pytorch'
-        working-directory: /transformers
-        run: |
-          python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
-
      - name: Environment
        working-directory: /transformers
        run: |
@ -235,85 +223,14 @@ jobs:
        if: ${{ always() }}
        uses: actions/upload-artifact@v3
        with:
-          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }}
+          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
          path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}

-  run_all_tests_torch_cuda_extensions_gpu:
-    name: Torch CUDA extension tests
-    if: inputs.framework == 'pytorch'
-    strategy:
-      fail-fast: false
-      matrix:
-        machine_type: [single-gpu, multi-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
-    needs: setup
-    container:
-      image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - name: Update clone
-        working-directory: /transformers
-        run: git fetch && git checkout ${{ github.sha }}
-
-      - name: Install
-        working-directory: /transformers
-        run: |
-          python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
-
-      - name: Remove cached torch extensions
-        run: rm -rf /github/home/.cache/torch_extensions/
-
-      # To avoid unknown test failures
-      - name: Pre build DeepSpeed *again*
-        working-directory: /
-        run: |
-          python3 -m pip uninstall -y deepspeed
-          rm -rf DeepSpeed
-          git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
-
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
-      - name: Environment
-        working-directory: /transformers
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        run: pip freeze
-
-      - name: Run all tests on GPU
-        working-directory: /transformers
-        run: |
-          python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt
-
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v3
-        with:
-          name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }}
-          path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
-
  send_results:
    name: Send results to webhook
    runs-on: ubuntu-latest
    if: always()
-    needs: [
-      check_runner_status,
-      check_runners,
-      setup,
-      run_tests_single_gpu,
-      run_tests_multi_gpu,
-      run_all_tests_torch_cuda_extensions_gpu
-    ]
+    needs: [check_runner_status, check_runners, setup, run_tests_single_gpu, run_tests_multi_gpu]
    steps:
      - name: Preliminary job status
        shell: bash
@ -355,11 +272,4 @@ jobs:
        uses: actions/upload-artifact@v3
        with:
          name: test_failure_tables_${{ inputs.framework }}-${{ inputs.version }}
-          path: test_failure_tables
-
-      # delete-artifact
-      - uses: geekyeggo/delete-artifact@v2
-        with:
-          name: |
-              single-*
-              multi-*
+          path: test_failure_tables
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@ -381,7 +381,7 @@ jobs:
        working-directory: /workspace
        run: |
          python3 -m pip uninstall -y deepspeed
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check

      - name: NVIDIA-SMI
        run: |
@ -467,7 +467,7 @@ jobs:
        working-directory: /workspace
        run: |
          python3 -m pip uninstall -y deepspeed
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check

      - name: NVIDIA-SMI
        run: |
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -9,10 +9,7 @@ name: Self-hosted runner (scheduled)
 on:
  repository_dispatch:
  schedule:
-    - cron: "17 2 * * *"
-  push:
-    branches:
-      - run_scheduled_ci*
+    - cron: "0 2 * * *"

 env:
  HF_HOME: /mnt/cache
@ -369,7 +366,7 @@ jobs:
        working-directory: /workspace
        run: |
          python3 -m pip uninstall -y deepspeed
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check

      - name: NVIDIA-SMI
        run: |
@ -487,23 +484,12 @@ jobs:
          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
          CI_EVENT: scheduled
-          CI_SHA: ${{ github.sha }}
-          CI_WORKFLOW_REF: ${{ github.workflow_ref }}
          RUNNER_STATUS: ${{ needs.check_runner_status.result }}
          RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
          SETUP_STATUS: ${{ needs.setup.result }}
        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
        run: |
-          sudo apt-get install -y curl
          pip install slack_sdk
          pip show slack_sdk
          python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
-
-      # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
-      - name: Failure table artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v3
-        with:
-          name: test_failure_tables
-          path: test_failure_tables
--- a/.github/workflows/update_metdata.yml
+++ b/.github/workflows/update_metdata.yml
@ -4,7 +4,7 @@ on:
  push:
    branches:
      - main
-      - update_transformers_metadata*
+      - update_transformers_metadata

 jobs:
  build_and_package:
@ -16,12 +16,25 @@ jobs:
    steps:
      - uses: actions/checkout@v3

+      - name: Load cached virtual environment
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: ~/venv/
+          key: v3-metadata-${{ hashFiles('setup.py') }}
+
+      - name: Create virtual environment on cache miss
+        if: steps.cache.outputs.cache-hit != 'true'
+        run: |
+          python -m venv ~/venv && . ~/venv/bin/activate
+          pip install --upgrade pip
+
      - name: Setup environment
        run: |
-          pip install --upgrade pip
-          pip install datasets pandas
-          pip install .[torch,tf,flax]
+          . ~/venv/bin/activate
+          pip install git+https://github.com/huggingface/transformers#egg=transformers[dev]

      - name: Update metadata
        run: |
+          . ~/venv/bin/activate
          python utils/update_metadata.py --token ${{ secrets.SYLVAIN_HF_TOKEN }} --commit_sha ${{ github.sha }}
--- a/.github/workflows/update_tiny_models.yml
+++ b/.github/workflows/update_tiny_models.yml
@ -0,0 +1,47 @@
+name: Self-hosted runner (push)
+
+on:
+  push:
+    branches:
+      - update_tiny_models*
+  repository_dispatch:
+  schedule:
+    - cron: "0 2 * * *"
+
+env:
+  TOKEN: ${{ secrets.SYLVAIN_HF_TOKEN }}
+
+jobs:
+  update_tiny_models:
+    name: Update tiny models
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout transformers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+
+      - name: Install
+        run: |
+            python -m pip install -U .[dev]
+            python -m pip install -U natten
+
+      - name: Update tiny models
+        run: | 
+          python utils/update_tiny_models.py
+
+      - name: Full report
+        run: cat tiny_models/reports/tiny_model_creation_report.json
+
+      - name: Failure report
+        run: cat tiny_models/reports/simple_failed_report.txt
+
+      - name: Summary report
+        run: cat tiny_models/reports/tiny_model_summary.json
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: tiny_model_creation_reports
+          path: tiny_models/reports
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -162,16 +162,14 @@ You'll need **[Python 3.7]((https://github.com/huggingface/transformers/blob/mai
   it with `pip uninstall transformers` before reinstalling it in editable
   mode with the `-e` flag.
   
-   Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a
-   failure with this command. If that's the case make sure to install the Deep Learning framework you are working with
-   (PyTorch, TensorFlow and/or Flax) then do:
-
+   Depending on your OS, you may need to install some external libraries as well if the `pip` installation fails.
+   
+   For macOS, you will likely need [MeCab](https://taku910.github.io/mecab/) which can be installed from Homebrew:
+   
   ```bash
-   pip install -e ".[quality]"
+   brew install mecab
   ```

-   which should be enough for most use cases.
-
 5. Develop the features on your branch.

   As you work on your code, you should make sure the test suite
--- a/9
+++ b/9
@ -41,16 +41,17 @@ repo-consistency:
 	python utils/check_config_docstrings.py
 	python utils/check_config_attributes.py
 	python utils/check_doctest_list.py
+	python utils/tests_fetcher.py --sanity_check
 	python utils/update_metadata.py --check-only
 	python utils/check_task_guides.py

 # this target runs checks on all files

 quality:
-	black --check $(check_dirs) setup.py conftest.py
+	black --check $(check_dirs)
 	python utils/custom_init_isort.py --check_only
 	python utils/sort_auto_mappings.py --check_only
-	ruff $(check_dirs) setup.py conftest.py
+	ruff $(check_dirs)
 	doc-builder style src/transformers docs/source --max_len 119 --check_only --path_to_docs docs/source
 	python utils/check_doc_toc.py

@ -65,8 +66,8 @@ extra_style_checks:
 # this target runs checks on all files and potentially modifies some of them

 style:
-	black $(check_dirs) setup.py conftest.py
-	ruff $(check_dirs) setup.py conftest.py --fix
+	black $(check_dirs)
+	ruff $(check_dirs) --fix
 	${MAKE} autogenerate_code
 	${MAKE} extra_style_checks

--- a/README.md
+++ b/README.md
@ -301,7 +301,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
+1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
@ -310,7 +310,6 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
-1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
@ -319,7 +318,6 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
 1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
-1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (from Google AI) released with the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
 1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl.
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
@ -341,7 +339,6 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
-1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
@ -352,7 +349,6 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
 1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
 1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren.
-1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
@ -368,7 +364,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
-1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
+1. **[LLaMA](https://huggingface.co/docs/transformers/main/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
@ -379,10 +375,9 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
 1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
-1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (from Facebook) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
+1. **[MEGA](https://huggingface.co/docs/transformers/main/model_doc/mega)** (from Facebook) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
@ -397,17 +392,15 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
-1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
-1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
+1. **[Pix2Struct](https://huggingface.co/docs/transformers/main/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
@ -422,9 +415,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
-1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
 1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
@ -432,7 +423,6 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
-1. **[SwiftFormer](https://huggingface.co/docs/transformers/main/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
 1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
@ -513,3 +503,4 @@ We now have a [paper](https://www.aclweb.org/anthology/2020.emnlp-demos.6/) you
    pages = "38--45"
 }
 ```
+
--- a/README_es.md
+++ b/README_es.md
@ -289,7 +289,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
+1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
@ -298,7 +298,6 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
-1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
@ -307,7 +306,6 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
 1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
-1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (from Google AI) released with the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
 1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl.
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
@ -322,14 +320,13 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
-1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
+1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang. 
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
+1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
-1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
@ -340,7 +337,6 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
 1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
 1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren.
-1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
@ -356,7 +352,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
-1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
+1. **[LLaMA](https://huggingface.co/docs/transformers/main/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
@ -367,10 +363,9 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
 1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
-1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (from Facebook) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
+1. **[MEGA](https://huggingface.co/docs/transformers/main/model_doc/mega)** (from Facebook) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
@ -385,17 +380,15 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
-1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
-1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
+1. **[Pix2Struct](https://huggingface.co/docs/transformers/main/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
@ -410,9 +403,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng) released with the paper [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
-1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
 1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
@ -420,7 +411,6 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
-1. **[SwiftFormer](https://huggingface.co/docs/transformers/main/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
 1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
--- a/README_hd.md
+++ b/README_hd.md
@ -261,7 +261,7 @@ conda install -c huggingface transformers
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (इनरिया/फेसबुक/सोरबोन से) साथ में कागज [CamemBERT: एक टेस्टी फ्रेंच लैंग्वेज मॉडल](https:// arxiv.org/abs/1911.03894) लुई मार्टिन*, बेंजामिन मुलर*, पेड्रो जेवियर ऑर्टिज़ सुआरेज़*, योआन ड्यूपॉन्ट, लॉरेंट रोमरी, एरिक विलेमोन्टे डे ला क्लर्जरी, जैमे सेडाह और बेनोइट सगोट द्वारा।
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google रिसर्च से) साथ में दिया गया पेपर [कैनाइन: प्री-ट्रेनिंग ए एफिशिएंट टोकनाइजेशन-फ्री एनकोडर फॉर लैंग्वेज रिप्रेजेंटेशन]( https://arxiv.org/abs/2103.06874) जोनाथन एच क्लार्क, डैन गैरेट, यूलिया टर्क, जॉन विएटिंग द्वारा।
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (LAION-AI से) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. द्वाराअनुसंधान पत्र [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) के साथ जारी किया गया
+1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (LAION-AI से) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. द्वाराअनुसंधान पत्र [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) के साथ जारी किया गया
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI से) साथ वाला पेपर [लर्निंग ट्रांसफरेबल विजुअल मॉडल फ्रॉम नेचुरल लैंग्वेज सुपरविजन](https://arxiv.org /abs/2103.00020) एलेक रैडफोर्ड, जोंग वूक किम, क्रिस हैलासी, आदित्य रमेश, गेब्रियल गोह, संध्या अग्रवाल, गिरीश शास्त्री, अमांडा एस्केल, पामेला मिश्किन, जैक क्लार्क, ग्रेचेन क्रुएगर, इल्या सुत्स्केवर द्वारा।
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (सेल्सफोर्स से) साथ में पेपर [प्रोग्राम सिंथेसिस के लिए एक संवादात्मक प्रतिमान](https://arxiv.org/abs/2203.13474) एरिक निजकैंप, बो पैंग, हिरोआकी हयाशी, लिफू तू, हुआन वांग, यिंगबो झोउ, सिल्वियो सावरेस, कैमिंग जिओंग रिलीज।
@ -270,7 +270,6 @@ conda install -c huggingface transformers
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (Facebook AI से) साथ वाला पेपर [A ConvNet for the 2020s](https://arxiv.org/abs /2201.03545) ज़ुआंग लियू, हेंज़ी माओ, चाओ-युआन वू, क्रिस्टोफ़ फीचटेनहोफ़र, ट्रेवर डेरेल, सैनिंग ज़ी द्वारा।
 1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (सिंघुआ यूनिवर्सिटी से) साथ में पेपर [सीपीएम: ए लार्ज-स्केल जेनेरेटिव चाइनीज प्री-ट्रेंड लैंग्वेज मॉडल](https : //arxiv.org/abs/2012.00413) झेंग्यान झांग, जू हान, हाओ झोउ, पेई के, युक्सियन गु, डेमिंग ये, युजिया किन, युशेंग सु, हाओझे जी, जियान गुआन, फैंचाओ क्यूई, ज़ियाओझी वांग, यानान झेंग द्वारा , गुओयांग ज़ेंग, हुआनकी काओ, शेंगकी चेन, डाइक्सुआन ली, ज़ेनबो सन, ज़ियुआन लियू, मिनली हुआंग, वेंटाओ हान, जी तांग, जुआनज़ी ली, ज़ियाओयान झू, माओसोंग सन।
-1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (सेल्सफोर्स से) साथ में पेपर [CTRL: ए कंडिशनल ट्रांसफॉर्मर लैंग्वेज मॉडल फॉर कंट्रोलेबल जेनरेशन](https://arxiv.org/abs/1909.05858) नीतीश शिरीष केसकर*, ब्रायन मैककैन*, लव आर. वार्ष्णेय, कैमिंग जिओंग और रिचर्ड द्वारा सोचर द्वारा जारी किया गया।
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft से) साथ में दिया गया पेपर [CvT: इंट्रोड्यूसिंग कनवॉल्यूशन टू विजन ट्रांसफॉर्मर्स](https://arxiv.org/ एब्स/2103.15808) हैपिंग वू, बिन जिओ, नोएल कोडेला, मेंगचेन लियू, जियांग दाई, लू युआन, लेई झांग द्वारा।
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (फेसबुक से) साथ में कागज [Data2Vec: भाषण, दृष्टि और भाषा में स्व-पर्यवेक्षित सीखने के लिए एक सामान्य ढांचा] (https://arxiv.org/abs/2202.03555) एलेक्सी बाएव्स्की, वेई-निंग सू, कियानटोंग जू, अरुण बाबू, जियाताओ गु, माइकल औली द्वारा पोस्ट किया गया।
@ -279,7 +278,6 @@ conda install -c huggingface transformers
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (बर्कले/फेसबुक/गूगल से) पेपर के साथ [डिसीजन ट्रांसफॉर्मर: रीनफोर्समेंट लर्निंग वाया सीक्वेंस मॉडलिंग](https : //arxiv.org/abs/2106.01345) लिली चेन, केविन लू, अरविंद राजेश्वरन, किमिन ली, आदित्य ग्रोवर, माइकल लास्किन, पीटर एबील, अरविंद श्रीनिवास, इगोर मोर्डच द्वारा पोस्ट किया गया।
 1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (सेंसटाइम रिसर्च से) साथ में पेपर [डिफॉर्मेबल डीईटीआर: डिफॉर्मेबल ट्रांसफॉर्मर्स फॉर एंड-टू-एंड ऑब्जेक्ट डिटेक्शन] (https://arxiv.org/abs/2010.04159) Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, जिफेंग दाई द्वारा पोस्ट किया गया।
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (फेसबुक से) साथ में पेपर [ट्रेनिंग डेटा-एफिशिएंट इमेज ट्रांसफॉर्मर और डिस्टिलेशन थ्रू अटेंशन](https://arxiv .org/abs/2012.12877) ह्यूगो टौव्रोन, मैथ्यू कॉर्ड, मैथिज्स डूज़, फ़्रांसिस्को मस्सा, एलेक्ज़ेंडर सबलेरोल्स, हर्वे जेगौ द्वारा।
-1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (Google AI से) Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun. द्वाराअनुसंधान पत्र [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) के साथ जारी किया गया
 1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl.
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (फेसबुक से) साथ में कागज [ट्रांसफॉर्मर्स के साथ एंड-टू-एंड ऑब्जेक्ट डिटेक्शन](https://arxiv. org/abs/2005.12872) निकोलस कैरियन, फ़्रांसिस्को मस्सा, गेब्रियल सिनेव, निकोलस उसुनियर, अलेक्जेंडर किरिलोव, सर्गेई ज़ागोरुयको द्वारा।
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (माइक्रोसॉफ्ट रिसर्च से) कागज के साथ [DialoGPT: बड़े पैमाने पर जनरेटिव प्री-ट्रेनिंग फॉर कन्वर्सेशनल रिस्पांस जेनरेशन](https ://arxiv.org/abs/1911.00536) यिज़े झांग, सिकी सन, मिशेल गैली, येन-चुन चेन, क्रिस ब्रोकेट, जियांग गाओ, जियानफेंग गाओ, जिंगजिंग लियू, बिल डोलन द्वारा।
@ -297,11 +295,10 @@ conda install -c huggingface transformers
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu से) Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang. द्वाराअनुसंधान पत्र [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) के साथ जारी किया गया
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (मेटा AI से) ट्रांसफॉर्मर प्रोटीन भाषा मॉडल हैं। **ESM-1b** पेपर के साथ जारी किया गया था [ अलेक्जेंडर राइव्स, जोशुआ मेयर, टॉम सर्कु, सिद्धार्थ गोयल, ज़ेमिंग लिन द्वारा जैविक संरचना और कार्य असुरक्षित सीखने को 250 मिलियन प्रोटीन अनुक्रमों तक स्केल करने से उभरता है] (https://www.pnas.org/content/118/15/e2016239118) जेसन लियू, डेमी गुओ, मायल ओट, सी. लॉरेंस ज़िटनिक, जेरी मा और रॉब फर्गस। **ESM-1v** को पेपर के साथ जारी किया गया था [भाषा मॉडल प्रोटीन फ़ंक्शन पर उत्परिवर्तन के प्रभावों की शून्य-शॉट भविष्यवाणी को सक्षम करते हैं] (https://doi.org/10.1101/2021.07.09.450648) जोशुआ मेयर, रोशन राव, रॉबर्ट वेरकुइल, जेसन लियू, टॉम सर्कु और अलेक्जेंडर राइव्स द्वारा। **ESM-2** को पेपर के साथ जारी किया गया था [भाषा मॉडल विकास के पैमाने पर प्रोटीन अनुक्रम सटीक संरचना भविष्यवाणी को सक्षम करते हैं](https://doi.org/10.1101/2022.07.20.500902) ज़ेमिंग लिन, हलील अकिन, रोशन राव, ब्रायन ही, झोंगकाई झू, वेंटिंग लू, ए द्वारा लान डॉस सैंटोस कोस्टा, मरियम फ़ज़ल-ज़रंडी, टॉम सर्कू, साल कैंडिडो, अलेक्जेंडर राइव्स।
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
+1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (CNRS से) साथ वाला पेपर [FlauBERT: Unsupervised Language Model Pre-training for फ़्रेंच](https://arxiv .org/abs/1912.05372) Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, बेंजामिन लेकोउटेक्स, अलेक्जेंड्रे अल्लाउज़ेन, बेनोइट क्रैबे, लॉरेंट बेसेसियर, डिडिएर श्वाब द्वारा।
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (FLAVA: A फाउंडेशनल लैंग्वेज एंड विजन अलाइनमेंट मॉडल) (https://arxiv) साथ वाला पेपर .org/abs/2112.04482) अमनप्रीत सिंह, रोंगहांग हू, वेदानुज गोस्वामी, गुइल्यूम कुएरॉन, वोज्शिएक गालुबा, मार्कस रोहरबैक, और डौवे कीला द्वारा।
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (गूगल रिसर्च से) साथ वाला पेपर [FNet: मिक्सिंग टोकन विद फूरियर ट्रांसफॉर्म्स](https://arxiv.org /abs/2105.03824) जेम्स ली-थॉर्प, जोशुआ आइंस्ली, इल्या एकस्टीन, सैंटियागो ओंटानन द्वारा।
-1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (Microsoft Research से) Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. द्वाराअनुसंधान पत्र [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) के साथ जारी किया गया
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (सीएमयू/गूगल ब्रेन से) साथ में कागज [फ़नल-ट्रांसफॉर्मर: कुशल भाषा प्रसंस्करण के लिए अनुक्रमिक अतिरेक को छानना](https://arxiv.org/abs/2006.03236) जिहांग दाई, गुओकुन लाई, यिमिंग यांग, क्वोक वी. ले द्वारा रिहाई।
 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (KAIST से) साथ वाला पेपर [वर्टिकल कटडेप्थ के साथ मोनोकुलर डेप्थ एस्टीमेशन के लिए ग्लोबल-लोकल पाथ नेटवर्क्स](https:/ /arxiv.org/abs/2201.07436) डोयोन किम, वूंगह्युन गा, प्युंगवान आह, डोंगग्यू जू, सेहवान चुन, जुनमो किम द्वारा।
@ -312,7 +309,6 @@ conda install -c huggingface transformers
 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (ओपनएआई से) साथ में पेपर [लैंग्वेज मॉडल्स अनसुपरवाइज्ड मल्टीटास्क लर्नर्स हैं](https://blog.openai.com/better-language-models/) एलेक रैडफोर्ड*, जेफरी वू*, रेवन चाइल्ड, डेविड लुआन, डारियो एमोडी* द्वारा * और इल्या सुत्सकेवर** ने पोस्ट किया।
 1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (EleutherAI से) साथ वाला पेपर [kingoflolz/mesh-transformer-jax](https://github. com/kingoflolz/mesh-transformer-jax/) बेन वांग और अरन कोमात्सुजाकी द्वारा।
 1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren.
-1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode से) Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. द्वाराअनुसंधान पत्र [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) के साथ जारी किया गया
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA से) साथ में कागज [GroupViT: टेक्स्ट सुपरविजन से सिमेंटिक सेगमेंटेशन इमर्जेस](https://arxiv .org/abs/2202.11094) जियारुई जू, शालिनी डी मेलो, सिफ़ी लियू, वोनमिन बायन, थॉमस ब्रेउएल, जान कौट्ज़, ज़ियाओलोंग वांग द्वारा।
@ -328,7 +324,7 @@ conda install -c huggingface transformers
 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (मेटा AI से) साथ वाला पेपर [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https:/ /arxiv.org/abs/2104.01136) बेन ग्राहम, अलाएल्डिन एल-नौबी, ह्यूगो टौवरन, पियरे स्टॉक, आर्मंड जौलिन, हर्वे जेगौ, मैथिज डूज़ द्वारा।
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (दक्षिण चीन प्रौद्योगिकी विश्वविद्यालय से) साथ में कागज [LiLT: एक सरल लेकिन प्रभावी भाषा-स्वतंत्र लेआउट ट्रांसफार्मर संरचित दस्तावेज़ समझ के लिए](https://arxiv.org/abs/2202.13669) जियापेंग वांग, लियानवेन जिन, काई डिंग द्वारा पोस्ट किया गया।
-1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (The FAIR team of Meta AI से) Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. द्वाराअनुसंधान पत्र [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) के साथ जारी किया गया
+1. **[LLaMA](https://huggingface.co/docs/transformers/main/model_doc/llama)** (The FAIR team of Meta AI से) Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. द्वाराअनुसंधान पत्र [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) के साथ जारी किया गया
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (मैंडी गुओ, जोशुआ आइंस्ली, डेविड यूथस, सैंटियागो ओंटानन, जियानमो नि, यूं-हुआन सुंग, यिनफेई यांग द्वारा पोस्ट किया गया।
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (स्टूडियो औसिया से) साथ में पेपर [LUKE: डीप कॉन्टेक्स्टुअलाइज्ड एंटिटी रिप्रेजेंटेशन विद एंटिटी-अवेयर सेल्फ-अटेंशन](https ://arxiv.org/abs/2010.01057) Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto द्वारा।
@ -339,10 +335,9 @@ conda install -c huggingface transformers
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (माइक्रोसॉफ्ट रिसर्च एशिया से) साथ में पेपर [मार्कअपएलएम: विजुअली-रिच डॉक्यूमेंट अंडरस्टैंडिंग के लिए टेक्स्ट और मार्कअप लैंग्वेज का प्री-ट्रेनिंग] (https://arxiv.org/abs/2110.08518) जुनलॉन्ग ली, यिहेंग जू, लेई कुई, फुरु द्वारा वी द्वारा पोस्ट किया गया।
 1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (FAIR and UIUC से) Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar. द्वाराअनुसंधान पत्र [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) के साथ जारी किया गया
 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (मेटा और UIUC से) पेपर के साथ जारी किया गया [प्रति-पिक्सेल वर्गीकरण वह सब नहीं है जिसकी आपको सिमेंटिक सेगमेंटेशन की आवश्यकता है] (https://arxiv.org/abs/2107.06278) बोवेन चेंग, अलेक्जेंडर जी. श्विंग, अलेक्जेंडर किरिलोव द्वारा >>>>>> रिबेस ठीक करें
-1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (Google AI से) Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos. द्वाराअनुसंधान पत्र [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) के साथ जारी किया गया
 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (फेसबुक से) साथ में पेपर [न्यूरल मशीन ट्रांसलेशन के लिए मल्टीलिंगुअल डीनोइजिंग प्री-ट्रेनिंग](https://arxiv. org/abs/2001.08210) यिनहान लियू, जियाताओ गु, नमन गोयल, जियान ली, सर्गेई एडुनोव, मार्जन ग़ज़विनिनेजाद, माइक लुईस, ल्यूक ज़ेटलमॉयर द्वारा।
 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (फेसबुक से) साथ में पेपर [एक्स्टेंसिबल बहुभाषी प्रीट्रेनिंग और फाइनट्यूनिंग के साथ बहुभाषी अनुवाद](https://arxiv युकिंग टैंग, चाउ ट्रान, जियान ली, पेंग-जेन चेन, नमन गोयल, विश्रव चौधरी, जियाताओ गु, एंजेला फैन द्वारा .org/abs/2008.00401)।
-1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (Facebook से) Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer. द्वाराअनुसंधान पत्र [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) के साथ जारी किया गया
+1. **[MEGA](https://huggingface.co/docs/transformers/main/model_doc/mega)** (Facebook से) Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer. द्वाराअनुसंधान पत्र [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) के साथ जारी किया गया
 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (NVIDIA से) कागज के साथ [Megatron-LM: मॉडल का उपयोग करके बहु-अरब पैरामीटर भाषा मॉडल का प्रशिक्षण Parallelism](https://arxiv.org/abs/1909.08053) मोहम्मद शोएबी, मोस्टोफा पटवारी, राउल पुरी, पैट्रिक लेग्रेस्ले, जेरेड कैस्पर और ब्रायन कैटानज़ारो द्वारा।
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (NVIDIA से) साथ वाला पेपर [Megatron-LM: ट्रेनिंग मल्टी-बिलियन पैरामीटर लैंग्वेज मॉडल्स यूजिंग मॉडल पैरेललिज़्म] (https://arxiv.org/abs/1909.08053) मोहम्मद शोएबी, मोस्टोफा पटवारी, राउल पुरी, पैट्रिक लेग्रेस्ले, जेरेड कैस्पर और ब्रायन कैटानज़ारो द्वारा पोस्ट किया गया।
 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (Alibaba Research से) Peng Wang, Cheng Da, and Cong Yao. द्वाराअनुसंधान पत्र [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) के साथ जारी किया गया
@ -357,17 +352,15 @@ conda install -c huggingface transformers
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (हुआवेई नूह के आर्क लैब से) साथ में कागज़ [NEZHA: चीनी भाषा समझ के लिए तंत्रिका प्रासंगिक प्रतिनिधित्व](https :/ /arxiv.org/abs/1909.00204) जुन्किउ वेई, ज़ियाओज़े रेन, ज़िआओगुआंग ली, वेनयोंग हुआंग, यी लियाओ, याशेंग वांग, जियाशू लिन, शिन जियांग, जिओ चेन और कुन लियू द्वारा।
 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (फ्रॉम मेटा) साथ में पेपर [नो लैंग्वेज लेफ्ट बिहाइंड: स्केलिंग ह्यूमन-सेंटेड मशीन ट्रांसलेशन] (https://arxiv.org/abs/2207.04672) एनएलएलबी टीम द्वारा प्रकाशित।
-1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (Meta से) the NLLB team. द्वाराअनुसंधान पत्र [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) के साथ जारी किया गया
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (विस्कॉन्सिन विश्वविद्यालय - मैडिसन से) साथ में कागज [Nyströmformer: A Nyström- आधारित एल्गोरिथम आत्म-ध्यान का अनुमान लगाने के लिए ](https://arxiv.org/abs/2102.03902) युनयांग ज़िओंग, झानपेंग ज़ेंग, रुद्रसिस चक्रवर्ती, मिंगक्सिंग टैन, ग्लेन फंग, यिन ली, विकास सिंह द्वारा पोस्ट किया गया।
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs से) पेपर [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) जितेश जैन, जिआचेन ली, मांगटिक चिउ, अली हसनी, निकिता ओरलोव, हम्फ्री शि के द्वारा जारी किया गया है।
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI से) साथ में कागज [विज़न ट्रांसफॉर्मर्स के साथ सिंपल ओपन-वोकैबुलरी ऑब्जेक्ट डिटेक्शन](https:/ /arxiv.org/abs/2205.06230) मैथियास मिंडरर, एलेक्सी ग्रिट्सेंको, ऑस्टिन स्टोन, मैक्सिम न्यूमैन, डिर्क वीसेनबोर्न, एलेक्सी डोसोवित्स्की, अरविंद महेंद्रन, अनुराग अर्नब, मुस्तफा देहघानी, ज़ुओरन शेन, जिओ वांग, ज़ियाओहुआ झाई, थॉमस किफ़, और नील हॉल्सबी द्वारा पोस्ट किया गया।
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google की ओर से) साथ में दिया गया पेपर [लंबे इनपुट सारांश के लिए ट्रांसफ़ॉर्मरों को बेहतर तरीके से एक्सटेंड करना](https://arxiv .org/abs/2208.04347) जेसन फांग, याओ झाओ, पीटर जे लियू द्वारा।
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (दीपमाइंड से) साथ में पेपर [पर्सीवर आईओ: संरचित इनपुट और आउटपुट के लिए एक सामान्य वास्तुकला] (https://arxiv.org/abs/2107.14795) एंड्रयू जेगल, सेबेस्टियन बोरग्यूड, जीन-बैप्टिस्ट अलायराक, कार्ल डोर्श, कैटलिन इओनेस्कु, डेविड द्वारा डिंग, स्कंद कोप्पुला, डैनियल ज़ोरान, एंड्रयू ब्रॉक, इवान शेलहैमर, ओलिवियर हेनाफ, मैथ्यू एम। बोट्विनिक, एंड्रयू ज़िसरमैन, ओरिओल विनियल्स, जोआओ कैरेरा द्वारा पोस्ट किया गया।
 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (VinAI Research से) कागज के साथ [PhoBERT: वियतनामी के लिए पूर्व-प्रशिक्षित भाषा मॉडल](https://www .aclweb.org/anthology/2020.findings-emnlp.92/) डैट क्वोक गुयेन और अन्ह तुआन गुयेन द्वारा पोस्ट किया गया।
-1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (Google से) Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova. द्वाराअनुसंधान पत्र [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) के साथ जारी किया गया
+1. **[Pix2Struct](https://huggingface.co/docs/transformers/main/model_doc/pix2struct)** (Google से) Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova. द्वाराअनुसंधान पत्र [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) के साथ जारी किया गया
 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (UCLA NLP से) साथ वाला पेपर [प्रोग्राम अंडरस्टैंडिंग एंड जेनरेशन के लिए यूनिफाइड प्री-ट्रेनिंग](https://arxiv .org/abs/2103.06333) वसी उद्दीन अहमद, सैकत चक्रवर्ती, बैशाखी रे, काई-वेई चांग द्वारा।
 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (माइक्रोसॉफ्ट रिसर्च से) साथ में पेपर [ProphetNet: प्रेडिक्टिंग फ्यूचर एन-ग्राम फॉर सीक्वेंस-टू-सीक्वेंस प्री-ट्रेनिंग ](https://arxiv.org/abs/2001.04063) यू यान, वीज़ेन क्यूई, येयुन गोंग, दयाहेंग लियू, नान डुआन, जिउशेंग चेन, रुओफ़ेई झांग और मिंग झोउ द्वारा पोस्ट किया गया।
@ -382,9 +375,7 @@ conda install -c huggingface transformers
 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (झुईई टेक्नोलॉजी से), साथ में पेपर [रोफॉर्मर: रोटरी पोजिशन एंबेडिंग के साथ एन्हांस्ड ट्रांसफॉर्मर] (https://arxiv.org/pdf/2104.09864v1.pdf) जियानलिन सु और यू लू और शेंगफेंग पैन और बो वेन और युनफेंग लियू द्वारा प्रकाशित।
-1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (Bo Peng से) Bo Peng. द्वाराअनुसंधान पत्र [this repo](https://github.com/BlinkDL/RWKV-LM) के साथ जारी किया गया
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
-1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI से) Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. द्वाराअनुसंधान पत्र [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) के साथ जारी किया गया
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP से) साथ देने वाला पेपर [भाषण पहचान के लिए अनसुपरवाइज्ड प्री-ट्रेनिंग में परफॉर्मेंस-एफिशिएंसी ट्रेड-ऑफ्स](https ://arxiv.org/abs/2109.06870) फेलिक्स वू, क्वांगयुन किम, जिंग पैन, क्यू हान, किलियन क्यू. वेनबर्गर, योव आर्टज़ी द्वारा।
 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (ASAPP से) साथ में पेपर [भाषण पहचान के लिए अनसुपरवाइज्ड प्री-ट्रेनिंग में परफॉर्मेंस-एफिशिएंसी ट्रेड-ऑफ्स] (https://arxiv.org/abs/2109.06870) फेलिक्स वू, क्वांगयुन किम, जिंग पैन, क्यू हान, किलियन क्यू. वेनबर्गर, योआव आर्टज़ी द्वारा पोस्ट किया गया।
 1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
@ -392,7 +383,6 @@ conda install -c huggingface transformers
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (फेसबुक से) साथ में पेपर [लार्ज-स्केल सेल्फ- एंड सेमी-सुपरवाइज्ड लर्निंग फॉर स्पीच ट्रांसलेशन](https://arxiv.org/abs/2104.06678) चांगहान वांग, ऐनी वू, जुआन पिनो, एलेक्सी बेवस्की, माइकल औली, एलेक्सिस द्वारा Conneau द्वारा पोस्ट किया गया।
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (तेल अवीव यूनिवर्सिटी से) साथ में पेपर [स्पैन सिलेक्शन को प्री-ट्रेनिंग करके कुछ-शॉट क्वेश्चन आंसरिंग](https:// arxiv.org/abs/2101.00438) ओरि राम, युवल कर्स्टन, जोनाथन बेरेंट, अमीर ग्लोबर्सन, ओमर लेवी द्वारा।
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (बर्कले से) कागज के साथ [SqueezeBERT: कुशल तंत्रिका नेटवर्क के बारे में NLP को कंप्यूटर विज़न क्या सिखा सकता है?](https: //arxiv.org/abs/2006.11316) फॉरेस्ट एन. इनडोला, अल्बर्ट ई. शॉ, रवि कृष्णा, और कर्ट डब्ल्यू. केटज़र द्वारा।
-1. **[SwiftFormer](https://huggingface.co/docs/transformers/main/model_doc/swiftformer)** (MBZUAI से) Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan. द्वाराअनुसंधान पत्र [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) के साथ जारी किया गया
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (माइक्रोसॉफ्ट से) साथ में कागज [स्वाइन ट्रांसफॉर्मर: शिफ्टेड विंडोज का उपयोग कर पदानुक्रमित विजन ट्रांसफॉर्मर](https://arxiv .org/abs/2103.14030) ज़ी लियू, युटोंग लिन, यू काओ, हान हू, यिक्सुआन वेई, झेंग झांग, स्टीफन लिन, बैनिंग गुओ द्वारा।
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (Microsoft से) साथ वाला पेपर [Swin Transformer V2: स्केलिंग अप कैपेसिटी एंड रेजोल्यूशन](https:// ज़ी लियू, हान हू, युटोंग लिन, ज़ुलिआंग याओ, ज़ेंडा ज़ी, यिक्सुआन वेई, जिया निंग, यू काओ, झेंग झांग, ली डोंग, फुरु वेई, बैनिंग गुओ द्वारा arxiv.org/abs/2111.09883।
 1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
--- a/README_ja.md
+++ b/README_ja.md
@ -323,7 +323,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (Inria/Facebook/Sorbonne から) Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot から公開された研究論文: [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894)
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research から) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting から公開された研究論文: [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874)
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys から) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou から公開された研究論文: [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335)
-1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (LAION-AI から) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. から公開された研究論文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687)
+1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (LAION-AI から) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. から公開された研究論文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687)
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI から) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever から公開された研究論文: [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen から) Timo Lüddecke and Alexander Ecker から公開された研究論文: [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003)
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (Salesforce から) Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong から公開された研究論文: [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474)
@ -332,7 +332,6 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (Facebook AI から) Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie から公開された研究論文: [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545)
 1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (Tsinghua University から) Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun から公開された研究論文: [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413)
-1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (OpenBMB から) [OpenBMB](https://www.openbmb.org/) から公開されました.
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (Salesforce から) Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher から公開された研究論文: [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858)
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft から) Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang から公開された研究論文: [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808)
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (Facebook から) Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli から公開された研究論文: [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555)
@ -341,7 +340,6 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (Berkeley/Facebook/Google から) Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch から公開された研究論文: [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345)
 1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (SenseTime Research から) Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai から公開された研究論文: [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159)
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (Facebook から) Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou から公開された研究論文: [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877)
-1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (Google AI から) Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun. から公開された研究論文 [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505)
 1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (The University of Texas at Austin から) Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl. から公開された研究論文 [NMS Strikes Back](https://arxiv.org/abs/2212.06137)
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (Facebook から) Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko から公開された研究論文: [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872)
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (Microsoft Research から) Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan から公開された研究論文: [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536)
@ -359,11 +357,10 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu から) Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang. から公開された研究論文 [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674)
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (Meta AI から) はトランスフォーマープロテイン言語モデルです.  **ESM-1b** は Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus から公開された研究論文: [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118). **ESM-1v** は Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives　から公開された研究論文: [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648). **ESM-2** と　**ESMFold** は Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives から公開された研究論文: [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902)
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (Google AI から) Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V から公開されたレポジトリー [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) Le, and Jason Wei
-1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
+1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (CNRS から) Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab から公開された研究論文: [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372)
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (Facebook AI から) Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela から公開された研究論文: [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482)
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (Google Research から) James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon から公開された研究論文: [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824)
-1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (Microsoft Research から) Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. から公開された研究論文 [Focal Modulation Networks](https://arxiv.org/abs/2203.11926)
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (CMU/Google Brain から) Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le から公開された研究論文: [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236)
 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (Microsoft Research から) Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. から公開された研究論文 [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100)
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (KAIST から) Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim から公開された研究論文: [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436)
@ -374,7 +371,6 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (OpenAI から) Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** から公開された研究論文: [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/)
 1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (EleutherAI から) Ben Wang and Aran Komatsuzaki から公開されたレポジトリー [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/)
 1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (AI-Sweden から) Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren から公開された研究論文: [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf)
-1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode から) Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. から公開された研究論文 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988)
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) 坂本俊之(tanreinama)からリリースされました.
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (Microsoft から) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu から公開された研究論文: [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234).
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA から) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang から公開された研究論文: [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094)
@ -390,7 +386,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (AllenAI から) Iz Beltagy, Matthew E. Peters, Arman Cohan から公開された研究論文: [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150)
 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (Meta AI から) Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze から公開された研究論文: [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136)
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (South China University of Technology から) Jiapeng Wang, Lianwen Jin, Kai Ding から公開された研究論文: [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669)
-1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (The FAIR team of Meta AI から) Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. から公開された研究論文 [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
+1. **[LLaMA](https://huggingface.co/docs/transformers/main/model_doc/llama)** (The FAIR team of Meta AI から) Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. から公開された研究論文 [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (AllenAI から) Iz Beltagy, Matthew E. Peters, Arman Cohan から公開された研究論文: [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150)
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (Google AI から) Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang から公開された研究論文: [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916)
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (Studio Ousia から) Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto から公開された研究論文: [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057)
@ -401,10 +397,9 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (Microsoft Research Asia から) Junlong Li, Yiheng Xu, Lei Cui, Furu Wei から公開された研究論文: [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518)
 1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (FAIR and UIUC から) Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar. から公開された研究論文 [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527)
 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (Meta and UIUC から) Bowen Cheng, Alexander G. Schwing, Alexander Kirillov から公開された研究論文: [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278)
-1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (Google AI から) Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos. から公開された研究論文 [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662)
 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (Facebook から) Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer から公開された研究論文: [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210)
 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (Facebook から) Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan から公開された研究論文: [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401)
-1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (Facebook から) Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer. から公開された研究論文 [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655)
+1. **[MEGA](https://huggingface.co/docs/transformers/main/model_doc/mega)** (Facebook から) Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer. から公開された研究論文 [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655)
 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (NVIDIA から) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro から公開された研究論文: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053)
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (NVIDIA から) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro から公開された研究論文: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053)
 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (Alibaba Research から) Peng Wang, Cheng Da, and Cong Yao. から公開された研究論文 [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592)
@ -419,17 +414,15 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (SHI Labs から) Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi から公開された研究論文: [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143)
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (Huawei Noah’s Ark Lab から) Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu から公開された研究論文: [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204)
 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (Meta から) the NLLB team から公開された研究論文: [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672)
-1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (Meta から) the NLLB team. から公開された研究論文 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672)
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (the University of Wisconsin - Madison から) Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh から公開された研究論文: [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902)
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs から) Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi から公開された研究論文: [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220)
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI から) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al から公開された研究論文: [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068)
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI から) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby から公開された研究論文: [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230)
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google から) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu から公開された研究論文: [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google から) Jason Phang, Yao Zhao, and Peter J. Liu から公開された研究論文: [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347)
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind から) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira から公開された研究論文: [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795)
 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (VinAI Research から) Dat Quoc Nguyen and Anh Tuan Nguyen から公開された研究論文: [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/)
-1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (Google から) Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova. から公開された研究論文 [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347)
+1. **[Pix2Struct](https://huggingface.co/docs/transformers/main/model_doc/pix2struct)** (Google から) Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova. から公開された研究論文 [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347)
 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (UCLA NLP から) Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang から公開された研究論文: [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333)
 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (Sea AI Labs から) Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng から公開された研究論文: [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418)
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (Microsoft Research から) Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou から公開された研究論文: [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063)
@ -444,9 +437,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (Facebook から) Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli から公開された研究論文: [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038)
 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (WeChatAI から) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou から公開された研究論文: [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf)
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (ZhuiyiTechnology から), Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu から公開された研究論文: [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864)
-1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (Bo Peng から) Bo Peng. から公開された研究論文 [this repo](https://github.com/BlinkDL/RWKV-LM)
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA から) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo から公開された研究論文: [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203)
-1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI から) Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. から公開された研究論文 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf)
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP から) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi から公開された研究論文: [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870)
 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (ASAPP から) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi から公開された研究論文: [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870)
 1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (Microsoft Research から) Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei. から公開された研究論文 [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205)
@ -454,7 +445,6 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (Facebook から), Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau から公開された研究論文: [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678)
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (Tel Aviv University から), Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy から公開された研究論文: [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438)
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (Berkeley から) Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer から公開された研究論文: [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316)
-1. **[SwiftFormer](https://huggingface.co/docs/transformers/main/model_doc/swiftformer)** (MBZUAI から) Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan. から公開された研究論文 [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446)
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (Microsoft から) Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo から公開された研究論文: [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030)
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (Microsoft から) Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo から公開された研究論文: [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883)
 1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (University of Würzburg から) Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte から公開された研究論文: [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345)
--- a/README_ko.md
+++ b/README_ko.md
@ -238,7 +238,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (Inria/Facebook/Sorbonne 에서) Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 의 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 논문과 함께 발표했습니다.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research 에서) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 의 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 논문과 함께 발표했습니다.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys 에서) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou 의 [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) 논문과 함께 발표했습니다.
-1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (LAION-AI 에서 제공)은 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.의 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687)논문과 함께 발표했습니다.
+1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (LAION-AI 에서 제공)은 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.의 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687)논문과 함께 발표했습니다.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI 에서) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 의 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 논문과 함께 발표했습니다.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen 에서) Timo Lüddecke and Alexander Ecker 의 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 논문과 함께 발표했습니다.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (Salesforce 에서) Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 의 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 논문과 함께 발표했습니다.
@ -247,7 +247,6 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (Facebook AI 에서) Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 의 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 논문과 함께 발표했습니다.
 1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (Tsinghua University 에서) Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 의 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 논문과 함께 발표했습니다.
-1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (Salesforce 에서) Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher 의 [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) 논문과 함께 발표했습니다.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft 에서) Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang 의 [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) 논문과 함께 발표했습니다.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (Facebook 에서) Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli 의 [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) 논문과 함께 발표했습니다.
@ -256,7 +255,6 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (Berkeley/Facebook/Google 에서) Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch 의 [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) 논문과 함께 발표했습니다.
 1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (SenseTime Research 에서) Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai 의 [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) 논문과 함께 발표했습니다.
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (Facebook 에서) Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou 의 [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) 논문과 함께 발표했습니다.
-1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (Google AI 에서 제공)은 Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.의 [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505)논문과 함께 발표했습니다.
 1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (The University of Texas at Austin 에서 제공)은 Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl.의 [NMS Strikes Back](https://arxiv.org/abs/2212.06137)논문과 함께 발표했습니다.
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (Facebook 에서) Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko 의 [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) 논문과 함께 발표했습니다.
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (Microsoft Research 에서) Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan 의 [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) 논문과 함께 발표했습니다.
@ -274,11 +272,10 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu 에서 제공)은 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.의 [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674)논문과 함께 발표했습니다.
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
+1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
-1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
@ -289,7 +286,6 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (OpenAI 에서) Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** 의 [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) 논문과 함께 발표했습니다.
 1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
 1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (AI-Sweden 에서) Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren. 의 [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) 논문과 함께 발표했습니다.
-1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode 에서 제공)은 Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.의 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988)논문과 함께 발표했습니다.
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu  의 [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234)  논문과 함께 발표했습니다.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA 에서) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 의 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 논문과 함께 발표했습니다.
@ -305,7 +301,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (AllenAI 에서) Iz Beltagy, Matthew E. Peters, Arman Cohan 의 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 논문과 함께 발표했습니다.
 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (Meta AI 에서) Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze 의 [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) 논문과 함께 발표했습니다.
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (South China University of Technology 에서) Jiapeng Wang, Lianwen Jin, Kai Ding 의 [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) 논문과 함께 발표했습니다.
-1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (The FAIR team of Meta AI 에서 제공)은 Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.의 [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)논문과 함께 발표했습니다.
+1. **[LLaMA](https://huggingface.co/docs/transformers/main/model_doc/llama)** (The FAIR team of Meta AI 에서 제공)은 Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.의 [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)논문과 함께 발표했습니다.
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (AllenAI 에서) Iz Beltagy, Matthew E. Peters, Arman Cohan 의 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 논문과 함께 발표했습니다.
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (Google AI 에서) Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang 의 [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) 논문과 함께 발표했습니다.
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (Studio Ousia 에서) Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto 의 [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) 논문과 함께 발표했습니다.
@ -316,10 +312,9 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (Microsoft Research Asia 에서) Junlong Li, Yiheng Xu, Lei Cui, Furu Wei 의 [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) 논문과 함께 발표했습니다.
 1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (FAIR and UIUC 에서 제공)은 Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.의 [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527)논문과 함께 발표했습니다.
 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (Meta and UIUC 에서) Bowen Cheng, Alexander G. Schwing, Alexander Kirillov 의 [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) 논문과 함께 발표했습니다.
-1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (Google AI 에서 제공)은 Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.의 [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662)논문과 함께 발표했습니다.
 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (Facebook 에서) Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer 의 [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) 논문과 함께 발표했습니다.
 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (Facebook 에서) Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan 의 [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) 논문과 함께 발표했습니다.
-1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (Facebook 에서 제공)은 Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.의 [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655)논문과 함께 발표했습니다.
+1. **[MEGA](https://huggingface.co/docs/transformers/main/model_doc/mega)** (Facebook 에서 제공)은 Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.의 [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655)논문과 함께 발표했습니다.
 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (NVIDIA 에서) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 의 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 논문과 함께 발표했습니다.
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (NVIDIA 에서) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 의 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 논문과 함께 발표했습니다.
 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (Alibaba Research 에서 제공)은 Peng Wang, Cheng Da, and Cong Yao.의 [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592)논문과 함께 발표했습니다.
@ -334,17 +329,15 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (SHI Labs 에서) Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi 의 [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) 논문과 함께 발표했습니다.
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (Huawei Noah’s Ark Lab 에서) Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu 의 [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) 논문과 함께 발표했습니다.
 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (Meta 에서) the NLLB team 의 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) 논문과 함께 발표했습니다.
-1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (Meta 에서 제공)은 the NLLB team.의 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672)논문과 함께 발표했습니다.
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (the University of Wisconsin - Madison 에서) Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh 의 [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) 논문과 함께 발표했습니다.
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs 에서) Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi 의 [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) 논문과 함께 발표했습니다.
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI 에서) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 의 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 논문과 함께 발표했습니다.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI 에서) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 의 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 논문과 함께 발표했습니다.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google 에서) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 의 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 논문과 함께 발표했습니다.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google 에서) Jason Phang, Yao Zhao, Peter J. Liu 의 [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) 논문과 함께 발표했습니다.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind 에서) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 의 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 논문과 함께 발표했습니다.
 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (VinAI Research 에서) Dat Quoc Nguyen and Anh Tuan Nguyen 의 [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) 논문과 함께 발표했습니다.
-1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (Google 에서 제공)은 Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.의 [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347)논문과 함께 발표했습니다.
+1. **[Pix2Struct](https://huggingface.co/docs/transformers/main/model_doc/pix2struct)** (Google 에서 제공)은 Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.의 [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347)논문과 함께 발표했습니다.
 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (UCLA NLP 에서) Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang 의 [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) 논문과 함께 발표했습니다.
 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (Sea AI Labs 에서) Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng 의 [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) 논문과 함께 발표했습니다.
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (Microsoft Research 에서) Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 의 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 논문과 함께 발표했습니다.
@ -359,9 +352,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (Facebook 에서) Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli 의 [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) 논문과 함께 발표했습니다.
 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (WeChatAI 에서) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 의 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 논문과 함께 발표했습니다.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (ZhuiyiTechnology 에서) Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 의 a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 논문과 함께 발표했습니다.
-1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (Bo Peng 에서 제공)은 Bo Peng.의 [this repo](https://github.com/BlinkDL/RWKV-LM)논문과 함께 발표했습니다.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA 에서) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo 의 [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 논문과 함께 발표했습니다.
-1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI 에서 제공)은 Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.의 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf)논문과 함께 발표했습니다.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP 에서) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 의 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 논문과 함께 발표했습니다.
 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (ASAPP 에서) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 의 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 논문과 함께 발표했습니다.
 1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (Microsoft Research 에서 제공)은 Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.의 [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205)논문과 함께 발표했습니다.
@ -369,7 +360,6 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (Facebook 에서) Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau 의 [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) 논문과 함께 발표했습니다.
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (Tel Aviv University 에서) Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy 의 [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) 논문과 함께 발표했습니다.
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (Berkeley 에서) Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 의 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 논문과 함께 발표했습니다.
-1. **[SwiftFormer](https://huggingface.co/docs/transformers/main/model_doc/swiftformer)** (MBZUAI 에서 제공)은 Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.의 [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446)논문과 함께 발표했습니다.
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (Microsoft 에서) Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo 의 [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) 논문과 함께 발표했습니다.
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (Microsoft 에서) Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo 의 [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) 논문과 함께 발표했습니다.
 1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (University of Würzburg 에서) Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte 의 [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) 논문과 함께 발표했습니다.
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@ -262,7 +262,7 @@ conda install -c huggingface transformers
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (来自 Inria/Facebook/Sorbonne) 伴随论文 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 由 Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 发布。
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (来自 OFA-Sys) 伴随论文 [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) 由 An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou 发布。
-1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (来自 LAION-AI) 伴随论文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) 由 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov 发布。
+1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (来自 LAION-AI) 伴随论文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) 由 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov 发布。
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (来自 University of Göttingen) 伴随论文 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 由 Timo Lüddecke and Alexander Ecker 发布。
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (来自 Salesforce) 伴随论文 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 由 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 发布。
@ -271,7 +271,6 @@ conda install -c huggingface transformers
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (来自 Facebook AI) 伴随论文 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 由 Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 发布。
 1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (来自 Tsinghua University) 伴随论文 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 由 Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 发布。
-1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (来自 Salesforce) 伴随论文 [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) 由 Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher 发布。
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (来自 Microsoft) 伴随论文 [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) 由 Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang 发布。
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (来自 Facebook) 伴随论文 [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) 由 Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli 发布。
@ -280,7 +279,6 @@ conda install -c huggingface transformers
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (来自 Berkeley/Facebook/Google) 伴随论文 [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) 由 Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch 发布。
 1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (来自 SenseTime Research) 伴随论文 [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) 由 Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai 发布。
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (来自 Facebook) 伴随论文 [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) 由 Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou 发布。
-1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (来自 Google AI) 伴随论文 [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) 由 Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun 发布。
 1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (来自 The University of Texas at Austin) 伴随论文 [NMS Strikes Back](https://arxiv.org/abs/2212.06137) 由 Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl 发布。
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (来自 Facebook) 伴随论文 [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) 由 Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko 发布。
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (来自 Microsoft Research) 伴随论文 [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) 由 Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan 发布。
@ -298,11 +296,10 @@ conda install -c huggingface transformers
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (来自 Baidu) 伴随论文 [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) 由 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang 发布。
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
+1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (来自 CNRS) 伴随论文 [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) 由 Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab 发布。
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (来自 Facebook AI) 伴随论文 [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) 由 Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela 发布。
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (来自 Google Research) 伴随论文 [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) 由 James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon 发布。
-1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (来自 Microsoft Research) 伴随论文 [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) 由 Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao 发布。
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (来自 CMU/Google Brain) 伴随论文 [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) 由 Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le 发布。
 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (来自 Microsoft Research) 伴随论文 [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) 由 Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang 发布。
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (来自 KAIST) 伴随论文 [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) 由 Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim 发布。
@ -313,7 +310,6 @@ conda install -c huggingface transformers
 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (来自 OpenAI) 伴随论文 [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) 由 Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** 发布。
 1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (来自 EleutherAI) 伴随论文 [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) 由 Ben Wang and Aran Komatsuzaki 发布。
 1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren.
-1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (来自 BigCode) 伴随论文 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) 由 Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra 发布。
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by 坂本俊之(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (来自 UCSD, NVIDIA) 伴随论文 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 由 Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 发布。
@ -329,7 +325,7 @@ conda install -c huggingface transformers
 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (来自 AllenAI) 伴随论文 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 由 Iz Beltagy, Matthew E. Peters, Arman Cohan 发布。
 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (来自 Meta AI) 伴随论文 [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) 由 Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze 发布。
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (来自 South China University of Technology) 伴随论文 [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) 由 Jiapeng Wang, Lianwen Jin, Kai Ding 发布。
-1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (来自 The FAIR team of Meta AI) 伴随论文 [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) 由 Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample 发布。
+1. **[LLaMA](https://huggingface.co/docs/transformers/main/model_doc/llama)** (来自 The FAIR team of Meta AI) 伴随论文 [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) 由 Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample 发布。
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (来自 AllenAI) 伴随论文 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 由 Iz Beltagy, Matthew E. Peters, Arman Cohan 发布。
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (来自 Google AI) released 伴随论文 [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) 由 Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang 发布。
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (来自 Studio Ousia) 伴随论文 [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) 由 Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto 发布。
@ -339,11 +335,10 @@ conda install -c huggingface transformers
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** 用 [OPUS](http://opus.nlpl.eu/) 数据训练的机器翻译模型由 Jörg Tiedemann 发布。[Marian Framework](https://marian-nmt.github.io/) 由微软翻译团队开发。
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (来自 Microsoft Research Asia) 伴随论文 [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) 由 Junlong Li, Yiheng Xu, Lei Cui, Furu Wei 发布。
 1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (来自 FAIR and UIUC) 伴随论文 [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) 由 Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar 发布。
-1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov
-1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (来自 Google AI) 伴随论文 [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) 由 Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos 发布。
+1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov  
 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (来自 Facebook) 伴随论文 [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) 由 Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer 发布。
 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (来自 Facebook) 伴随论文 [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) 由 Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan 发布。
-1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (来自 Facebook) 伴随论文 [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) 由 Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer 发布。
+1. **[MEGA](https://huggingface.co/docs/transformers/main/model_doc/mega)** (来自 Facebook) 伴随论文 [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) 由 Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer 发布。
 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。
 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (来自 Alibaba Research) 伴随论文 [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) 由 Peng Wang, Cheng Da, and Cong Yao 发布。
@ -358,17 +353,15 @@ conda install -c huggingface transformers
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (来自 SHI Labs) 伴随论文 [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) 由 Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi 发布。
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (来自华为诺亚方舟实验室) 伴随论文 [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) 由 Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu 发布。
 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (来自 Meta) 伴随论文 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) 由 the NLLB team 发布。
-1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (来自 Meta) 伴随论文 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) 由 the NLLB team 发布。
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (来自 the University of Wisconsin - Madison) 伴随论文 [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) 由 Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh 发布。
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (来自 SHI Labs)  伴随论文 [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) 由 Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi 发布。
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (来自 [s-JoL](https://huggingface.co/s-JoL)) 由 [Open-Llama](https://github.com/s-JoL/Open-Llama) 发布. 
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (来自 Meta AI) 伴随论文 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 由 Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 发布。
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (来自 Google AI) 伴随论文 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 由 Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 发布。
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (来自 Google) 伴随论文 [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) 由 Jason Phang, Yao Zhao, Peter J. Liu 发布。
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (来自 Deepmind) 伴随论文 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 由 Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 发布。
 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (来自 VinAI Research) 伴随论文 [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) 由 Dat Quoc Nguyen and Anh Tuan Nguyen 发布。
-1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (来自 Google) 伴随论文 [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) 由 Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova 发布。
+1. **[Pix2Struct](https://huggingface.co/docs/transformers/main/model_doc/pix2struct)** (来自 Google) 伴随论文 [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) 由 Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova 发布。
 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (来自 UCLA NLP) 伴随论文 [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) 由 Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang 发布。
 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (来自 Sea AI Labs) 伴随论文 [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) 由 Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng 发布。
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
@ -383,9 +376,7 @@ conda install -c huggingface transformers
 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (来自 Facebook) 伴随论文 [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) 由 Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli 发布。
 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (来自 WeChatAI), 伴随论文 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 由 HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 发布。
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (来自 ZhuiyiTechnology), 伴随论文 [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 由 Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 发布。
-1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (来自 Bo Peng) 伴随论文 [this repo](https://github.com/BlinkDL/RWKV-LM) 由 Bo Peng 发布。
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (来自 NVIDIA) 伴随论文 [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 由 Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo 发布。
-1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (来自 Meta AI) 伴随论文 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) 由 Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick 发布。
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (来自 ASAPP) 伴随论文 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 由 Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 发布。
 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (来自 ASAPP) 伴随论文 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 由 Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 发布。
 1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (来自 Microsoft Research) 伴随论文 [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) 由 Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei 发布。
@ -393,7 +384,6 @@ conda install -c huggingface transformers
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (来自 Facebook) 伴随论文 [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) 由 Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau 发布。
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (来自 Tel Aviv University) 伴随论文 [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) 由 Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy 发布。
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (来自 Berkeley) 伴随论文 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 由 Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 发布。
-1. **[SwiftFormer](https://huggingface.co/docs/transformers/main/model_doc/swiftformer)** (来自 MBZUAI) 伴随论文 [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) 由 Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan 发布。
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (来自 Microsoft) 伴随论文 [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) 由 Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo 发布。
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (来自 Microsoft) 伴随论文 [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) 由 Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo 发布。
 1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (来自 University of Würzburg) 伴随论文 [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) 由 Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte 发布。
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@ -274,7 +274,7 @@ conda install -c huggingface transformers
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
+1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
@ -283,7 +283,6 @@ conda install -c huggingface transformers
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
-1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
@ -292,7 +291,6 @@ conda install -c huggingface transformers
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
 1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
-1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (from Google AI) released with the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
 1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl.
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
@ -310,11 +308,10 @@ conda install -c huggingface transformers
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
-1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
+1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
-1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
@ -325,7 +322,6 @@ conda install -c huggingface transformers
 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
 1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released with the paper [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
 1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren.
-1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by 坂本俊之(tanreinama).
 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
@ -341,7 +337,7 @@ conda install -c huggingface transformers
 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
-1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
+1. **[LLaMA](https://huggingface.co/docs/transformers/main/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
@ -352,10 +348,9 @@ conda install -c huggingface transformers
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
 1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
 1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov
-1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (from Facebook) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
+1. **[MEGA](https://huggingface.co/docs/transformers/main/model_doc/mega)** (from Facebook) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
@ -370,17 +365,15 @@ conda install -c huggingface transformers
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
-1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
-1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
+1. **[Pix2Struct](https://huggingface.co/docs/transformers/main/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
@ -395,9 +388,7 @@ conda install -c huggingface transformers
 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng) released with the paper [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
-1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
 1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
@ -405,7 +396,6 @@ conda install -c huggingface transformers
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook) released with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University) released with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
-1. **[SwiftFormer](https://huggingface.co/docs/transformers/main/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
 1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
--- a/conftest.py
+++ b/conftest.py
@ -20,10 +20,6 @@ import sys
 import warnings
 from os.path import abspath, dirname, join

-import _pytest
-
-from transformers.testing_utils import HfDoctestModule, HfDocTestParser
-

 # allow having multiple repository checkouts and not needing to remember to rerun
 # 'pip install -e .[dev]' when switching between checkouts and running tests.
@ -42,10 +38,10 @@ def pytest_configure(config):
    config.addinivalue_line(
        "markers", "is_pt_flax_cross_test: mark test to run only when PT and FLAX interactions are tested"
    )
-    config.addinivalue_line("markers", "is_pipeline_test: mark test to run only when pipelines are tested")
+    config.addinivalue_line(
+        "markers", "is_pipeline_test: mark test to run only when pipelines are tested"
+    )
    config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment")
-    config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate")
-    config.addinivalue_line("markers", "tool_tests: mark the tool tests that are run on their specific schedule")


 def pytest_addoption(parser):
@ -69,7 +65,7 @@ def pytest_sessionfinish(session, exitstatus):


 # Doctest custom flag to ignore output.
-IGNORE_RESULT = doctest.register_optionflag("IGNORE_RESULT")
+IGNORE_RESULT = doctest.register_optionflag('IGNORE_RESULT')

 OutputChecker = doctest.OutputChecker

@ -82,5 +78,3 @@ class CustomOutputChecker(OutputChecker):


 doctest.OutputChecker = CustomOutputChecker
-_pytest.doctest.DoctestModule = HfDoctestModule
-doctest.DocTestParser = HfDocTestParser
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -9,7 +9,7 @@ SHELL ["sh", "-lc"]
 # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
 # to be used as arguments for docker build (so far).

-ARG PYTORCH='2.0.1'
+ARG PYTORCH='2.0.0'
 # (not always a valid torch version)
 ARG INTEL_TORCH_EXT='1.11.0'
 # Example: `cu102`, `cu113`, etc.
@ -32,9 +32,15 @@ RUN echo torch=$VERSION
 # TODO: We might need to specify proper versions that work with a specific torch version (especially for past CI).
 RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA

-RUN python3 -m pip install --no-cache-dir -U tensorflow==2.12 protobuf==3.20.3 tensorflow_text tensorflow_probability
+RUN python3 -m pip install --no-cache-dir -U tensorflow==2.11
+RUN python3 -m pip install --no-cache-dir -U tensorflow_probability
 RUN python3 -m pip uninstall -y flax jax

+# To include the change in this commit https://github.com/onnx/tensorflow-onnx/commit/ddca3a5eb2d912f20fe7e0568dd1a3013aee9fa3
+# Otherwise, we get tf2onnx==1.8 (caused by `flatbuffers` version),  and some tests fail with `ValueError: from_keras requires input_signature`.
+# TODO: remove this line once the conflict is resolved in these libraries.
+RUN python3 -m pip install --no-cache-dir git+https://github.com/onnx/tensorflow-onnx.git@ddca3a5eb2d912f20fe7e0568dd1a3013aee9fa3
+
 RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT+cpu -f https://software.intel.com/ipex-whl-stable

 RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
@ -45,9 +51,6 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/acc
 # Add bitsandbytes for mixed int8 testing
 RUN python3 -m pip install --no-cache-dir bitsandbytes

-# For bettertransformer
-RUN python3 -m pip install --no-cache-dir optimum
-
 # For video model testing
 RUN python3 -m pip install --no-cache-dir decord av==9.2.0

--- a/docker/transformers-past-gpu/Dockerfile
+++ b/docker/transformers-past-gpu/Dockerfile
@ -1,4 +1,4 @@
-ARG BASE_DOCKER_IMAGE
+ARG BASE_DOCKER_IMAGE="nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04"
 FROM $BASE_DOCKER_IMAGE
 LABEL maintainer="Hugging Face"

@ -8,7 +8,7 @@ ARG DEBIAN_FRONTEND=noninteractive
 SHELL ["sh", "-lc"]

 RUN apt update
-RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs libaio-dev
+RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs
 RUN git lfs install
 RUN python3 -m pip install --no-cache-dir --upgrade pip

@ -23,11 +23,9 @@ RUN cd transformers && python3 setup.py develop
 ARG FRAMEWORK
 ARG VERSION

-# Control `setuptools` version to avoid some issues
-RUN [ "$VERSION" != "1.9" -a "$VERSION" != "1.10" ] && python3 -m pip install -U setuptools || python3 -m pip install -U "setuptools<=59.5"
-
 # Remove all frameworks
-RUN python3 -m pip uninstall -y torch torchvision torchaudio tensorflow jax flax
+# (`accelerate` requires `torch`, and this causes import issues for TF-only testing)
+RUN python3 -m pip uninstall -y torch torchvision torchaudio accelerate tensorflow jax flax

 # Get the libraries and their versions to install, and write installation command to `~/.profile`.
 RUN python3 ./transformers/utils/past_ci_versions.py --framework $FRAMEWORK --version $VERSION
@ -36,24 +34,4 @@ RUN python3 ./transformers/utils/past_ci_versions.py --framework $FRAMEWORK --ve
 RUN echo "INSTALL_CMD = $INSTALL_CMD"
 RUN $INSTALL_CMD

-RUN [ "$FRAMEWORK" != "pytorch" ] && echo "`deepspeed-testing` installation is skipped" || python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
-
-# Remove `accelerate`: it requires `torch`, and this causes import issues for TF-only testing
-# We will install `accelerate@main` in Past CI workflow file
-RUN python3 -m pip uninstall -y accelerate
-
-# Uninstall `torch-tensorrt` and `apex` shipped with the base image
-RUN python3 -m pip uninstall -y torch-tensorrt apex
-
-# Pre-build **nightly** release of DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
-RUN python3 -m pip uninstall -y deepspeed
-# This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.)
-# Issue: https://github.com/microsoft/DeepSpeed/issues/2010
-# RUN git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build && \
-#    DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
-
 RUN python3 -m pip install -U "itsdangerous<2.1.0"
-
-# When installing in editable mode, `transformers` is not recognized as a package.
-# this line must be added in order for python to be aware of transformers.
-RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@ -4,7 +4,7 @@ LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive

-ARG PYTORCH='2.0.1'
+ARG PYTORCH='2.0.0'
 # Example: `cu102`, `cu113`, etc.
 ARG CUDA='cu117'

@ -22,8 +22,6 @@ RUN python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchau

 RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]

-RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
-
 # Uninstall `torch-tensorrt` shipped with the base image
 RUN python3 -m pip uninstall -y torch-tensorrt

@ -38,7 +36,7 @@ RUN python3 -m pip uninstall -y deepspeed
 # This has to be run (again) inside the GPU VMs running the tests.
 # The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests.
 # TODO: Find out why test fail.
-RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1

 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
--- a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
@ -1,11 +1,10 @@
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel_22-08.html#rel_22-08
-FROM nvcr.io/nvidia/pytorch:22.08-py3
+FROM nvcr.io/nvidia/pytorch:21.03-py3
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive

 # Example: `cu102`, `cu113`, etc.
-ARG CUDA='cu117'
+ARG CUDA='cu113'

 RUN apt -y update
 RUN apt install -y libaio-dev
@ -21,35 +20,30 @@ RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio

 RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]

-RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
-
-# Uninstall `torch-tensorrt` and `apex` shipped with the base image
-RUN python3 -m pip uninstall -y torch-tensorrt apex
-
 # Pre-build **nightly** release of DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
 RUN python3 -m pip uninstall -y deepspeed
 # This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.)
 # Issue: https://github.com/microsoft/DeepSpeed/issues/2010
 # RUN git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build && \
-#    DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+#    DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1

-## For `torchdynamo` tests
-## (see https://github.com/huggingface/transformers/pull/17765)
-#RUN git clone https://github.com/pytorch/functorch
-#RUN python3 -m pip install --no-cache-dir ./functorch[aot]
-#RUN cd functorch && python3 setup.py develop
-#
-#RUN git clone https://github.com/pytorch/torchdynamo
-#RUN python3 -m pip install -r ./torchdynamo/requirements.txt
-#RUN cd torchdynamo && python3 setup.py develop
-#
-## install TensorRT
-#RUN python3 -m pip install --no-cache-dir -U nvidia-pyindex
-#RUN python3 -m pip install --no-cache-dir -U nvidia-tensorrt==8.2.4.2
-#
-## install torch_tensorrt (fx path)
-#RUN git clone https://github.com/pytorch/TensorRT.git
-#RUN cd TensorRT/py && python3 setup.py install --fx-only
+# For `torchdynamo` tests
+# (see https://github.com/huggingface/transformers/pull/17765)
+RUN git clone https://github.com/pytorch/functorch
+RUN python3 -m pip install --no-cache-dir ./functorch[aot]
+RUN cd functorch && python3 setup.py develop
+
+RUN git clone https://github.com/pytorch/torchdynamo
+RUN python3 -m pip install -r ./torchdynamo/requirements.txt
+RUN cd torchdynamo && python3 setup.py develop
+
+# install TensorRT
+RUN python3 -m pip install --no-cache-dir -U nvidia-pyindex
+RUN python3 -m pip install --no-cache-dir -U nvidia-tensorrt==8.2.4.2
+
+# install torch_tensorrt (fx path)
+RUN git clone https://github.com/pytorch/TensorRT.git
+RUN cd TensorRT/py && python3 setup.py install --fx-only

 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
--- a/docker/transformers-pytorch-gpu/Dockerfile
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@ -12,7 +12,7 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
 RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video]

 # If set to nothing, will install the latest version
-ARG PYTORCH='2.0.1'
+ARG PYTORCH='2.0.0'
 ARG TORCH_VISION=''
 ARG TORCH_AUDIO=''
 # Example: `cu102`, `cu113`, etc.
--- a/docker/transformers-tensorflow-gpu/Dockerfile
+++ b/docker/transformers-tensorflow-gpu/Dockerfile
@ -12,7 +12,7 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
 RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-tensorflow,testing]

 # If set to nothing, will install the latest version
-ARG TENSORFLOW='2.12'
+ARG TENSORFLOW='2.11'

 RUN [ ${#TENSORFLOW} -gt 0 ] && VERSION='tensorflow=='$TENSORFLOW'.*' ||  VERSION='tensorflow'; python3 -m pip install --no-cache-dir -U $VERSION
 RUN python3 -m pip uninstall -y torch flax
--- a/docs/source/de/index.mdx
+++ b/docs/source/de/index.mdx
@ -283,9 +283,9 @@ Flax), PyTorch, und/oder TensorFlow haben.
 |             RAG             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            REALM            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 |          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|           RegNet            |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           RegNet            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |           RemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|           ResNet            |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           ResNet            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 |           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |          RoFormer           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -8,24 +8,45 @@
  title: Get started
 - sections:
  - local: pipeline_tutorial
-    title: Run inference with pipelines
+    title: Pipelines for inference
  - local: autoclass_tutorial
-    title: Write portable code with AutoClass
+    title: Load pretrained instances with an AutoClass
  - local: preprocessing
-    title: Preprocess data
+    title: Preprocess
  - local: training
    title: Fine-tune a pretrained model
-  - local: run_scripts
-    title: Train with a script
  - local: accelerate
-    title: Set up distributed training with 🤗 Accelerate
+    title: Distributed training with 🤗 Accelerate
  - local: model_sharing
-    title: Share your model
-  - local: transformers_agents
-    title: Agents
+    title: Share a model
  title: Tutorials
 - sections:
  - sections:
+    - local: create_a_model
+      title: Create a custom architecture
+    - local: custom_models
+      title: Sharing custom models
+    - local: run_scripts
+      title: Train with a script
+    - local: sagemaker
+      title: Run training on Amazon SageMaker
+    - local: converting_tensorflow_models
+      title: Converting from TensorFlow checkpoints
+    - local: serialization
+      title: Export to ONNX
+    - local: torchscript
+      title: Export to TorchScript
+    - local: troubleshooting
+      title: Troubleshoot
+    title: General usage
+  - sections:
+    - local: fast_tokenizers
+      title: Use tokenizers from 🤗 Tokenizers
+    - local: multilingual
+      title: Inference for multilingual models
+    - local: generation_strategies
+      title: Text generation strategies
+    - sections:
      - local: tasks/sequence_classification
        title: Text classification
      - local: tasks/token_classification
@ -42,71 +63,38 @@
        title: Summarization
      - local: tasks/multiple_choice
        title: Multiple choice
+      title: Task guides
+      isExpanded: false
    title: Natural Language Processing
-    isExpanded: false
  - sections:
-      - local: tasks/audio_classification
-        title: Audio classification
-      - local: tasks/asr
-        title: Automatic speech recognition
+    - local: tasks/audio_classification
+      title: Audio classification
+    - local: tasks/asr
+      title: Automatic speech recognition
    title: Audio
-    isExpanded: false
  - sections:
-      - local: tasks/image_classification
-        title: Image classification
-      - local: tasks/semantic_segmentation
-        title: Semantic segmentation
-      - local: tasks/video_classification
-        title: Video classification
-      - local: tasks/object_detection
-        title: Object detection
-      - local: tasks/zero_shot_object_detection
-        title: Zero-shot object detection
-      - local: tasks/zero_shot_image_classification
-        title: Zero-shot image classification
-      - local: tasks/monocular_depth_estimation
-        title: Depth estimation
+    - local: tasks/image_classification
+      title: Image classification
+    - local: tasks/semantic_segmentation
+      title: Semantic segmentation
+    - local: tasks/video_classification
+      title: Video classification
+    - local: tasks/object_detection
+      title: Object detection
+    - local: tasks/zero_shot_object_detection
+      title: Zero-shot object detection
+    - local: tasks/zero_shot_image_classification
+      title: Zero-shot image classification
+    - local: tasks/monocular_depth_estimation
+      title: Depth estimation
    title: Computer Vision
-    isExpanded: false
  - sections:
-      - local: tasks/image_captioning
-        title: Image captioning
-      - local: tasks/document_question_answering
-        title: Document Question Answering
-      - local: tasks/text-to-speech
-        title: Text to speech
+    - local: tasks/image_captioning
+      title: Image captioning
+    - local: tasks/document_question_answering
+      title: Document Question Answering
    title: Multimodal
-    isExpanded: false
-  title: Task Guides
- sections:
-    - local: fast_tokenizers
-      title: Use fast tokenizers from 🤗 Tokenizers
-    - local: multilingual
-      title: Run inference with multilingual models
-    - local: generation_strategies
-      title: Customize text generation strategy
-    - local: create_a_model
-      title: Use model-specific APIs
-    - local: custom_models
-      title: Share a custom model
-    - local: sagemaker
-      title: Run training on Amazon SageMaker
-    - local: serialization
-      title: Export to ONNX
-    - local: torchscript
-      title: Export to TorchScript
-    - local: benchmarks
-      title: Benchmarks
-    - local: notebooks
-      title: Notebooks with examples
-    - local: community
-      title: Community resources
-    - local: custom_tools
-      title: Custom Tools and Prompts
-    - local: troubleshooting
-      title: Troubleshoot
-  title: Developer guides
- sections:
+  - sections:
    - local: performance
      title: Overview
    - local: perf_train_gpu_one
@ -141,8 +129,8 @@
      title: Hyperparameter Search using Trainer API
    - local: tf_xla
      title: XLA Integration for TensorFlow Models
-  title: Performance and scalability
- sections:
+    title: Performance and scalability
+  - sections:
    - local: contributing
      title: How to contribute to transformers?
    - local: add_new_model
@ -155,8 +143,16 @@
      title: Testing
    - local: pr_checks
      title: Checks on a Pull Request
-  title: Contribute
-
+    title: Contribute
+  - local: notebooks
+    title: 🤗 Transformers Notebooks
+  - local: community
+    title: Community resources
+  - local: benchmarks
+    title: Benchmarks
+  - local: migration
+    title: Migrating from previous packages
+  title: How-to guides
 - sections:
  - local: philosophy
    title: Philosophy
@ -183,8 +179,6 @@
  title: Conceptual guides
 - sections:
  - sections:
-    - local: main_classes/agent
-      title: Agents and Tools
    - local: model_doc/auto
      title: Auto Classes
    - local: main_classes/callback
@ -269,8 +263,6 @@
        title: ConvBERT
      - local: model_doc/cpm
        title: CPM
-      - local: model_doc/cpmant
-        title: CPMANT
      - local: model_doc/ctrl
        title: CTRL
      - local: model_doc/deberta
@ -317,8 +309,6 @@
        title: GPT-J
      - local: model_doc/gpt2
        title: GPT2
-      - local: model_doc/gpt_bigcode
-        title: GPTBigCode
      - local: model_doc/gptsan-japanese
        title: GPTSAN Japanese
      - local: model_doc/gpt-sw3
@ -367,12 +357,8 @@
        title: NEZHA
      - local: model_doc/nllb
        title: NLLB
-      - local: model_doc/nllb-moe
-        title: NLLB-MoE
      - local: model_doc/nystromformer
        title: Nyströmformer
-      - local: model_doc/open-llama
-        title: Open-Llama
      - local: model_doc/opt
        title: OPT
      - local: model_doc/pegasus
@ -405,8 +391,6 @@
        title: RoCBert
      - local: model_doc/roformer
        title: RoFormer
-      - local: model_doc/rwkv
-        title: RWKV
      - local: model_doc/splinter
        title: Splinter
      - local: model_doc/squeezebert
@ -474,8 +458,6 @@
        title: EfficientFormer
      - local: model_doc/efficientnet
        title: EfficientNet
-      - local: model_doc/focalnet
-        title: FocalNet
      - local: model_doc/glpn
        title: GLPN
      - local: model_doc/imagegpt
@ -502,8 +484,6 @@
        title: ResNet
      - local: model_doc/segformer
        title: SegFormer
-      - local: model_doc/swiftformer
-        title: SwiftFormer
      - local: model_doc/swin
        title: Swin Transformer
      - local: model_doc/swinv2
@ -590,8 +570,6 @@
        title: CLIPSeg
      - local: model_doc/data2vec
        title: Data2Vec
-      - local: model_doc/deplot
-        title: DePlot
      - local: model_doc/donut
        title: Donut
      - local: model_doc/flava
@ -612,8 +590,6 @@
        title: LiLT
      - local: model_doc/lxmert
        title: LXMERT
-      - local: model_doc/matcha
-        title: MatCha
      - local: model_doc/mgp-str
        title: MGP-STR
      - local: model_doc/oneformer
@ -624,8 +600,6 @@
        title: Perceiver
      - local: model_doc/pix2struct
        title: Pix2Struct
-      - local: model_doc/sam
-        title: Segment Anything
      - local: model_doc/speech-encoder-decoder
        title: Speech Encoder Decoder Models
      - local: model_doc/tapas
--- a/docs/source/en/add_new_model.mdx
+++ b/docs/source/en/add_new_model.mdx
@ -202,15 +202,7 @@ source .env/bin/activate
 pip install -e ".[dev]"
 ```

-Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a
-failure with this command. If that's the case make sure to install the Deep Learning framework you are working with
-(PyTorch, TensorFlow and/or Flax) then do:
-
-```bash
-pip install -e ".[quality]"
-```
-
-which should be enough for most use cases. You can then return to the parent directory
+and return to the parent directory

 ```bash
 cd ..
@ -678,7 +670,7 @@ model.save_pretrained("/path/to/converted/checkpoint/folder")
 **7. Implement the forward pass**

 Having managed to correctly load the pretrained weights into the 🤗 Transformers implementation, you should now make
-sure that the forward pass is correctly implemented. In [Get familiar with the original repository](#34-run-a-pretrained-checkpoint-using-the-original-repository), you have already created a script that runs a forward
+sure that the forward pass is correctly implemented. In [Get familiar with the original repository](#run-a-pretrained-checkpoint-using-the-original-repository), you have already created a script that runs a forward
 pass of the model using the original repository. Now you should write an analogous script using the 🤗 Transformers
 implementation instead of the original one. It should look as follows:

--- a/docs/source/en/add_tensorflow_model.mdx
+++ b/docs/source/en/add_tensorflow_model.mdx
@ -119,13 +119,6 @@ source .env/bin/activate
 pip install -e ".[dev]"
 ```

-Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a
-failure with this command. If that's the case make sure to install TensorFlow then do:
-
-```bash
-pip install -e ".[quality]"
-```
-
 **Note:** You don't need to have CUDA installed. Making the new model work on CPU is sufficient.

 4. Create a branch with a descriptive name from your main branch
--- a/docs/source/en/converting_tensorflow_models.mdx
+++ b/docs/source/en/converting_tensorflow_models.mdx
@ -0,0 +1,162 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Converting From Tensorflow Checkpoints
+
+A command-line interface is provided to convert original Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM checkpoints to models
+that can be loaded using the `from_pretrained` methods of the library.
+
+<Tip>
+
+Since 2.3.0 the conversion script is now part of the transformers CLI (**transformers-cli**) available in any
+transformers >= 2.3.0 installation.
+
+The documentation below reflects the **transformers-cli convert** command format.
+
+</Tip>
+
+## BERT
+
+You can convert any TensorFlow checkpoint for BERT (in particular [the pre-trained models released by Google](https://github.com/google-research/bert#pre-trained-models)) in a PyTorch save file by using the
+[convert_bert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py) script.
+
+This CLI takes as input a TensorFlow checkpoint (three files starting with `bert_model.ckpt`) and the associated
+configuration file (`bert_config.json`), and creates a PyTorch model for this configuration, loads the weights from
+the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can
+be imported using `from_pretrained()` (see example in [quicktour](quicktour) , [run_glue.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_glue.py) ).
+
+You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow
+checkpoint (the three files starting with `bert_model.ckpt`) but be sure to keep the configuration file (\
+`bert_config.json`) and the vocabulary file (`vocab.txt`) as these are needed for the PyTorch model too.
+
+To run this specific conversion script you will need to have TensorFlow and PyTorch installed (`pip install tensorflow`). The rest of the repository only requires PyTorch.
+
+Here is an example of the conversion process for a pre-trained `BERT-Base Uncased` model:
+
+```bash
+export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
+
+transformers-cli convert --model_type bert \
+  --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
+  --config $BERT_BASE_DIR/bert_config.json \
+  --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
+```
+
+You can download Google's pre-trained models for the conversion [here](https://github.com/google-research/bert#pre-trained-models).
+
+## ALBERT
+
+Convert TensorFlow model checkpoints of ALBERT to PyTorch using the
+[convert_albert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py) script.
+
+The CLI takes as input a TensorFlow checkpoint (three files starting with `model.ckpt-best`) and the accompanying
+configuration file (`albert_config.json`), then creates and saves a PyTorch model. To run this conversion you will
+need to have TensorFlow and PyTorch installed.
+
+Here is an example of the conversion process for the pre-trained `ALBERT Base` model:
+
+```bash
+export ALBERT_BASE_DIR=/path/to/albert/albert_base
+
+transformers-cli convert --model_type albert \
+  --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
+  --config $ALBERT_BASE_DIR/albert_config.json \
+  --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
+```
+
+You can download Google's pre-trained models for the conversion [here](https://github.com/google-research/albert#pre-trained-models).
+
+## OpenAI GPT
+
+Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoint
+save as the same format than OpenAI pretrained model (see [here](https://github.com/openai/finetune-transformer-lm)\
+)
+
+```bash
+export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
+
+transformers-cli convert --model_type gpt \
+  --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+  [--config OPENAI_GPT_CONFIG] \
+  [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
+```
+
+## OpenAI GPT-2
+
+Here is an example of the conversion process for a pre-trained OpenAI GPT-2 model (see [here](https://github.com/openai/gpt-2))
+
+```bash
+export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
+
+transformers-cli convert --model_type gpt2 \
+  --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+  [--config OPENAI_GPT2_CONFIG] \
+  [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
+```
+
+## Transformer-XL
+
+Here is an example of the conversion process for a pre-trained Transformer-XL model (see [here](https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models))
+
+```bash
+export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
+
+transformers-cli convert --model_type transfo_xl \
+  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+  [--config TRANSFO_XL_CONFIG] \
+  [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]
+```
+
+## XLNet
+
+Here is an example of the conversion process for a pre-trained XLNet model:
+
+```bash
+export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
+export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
+
+transformers-cli convert --model_type xlnet \
+  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
+  --config $TRANSFO_XL_CONFIG_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+  [--finetuning_task_name XLNET_FINETUNED_TASK] \
+```
+
+## XLM
+
+Here is an example of the conversion process for a pre-trained XLM model:
+
+```bash
+export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
+
+transformers-cli convert --model_type xlm \
+  --tf_checkpoint $XLM_CHECKPOINT_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
+ [--config XML_CONFIG] \
+ [--finetuning_task_name XML_FINETUNED_TASK]
+```
+
+## T5
+
+Here is an example of the conversion process for a pre-trained T5 model:
+
+```bash
+export T5=/path/to/t5/uncased_L-12_H-768_A-12
+
+transformers-cli convert --model_type t5 \
+  --tf_checkpoint $T5/t5_model.ckpt \
+  --config $T5/t5_config.json \
+  --pytorch_dump_output $T5/pytorch_model.bin
+```
--- a/docs/source/en/custom_tools.mdx
+++ b/docs/source/en/custom_tools.mdx
@ -1,778 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Custom Tools and Prompts
-
-<Tip>
-
-If you are not aware of what tools and agents are in the context of transformers, we recommend you read the
-[Transformers Agents](transformers_agents) page first.
-
-</Tip>
-
-<Tip warning={true}>
-
-Transformers Agent is an experimental API that is subject to change at any time. Results returned by the agents
-can vary as the APIs or underlying models are prone to change.
-
-</Tip>
-
-Creating and using custom tools and prompts is paramount to empowering the agent and having it perform new tasks.
-In this guide we'll take a look at:
-
- How to customize the prompt
- How to use custom tools
- How to create custom tools
-
-## Customizing the prompt
-
-As explained in [Transformers Agents](transformers_agents) agents can run in [`~Agent.run`] and [`~Agent.chat`] mode.
-Both the `run` and `chat` modes underlie the same logic. The language model powering the agent is conditioned on a long 
-prompt and completes the prompt by generating the next tokens until the stop token is reached.
-The only difference between the two modes is that during the `chat` mode the prompt is extended with 
-previous user inputs and model generations. This allows the agent to have access to past interactions,
-seemingly giving the agent some kind of memory.
-
-### Structure of the prompt
-
-Let's take a closer look at how the prompt is structured to understand how it can be best customized.
-The prompt is structured broadly into four parts.
-
- 1. Introduction: how the agent should behave, explanation of the concept of tools.
- 2. Description of all the tools. This is defined by a `<<all_tools>>` token that is dynamically replaced at runtime with the tools defined/chosen by the user.
- 3. A set of examples of tasks and their solution
- 4. Current example, and request for solution.
-
-To better understand each part, let's look at a shortened version of how the `run` prompt can look like:
-
-````text
-I will ask you to perform a task, your job is to come up with a series of simple commands in Python that will perform the task.
-[...]
-You can print intermediate results if it makes sense to do so.
-
-Tools:
- document_qa: This is a tool that answers a question about a document (pdf). It takes an input named `document` which should be the document containing the information, as well as a `question` that is the question about the document. It returns a text that contains the answer to the question.
- image_captioner: This is a tool that generates a description of an image. It takes an input named `image` which should be the image to the caption and returns a text that contains the description in English.
-[...]
-
-Task: "Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French."
-
-I will use the following tools: `translator` to translate the question into English and then `image_qa` to answer the question on the input image.
-
-Answer:
-```py
-translated_question = translator(question=question, src_lang="French", tgt_lang="English")
-print(f"The translated question is {translated_question}.")
-answer = image_qa(image=image, question=translated_question)
-print(f"The answer is {answer}")
-```
-
-Task: "Identify the oldest person in the `document` and create an image showcasing the result as a banner."
-
-I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
-
-Answer:
-```py
-answer = document_qa(document, question="What is the oldest person?")
-print(f"The answer is {answer}.")
-image = image_generator("A banner showing " + answer)
-```
-
-[...]
-
-Task: "Draw me a picture of rivers and lakes"
-
-I will use the following
-````
-
-The introduction (the text before *"Tools:"*) explains precisely how the model shall behave and what it should do.
-This part most likely does not need to be customized as the agent shall always behave the same way.
-
-The second part (the bullet points below *"Tools"*) is dynamically added upon calling `run` or `chat`. There are 
-exactly as many bullet points as there are tools in `agent.toolbox` and each bullet point consists of the name 
-and description of the tool:
-
-```text
- <tool.name>: <tool.description>
-```
-
-Let's verify this quickly by loading the document_qa tool and printing out the name and description.
-
-```py
-from transformers import load_tool
-
-document_qa = load_tool("document-question-answering")
-print(f"- {document_qa.name}: {document_qa.description}")
-```
-
-which gives:
-```text
- document_qa: This is a tool that answers a question about a document (pdf). It takes an input named `document` which should be the document containing the information, as well as a `question` that is the question about the document. It returns a text that contains the answer to the question.
-```
-
-We can see that the tool name is short and precise. The description includes two parts, the first explaining 
-what the tool does and the second states what input arguments and return values are expected.
-
-A good tool name and tool description are very important for the agent to correctly use it. Note that the only
-information the agent has about the tool is its name and description, so one should make sure that both 
-are precisely written and match the style of the existing tools in the toolbox. In particular make sure the description
-mentions all the arguments expected by name in code-style, along with the expected type and a description of what they
-are.
-
-<Tip>
-
-Check the naming and description of the curated Transformers tools to better understand what name and 
-description a tool is expected to have. You can see all tools with the [`Agent.toolbox`] property.
-
-</Tip>
-
-The third part includes a set of curated examples that show the agent exactly what code it should produce
-for what kind of user request. The large language models empowering the agent are extremely good at 
-recognizing patterns in a prompt and repeating the pattern with new data. Therefore, it is very important
-that the examples are written in a way that maximizes the likelihood of the agent to generating correct,
-executable code in practice. 
-
-Let's have a look at one example:
-
-````text
-Task: "Identify the oldest person in the `document` and create an image showcasing the result as a banner."
-
-I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
-
-Answer:
-```py
-answer = document_qa(document, question="What is the oldest person?")
-print(f"The answer is {answer}.")
-image = image_generator("A banner showing " + answer)
-```
-
-````
-
-The pattern the model is prompted to repeat has three parts: The task statement, the agent's explanation of 
-what it intends to do, and finally the generated code. Every example that is part of the prompt has this exact 
-pattern, thus making sure that the agent will reproduce exactly the same pattern when generating new tokens.
-
-The prompt examples are curated by the Transformers team and rigorously evaluated on a set of 
-[problem statements](https://github.com/huggingface/transformers/blob/main/src/transformers/tools/evaluate_agent.py)
-to ensure that the agent's prompt is as good as possible to solve real use cases of the agent.
-
-The final part of the prompt corresponds to:
-```text
-Task: "Draw me a picture of rivers and lakes"
-
-I will use the following
-```
-
-is a final and unfinished example that the agent is tasked to complete. The unfinished example
-is dynamically created based on the actual user input. For the above example, the user ran:
-
-```py
-agent.run("Draw me a picture of rivers and lakes")
-```
-
-The user input - *a.k.a* the task: *"Draw me a picture of rivers and lakes"* is cast into the 
-prompt template: "Task: <task> \n\n I will use the following". This sentence makes up the final lines of the 
-prompt the agent is conditioned on, therefore strongly influencing the agent to finish the example 
-exactly in the same way it was previously done in the examples.
-
-Without going into too much detail, the chat template has the same prompt structure with the 
-examples having a slightly different style, *e.g.*:
-
-````text
-[...]
-
-=====
-
-Human: Answer the question in the variable `question` about the image stored in the variable `image`.
-
-Assistant: I will use the tool `image_qa` to answer the question on the input image.
-
-```py
-answer = image_qa(text=question, image=image)
-print(f"The answer is {answer}")
-```
-
-Human: I tried this code, it worked but didn't give me a good result. The question is in French
-
-Assistant: In this case, the question needs to be translated first. I will use the tool `translator` to do this.
-
-```py
-translated_question = translator(question=question, src_lang="French", tgt_lang="English")
-print(f"The translated question is {translated_question}.")
-answer = image_qa(text=translated_question, image=image)
-print(f"The answer is {answer}")
-```
-
-=====
-
-[...]
-````
-
-Contrary, to the examples of the `run` prompt, each `chat` prompt example has one or more exchanges between the 
-*Human* and the *Assistant*. Every exchange is structured similarly to the example of the `run` prompt. 
-The user's input is appended to behind *Human:* and the agent is prompted to first generate what needs to be done 
-before generating code. An exchange can be based on previous exchanges, therefore allowing the user to refer
-to past exchanges as is done *e.g.* above by the user's input of "I tried **this** code" refers to the 
-previously generated code of the agent.
-
-Upon running `.chat`, the user's input or *task* is cast into an unfinished example of the form:
-```text
-Human: <user-input>\n\nAssistant:
-```
-which the agent completes. Contrary to the `run` command, the `chat` command then appends the completed example
-to the prompt, thus giving the agent more context for the next `chat` turn.
-
-Great now that we know how the prompt is structured, let's see how we can customize it!
-
-### Writing good user inputs
-
-While large language models are getting better and better at understanding users' intentions, it helps 
-enormously to be as precise as possible to help the agent pick the correct task. What does it mean to be 
-as precise as possible?
-
-The agent sees a list of tool names and their description in its prompt. The more tools are added the 
-more difficult it becomes for the agent to choose the correct tool and it's even more difficult to choose
-the correct sequences of tools to run. Let's look at a common failure case, here we will only return 
-the code to analyze it.
-
-```py
-from transformers import HfAgent
-
-agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
-
-agent.run("Show me a tree", return_code=True)
-```
-
-gives:
-
-```text
-==Explanation from the agent==
-I will use the following tool: `image_segmenter` to create a segmentation mask for the image.
-
-
-==Code generated by the agent==
-mask = image_segmenter(image, prompt="tree")
-```
-
-which is probably not what we wanted. Instead, it is more likely that we want an image of a tree to be generated.
-To steer the agent more towards using a specific tool it can therefore be very helpful to use important keywords that 
-are present in the tool's name and description. Let's have a look.
-```py
-agent.toolbox["image_generator"].description
-```
-
-```text
-'This is a tool that creates an image according to a prompt, which is a text description. It takes an input named `prompt` which contains the image description and outputs an image.
-```
-
-The name and description make use of the keywords "image", "prompt", "create" and "generate". Using these words will most likely work better here. Let's refine our prompt a bit.
-
-```py
-agent.run("Create an image of a tree", return_code=True)
-```
-
-gives:
-```text
-==Explanation from the agent==
-I will use the following tool `image_generator` to generate an image of a tree.
-
-
-==Code generated by the agent==
-image = image_generator(prompt="tree")
-```
-
-Much better! That looks more like what we want. In short, when you notice that the agent struggles to 
-correctly map your task to the correct tools, try looking up the most pertinent keywords of the tool's name
-and description and try refining your task request with it.
-
-### Customizing the tool descriptions
-
-As we've seen before the agent has access to each of the tools' names and descriptions. The base tools 
-should have very precise names and descriptions, however, you might find that it could help to change the 
-the description or name of a tool for your specific use case. This might become especially important 
-when you've added multiple tools that are very similar or if you want to use your agent only for a certain 
-domain, *e.g.* image generation and transformations.
-
-A common problem is that the agent confuses image generation with image transformation/modification when 
-used a lot for image generation tasks, *e.g.*
-```py
-agent.run("Make an image of a house and a car", return_code=True)
-```
-returns
-```text
-==Explanation from the agent== 
-I will use the following tools `image_generator` to generate an image of a house and `image_transformer` to transform the image of a car into the image of a house.
-
-==Code generated by the agent==
-house_image = image_generator(prompt="A house")
-car_image = image_generator(prompt="A car")
-house_car_image = image_transformer(image=car_image, prompt="A house")
-```
-
-which is probably not exactly what we want here. It seems like the agent has a difficult time 
-to understand the difference between `image_generator` and `image_transformer` and often uses the two together.
-
-We can help the agent here by changing the tool name and description of `image_transformer`. Let's instead call it `modifier`
-to disassociate it a bit from "image" and "prompt":
-```py
-agent.toolbox["modifier"] = agent.toolbox.pop("image_transformer")
-agent.toolbox["modifier"].description = agent.toolbox["modifier"].description.replace(
-    "transforms an image according to a prompt", "modifies an image"
-)
-```
-
-Now "modify" is a strong cue to use the new image processor which should help with the above prompt. Let's run it again.
-
-```py
-agent.run("Make an image of a house and a car", return_code=True)
-```
-
-Now we're getting:
-```text
-==Explanation from the agent==
-I will use the following tools: `image_generator` to generate an image of a house, then `image_generator` to generate an image of a car.
-
-
-==Code generated by the agent==
-house_image = image_generator(prompt="A house")
-car_image = image_generator(prompt="A car")
-```
-
-which is definitely closer to what we had in mind! However, we want to have both the house and car in the same image. Steering the task more toward single image generation should help:
-
-```py
-agent.run("Create image: 'A house and car'", return_code=True)
-```
-
-```text
-==Explanation from the agent==
-I will use the following tool: `image_generator` to generate an image.
-
-
-==Code generated by the agent==
-image = image_generator(prompt="A house and car")
-```
-
-<Tip warning={true}>
-
-Agents are still brittle for many use cases, especially when it comes to 
-slightly more complex use cases like generating an image of multiple objects.
-Both the agent itself and the underlying prompt will be further improved in the coming 
-months making sure that agents become more robust to a variety of user inputs.
-
-</Tip>
-
-### Customizing the whole prompt
-
-To give the user maximum flexibility, the whole prompt template as explained in [above](#structure-of-the-prompt)
-can be overwritten by the user. In this case make sure that your custom prompt includes an introduction section, 
-a tool section, an example section, and an unfinished example section. If you want to overwrite the `run` prompt template, 
-you can do as follows:
-
-```py
-template = """ [...] """
-
-agent = HfAgent(your_endpoint, run_prompt_template=template)
-```
-
-<Tip warning={true}>
-
-Please make sure to have the `<<all_tools>>` string and the `<<prompt>>` defined somewhere in the `template` so that the agent can be aware 
-of the tools, it has available to it as well as correctly insert the user's prompt.
-
-</Tip>
-
-Similarly, one can overwrite the `chat` prompt template. Note that the `chat` mode always uses the following format for the exchanges:
-```text
-Human: <<task>>
-
-Assistant:
-```
-
-Therefore it is important that the examples of the custom `chat` prompt template also make use of this format.
-You can overwrite the `chat` template at instantiation as follows.
-
-```
-template = """ [...] """
-
-agent = HfAgent(url_endpoint=your_endpoint, chat_prompt_template=template)
-```
-
-<Tip warning={true}>
-
-Please make sure to have the `<<all_tools>>` string defined somewhere in the `template` so that the agent can be aware 
-of the tools, it has available to it.
-
-</Tip>
-
-## Using custom tools
-
-In this section, we'll be leveraging two existing custom tools that are specific to image generation:
-
- We replace [huggingface-tools/image-transformation](https://huggingface.co/spaces/huggingface-tools/image-transformation),
-  with [diffusers/controlnet-canny-tool](https://huggingface.co/spaces/diffusers/controlnet-canny-tool) 
-  to allow for more image modifications.
- We add a new tool for image upscaling to the default toolbox: 
-  [diffusers/latent-upscaler-tool](https://huggingface.co/spaces/diffusers/latent-upscaler-tool) replace the existing image-transformation tool.
-
-We'll start by loading the custom tools with the convenient [`load_tool`] function:
-
-```py
-from transformers import load_tool
-
-controlnet_transformer = load_tool("diffusers/controlnet-canny-tool")
-upscaler = load_tool("diffusers/latent-upscaler-tool")
-```
-
-Upon adding custom tools to an agent, the tools' descriptions and names are automatically
-included in the agents' prompts. Thus, it is imperative that custom tools have
-a well-written description and name in order for the agent to understand how to use them.
-Let's take a look at the description and name of `controlnet_transformer`:
-
-```py
-print(f"Description: '{controlnet_transformer.description}'")
-print(f"Name: '{controlnet_transformer.name}'")
-```
-
-gives 
-```text
-Description: 'This is a tool that transforms an image with ControlNet according to a prompt. 
-It takes two inputs: `image`, which should be the image to transform, and `prompt`, which should be the prompt to use to change it. It returns the modified image.'
-Name: 'image_transformer'
-```
-
-The name and description are accurate and fit the style of the [curated set of tools](./transformers_agents#a-curated-set-of-tools).
-Next, let's instantiate an agent with `controlnet_transformer` and `upscaler`:
-
-```py
-tools = [controlnet_transformer, upscaler]
-agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=tools)
-```
-
-This command should give you the following info:
-
-```text
-image_transformer has been replaced by <transformers_modules.diffusers.controlnet-canny-tool.bd76182c7777eba9612fc03c0
-8718a60c0aa6312.image_transformation.ControlNetTransformationTool object at 0x7f1d3bfa3a00> as provided in `additional_tools`
-```
-
-The set of curated tools already has an `image_transformer` tool which is hereby replaced with our custom tool.
-
-<Tip>
-
-Overwriting existing tools can be beneficial if we want to use a custom tool exactly for the same task as an existing tool 
-because the agent is well-versed in using the specific task. Beware that the custom tool should follow the exact same API 
-as the overwritten tool in this case, or you should adapt the prompt template to make sure all examples using that
-tool are updated.
-
-</Tip>
-
-The upscaler tool was given the name `image_upscaler` which is not yet present in the default toolbox and is therefore simply added to the list of tools.
-You can always have a look at the toolbox that is currently available to the agent via the `agent.toolbox` attribute:
-
-```py
-print("\n".join([f"- {a}" for a in agent.toolbox.keys()]))
-```
-
-```text
- document_qa
- image_captioner
- image_qa
- image_segmenter
- transcriber
- summarizer
- text_classifier
- text_qa
- text_reader
- translator
- image_transformer
- text_downloader
- image_generator
- video_generator
- image_upscaler
-```
-
-Note how `image_upscaler` is now part of the agents' toolbox.
-
-Let's now try out the new tools! We will re-use the image we generated in [Transformers Agents Quickstart](./transformers_agents#single-execution-run).
-
-```py
-from diffusers.utils import load_image
-
-image = load_image(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png"
-)
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200> 
-
-Let's transform the image into a beautiful winter landscape:
-
-```py
-image = agent.run("Transform the image: 'A frozen lake and snowy forest'", image=image)
-```
-
-```text
-==Explanation from the agent==
-I will use the following tool: `image_transformer` to transform the image.
-
-
-==Code generated by the agent==
-image = image_transformer(image, prompt="A frozen lake and snowy forest")
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_winter.png" width=200> 
-
-The new image processing tool is based on ControlNet which can make very strong modifications to the image.
-By default the image processing tool returns an image of size 512x512 pixels. Let's see if we can upscale it.
-
-```py
-image = agent.run("Upscale the image", image)
-```
-
-```text
-==Explanation from the agent==
-I will use the following tool: `image_upscaler` to upscale the image.
-
-
-==Code generated by the agent==
-upscaled_image = image_upscaler(image)
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_winter_upscale.png" width=400> 
-
-The agent automatically mapped our prompt "Upscale the image" to the just added upscaler tool purely based on the description and name of the upscaler tool 
-and was able to correctly run it.
-
-Next, let's have a look at how you can create a new custom tool.
-
-### Adding new tools
-
-In this section, we show how to create a new tool that can be added to the agent.
-
-#### Creating a new tool
-
-We'll first start by creating a tool. We'll add the not-so-useful yet fun task of fetching the model on the Hugging Face
-Hub with the most downloads for a given task.
-
-We can do that with the following code:
-
-```python
-from huggingface_hub import list_models
-
-task = "text-classification"
-
-model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
-print(model.id)
-```
-
-For the task `text-classification`, this returns `'facebook/bart-large-mnli'`, for `translation` it returns `'t5-base`.
-
-How do we convert this to a tool that the agent can leverage? All tools depend on the superclass `Tool` that holds the
-main attributes necessary. We'll create a class that inherits from it:
-
-```python
-from transformers import Tool
-
-
-class HFModelDownloadsTool(Tool):
-    pass
-```
-
-This class has a few needs:
- An attribute `name`, which corresponds to the name of the tool itself. To be in tune with other tools which have a
-  performative name, we'll name it `model_download_counter`.
- An attribute `description`, which will be used to populate the prompt of the agent.
- `inputs` and `outputs` attributes. Defining this will help the python interpreter make educated choices about types,
-  and will allow for a gradio-demo to be spawned when we push our tool to the Hub. They're both a list of expected
-  values, which can be `text`, `image`, or `audio`.
- A `__call__` method which contains the inference code. This is the code we've played with above!
-
-Here's what our class looks like now:
-
-```python
-from transformers import Tool
-from huggingface_hub import list_models
-
-
-class HFModelDownloadsTool(Tool):
-    name = "model_download_counter"
-    description = (
-        "This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. "
-        "It takes the name of the category (such as text-classification, depth-estimation, etc), and "
-        "returns the name of the checkpoint."
-    )
-
-    inputs = ["text"]
-    outputs = ["text"]
-
-    def __call__(self, task: str):
-        model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
-        return model.id
-```
-
-We now have our tool handy. Save it in a file and import it from your main script. Let's name this file
-`model_downloads.py`, so the resulting import code looks like this:
-
-```python
-from model_downloads import HFModelDownloadsTool
-
-tool = HFModelDownloadsTool()
-```
-
-In order to let others benefit from it and for simpler initialization, we recommend pushing it to the Hub under your 
-namespace. To do so, just call `push_to_hub` on the `tool` variable:
-
-```python
-tool.push_to_hub("hf-model-downloads")
-```
-
-You now have your code on the Hub! Let's take a look at the final step, which is to have the agent use it.
-
-#### Having the agent use the tool
-
-We now have our tool that lives on the Hub which can be instantiated as such (change the user name for your tool):
-
-```python
-from transformers import load_tool
-
-tool = load_tool("lysandre/hf-model-downloads")
-```
-
-In order to use it in the agent, simply pass it in the `additional_tools` parameter of the agent initialization method:
-
-```python
-from transformers import HfAgent
-
-agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=[tool])
-
-agent.run(
-    "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?"
-)
-```
-which outputs the following:
-```text
-==Code generated by the agent==
-model = model_download_counter(task="text-to-video")
-print(f"The model with the most downloads is {model}.")
-audio_model = text_reader(model)
-
-
-==Result==
-The model with the most downloads is damo-vilab/text-to-video-ms-1.7b.
-```
-
-and generates the following audio.
-
-| **Audio**                                                                                                                                            |
-|------------------------------------------------------------------------------------------------------------------------------------------------------|
-| <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/damo.wav" type="audio/wav"/> |
-
-
-<Tip>
-
-Depending on the LLM, some are quite brittle and require very exact prompts in order to work well. Having a well-defined
-name and description of the tool is paramount to having it be leveraged by the agent.
-
-</Tip>
-
-### Replacing existing tools
-
-Replacing existing tools can be done simply by assigning a new item to the agent's toolbox. Here's how one would do so:
-
-```python
-from transformers import HfAgent, load_tool
-
-agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
-agent.toolbox["image-transformation"] = load_tool("diffusers/controlnet-canny-tool")
-```
-
-<Tip>
-
-Beware when replacing tools with others! This will also adjust the agent's prompt. This can be good if you have a better
-prompt suited for the task, but it can also result in your tool being selected way more than others or for other
-tools to be selected instead of the one you have defined.
-
-</Tip>
-
-## Leveraging gradio-tools
-
-[gradio-tools](https://github.com/freddyaboulton/gradio-tools) is a powerful library that allows using Hugging
-Face Spaces as tools. It supports many existing Spaces as well as custom Spaces to be designed with it.
-
-We offer support for `gradio_tools` by using the `Tool.from_gradio` method. For example, we want to take
-advantage of the `StableDiffusionPromptGeneratorTool` tool offered in the `gradio-tools` toolkit so as to
-improve our prompts and generate better images.
-
-We first import the tool from `gradio_tools` and instantiate it:
-
-```python
-from gradio_tools import StableDiffusionPromptGeneratorTool
-
-gradio_tool = StableDiffusionPromptGeneratorTool()
-```
-
-We pass that instance to the `Tool.from_gradio` method:
-
-```python
-from transformers import Tool
-
-tool = Tool.from_gradio(gradio_tool)
-```
-
-Now we can manage it exactly as we would a usual custom tool. We leverage it to improve our prompt
-` a rabbit wearing a space suit`:
-
-```python
-from transformers import HfAgent
-
-agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=[tool])
-
-agent.run("Generate an image of the `prompt` after improving it.", prompt="A rabbit wearing a space suit")
-```
-
-The model adequately leverages the tool:
-```text
-==Explanation from the agent==
-I will use the following  tools: `StableDiffusionPromptGenerator` to improve the prompt, then `image_generator` to generate an image according to the improved prompt.
-
-
-==Code generated by the agent==
-improved_prompt = StableDiffusionPromptGenerator(prompt)
-print(f"The improved prompt is {improved_prompt}.")
-image = image_generator(improved_prompt)
-```
-
-Before finally generating the image:
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png">
-
-<Tip warning={true}>
-
-gradio-tools requires *textual* inputs and outputs, even when working with different modalities. This implementation
-works with image and audio objects. The two are currently incompatible, but will rapidly become compatible as we
-work to improve the support.
-
-</Tip>
-
-## Future compatibility with Langchain
-
-We love Langchain and think it has a very compelling suite of tools. In order to handle these tools,
-Langchain requires *textual* inputs and outputs, even when working with different modalities.
-This is often the serialized version (i.e., saved to disk) of the objects.
-
-This difference means that multi-modality isn't handled between transformers-agents and langchain.
-We aim for this limitation to be resolved in future versions, and welcome any help from avid langchain
-users to help us achieve this compatibility.
-
-We would love to have better support. If you would like to help, please 
-[open an issue](https://github.com/huggingface/transformers/issues/new) and share what you have in mind.
--- a/docs/source/en/generation_strategies.mdx
+++ b/docs/source/en/generation_strategies.mdx
@ -139,35 +139,6 @@ one for summarization with beam search). You must have the right Hub permissions
 ['Les fichiers de configuration sont faciles à utiliser !']
 ```

-## Streaming
-
-The `generate()` supports streaming, through its `streamer` input. The `streamer` input is compatible any instance
-from a class that has the following methods: `put()` and `end()`. Internally, `put()` is used to push new tokens and
-`end()` is used to flag the end of text generation.
-
-<Tip warning={true}>
-
-The API for the streamer classes is still under development and may change in the future.
-
-</Tip>
-
-In practice, you can craft your own streaming class for all sorts of purposes! We also have basic streaming classes
-ready for you to use. For example, you can use the [`TextStreamer`] class to stream the output of `generate()` into
-your screen, one word at a time:
-
-```python
->>> from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
-
->>> tok = AutoTokenizer.from_pretrained("gpt2")
->>> model = AutoModelForCausalLM.from_pretrained("gpt2")
->>> inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
->>> streamer = TextStreamer(tok)
-
->>> # Despite returning the usual output, the streamer will also print the generated text to stdout.
->>> _ = model.generate(**inputs, streamer=streamer, max_new_tokens=20)
-An increasing sequence: one, two, three, four, five, six, seven, eight, nine, ten, eleven,
-```
-
 ## Decoding strategies

 Certain combinations of the `generate()` parameters, and ultimately `generation_config`, can be used to enable specific
@ -222,11 +193,11 @@ We pride ourselves on being the best in the business and our customer service is
 ### Multinomial sampling

 As opposed to greedy search that always chooses a token with the highest probability as the
-next token, multinomial sampling (also called ancestral sampling) randomly selects the next token based on the probability distribution over the entire
+next token, multinomial sampling randomly selects the next token based on the probability distribution over the entire
 vocabulary given by the model. Every token with a non-zero probability has a chance of being selected, thus reducing the
 risk of repetition.

-To enable multinomial sampling set `do_sample=True` and `num_beams=1`.
+To enable multinomial sampling set `do_sample=True`.

 ```python
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM
@ -238,7 +209,7 @@ To enable multinomial sampling set `do_sample=True` and `num_beams=1`.
 >>> prompt = "Today was an amazing day because"
 >>> inputs = tokenizer(prompt, return_tensors="pt")

->>> outputs = model.generate(**inputs, do_sample=True, num_beams=1, max_new_tokens=100)
+>>> outputs = model.generate(**inputs, do_sample=True, max_new_tokens=100)
 >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
 ['Today was an amazing day because we are now in the final stages of our trip to New York City which was very tough. \
 It is a difficult schedule and a challenging part of the year but still worth it. I have been taking things easier and \
@ -332,53 +303,3 @@ The groups are selected to ensure they are distinct enough compared to the other
 This guide illustrates the main parameters that enable various decoding strategies. More advanced parameters exist for the
 [`generate`] method, which gives you even further control over the [`generate`] method's behavior.
 For the complete list of the available parameters, refer to the [API documentation](./main_classes/text_generation.mdx).
-
-### Assisted Decoding
-
-Assisted decoding is a modification of the decoding strategies above that uses an assistant model with the same
-tokenizer (ideally a much smaller model) to greedily generate a few candidate tokens. The main model then validates
-the candidate tokens in a single forward pass, which speeds up the decoding process. Currently, only greedy search
-and sampling are supported with assisted decoding, and doesn't support batched inputs.
-
-<!-- TODO: add link to the blog post about assisted decoding when it exists -->
-
-To enable assisted decoding, set the `assistant_model` argument with a model.
-
-```python
->>> from transformers import AutoModelForCausalLM, AutoTokenizer
-
->>> prompt = "Alice and Bob"
->>> checkpoint = "EleutherAI/pythia-1.4b-deduped"
->>> assistant_checkpoint = "EleutherAI/pythia-160m-deduped"
-
->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
->>> inputs = tokenizer(prompt, return_tensors="pt")
-
->>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
->>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint)
->>> outputs = model.generate(**inputs, assistant_model=assistant_model)
->>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
-```
-
-When using assisted decoding with sampling methods, you can use the `temperarure` argument to control the randomness
-just like in multinomial sampling. However, in assisted decoding, reducing the temperature will help improving latency.
-
-<!-- TODO: link the blog post again to explain why the tradeoff exists -->
-
-```python
->>> from transformers import AutoModelForCausalLM, AutoTokenizer
-
->>> prompt = "Alice and Bob"
->>> checkpoint = "EleutherAI/pythia-1.4b-deduped"
->>> assistant_checkpoint = "EleutherAI/pythia-160m-deduped"
-
->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
->>> inputs = tokenizer(prompt, return_tensors="pt")
-
->>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
->>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint)
->>> outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.5)
->>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-["Alice and Bob are sitting on the sofa. Alice says, 'I'm going to my room"]
-```
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@ -75,7 +75,7 @@ The documentation is organized into five sections:
 1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[Chinese-CLIP](model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
-1. **[CLAP](model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
+1. **[CLAP](model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
 1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
@ -84,7 +84,6 @@ The documentation is organized into five sections:
 1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[ConvNeXTV2](model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
 1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
-1. **[CPM-Ant](model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
 1. **[CTRL](model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
@ -93,7 +92,6 @@ The documentation is organized into five sections:
 1. **[Decision Transformer](model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
 1. **[Deformable DETR](model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
 1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
-1. **[DePlot](model_doc/deplot)** (from Google AI) released with the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
 1. **[DETA](model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl.
 1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
 1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
@ -115,7 +113,6 @@ The documentation is organized into five sections:
 1. **[FlauBERT](model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
 1. **[FLAVA](model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
 1. **[FNet](model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
-1. **[FocalNet](model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
 1. **[Funnel Transformer](model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
 1. **[GIT](model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
 1. **[GLPN](model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
@ -126,7 +123,6 @@ The documentation is organized into five sections:
 1. **[GPT-2](model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
 1. **[GPT-J](model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
 1. **[GPT-Sw3](model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren.
-1. **[GPTBigCode](model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
 1. **[GPTSAN-japanese](model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
 1. **[Graphormer](model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
 1. **[GroupViT](model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
@ -153,7 +149,6 @@ The documentation is organized into five sections:
 1. **[MarkupLM](model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
 1. **[Mask2Former](model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
 1. **[MaskFormer](model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
-1. **[MatCha](model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
 1. **[mBART](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[mBART-50](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
 1. **[MEGA](model_doc/mega)** (from Facebook) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
@ -171,10 +166,8 @@ The documentation is organized into five sections:
 1. **[NAT](model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
 1. **[Nezha](model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
 1. **[NLLB](model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
-1. **[NLLB-MOE](model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
 1. **[Nyströmformer](model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
 1. **[OneFormer](model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
-1. **[OpenLlama](model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 
 1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
 1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
@ -196,9 +189,7 @@ The documentation is organized into five sections:
 1. **[RoBERTa-PreLayerNorm](model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
 1. **[RoCBert](model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-1. **[RWKV](model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
 1. **[SegFormer](model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
-1. **[Segment Anything](model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
 1. **[SEW](model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
 1. **[SEW-D](model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
 1. **[SpeechT5](model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
@ -206,7 +197,6 @@ The documentation is organized into five sections:
 1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
 1. **[Splinter](model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
 1. **[SqueezeBERT](model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
-1. **[SwiftFormer](model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
 1. **[Swin Transformer](model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
 1. **[Swin Transformer V2](model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
 1. **[Swin2SR](model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
@ -278,7 +268,7 @@ Flax), PyTorch, and/or TensorFlow.
 |              BiT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          Blenderbot           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |        BlenderbotSmall        |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|             BLIP              |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             BLIP              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            BLIP-2             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             BLOOM             |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
 |          BridgeTower          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
@ -293,7 +283,6 @@ Flax), PyTorch, and/or TensorFlow.
 |           ConvBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |           ConvNeXT            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |          ConvNeXTV2           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            CPM-Ant            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             CTRL              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |              CvT              |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |         Data2VecAudio         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
@ -322,16 +311,14 @@ Flax), PyTorch, and/or TensorFlow.
 |           FlauBERT            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |             FLAVA             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             FNet              |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|           FocalNet            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |      Funnel Transformer       |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |              GIT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             GLPN              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            GPT Neo            |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|            GPT Neo            |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
 |           GPT NeoX            |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
 |       GPT NeoX Japanese       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             GPT-J             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
 |            GPT-Sw3            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|          GPTBigCode           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |        GPTSAN-japanese        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          Graphormer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           GroupViT            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
@ -346,7 +333,7 @@ Flax), PyTorch, and/or TensorFlow.
 |              LED              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |             LeViT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             LiLT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             LLaMA             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|             LLaMA             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          Longformer           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |            LongT5             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
 |             LUKE              |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
@ -371,12 +358,10 @@ Flax), PyTorch, and/or TensorFlow.
 |              MVP              |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 |              NAT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             Nezha             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           NLLB-MOE            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |         Nyströmformer         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           OneFormer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          OpenAI GPT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |         OpenAI GPT-2          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|           OpenLlama           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |              OPT              |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
 |            OWL-ViT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            Pegasus            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
@ -390,16 +375,14 @@ Flax), PyTorch, and/or TensorFlow.
 |              RAG              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |             REALM             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 |           Reformer            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|            RegNet             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|            RegNet             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            RemBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            ResNet             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|            ResNet             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |           RetriBERT           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 |            RoBERTa            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |     RoBERTa-PreLayerNorm      |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
 |            RoCBert            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           RoFormer            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|             RWKV              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|              SAM              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           SegFormer           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |              SEW              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             SEW-D             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
@ -409,7 +392,6 @@ Flax), PyTorch, and/or TensorFlow.
 |           SpeechT5            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           Splinter            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 |          SqueezeBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|          SwiftFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |       Swin Transformer        |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |      Swin Transformer V2      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            Swin2SR            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
--- a/docs/source/en/internal/audio_utils.mdx
+++ b/docs/source/en/internal/audio_utils.mdx
@ -12,9 +12,10 @@ specific language governing permissions and limitations under the License.

 # Utilities for `FeatureExtractors`

-This page lists all the utility functions that can be used by the audio [`FeatureExtractor`] in order to compute special features from a raw audio using common algorithms such as *Short Time Fourier Transform* or *log mel spectrogram*.
+This page lists all the utility functions that can be used by the audio [`FeatureExtractor`] in order to compute special features from a raw audio using common algorithms such as *Short Time Fourier Transform* or *Mel log spectrogram*.

-Most of those are only useful if you are studying the code of the audio processors in the library.
+
+Most of those are only useful if you are studying the code of the image processors in the library.

 ## Audio Transformations

@ -22,14 +23,12 @@ Most of those are only useful if you are studying the code of the audio processo

 [[autodoc]] audio_utils.mel_to_hertz

-[[autodoc]] audio_utils.mel_filter_bank
+[[autodoc]] audio_utils.get_mel_filter_banks

-[[autodoc]] audio_utils.optimal_fft_length
-
-[[autodoc]] audio_utils.window_function
-
-[[autodoc]] audio_utils.spectrogram
+[[autodoc]] audio_utils.stft

 [[autodoc]] audio_utils.power_to_db

-[[autodoc]] audio_utils.amplitude_to_db
+[[autodoc]] audio_utils.fram_wave
+
+
--- a/docs/source/en/internal/generation_utils.mdx
+++ b/docs/source/en/internal/generation_utils.mdx
@ -265,9 +265,3 @@ A [`Constraint`] can be used to force the generation to include specific tokens
 [[autodoc]] top_k_top_p_filtering

 [[autodoc]] tf_top_k_top_p_filtering
-
-## Streamers
-
-[[autodoc]] TextStreamer
-
-[[autodoc]] TextIteratorStreamer
--- a/docs/source/en/main_classes/agent.mdx
+++ b/docs/source/en/main_classes/agent.mdx
@ -1,64 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Agents & Tools
-
-<Tip warning={true}>
-
-Transformers Agent is an experimental API which is subject to change at any time. Results returned by the agents
-can vary as the APIs or underlying models are prone to change.
-
-</Tip>
-
-To learn more about agents and tools make sure to read the [introductory guide](../transformers_agents). This page
-contains the API docs for the underlying classes.
-
-## Agents
-
-We provide two types of agents: [`HfAgent`] uses inference endpoints for opensource models and [`OpenAiAgent`] uses OpenAI closed models.
-
-### HfAgent
-
-[[autodoc]] HfAgent
-
-### OpenAiAgent
-
-[[autodoc]] OpenAiAgent
-
-### Agent
-
-[[autodoc]] Agent
-    - chat
-    - run
-    - prepare_for_new_chat
-
-## Tools
-
-### load_tool
-
-[[autodoc]] load_tool
-
-### Tool
-
-[[autodoc]] Tool
-
-### PipelineTool
-
-[[autodoc]] PipelineTool
-
-### RemoteTool
-
-[[autodoc]] RemoteTool
-
-### launch_gradio_demo
-
-[[autodoc]] launch_gradio_demo
--- a/docs/source/en/main_classes/output.mdx
+++ b/docs/source/en/main_classes/output.mdx
@ -31,7 +31,7 @@ outputs = model(**inputs, labels=labels)
 ```

 The `outputs` object is a [`~modeling_outputs.SequenceClassifierOutput`], as we can see in the
-documentation of that class below, it means it has an optional `loss`, a `logits`, an optional `hidden_states` and
+documentation of that class below, it means it has an optional `loss`, a `logits` an optional `hidden_states` and
 an optional `attentions` attribute. Here we have the `loss` since we passed along `labels`, but we don't have
 `hidden_states` and `attentions` because we didn't pass `output_hidden_states=True` or
 `output_attentions=True`.
--- a/docs/source/en/main_classes/quantization.mdx
+++ b/docs/source/en/main_classes/quantization.mdx
@ -33,7 +33,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 model_id = "bigscience/bloom-1b7"

 tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_8bit=True)
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map == "auto", load_in_8bit=True)
 ```

 Then, use your model as you would usually use a [`PreTrainedModel`].
@ -52,37 +52,6 @@ Note that once a model has been loaded in 8-bit it is currently not possible to

 </Tip>

-### Push quantized models on the 🤗 Hub
-
-You can push a quantized model on the Hub by naively using `push_to_hub` method. This will first push the quantization configuration file, then push the quantized model weights.
-Make sure to use `bitsandbytes>0.37.2` (at this time of writing, we tested it on `bitsandbytes==0.38.0.post1`) to be able to use this feature. 
-
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m", device_map="auto", load_in_8bit=True)
-tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
-
-model.push_to_hub("bloom-560m-8bit")
-```
-
-<Tip warning={true}>
-
-Pushing 8bit models on the Hub is strongely encouraged for large models. This will allow the community to benefit from the memory footprint reduction and loading for example large models on a Google Colab.
-
-</Tip>
-
-### Load a quantized model from the 🤗 Hub
-
-You can load a quantized model from the Hub by using `from_pretrained` method. Make sure that the pushed weights are quantized, by checking that the attribute `quantization_config` is present in the model configuration object.
-
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model = AutoModelForCausalLM.from_pretrained("{your_username}/bloom-560m-8bit")
-```
-Note that in this case, you don't need to specify the arguments `load_in_8bit=True` and `device_map="auto"`, but you need to make sure that `bitsandbytes` and `accelerate` are installed.
-
 ### Advanced usecases

 This section is intended to advanced users, that want to explore what it is possible to do beyond loading and running 8-bit models.
--- a/docs/source/en/main_classes/text_generation.mdx
+++ b/docs/source/en/main_classes/text_generation.mdx
@ -24,8 +24,7 @@ of the generation method.

 To learn how to inspect a model's generation configuration, what are the defaults, how to change the parameters ad hoc,
 and how to create and save a customized generation configuration, refer to the
-[text generation strategies guide](../generation_strategies). The guide also explains how to use related features,
-like token streaming.
+[text generation strategies guide](../generation_strategies).

 ## GenerationConfig

--- a/docs/source/en/migration.mdx
+++ b/docs/source/en/migration.mdx
@ -0,0 +1,315 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Migrating from previous packages
+
+## Migrating from transformers `v3.x` to `v4.x`
+
+A couple of changes were introduced when the switch from version 3 to version 4 was done. Below is a summary of the
+expected changes:
+
+#### 1. AutoTokenizers and pipelines now use fast (rust) tokenizers by default.
+
+The python and rust tokenizers have roughly the same API, but the rust tokenizers have a more complete feature set.
+
+This introduces two breaking changes:
+- The handling of overflowing tokens between the python and rust tokenizers is different.
+- The rust tokenizers do not accept integers in the encoding methods.
+
+##### How to obtain the same behavior as v3.x in v4.x
+
+- The pipelines now contain additional features out of the box. See the [token-classification pipeline with the `grouped_entities` flag](main_classes/pipelines#transformers.TokenClassificationPipeline).
+- The auto-tokenizers now return rust tokenizers. In order to obtain the python tokenizers instead, the user may use the `use_fast` flag by setting it to `False`:
+
+In version `v3.x`:
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+```
+to obtain the same in version `v4.x`:
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)
+```
+
+#### 2. SentencePiece is removed from the required dependencies
+
+The requirement on the SentencePiece dependency has been lifted from the `setup.py`. This is done so that we may have a channel on anaconda cloud without relying on `conda-forge`. This means that the tokenizers that depend on the SentencePiece library will not be available with a standard `transformers` installation.
+
+This includes the **slow** versions of:
+- `XLNetTokenizer`
+- `AlbertTokenizer`
+- `CamembertTokenizer`
+- `MBartTokenizer`
+- `PegasusTokenizer`
+- `T5Tokenizer`
+- `ReformerTokenizer`
+- `XLMRobertaTokenizer`
+
+##### How to obtain the same behavior as v3.x in v4.x
+
+In order to obtain the same behavior as version `v3.x`, you should install `sentencepiece` additionally:
+
+In version `v3.x`:
+```bash
+pip install transformers
+```
+to obtain the same in version `v4.x`:
+```bash
+pip install transformers[sentencepiece]
+```
+or
+```bash
+pip install transformers sentencepiece
+```
+#### 3. The architecture of the repo has been updated so that each model resides in its folder
+
+The past and foreseeable addition of new models means that the number of files in the directory `src/transformers` keeps growing and becomes harder to navigate and understand. We made the choice to put each model and the files accompanying it in their own sub-directories.
+
+This is a breaking change as importing intermediary layers using a model's module directly needs to be done via a different path.
+
+##### How to obtain the same behavior as v3.x in v4.x
+
+In order to obtain the same behavior as version `v3.x`, you should update the path used to access the layers.
+
+In version `v3.x`:
+```bash
+from transformers.modeling_bert import BertLayer
+```
+to obtain the same in version `v4.x`:
+```bash
+from transformers.models.bert.modeling_bert import BertLayer
+```
+
+#### 4. Switching the `return_dict` argument to `True` by default
+
+The [`return_dict` argument](main_classes/output) enables the return of dict-like python objects containing the model outputs, instead of the standard tuples. This object is self-documented as keys can be used to retrieve values, while also behaving as a tuple as users may retrieve objects by index or by slice.
+
+This is a breaking change as the limitation of that tuple is that it cannot be unpacked: `value0, value1 = outputs` will not work.
+
+##### How to obtain the same behavior as v3.x in v4.x
+
+In order to obtain the same behavior as version `v3.x`, you should specify the `return_dict` argument to `False`, either in the model configuration or during the forward pass.
+
+In version `v3.x`:
+```bash
+model = BertModel.from_pretrained("bert-base-cased")
+outputs = model(**inputs)
+```
+to obtain the same in version `v4.x`:
+```bash
+model = BertModel.from_pretrained("bert-base-cased")
+outputs = model(**inputs, return_dict=False)
+```
+or
+```bash
+model = BertModel.from_pretrained("bert-base-cased", return_dict=False)
+outputs = model(**inputs)
+```
+
+#### 5. Removed some deprecated attributes
+
+Attributes that were deprecated have been removed if they had been deprecated for at least a month. The full list of deprecated attributes can be found in [#8604](https://github.com/huggingface/transformers/pull/8604).
+
+Here is a list of these attributes/methods/arguments and what their replacements should be:
+
+In several models, the labels become consistent with the other models:
+- `masked_lm_labels` becomes `labels` in `AlbertForMaskedLM` and `AlbertForPreTraining`.
+- `masked_lm_labels` becomes `labels` in `BertForMaskedLM` and `BertForPreTraining`.
+- `masked_lm_labels` becomes `labels` in `DistilBertForMaskedLM`.
+- `masked_lm_labels` becomes `labels` in `ElectraForMaskedLM`.
+- `masked_lm_labels` becomes `labels` in `LongformerForMaskedLM`.
+- `masked_lm_labels` becomes `labels` in `MobileBertForMaskedLM`.
+- `masked_lm_labels` becomes `labels` in `RobertaForMaskedLM`.
+- `lm_labels` becomes `labels` in `BartForConditionalGeneration`.
+- `lm_labels` becomes `labels` in `GPT2DoubleHeadsModel`.
+- `lm_labels` becomes `labels` in `OpenAIGPTDoubleHeadsModel`.
+- `lm_labels` becomes `labels` in `T5ForConditionalGeneration`.
+
+In several models, the caching mechanism becomes consistent with the other models:
+- `decoder_cached_states` becomes `past_key_values` in all BART-like, FSMT and T5 models.
+- `decoder_past_key_values` becomes `past_key_values` in all BART-like, FSMT and T5 models.
+- `past` becomes `past_key_values` in all CTRL models.
+- `past` becomes `past_key_values` in all GPT-2 models.
+
+Regarding the tokenizer classes:
+- The tokenizer attribute `max_len` becomes `model_max_length`.
+- The tokenizer attribute `return_lengths` becomes `return_length`.
+- The tokenizer encoding argument `is_pretokenized` becomes `is_split_into_words`.
+
+Regarding the `Trainer` class:
+- The `Trainer` argument `tb_writer` is removed in favor of the callback `TensorBoardCallback(tb_writer=...)`.
+- The `Trainer` argument `prediction_loss_only` is removed in favor of the class argument `args.prediction_loss_only`.
+- The `Trainer` attribute `data_collator` should be a callable.
+- The `Trainer` method `_log` is deprecated in favor of `log`.
+- The `Trainer` method `_training_step` is deprecated in favor of `training_step`.
+- The `Trainer` method `_prediction_loop` is deprecated in favor of `prediction_loop`.
+- The `Trainer` method `is_local_master` is deprecated in favor of `is_local_process_zero`.
+- The `Trainer` method `is_world_master` is deprecated in favor of `is_world_process_zero`.
+
+Regarding the `TFTrainer` class:
+- The `TFTrainer` argument `prediction_loss_only` is removed in favor of the class argument `args.prediction_loss_only`.
+- The `Trainer` method `_log` is deprecated in favor of `log`.
+- The `TFTrainer` method `_prediction_loop` is deprecated in favor of `prediction_loop`.
+- The `TFTrainer` method `_setup_wandb` is deprecated in favor of `setup_wandb`.
+- The `TFTrainer` method `_run_model` is deprecated in favor of `run_model`.
+
+Regarding the `TrainingArguments` class:
+- The `TrainingArguments` argument `evaluate_during_training` is deprecated in favor of `evaluation_strategy`.
+
+Regarding the Transfo-XL model:
+- The Transfo-XL configuration attribute `tie_weight` becomes `tie_words_embeddings`.
+- The Transfo-XL modeling method `reset_length` becomes `reset_memory_length`.
+
+Regarding pipelines:
+- The `FillMaskPipeline` argument `topk` becomes `top_k`.
+
+
+
+## Migrating from pytorch-transformers to 🤗 Transformers
+
+Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to 🤗 Transformers.
+
+### Positional order of some models' keywords inputs (`attention_mask`, `token_type_ids`...) changed
+
+To be able to use Torchscript (see #1010, #1204 and #1195) the specific order of some models **keywords inputs** (`attention_mask`, `token_type_ids`...) has been changed.
+
+If you used to call the models with keyword names for keyword arguments, e.g. `model(inputs_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)`, this should not cause any change.
+
+If you used to call the models with positional inputs for keyword arguments, e.g. `model(inputs_ids, attention_mask, token_type_ids)`, you may have to double check the exact order of input arguments.
+
+## Migrating from pytorch-pretrained-bert
+
+Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to 🤗 Transformers
+
+### Models always output `tuples`
+
+The main breaking change when migrating from `pytorch-pretrained-bert` to 🤗 Transformers is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
+
+The exact content of the tuples for each model are detailed in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
+
+In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
+
+Here is a `pytorch-pretrained-bert` to 🤗 Transformers conversion example for a `BertForSequenceClassification` classification model:
+
+```python
+# Let's load our model
+model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
+
+# If you used to have this line in pytorch-pretrained-bert:
+loss = model(input_ids, labels=labels)
+
+# Now just use this line in 🤗 Transformers to extract the loss from the output tuple:
+outputs = model(input_ids, labels=labels)
+loss = outputs[0]
+
+# In 🤗 Transformers you can also have access to the logits:
+loss, logits = outputs[:2]
+
+# And even the attention weights if you configure the model to output them (and other outputs too, see the docstrings and documentation)
+model = BertForSequenceClassification.from_pretrained("bert-base-uncased", output_attentions=True)
+outputs = model(input_ids, labels=labels)
+loss, logits, attentions = outputs
+```
+
+### Serialization
+
+Breaking change in the `from_pretrained()`method:
+
+1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.
+
+2. The additional `*inputs` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute first which can break derived model classes build based on the previous `BertForSequenceClassification` examples. More precisely, the positional arguments `*inputs` provided to `from_pretrained()` are directly forwarded the model `__init__()` method while the keyword arguments `**kwargs` (i) which match configuration class attributes are used to update said attributes (ii) which don't match any configuration class attributes are forwarded to the model `__init__()` method.
+
+Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other serialization method before.
+
+Here is an example:
+
+```python
+### Let's load a model and tokenizer
+model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
+tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+
+### Do some stuff to our model and tokenizer
+# Ex: add new tokens to the vocabulary and embeddings of our model
+tokenizer.add_tokens(["[SPECIAL_TOKEN_1]", "[SPECIAL_TOKEN_2]"])
+model.resize_token_embeddings(len(tokenizer))
+# Train our model
+train(model)
+
+### Now let's save our model and tokenizer to a directory
+model.save_pretrained("./my_saved_model_directory/")
+tokenizer.save_pretrained("./my_saved_model_directory/")
+
+### Reload the model and the tokenizer
+model = BertForSequenceClassification.from_pretrained("./my_saved_model_directory/")
+tokenizer = BertTokenizer.from_pretrained("./my_saved_model_directory/")
+```
+
+### Optimizers: BertAdam & OpenAIAdam are now AdamW, schedules are standard PyTorch schedules
+
+The two optimizers previously included, `BertAdam` and `OpenAIAdam`, have been replaced by a single `AdamW` optimizer which has a few differences:
+
+- it only implements weights decay correction,
+- schedules are now externals (see below),
+- gradient clipping is now also external (see below).
+
+The new optimizer `AdamW` matches PyTorch `Adam` optimizer API and let you use standard PyTorch or apex methods for the schedule and clipping.
+
+The schedules are now standard [PyTorch learning rate schedulers](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate) and not part of the optimizer anymore.
+
+Here is a conversion examples from `BertAdam` with a linear warmup and decay schedule to `AdamW` and the same schedule:
+
+```python
+# Parameters:
+lr = 1e-3
+max_grad_norm = 1.0
+num_training_steps = 1000
+num_warmup_steps = 100
+warmup_proportion = float(num_warmup_steps) / float(num_training_steps)  # 0.1
+
+### Previously BertAdam optimizer was instantiated like this:
+optimizer = BertAdam(
+    model.parameters(),
+    lr=lr,
+    schedule="warmup_linear",
+    warmup=warmup_proportion,
+    num_training_steps=num_training_steps,
+)
+### and used like this:
+for batch in train_data:
+    loss = model(batch)
+    loss.backward()
+    optimizer.step()
+
+### In 🤗 Transformers, optimizer and schedules are split and instantiated like this:
+optimizer = AdamW(
+    model.parameters(), lr=lr, correct_bias=False
+)  # To reproduce BertAdam specific behavior set correct_bias=False
+scheduler = get_linear_schedule_with_warmup(
+    optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps
+)  # PyTorch scheduler
+### and used like this:
+for batch in train_data:
+    loss = model(batch)
+    loss.backward()
+    torch.nn.utils.clip_grad_norm_(
+        model.parameters(), max_grad_norm
+    )  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
+    optimizer.step()
+    scheduler.step()
+```
--- a/docs/source/en/model_doc/auto.mdx
+++ b/docs/source/en/model_doc/auto.mdx
@ -134,10 +134,6 @@ The following auto classes are available for the following natural language proc

 [[autodoc]] FlaxAutoModelForMaskedLM

-### AutoModelForMaskGeneration
-
-[[autodoc]] AutoModelForMaskGeneration
-
 ### AutoModelForSeq2SeqLM

 [[autodoc]] AutoModelForSeq2SeqLM
--- a/docs/source/en/model_doc/biogpt.mdx
+++ b/docs/source/en/model_doc/biogpt.mdx
@ -53,16 +53,4 @@ This model was contributed by [kamalkraj](https://huggingface.co/kamalkraj). The
 ## BioGptForCausalLM

 [[autodoc]] BioGptForCausalLM
-    - forward
-
-    
-## BioGptForTokenClassification
-
-[[autodoc]] BioGptForTokenClassification
-    - forward
-
-
-## BioGptForSequenceClassification
-
-[[autodoc]] BioGptForSequenceClassification
    - forward
--- a/docs/source/en/model_doc/blip.mdx
+++ b/docs/source/en/model_doc/blip.mdx
@ -1,4 +1,4 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@ -93,40 +93,4 @@ The original code can be found [here](https://github.com/salesforce/BLIP).
 ## BlipForQuestionAnswering

 [[autodoc]] BlipForQuestionAnswering
-    - forward
-
-## TFBlipModel
-
-[[autodoc]] TFBlipModel
-    - call
-    - get_text_features
-    - get_image_features
-
-## TFBlipTextModel
-
-[[autodoc]] TFBlipTextModel
-    - call
-
-
-## TFBlipVisionModel
-
-[[autodoc]] TFBlipVisionModel
-    - call
-
-
-## TFBlipForConditionalGeneration
-
-[[autodoc]] TFBlipForConditionalGeneration
-    - call
-
-
-## TFBlipForImageTextRetrieval
-
-[[autodoc]] TFBlipForImageTextRetrieval
-    - call
-
-
-## TFBlipForQuestionAnswering
-
-[[autodoc]] TFBlipForQuestionAnswering
-    - call
+    - forward
--- a/docs/source/en/model_doc/clap.mdx
+++ b/docs/source/en/model_doc/clap.mdx
@ -14,10 +14,10 @@ specific language governing permissions and limitations under the License.

 ## Overview

-The CLAP model was proposed in [Large Scale Contrastive Language-Audio pretraining with
+The CLAP model was proposed in [Large Scale Constrastive Laungaue-Audio pretraining with
 feature fusion and keyword-to-caption augmentation](https://arxiv.org/pdf/2211.06687.pdf) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.

-CLAP (Contrastive Language-Audio Pretraining) is a neural network trained on a variety of (audio, text) pairs. It can be instructed in to predict the most relevant text snippet, given an audio, without directly optimizing for the task. The CLAP model uses a SWINTransformer to get audio features from a log-Mel spectrogram input, and a RoBERTa model to get text features. Both the text and audio features are then projected to a latent space with identical dimension. The dot product between the projected audio and text features is then used as a similar score.
+CLAP (Constrastive Laungaue-Audio Pretraining) is a neural network trained on a variety of (audio, text) pairs. It can be instructed in to predict the most relevant text snippet, given an audio, without directly optimizing for the task. The CLAP model uses a SWINTransformer to get audio features from a log-Mel spectrogram input, and a RoBERTa model to get text features. Both the text and audio features are then projected to a latent space with identical dimension. The dot product between the projected audio and text features is then used as a similar score.

 The abstract from the paper is the following:

--- a/docs/source/en/model_doc/cpmant.mdx
+++ b/docs/source/en/model_doc/cpmant.mdx
@ -1,44 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team and The OpenBMB Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# CPMAnt
-
-## Overview
-
-CPM-Ant is an open-source Chinese pre-trained language model (PLM) with 10B parameters. It is also the first milestone of the live training process of CPM-Live. The training process is cost-effective and environment-friendly. CPM-Ant also achieves promising results with delta tuning on the CUGE benchmark. Besides the full model, we also provide various compressed versions to meet the requirements of different hardware configurations. [See more](https://github.com/OpenBMB/CPM-Live/tree/cpm-ant/cpm-live)
-
-Tips:
-
-This model was contributed by [OpenBMB](https://huggingface.co/openbmb). The original code can be found [here](https://github.com/OpenBMB/CPM-Live/tree/cpm-ant/cpm-live).
-
-⚙️ Training & Inference
- A tutorial on [CPM-Live](https://github.com/OpenBMB/CPM-Live/tree/cpm-ant/cpm-live).
-
-## CpmAntConfig
-
-[[autodoc]] CpmAntConfig
-    - all
-
-## CpmAntTokenizer
-
-[[autodoc]] CpmAntTokenizer
-    - all
-
-## CpmAntModel
-
-[[autodoc]] CpmAntModel
-    - all
-    
-## CpmAntForCausalLM
-
-[[autodoc]] CpmAntForCausalLM
-    - all
--- a/docs/source/en/model_doc/deplot.mdx
+++ b/docs/source/en/model_doc/deplot.mdx
@ -1,58 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# DePlot
-
-## Overview 
-
-DePlot was proposed in the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) from Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
-
-The abstract of the paper states the following:
-
-*Visual language such as charts and plots is ubiquitous in the human world. Comprehending plots and charts requires strong reasoning skills. Prior state-of-the-art (SOTA) models require at least tens of thousands of training examples and their reasoning capabilities are still much limited, especially on complex human-written queries. This paper presents the first one-shot solution to visual language reasoning. We decompose the challenge of visual language reasoning into two steps: (1) plot-to-text translation, and (2) reasoning over the translated text. The key in this method is a modality conversion module, named as DePlot, which translates the image of a plot or chart to a linearized table. The output of DePlot can then be directly used to prompt a pretrained large language model (LLM), exploiting the few-shot reasoning capabilities of LLMs. To obtain DePlot, we standardize the plot-to-table task by establishing unified task formats and metrics, and train DePlot end-to-end on this task. DePlot can then be used off-the-shelf together with LLMs in a plug-and-play fashion. Compared with a SOTA model finetuned on more than >28k data points, DePlot+LLM with just one-shot prompting achieves a 24.0% improvement over finetuned SOTA on human-written queries from the task of chart QA.*
-
-## Model description
-
-DePlot is a model that is trained using `Pix2Struct` architecture. You can find more information about `Pix2Struct` in the [Pix2Struct documentation](https://huggingface.co/docs/transformers/main/en/model_doc/pix2struct).
-DePlot is a Visual Question Answering subset of `Pix2Struct` architecture. It renders the input question on the image and predicts the answer.
-
-## Usage
-
-Currently one checkpoint is available for DePlot:
-
- `google/deplot`: DePlot fine-tuned on ChartQA dataset 
-
-
-```python
-from transformers import AutoProcessor, Pix2StructForConditionalGeneration
-import requests
-from PIL import Image
-
-model = Pix2StructForConditionalGeneration.from_pretrained("google/deplot")
-processor = AutoProcessor.from_pretrained("google/deplot")
-url = "https://raw.githubusercontent.com/vis-nlp/ChartQA/main/ChartQA%20Dataset/val/png/5090.png"
-image = Image.open(requests.get(url, stream=True).raw)
-
-inputs = processor(images=image, text="Generate underlying data table of the figure below:", return_tensors="pt")
-predictions = model.generate(**inputs, max_new_tokens=512)
-print(processor.decode(predictions[0], skip_special_tokens=True))
-```
-
-## Fine-tuning
-
-To fine-tune DePlot, refer to the pix2struct [fine-tuning notebook](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_pix2struct.ipynb). For `Pix2Struct` models, we have found out that fine-tuning the model with Adafactor and cosine learning rate scheduler leads to faster convergence:
-```python
-from transformers.optimization import Adafactor, get_cosine_schedule_with_warmup
-
-optimizer = Adafactor(self.parameters(), scale_parameter=False, relative_step=False, lr=0.01, weight_decay=1e-05)
-scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=1000, num_training_steps=40000)
-```
--- a/docs/source/en/model_doc/distilbert.mdx
+++ b/docs/source/en/model_doc/distilbert.mdx
@ -19,16 +19,13 @@ specific language governing permissions and limitations under the License.
 <a href="https://huggingface.co/spaces/docs-demos/distilbert-base-uncased">
 <img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
 </a>
-<a href="https://huggingface.co/papers/1910.01108">
-<img alt="Paper page" src="https://img.shields.io/badge/Paper%20page-1910.01108-green">
-</a>
 </div>

 ## Overview

 The DistilBERT model was proposed in the blog post [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a
 distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5), and the paper [DistilBERT, a
-distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/papers/1910.01108). DistilBERT is a
+distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108). DistilBERT is a
 small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than
 *bert-base-uncased*, runs 60% faster while preserving over 95% of BERT's performances as measured on the GLUE language
 understanding benchmark.
--- a/docs/source/en/model_doc/focalnet.mdx
+++ b/docs/source/en/model_doc/focalnet.mdx
@ -1,51 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# FocalNet
-
-## Overview
-
-The FocalNet model was proposed in [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
-FocalNets completely replace self-attention (used in models like [ViT](vit) and [Swin](swin)) by a focal modulation mechanism for modeling token interactions in vision.
-The authors claim that FocalNets outperform self-attention based models with similar computational costs on the tasks of image classification, object detection, and segmentation.
-
-The abstract from the paper is the following:
-
-*We propose focal modulation networks (FocalNets in short), where self-attention (SA) is completely replaced by a focal modulation mechanism for modeling token interactions in vision. Focal modulation comprises three components: (i) hierarchical contextualization, implemented using a stack of depth-wise convolutional layers, to encode visual contexts from short to long ranges, (ii) gated aggregation to selectively gather contexts for each query token based on its
-content, and (iii) element-wise modulation or affine transformation to inject the aggregated context into the query. Extensive experiments show FocalNets outperform the state-of-the-art SA counterparts (e.g., Swin and Focal Transformers) with similar computational costs on the tasks of image classification, object detection, and segmentation. Specifically, FocalNets with tiny and base size achieve 82.3% and 83.9% top-1 accuracy on ImageNet-1K. After pretrained on ImageNet-22K in 224 resolution, it attains 86.5% and 87.3% top-1 accuracy when finetuned with resolution 224 and 384, respectively. When transferred to downstream tasks, FocalNets exhibit clear superiority. For object detection with Mask R-CNN, FocalNet base trained with 1\times outperforms the Swin counterpart by 2.1 points and already surpasses Swin trained with 3\times schedule (49.0 v.s. 48.5). For semantic segmentation with UPerNet, FocalNet base at single-scale outperforms Swin by 2.4, and beats Swin at multi-scale (50.5 v.s. 49.7). Using large FocalNet and Mask2former, we achieve 58.5 mIoU for ADE20K semantic segmentation, and 57.9 PQ for COCO Panoptic Segmentation. Using huge FocalNet and DINO, we achieved 64.3 and 64.4 mAP on COCO minival and test-dev, respectively, establishing new SoTA on top of much larger attention-based models like Swinv2-G and BEIT-3.*
-
-Tips:
-
- One can use the [`AutoImageProcessor`] class to prepare images for the model.
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr).
-The original code can be found [here](https://github.com/microsoft/FocalNet).
-
-
-## FocalNetConfig
-
-[[autodoc]] FocalNetConfig
-
-## FocalNetModel
-
-[[autodoc]] FocalNetModel
-    - forward
-
-## FocalNetForMaskedImageModeling
-
-[[autodoc]] FocalNetForMaskedImageModeling
-    - forward
-
-## FocalNetForImageClassification
-
-[[autodoc]] FocalNetForImageClassification
-    - forward
--- a/docs/source/en/model_doc/gpt2.mdx
+++ b/docs/source/en/model_doc/gpt2.mdx
@ -24,7 +24,7 @@ specific language governing permissions and limitations under the License.
 ## Overview

 OpenAI GPT-2 model was proposed in [Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) by Alec
-Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever from [OpenAI](https://huggingface.co/openai). It's a causal (unidirectional)
+Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever. It's a causal (unidirectional)
 transformer pretrained using language modeling on a very large corpus of ~40 GB of text data.

 The abstract from the paper is the following:
@ -111,11 +111,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] GPT2DoubleHeadsModel
    - forward

-## GPT2ForQuestionAnswering
-
-[[autodoc]] GPT2ForQuestionAnswering
-    - forward
-
 ## GPT2ForSequenceClassification

 [[autodoc]] GPT2ForSequenceClassification
--- a/docs/source/en/model_doc/gpt_bigcode.mdx
+++ b/docs/source/en/model_doc/gpt_bigcode.mdx
@ -1,65 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# GPTBigCode
-
-## Overview
-
-The GPTBigCode model was proposed in [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by BigCode. The listed authors are: Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
-
-The abstract from the paper is the following:uery
-
-*The BigCode project is an open-scientific collaboration working on the responsible development of large language models for code. This tech report describes the progress of the collaboration until December 2022, outlining the current state of the Personally Identifiable Information (PII) redaction pipeline, the experiments conducted to de-risk the model architecture, and the experiments investigating better preprocessing methods for the training data. We train 1.1B parameter models on the Java, JavaScript, and Python subsets of The Stack and evaluate them on the MultiPL-E text-to-code benchmark. We find that more aggressive filtering of near-duplicates can further boost performance and, surprisingly, that selecting files from repositories with 5+ GitHub stars deteriorates performance significantly. Our best model outperforms previous open-source multilingual code generation models (InCoder-6.7B and CodeGen-Multi-2.7B) in both left-to-right generation and infilling on the Java, JavaScript, and Python portions of MultiPL-E, despite being a substantially smaller model. All models are released under an OpenRAIL license at [this https URL.](https://huggingface.co/bigcode)*
-
-The model is a an optimized [GPT2 model](https://huggingface.co/docs/transformers/model_doc/gpt2) with support for Multi-Query Attention.
-
-## Technical details
-
-The main differences compared to GPT2.
- Added support for Multi-Query Attention.
- Use `gelu_pytorch_tanh` instead of classic `gelu`.
- Avoid unnecessary synchronizations (this has since been added to GPT2 in #20061, but wasn't in the reference codebase).
- Use Linear layers instead of Conv1D (good speedup but makes the checkpoints incompatible).
- Merge `_attn` and `_upcast_and_reordered_attn`. Always merge the matmul with scaling. Rename `reorder_and_upcast_attn`->`attention_softmax_in_fp32`
- Cache the attention mask value to avoid recreating it every time.
- Use jit to fuse the attention fp32 casting, masking, softmax, and scaling.
- Combine the attention and causal masks into a single one, pre-computed for the whole model instead of every layer.
- Merge the key and value caches into one (this changes the format of layer_past/ present, does it risk creating problems?)
- Use the memory layout (self.num_heads, 3, self.head_dim) instead of `(3, self.num_heads, self.head_dim)` for the QKV tensor with MHA. (prevents an overhead with the merged key and values, but makes the checkpoints incompatible with the original gpt2 model).
-
-You can read more about the optimizations in the [original pull request](https://github.com/huggingface/transformers/pull/22575)
-
-## GPTBigCodeConfig
-
-[[autodoc]] GPTBigCodeConfig
-
-
-## GPTBigCodeModel
-
-[[autodoc]] GPTBigCodeModel
-    - forward
-
-## GPTBigCodeForCausalLM
-
-[[autodoc]] GPTBigCodeForCausalLM
-    - forward
-
-
-## GPTBigCodeForSequenceClassification
-
-[[autodoc]] GPTBigCodeForSequenceClassification
-    - forward
-
-## GPTBigCodeForTokenClassification
-
-[[autodoc]] GPTBigCodeForTokenClassification
-    - forward
--- a/docs/source/en/model_doc/gpt_neo.mdx
+++ b/docs/source/en/model_doc/gpt_neo.mdx
@ -69,20 +69,25 @@ The `generate()` method can be used to generate text using GPT Neo model.
 [[autodoc]] GPTNeoForCausalLM
    - forward

-## GPTNeoForQuestionAnswering
-
-[[autodoc]] GPTNeoForQuestionAnswering
-    - forward
-
 ## GPTNeoForSequenceClassification

 [[autodoc]] GPTNeoForSequenceClassification
    - forward

-## GPTNeoForTokenClassification
+## TFGPTNeoModel

-[[autodoc]] GPTNeoForTokenClassification
-    - forward
+[[autodoc]] TFGPTNeoModel
+    - call
+
+## TFGPTNeoForCausalLM
+
+[[autodoc]] TFGPTNeoForCausalLM
+    - call
+
+## TFGPTNeoForSequenceClassification
+
+[[autodoc]] TFGPTNeoForSequenceClassification
+    - call

 ## FlaxGPTNeoModel

--- a/docs/source/en/model_doc/gpt_neox.mdx
+++ b/docs/source/en/model_doc/gpt_neox.mdx
@ -78,18 +78,3 @@ The `generate()` method can be used to generate text using GPT Neo model.

 [[autodoc]] GPTNeoXForCausalLM
    - forward
-
-## GPTNeoXForQuestionAnswering
-
-[[autodoc]] GPTNeoXForQuestionAnswering
-    - forward
-
-## GPTNeoXForSequenceClassification
-
-[[autodoc]] GPTNeoXForSequenceClassification
-    - forward
-
-## GPTNeoXForTokenClassification
-
-[[autodoc]] GPTNeoXForTokenClassification
-    - forward
--- a/docs/source/en/model_doc/layoutlmv2.mdx
+++ b/docs/source/en/model_doc/layoutlmv2.mdx
@ -121,28 +121,6 @@ section below.
 In addition, there's LayoutXLM, which is a multilingual version of LayoutLMv2. More information can be found on
 [LayoutXLM's documentation page](layoutxlm).

-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with LayoutLMv2. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-<PipelineTag pipeline="text-classification"/>
-
- A notebook on how to [finetune LayoutLMv2 for text-classification on RVL-CDIP dataset](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/RVL-CDIP/Fine_tuning_LayoutLMv2ForSequenceClassification_on_RVL_CDIP.ipynb).
- See also: [Text classification task guide](../tasks/sequence_classification)
-
-<PipelineTag pipeline="question-answering"/>
-
- A notebook on how to [finetune LayoutLMv2 for question-answering on DocVQA dataset](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/DocVQA/Fine_tuning_LayoutLMv2ForQuestionAnswering_on_DocVQA.ipynb).
- See also: [Question answering task guide](../tasks/question_answering)
- See also: [Document question answering task guide](../tasks/document_question_answering)
-
-
-<PipelineTag pipeline="token-classification"/>
-
- A notebook on how to [finetune LayoutLMv2 for token-classification on CORD dataset](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/CORD/Fine_tuning_LayoutLMv2ForTokenClassification_on_CORD.ipynb).
- A notebook on how to [finetune LayoutLMv2 for token-classification on FUNSD dataset](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/FUNSD/Fine_tuning_LayoutLMv2ForTokenClassification_on_FUNSD_using_HuggingFace_Trainer.ipynb).
- See also: [Token classification task guide](../tasks/token_classification)
-
 ## Usage: LayoutLMv2Processor

 The easiest way to prepare data for the model is to use [`LayoutLMv2Processor`], which internally
@ -288,6 +266,13 @@ print(encoding.keys())
 # dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
 ```

+## Documentation resources
+
+- [Document question answering task guide](../tasks/document_question_answering)
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+
 ## LayoutLMv2Config

 [[autodoc]] LayoutLMv2Config
--- a/docs/source/en/model_doc/llama.mdx
+++ b/docs/source/en/model_doc/llama.mdx
@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.

 ## Overview

-The LLaMA model was proposed in [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. It is a collection of foundation language models ranging from 7B to 65B parameters.
+The LLaMA model was proposed in [LLaMA: Open and Efficient Foundation Language Models](LLaMA: Open and Efficient Foundation Language Models)  by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. It is a collection of foundation language models ranging from 7B to 65B parameters.

 The abstract from the paper is the following:

@ -23,7 +23,7 @@ The abstract from the paper is the following:
 Tips:

 - Weights for the LLaMA models can be obtained from by filling out [this form](https://docs.google.com/forms/d/e/1FAIpQLSfqNECQnMkycAp2jP4Z9TFX0cGR4uf7b_fBxjY_OjhJILlKGA/viewform?usp=send_form)
- After downloading the weights, they will need to be converted to the Hugging Face Transformers format using the [conversion script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py). The script can be called with the following (example) command:
+- After downloading the weights, they will need to be converted to the Hugging Face Transformers format using the [conversion script](/src/transformers/models/llama/convert_llama_weights_to_hf.py). The script can be called with the following (example) command:

 ```bash
 python src/transformers/models/llama/convert_llama_weights_to_hf.py \
@ -42,7 +42,7 @@ model = LlamaForCausalLM.from_pretrained("/output/path")
 Note that executing the script requires enough CPU RAM to host the whole model in float16 precision (even if the biggest versions
 come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). For the 65B model, it's thus 130GB of RAM needed.

- The LLaMA tokenizer is a BPE model based on [sentencepiece](https://github.com/google/sentencepiece). One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. "Banana"), the tokenizer does not prepend the prefix space to the string.
+- The LLaMA tokenizer is based on [sentencepiece](https://github.com/google/sentencepiece). One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. "Banana"), the tokenizer does not prepend the prefix space to the string. To have the tokenizer output the prefix space, set `decode_with_prefix_space=True` in the `LlamaTokenizer` object or in the tokenizer configuration.

 This model was contributed by [zphang](https://huggingface.co/zphang) with contributions from [BlackSamorez](https://huggingface.co/BlackSamorez). The code of the implementation in Hugging Face is based on GPT-NeoX [here](https://github.com/EleutherAI/gpt-neox). The original code of the authors can be found [here](https://github.com/facebookresearch/llama).

@ -59,14 +59,6 @@ This model was contributed by [zphang](https://huggingface.co/zphang) with contr
    - create_token_type_ids_from_sequences
    - save_vocabulary

-## LlamaTokenizerFast
-
-[[autodoc]] LlamaTokenizerFast
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
 ## LlamaModel

 [[autodoc]] LlamaModel
@ -81,4 +73,4 @@ This model was contributed by [zphang](https://huggingface.co/zphang) with contr
 ## LlamaForSequenceClassification

 [[autodoc]] LlamaForSequenceClassification
-    - forward
+    - forward
--- a/docs/source/en/model_doc/matcha.mdx
+++ b/docs/source/en/model_doc/matcha.mdx
@ -1,66 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# MatCha
-
-## Overview
-
-MatCha has been proposed in the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662), from Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
-
-The abstract of the paper states the following:
-
-*Visual language data such as plots, charts, and infographics are ubiquitous in the human world. However, state-of-the-art vision-language models do not perform well on these data. We propose MatCha (Math reasoning and Chart derendering pretraining) to enhance visual language models' capabilities in jointly modeling charts/plots and language data. Specifically, we propose several pretraining tasks that cover plot deconstruction and numerical reasoning which are the key capabilities in visual language modeling. We perform the MatCha pretraining starting from Pix2Struct, a recently proposed image-to-text visual language model. On standard benchmarks such as PlotQA and ChartQA, the MatCha model outperforms state-of-the-art methods by as much as nearly 20%. We also examine how well MatCha pretraining transfers to domains such as screenshots, textbook diagrams, and document figures and observe overall improvement, verifying the usefulness of MatCha pretraining on broader visual language tasks.*
-
-## Model description
-
-MatCha is a model that is trained using `Pix2Struct` architecture. You can find more information about `Pix2Struct` in the [Pix2Struct documentation](https://huggingface.co/docs/transformers/main/en/model_doc/pix2struct).
-MatCha is a Visual Question Answering subset of `Pix2Struct` architecture. It renders the input question on the image and predicts the answer.
-
-## Usage
-
-Currently 6 checkpoints are available for MatCha:
-
- `google/matcha`: the base MatCha model, used to fine-tune MatCha on downstream tasks
- `google/matcha-chartqa`: MatCha model fine-tuned on ChartQA dataset. It can be used to answer questions about charts.
- `google/matcha-plotqa-v1`: MatCha model fine-tuned on PlotQA dataset. It can be used to answer questions about plots.
- `google/matcha-plotqa-v2`: MatCha model fine-tuned on PlotQA dataset. It can be used to answer questions about plots.
- `google/matcha-chart2text-statista`: MatCha model fine-tuned on Statista dataset. 
- `google/matcha-chart2text-pew`: MatCha model fine-tuned on Pew dataset.
-
-The models finetuned on `chart2text-pew` and `chart2text-statista` are more suited for summarization, whereas the models finetuned on `plotqa` and `chartqa` are more suited for question answering.
-
-You can use these models as follows (example on a ChatQA dataset):
-
-```python
-from transformers import AutoProcessor, Pix2StructForConditionalGeneration
-import requests
-from PIL import Image
-
-model = Pix2StructForConditionalGeneration.from_pretrained("google/matcha-chartqa").to(0)
-processor = AutoProcessor.from_pretrained("google/matcha-chartqa")
-url = "https://raw.githubusercontent.com/vis-nlp/ChartQA/main/ChartQA%20Dataset/val/png/20294671002019.png"
-image = Image.open(requests.get(url, stream=True).raw)
-
-inputs = processor(images=image, text="Is the sum of all 4 places greater than Laos?", return_tensors="pt").to(0)
-predictions = model.generate(**inputs, max_new_tokens=512)
-print(processor.decode(predictions[0], skip_special_tokens=True))
-```
-
-## Fine-tuning
-
-To fine-tune MatCha, refer to the pix2struct [fine-tuning notebook](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_pix2struct.ipynb). For `Pix2Struct` models, we have found out that fine-tuning the model with Adafactor and cosine learning rate scheduler leads to faste convergence:
-```python
-from transformers.optimization import Adafactor, get_cosine_schedule_with_warmup
-
-optimizer = Adafactor(self.parameters(), scale_parameter=False, relative_step=False, lr=0.01, weight_decay=1e-05)
-scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=1000, num_training_steps=40000)
-```
--- a/docs/source/en/model_doc/mega.mdx
+++ b/docs/source/en/model_doc/mega.mdx
@ -27,7 +27,7 @@ The abstract from the paper is the following:
 Tips:

 - MEGA can perform quite well with relatively few parameters. See Appendix D in the MEGA paper for examples of architectural specs which perform well in various settings. If using MEGA as a decoder, be sure to set `bidirectional=False` to avoid errors with default bidirectional. 
- Mega-chunk is a variant of mega that reduces time and spaces complexity from quadratic to linear. Utilize chunking with MegaConfig.use_chunking and control chunk size with MegaConfig.chunk_size 
+- Mega-chunk is a variant of mega that reduces time and spaces complexity from quadratic to linear. Utilize chunking with MegaConfiig.use_chunking and control chunk size with MegaConfig.chunk_size 

 This model was contributed by [mnaylor](https://huggingface.co/mnaylor).
 The original code can be found [here](https://github.com/facebookresearch/mega).
--- a/docs/source/en/model_doc/nllb-moe.mdx
+++ b/docs/source/en/model_doc/nllb-moe.mdx
@ -1,128 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# NLLB-MOE
-
-
-## Overview
-
-The NLLB model was presented in [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by Marta R. Costa-jussà, James Cross, Onur Çelebi,
-Maha Elbayad, Kenneth Heafield, Kevin Heffernan, Elahe Kalbassi, Janice Lam, Daniel Licht, Jean Maillard, Anna Sun, Skyler Wang, Guillaume Wenzek, Al Youngblood, Bapi Akula,
-Loic Barrault, Gabriel Mejia Gonzalez, Prangthip Hansanti, John Hoffman, Semarley Jarrett, Kaushik Ram Sadagopan, Dirk Rowe, Shannon Spruit, Chau Tran, Pierre Andrews,
-Necip Fazil Ayan, Shruti Bhosale, Sergey Edunov, Angela Fan, Cynthia Gao, Vedanuj Goswami, Francisco Guzmán, Philipp Koehn, Alexandre Mourachko, Christophe Ropers,
-Safiyyah Saleem, Holger Schwenk, and Jeff Wang.
-
-The abstract of the paper is the following:
-
-*Driven by the goal of eradicating language barriers on a global scale, machine translation has solidified itself as a key focus of artificial intelligence research today.
-However, such efforts have coalesced around a small subset of languages, leaving behind the vast majority of mostly low-resource languages. What does it take to break the
-200 language barrier while ensuring safe, high quality results, all while keeping ethical considerations in mind? In No Language Left Behind, we took on this challenge by
-first contextualizing the need for low-resource language translation support through exploratory interviews with native speakers. Then, we created datasets and models aimed
-at narrowing the performance gap between low and high-resource languages. More specifically, we developed a conditional compute model based on Sparsely Gated Mixture of
-Experts that is trained on data obtained with novel and effective data mining techniques tailored for low-resource languages. We propose multiple architectural and training
-improvements to counteract overfitting while training on thousands of tasks. Critically, we evaluated the performance of over 40,000 different translation directions using
-a human-translated benchmark, Flores-200, and combined human evaluation with a novel toxicity benchmark covering all languages in Flores-200 to assess translation safety.
-Our model achieves an improvement of 44% BLEU relative to the previous state-of-the-art, laying important groundwork towards realizing a universal translation system.*
-
-Tips:
-
- M2M100ForConditionalGeneration is the base model for both NLLB and NLLB MoE
- The NLLB-MoE is very similar to the NLLB model, but it's feed forward layer is based on the implementation of SwitchTransformers.
- The tokenizer is the same as the NLLB models.
-
-This model was contributed by [Arthur Zucker](https://huggingface.co/ArtZucker).
-The original code can be found [here](https://github.com/facebookresearch/fairseq).
-
-## Implementation differences with SwitchTransformers
-The biggest difference is the way the tokens are routed. NLLB-MoE uses a `top-2-gate` which means that for each input, only the top two experts are selected based on the 
-highest predicted probabilities from the gating network, and the remaining experts are ignored. In `SwitchTransformers`, only the top-1 probabilities are computed, 
-which means that tokens have less probability of being forwarded. Moreover, if a token is not routed to any expert, `SwitchTransformers` still adds its unmodified hidden 
-states (kind of like a residual connection) while they are masked in `NLLB`'s top-2 routing mechanism. 
-
-## Generating with NLLB-MoE
-The avalable checkpoints requires around 350GB of storage. Make sure to use `accelerate` if you do not have enough RAM on your machine.
-
-While generating the target text set the `forced_bos_token_id` to the target language id. The following
-example shows how to translate English to French using the *facebook/nllb-200-distilled-600M* model.
-
-Note that we're using the BCP-47 code for French `fra_Latn`. See [here](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200)
-for the list of all BCP-47 in the Flores 200 dataset.
-
-```python
->>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-moe-54b")
->>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-moe-54b")
-
->>> article = "Previously, Ring's CEO, Jamie Siminoff, remarked the company started when his doorbell wasn't audible from his shop in his garage."
->>> inputs = tokenizer(article, return_tensors="pt")
-
->>> translated_tokens = model.generate(
-...     **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["fra_Latn"], max_length=50
-... )
->>> tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
-"Auparavant, le PDG de Ring, Jamie Siminoff, a fait remarquer que la société avait commencé lorsque sa sonnette n'était pas audible depuis son magasin dans son garage."
-```
-
-### Generating from any other language than English
-
-English (`eng_Latn`) is set as the default language from which to translate. In order to specify that you'd like to translate from a different language,
-you should specify the BCP-47 code in the `src_lang` keyword argument of the tokenizer initialization.
-
-See example below for a translation from romanian to german:
-
-```python
->>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-moe-54b", src_lang="ron_Latn")
->>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-moe-54b")
-
->>> article = "Şeful ONU spune că nu există o soluţie militară în Siria"
->>> inputs = tokenizer(article, return_tensors="pt")
-
->>> translated_tokens = model.generate(
-...     **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["deu_Latn"], max_length=30
-... )
->>> tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
-```
-
-## Documentation resources
-
- [Translation task guide](./tasks/translation)
- [Summarization task guide](./tasks/summarization)
-
-
-## NllbMoeConfig
-
-[[autodoc]] NllbMoeConfig
-
-## NllbMoeTop2Router
-
-[[autodoc]] NllbMoeTop2Router
-    - route_tokens
-    - forward
-
-## NllbMoeSparseMLP
-
-[[autodoc]] NllbMoeSparseMLP
-    - forward
-
-## NllbMoeModel
-
-[[autodoc]] NllbMoeModel
-    - forward
-
-## NllbMoeForConditionalGeneration
-
-[[autodoc]] NllbMoeForConditionalGeneration
-    - forward
-
--- a/docs/source/en/model_doc/nllb.mdx
+++ b/docs/source/en/model_doc/nllb.mdx
@ -12,45 +12,8 @@ specific language governing permissions and limitations under the License.

 # NLLB

-**DISCLAIMER:** The default behaviour for the tokenizer has recently been fixed (and thus changed)!
-
-The previous version adds `[self.eos_token_id, self.cur_lang_code]` at the end of the token sequence for both target and source tokenization. This is wrong as the NLLB paper mentions (page 48, 6.1.1. Model Architecture) :
-
-*Note that we prefix the source sequence with the source language, as opposed to the target
-language as previously done in several works (Arivazhagan et al., 2019; Johnson et al.,
-2017). This is primarily because we prioritize optimizing zero-shot performance of our
-model on any pair of 200 languages at a minor cost to supervised performance.*
-
-Previous behaviour:
-
-```python
->>> from transformers import NllbTokenizer
-
->>> tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
->>> tokenizer("How was your day?").input_ids
-[13374, 1398, 4260, 4039, 248130, 2, 256047]
-
->>> # 2: '</s>'
->>> # 256047 : 'eng_Latn'
-```
-New behaviour
-
-```python
->>> from transformers import NllbTokenizer
-
->>> tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
->>> tokenizer("How was your day?").input_ids
-[256047, 13374, 1398, 4260, 4039, 248130, 2]
- ```
-
-Enabling the old behaviour can be done as follows:
-```python
->>> from transformers import NllbTokenizer
-
->>> tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", legacy_behaviour=True)
-```
-
-For more details, feel free to check the linked [PR](https://github.com/huggingface/transformers/pull/22313) and [Issue](https://github.com/huggingface/transformers/issues/19943).
+**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=bug&template=bug-report.yml) and assign
+@LysandreJik

 ## Overview of NLLB

@ -72,9 +35,7 @@ improvements to counteract overfitting while training on thousands of tasks. Cri
 a human-translated benchmark, Flores-200, and combined human evaluation with a novel toxicity benchmark covering all languages in Flores-200 to assess translation safety.
 Our model achieves an improvement of 44% BLEU relative to the previous state-of-the-art, laying important groundwork towards realizing a universal translation system.*

-This implementation contains the dense models available on release.
-
-**The sparse model NLLB-MoE (Mixture of Expert) is now available! More details [here](nllb-moe)**
+This implementation contains the dense models available on release. Let us know via a GitHub issue if you would like to see the MoE models as well.

 This model was contributed by [Lysandre](https://huggingface.co/lysandre). The authors' code can be found [here](https://github.com/facebookresearch/fairseq/tree/nllb).

--- a/docs/source/en/model_doc/open-llama.mdx
+++ b/docs/source/en/model_doc/open-llama.mdx
@ -1,44 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Open-Llama
-
-## Overview
-
-The Open-Llama model was proposed in [Open-Llama project](https://github.com/s-JoL/Open-Llama) by community developer s-JoL.
-
-The model is mainly based on LLaMA with some modifications, incorporating memory-efficient attention from Xformers, stable embedding from Bloom, and shared input-output embedding from PLAM.
-And the model is pre-trained on both Chinese and English, which gives it better performance on Chinese language tasks.
-
-This model was contributed by [s-JoL](https://huggingface.co/s-JoL).
-The original code can be found [Open-Llama](https://github.com/s-JoL/Open-Llama).
-Checkpoint and usage can be found at [s-JoL/Open-Llama-V1](https://huggingface.co/s-JoL/Open-Llama-V1).
-
-
-## OpenLlamaConfig
-
-[[autodoc]] OpenLlamaConfig
-
-## OpenLlamaModel
-
-[[autodoc]] OpenLlamaModel
-    - forward
-
-## OpenLlamaForCausalLM
-
-[[autodoc]] OpenLlamaForCausalLM
-    - forward
-
-## OpenLlamaForSequenceClassification
-
-[[autodoc]] OpenLlamaForSequenceClassification
-    - forward
--- a/docs/source/en/model_doc/pix2struct.mdx
+++ b/docs/source/en/model_doc/pix2struct.mdx
@ -28,8 +28,9 @@ We therefore advise you to use these models for the tasks they have been fine tu
 This model was contributed by [ybelkada](https://huggingface.co/ybelkada).
 The original code can be found [here](https://github.com/google-research/pix2struct).

-## Resources
+## Resources:

+- [Paper](https://arxiv.org/abs/2210.03347)
 - [Fine-tuning Notebook](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_pix2struct.ipynb)
 - [All models](https://huggingface.co/models?search=pix2struct)

--- a/docs/source/en/model_doc/regnet.mdx
+++ b/docs/source/en/model_doc/regnet.mdx
@ -67,16 +67,4 @@ If you're interested in submitting a resource to be included here, please feel f
 ## TFRegNetForImageClassification

 [[autodoc]] TFRegNetForImageClassification
-    - call
-
-
-## FlaxRegNetModel
-
-[[autodoc]] FlaxRegNetModel
-    - __call__
-
-
-## FlaxRegNetForImageClassification
-
-[[autodoc]] FlaxRegNetForImageClassification
-    - __call__
+    - call
--- a/docs/source/en/model_doc/resnet.mdx
+++ b/docs/source/en/model_doc/resnet.mdx
@ -71,13 +71,3 @@ If you're interested in submitting a resource to be included here, please feel f

 [[autodoc]] TFResNetForImageClassification
    - call
-
-## FlaxResNetModel
-
-[[autodoc]] FlaxResNetModel
-    - __call__
-
-## FlaxResNetForImageClassification
-
-[[autodoc]] FlaxResNetForImageClassification
-    - __call__
--- a/docs/source/en/model_doc/roberta.mdx
+++ b/docs/source/en/model_doc/roberta.mdx
@ -19,14 +19,11 @@ specific language governing permissions and limitations under the License.
 <a href="https://huggingface.co/spaces/docs-demos/roberta-base">
 <img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
 </a>
-<a href="https://huggingface.co/papers/1907.11692">
-<img alt="Paper page" src="https://img.shields.io/badge/Paper%20page-1907.11692-green">
-</a>
 </div>

 ## Overview

-The RoBERTa model was proposed in [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, [Myle Ott](https://huggingface.co/myleott), Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer
+The RoBERTa model was proposed in [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer
 Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. It is based on Google's BERT model released in 2018.

 It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining objective and training with
--- a/docs/source/en/model_doc/rwkv.mdx
+++ b/docs/source/en/model_doc/rwkv.mdx
@ -1,129 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# RWKV
-
-## Overview
-
-The RWKV model was proposed in [this repo](https://github.com/BlinkDL/RWKV-LM)
-
-It suggests a tweak in the traditional Transformer attention to make it linear. This way, the model can be used as recurrent network: passing inputs for timestamp 0 and timestamp 1 together is the same as passing inputs at timestamp 0, then inputs at timestamp 1 along with the state of timestamp 0 (see example below).
-
-This can be more efficient than a regular Transformer and can deal with sentence of any length (even if the model uses a fixed context length for training).
-
-This model was contributed by [sgugger](https://huggingface.co/sgugger).
-The original code can be found [here](https://github.com/BlinkDL/RWKV-LM).
-
-Example of use as an RNN:
-
-```py
-import torch
-from transformers import AutoTokenizer, RwkvConfig, RwkvModel
-
-model = RwkvModel.from_pretrained("sgugger/rwkv-430M-pile")
-tokenizer = AutoTokenizer.from_pretrained("sgugger/rwkv-430M-pile")
-
-inputs = tokenizer("This is an example.", return_tensors="pt")
-# Feed everything to the model
-outputs = model(inputs["input_ids"])
-output_whole = outputs.last_hidden_state
-
-outputs = model(inputs["input_ids"][:, :2])
-output_one = outputs.last_hidden_state
-
-# Using the state computed on the first inputs, we will get the same output
-outputs = model(inputs["input_ids"][:, 2:], state=outputs.state)
-output_two = outputs.last_hidden_state
-
-torch.allclose(torch.cat([output_one, output_two], dim=1), output_whole, atol=1e-5)
-```
-
-## RwkvConfig
-
-[[autodoc]] RwkvConfig
-
-
-## RwkvModel
-
-[[autodoc]] RwkvModel
-    - forward
-
-## RwkvLMHeadModel
-
-[[autodoc]] RwkvForCausalLM
-    - forward
-
-## Rwkv attention and the recurrent formulas
-
-In a traditional auto-regressive Transformer, attention is written as
-
-$$O = \hbox{softmax}(QK^{T} / \sqrt{d}) V$$
-
-with \\(Q\\), \\(K\\) and \\(V\\) are matrices of shape `seq_len x hidden_size` named query, key and value (they are actually bigger matrices with a batch dimension and an attention head dimension but we're only interested in the last two, which is where the matrix product is taken, so for the sake of simplicity we only consider those two). The product \\(QK^{T}\\) then has shape `seq_len x seq_len` and we can take the maxtrix product with \\(V\\) to get the output \\(O\\) of the same shape as the others.  
-
-Replacing the softmax by its value gives:
-
-$$O_{i} = \frac{\sum_{j=1}^{i} e^{Q_{i} K_{j}^{T} / \sqrt{d}} V_{j}}{\sum_{j=1}^{i} e^{Q_{i} K_{j}^{T} / \sqrt{d}}}$$
-
-Note that the entries in \\(QK^{T}\\) corresponding to \\(j > i\\) are masked (the sum stops at j) because the attention is not allowed to look at future tokens (only past ones).
-
-In comparison, the RWKV attention is given by
-
-$$O_{i} = \sigma(R_{i}) \frac{\sum_{j=1}^{i} e^{W_{i-j} + K_{j}} V_{j}}{\sum_{j=1}^{i} e^{W_{i-j} + K_{j}}}$$
-
-where \\(R\\) is a new matrix called receptance by the author, \\(K\\) and \\(V\\) are still the key and value (\\(\sigma\\) here is the sigmoid function). \\(W\\) is a new vector that represents the position of the token and is given by
-
-$$W_{0} = u \hbox{  and  } W_{k} = (k-1)w \hbox{ for } k \geq 1$$
-
-with \\(u\\) and \\(w\\) learnable parameters called in the code `time_first` and `time_decay` respectively. The numerator and denominator can both be expressed recursively. Naming them \\(N_{i}\\) and \\(D_{i}\\) we have:
-
-$$N_{i} = e^{u + K_{i}} V_{i} + \hat{N}_{i} \hbox{  where  } \hat{N}_{i} = e^{K_{i-1}} V_{i-1} + e^{w + K_{i-2}} V_{i-2} \cdots + e^{(i-2)w + K_{1}} V_{1}$$
-
-so \\(\hat{N}_{i}\\) (called `numerator_state` in the code) satistfies
-
-$$\hat{N}_{0} = 0 \hbox{  and  } \hat{N}_{j+1} = e^{K_{j}} V_{j} + e^{w} \hat{N}_{j}$$
-
-and
-
-$$D_{i} = e^{u + K_{i}} + \hat{D}_{i} \hbox{  where  } \hat{D}_{i} = e^{K_{i-1}} + e^{w + K_{i-2}} \cdots + e^{(i-2)w + K_{1}}$$
-
-so \\(\hat{D}_{i}\\) (called `denominator_state` in the code) satistfies
-
-$$\hat{D}_{0} = 0 \hbox{  and  } \hat{D}_{j+1} = e^{K_{j}} + e^{w} \hat{D}_{j}$$
-
-The actual recurrent formula used are a tiny bit more complex, as for numerical stability we don't want to compute exponentials of big numbers. Usually the softmax is not computed as is, but the exponential of the maximum term is divided of the numerator and denominator:
-
-$$\frac{e^{x_{i}}}{\sum_{j=1}^{n} e^{x_{j}}} = \frac{e^{x_{i} - M}}{\sum_{j=1}^{n} e^{x_{j} - M}}$$
-
-with \\(M\\) the maximum of all \\(x_{j}\\). So here on top of saving the numerator state (\\(\hat{N}\\)) and the denominator state (\\(\hat{D}\\)) we also keep track of the maximum of all terms encountered in the exponentials. So we actually use
-
-$$\tilde{N}_{i} = e^{-M_{i}} \hat{N}_{i} \hbox{  and  } \tilde{D}_{i} = e^{-M_{i}} \hat{D}_{i}$$
-
-defined by the following recurrent formulas:
-
-$$\tilde{N}_{0} = 0 \hbox{  and  } \tilde{N}_{j+1} = e^{K_{j} - q} V_{j} + e^{w + M_{j} - q} \tilde{N}_{j} \hbox{  where  } q = \max(K_{j}, w + M_{j})$$
-
-and
-
-$$\tilde{D}_{0} = 0 \hbox{  and  } \tilde{D}_{j+1} = e^{K_{j} - q} + e^{w + M_{j} - q} \tilde{D}_{j} \hbox{  where  } q = \max(K_{j}, w + M_{j})$$
-
-and \\(M_{j+1} = q\\). With those, we can then compute
-
-$$N_{i} = e^{u + K_{i} - q} V_{i} + e^{M_{i}} \tilde{N}_{i} \hbox{  where  } q = \max(u + K_{i}, M_{i})$$
-
-and
-
-$$D_{i} = e^{u + K_{i} - q} + e^{M_{i}} \tilde{D}_{i} \hbox{  where  } q = \max(u + K_{i}, M_{i})$$
-
-which finally gives us
-
-$$O_{i} = \sigma(R_{i}) \frac{N_{i}}{D_{i}}$$
--- a/docs/source/en/model_doc/sam.mdx
+++ b/docs/source/en/model_doc/sam.mdx
@ -1,101 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# SAM
-
-## Overview
-
-SAM (Segment Anything Model) was proposed in [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
-
-The model can be used to predict segmentation masks of any object of interest given an input image. 
-
-![example image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-output.png)
-
-The abstract from the paper is the following:
-
-*We introduce the Segment Anything (SA) project: a new task, model, and dataset for image segmentation. Using our efficient model in a data collection loop, we built the largest segmentation dataset to date (by far), with over 1 billion masks on 11M licensed and privacy respecting images. The model is designed and trained to be promptable, so it can transfer zero-shot to new image distributions and tasks. We evaluate its capabilities on numerous tasks and find that its zero-shot performance is impressive -- often competitive with or even superior to prior fully supervised results. We are releasing the Segment Anything Model (SAM) and corresponding dataset (SA-1B) of 1B masks and 11M images at [https://segment-anything.com](https://segment-anything.com) to foster research into foundation models for computer vision.*
-
-Tips:
-
- The model predicts binary masks that states the presence or not of the object of interest given an image.
- The model predicts much better results if input 2D points and/or input bounding boxes are provided
- You can prompt multiple points for the same image, and predict a single mask. 
- Fine-tuning the model is not supported yet
- According to the paper, textual input should be also supported. However, at this time of writing this seems to be not supported according to [the official repository](https://github.com/facebookresearch/segment-anything/issues/4#issuecomment-1497626844). 
-
-
-This model was contributed by [ybelkada](https://huggingface.co/ybelkada) and [ArthurZ](https://huggingface.co/ArthurZ).
-The original code can be found [here](https://github.com/facebookresearch/segment-anything).
-
-Below is an example on how to run mask generation given an image and a 2D point:
-
-```python
-import torch
-from PIL import Image
-import requests
-from transformers import SamModel, SamProcessor
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model = SamModel.from_pretrained("facebook/sam-vit-huge").to(device)
-processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
-
-img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
-raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
-input_points = [[[450, 600]]]  # 2D location of a window in the image
-
-inputs = processor(raw_image, input_points=input_points, return_tensors="pt").to(device)
-outputs = model(**inputs)
-
-masks = processor.image_processor.post_process_masks(
-    outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu()
-)
-scores = outputs.iou_scores
-```
-
-Resources:
-
- [Demo notebook](https://github.com/huggingface/notebooks/blob/main/examples/segment_anything.ipynb) for using the model.
- [Demo notebook](https://github.com/huggingface/notebooks/blob/main/examples/automatic_mask_generation.ipynb) for using the automatic mask generation pipeline.
- [Demo notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/SAM/Run_inference_with_MedSAM_using_HuggingFace_Transformers.ipynb) for inference with MedSAM, a fine-tuned version of SAM on the medical domain.
- [Demo notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/SAM/Fine_tune_SAM_(segment_anything)_on_a_custom_dataset.ipynb) for fine-tuning the model on custom data.
-
-## SamConfig
-
-[[autodoc]] SamConfig
-
-## SamVisionConfig
-
-[[autodoc]] SamVisionConfig
-
-## SamMaskDecoderConfig
-
-[[autodoc]] SamMaskDecoderConfig
-
-## SamPromptEncoderConfig
-
-[[autodoc]] SamPromptEncoderConfig
-
-
-## SamProcessor
-
-[[autodoc]] SamProcessor
-
-
-## SamImageProcessor
-
-[[autodoc]] SamImageProcessor
-
-
-## SamModel
-
-[[autodoc]] SamModel
-    - forward
--- a/docs/source/en/model_doc/swiftformer.mdx
+++ b/docs/source/en/model_doc/swiftformer.mdx
@ -1,45 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# SwiftFormer
-
-## Overview
-
-The SwiftFormer model was proposed in [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
-
-The SwiftFormer paper introduces a novel efficient additive attention mechanism that effectively replaces the quadratic matrix multiplication operations in the self-attention computation with linear element-wise multiplications. A series of models called 'SwiftFormer' is built based on this, which achieves state-of-the-art performance in terms of both accuracy and mobile inference speed. Even their small variant achieves 78.5% top-1 ImageNet1K accuracy with only 0.8 ms latency on iPhone 14, which is more accurate and 2× faster compared to MobileViT-v2.
-
-The abstract from the paper is the following:
-
-*Self-attention has become a defacto choice for capturing global context in various vision applications. However, its quadratic computational complexity with respect to image resolution limits its use in real-time applications, especially for deployment on resource-constrained mobile devices. Although hybrid approaches have been proposed to combine the advantages of convolutions and self-attention for a better speed-accuracy trade-off, the expensive matrix multiplication operations in self-attention remain a bottleneck. In this work, we introduce a novel efficient additive attention mechanism that effectively replaces the quadratic matrix multiplication operations with linear element-wise multiplications. Our design shows that the key-value interaction can be replaced with a linear layer without sacrificing any accuracy. Unlike previous state-of-the-art methods, our efficient formulation of self-attention enables its usage at all stages of the network. Using our proposed efficient additive attention, we build a series of models called "SwiftFormer" which achieves state-of-the-art performance in terms of both accuracy and mobile inference speed. Our small variant achieves 78.5% top-1 ImageNet-1K accuracy with only 0.8 ms latency on iPhone 14, which is more accurate and 2x faster compared to MobileViT-v2.*
-
-Tips:
-    - One can use the [`ViTImageProcessor`] API to prepare images for the model.
-
-
-This model was contributed by [shehan97](https://huggingface.co/shehan97).
-The original code can be found [here](https://github.com/Amshaker/SwiftFormer).
-
-
-## SwiftFormerConfig
-
-[[autodoc]] SwiftFormerConfig
-
-## SwiftFormerModel
-
-[[autodoc]] SwiftFormerModel
-    - forward
-
-## SwiftFormerForImageClassification
-
-[[autodoc]] SwiftFormerForImageClassification
-    - forward
--- a/docs/source/en/model_doc/t5.mdx
+++ b/docs/source/en/model_doc/t5.mdx
@ -19,15 +19,12 @@ specific language governing permissions and limitations under the License.
 <a href="https://huggingface.co/spaces/docs-demos/t5-base">
 <img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
 </a>
-<a href="https://huggingface.co/papers/1910.10683">
-<img alt="Paper page" src="https://img.shields.io/badge/Paper%20page-1910.10683-green">
-</a>
 </div>

 ## Overview

-The T5 model was presented in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/pdf/1910.10683.pdf) by [Colin Raffel](https://huggingface.co/craffel), Noam Shazeer, [Adam Roberts](https://huggingface.co/adarob), Katherine Lee, Sharan Narang,
-Michael Matena, Yanqi Zhou, Wei Li, [Peter J. Liu](https://huggingface.co/peterjliu).
+The T5 model was presented in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/pdf/1910.10683.pdf) by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang,
+Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu.

 The abstract from the paper is the following:

--- a/docs/source/en/model_doc/trocr.mdx
+++ b/docs/source/en/model_doc/trocr.mdx
@ -50,27 +50,6 @@ Tips:
  information, see the [official models](https://huggingface.co/models?other=trocr>).
 - TrOCR is always used within the [VisionEncoderDecoder](vision-encoder-decoder) framework.

-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with TrOCR. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-<PipelineTag pipeline="text-classification"/>
-
- A blog post on [Accelerating Document AI](https://huggingface.co/blog/document-ai) with TrOCR.
- A blog post on how to [Document AI](https://github.com/philschmid/document-ai-transformers) with TrOCR.
- A notebook on how to [finetune TrOCR on IAM Handwriting Database using Seq2SeqTrainer](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Fine_tune_TrOCR_on_IAM_Handwriting_Database_using_Seq2SeqTrainer.ipynb).
- A notebook on [inference with TrOCR](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Inference_with_TrOCR_%2B_Gradio_demo.ipynb) and Gradio demo.
- A notebook on [finetune TrOCR on the IAM Handwriting Database](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Fine_tune_TrOCR_on_IAM_Handwriting_Database_using_native_PyTorch.ipynb) using native PyTorch.
- A notebook on [evaluating TrOCR on the IAM test set](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Evaluating_TrOCR_base_handwritten_on_the_IAM_test_set.ipynb).
-
-<PipelineTag pipeline="text-generation"/>
-
- [Casual language modeling](https://huggingface.co/docs/transformers/tasks/language_modeling) task guide.
-
-⚡️ Inference
-
- An interactive-demo on [TrOCR handwritten character recognition](https://huggingface.co/spaces/nielsr/TrOCR-handwritten).
-
 ## Inference

 TrOCR's [`VisionEncoderDecoder`] model accepts images as input and makes use of
--- a/docs/source/en/model_doc/wav2vec2.mdx
+++ b/docs/source/en/model_doc/wav2vec2.mdx
@ -197,11 +197,6 @@ Otherwise, [`~Wav2Vec2ProcessorWithLM.batch_decode`] performance will be slower
 [[autodoc]] TFWav2Vec2Model
    - call

-## TFWav2Vec2ForSequenceClassification
-
-[[autodoc]] TFWav2Vec2ForSequenceClassification
-    - call
-
 ## TFWav2Vec2ForCTC

 [[autodoc]] TFWav2Vec2ForCTC
--- a/docs/source/en/model_doc/whisper.mdx
+++ b/docs/source/en/model_doc/whisper.mdx
@ -105,9 +105,3 @@ The original code can be found [here](https://github.com/openai/whisper).

 [[autodoc]] FlaxWhisperForConditionalGeneration
    - __call__
-
-## FlaxWhisperForAudioClassification
-
-[[autodoc]] FlaxWhisperForAudioClassification
-    - __call__
-
--- a/docs/source/en/perf_infer_gpu_one.mdx
+++ b/docs/source/en/perf_infer_gpu_one.mdx
@ -11,28 +11,11 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o

 # Efficient Inference on a Single GPU

-In addition to this guide, relevant information can be found as well in [the guide for training on a single GPU](perf_train_gpu_one) and [the guide for inference on CPUs](perf_infer_cpu).
+This document will be completed soon with information on how to infer on a single GPU. In the meantime you can check out [the guide for training on a single GPU](perf_train_gpu_one) and [the guide for inference on CPUs](perf_infer_cpu).

-## Better Transformer: PyTorch-native transformer fastpath
+## `BetterTransformer` for faster inference

-PyTorch-native [`nn.MultiHeadAttention`](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/) attention fastpath, called BetterTransformer, can be used with Transformers through the integration in the [🤗 Optimum library](https://huggingface.co/docs/optimum/bettertransformer/overview).
-
-PyTorch's attention fastpath allows to speed up inference through kernel fusions and the use of [nested tensors](https://pytorch.org/docs/stable/nested.html). Detailed benchmarks can be found in [this blog post](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2).
-
-After installing the [`optimum`](https://github.com/huggingface/optimum) package, to use Better Transformer during inference, the relevant internal modules are replaced by calling [`~PreTrainedModel.to_bettertransformer`]:
-
-```python
-model = model.to_bettertransformer()
-```
-
-The method [`~PreTrainedModel.reverse_bettertransformer`] allows to go back to the original modeling, which should be used before saving the model in order to use the canonical transformers modeling:
-
-```python
-model = model.reverse_bettertransformer()
-model.save_pretrained("saved_model")
-```
-
-As of PyTorch 2.0, the attention fastpath is supported for both encoders and decoders. The list of supported architectures can be found [here](https://huggingface.co/docs/optimum/bettertransformer/overview#supported-models).
+We have recently integrated `BetterTransformer` for faster inference on GPU for text, image and audio models. Check the documentation about this integration [here](https://huggingface.co/docs/optimum/bettertransformer/overview) for more details.

 ## `bitsandbytes` integration for Int8 mixed-precision matrix decomposition

@ -88,7 +71,7 @@ model_name = "bigscience/bloom-2b5"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)

-prompt = "Hello, my llama is cute"
+text = "Hello, my llama is cute"
 inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
 generated_ids = model.generate(**inputs)
 outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
@ -122,4 +105,4 @@ Check out the demo for running T5-11b (42GB in fp32)! Using 8-bit quantization o

 Or this demo for BLOOM-3B:

-[![Open In Colab: BLOOM-3b demo](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1qOjXfQIAULfKvZqwCen8-MoWKGdSatZ4?usp=sharing)
+[![Open In Colab: BLOOM-3b demo](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1qOjXfQIAULfKvZqwCen8-MoWKGdSatZ4?usp=sharing)
--- a/docs/source/en/perf_train_cpu_many.mdx
+++ b/docs/source/en/perf_train_cpu_many.mdx
@ -73,7 +73,7 @@ The following "Usage in Trainer" takes mpirun in Intel® MPI library as an examp


 ## Usage in Trainer
-To enable multi CPU distributed training in the Trainer with the ccl backend, users should add **`--ddp_backend ccl`** in the command arguments.
+To enable multi CPU distributed training in the Trainer with the ccl backend, users should add **`--xpu_backend ccl`** in the command arguments.

 Let's see an example with the [question-answering example](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)

@ -95,7 +95,7 @@ The following command enables training with 2 processes on one Xeon node, with o
 --doc_stride 128  \
 --output_dir /tmp/debug_squad/ \
 --no_cuda \
- --ddp_backend ccl \
+ --xpu_backend ccl \
 --use_ipex
 ```
 The following command enables training with a total of four processes on two Xeons (node0 and node1, taking node0 as the main process), ppn (processes per node) is set to 2, with one process running per one socket. The variables OMP_NUM_THREADS/CCL_WORKER_COUNT can be tuned for optimal performance.
@ -124,7 +124,7 @@ Now, run the following command in node0 and **4DDP** will be enabled in node0 an
 --doc_stride 128  \
 --output_dir /tmp/debug_squad/ \
 --no_cuda \
- --ddp_backend ccl \
+ --xpu_backend ccl \
 --use_ipex \
 --bf16
 ```
--- a/docs/source/en/perf_train_gpu_many.mdx
+++ b/docs/source/en/perf_train_gpu_many.mdx
@ -272,7 +272,7 @@ It's easy to see from the bottom diagram how PP has less dead zones, where GPUs

 Both parts of the diagram show a parallelism that is of degree 4. That is 4 GPUs are participating in the pipeline. So there is the forward path of 4 pipe stages F0, F1, F2 and F3 and then the return reverse order backward path of B3, B2, B1 and B0.

-PP introduces a new hyper-parameter to tune and it's `chunks` which defines how many chunks of data are sent in a sequence through the same pipe stage. For example, in the bottom diagram you can see that `chunks=4`. GPU0 performs the same forward path on chunk 0, 1, 2 and 3 (F0,0, F0,1, F0,2, F0,3) and then it waits for other GPUs to do their work and only when their work is starting to be complete, GPU0 starts to work again doing the backward path for chunks 3, 2, 1 and 0 (B0,3, B0,2, B0,1, B0,0).
+PP introduces a new hyper-parameter to tune and it's `chunks` which defines how many chunks of data are sent in a sequence through the same pipe stage. For example, in the bottomw diagram you can see that `chunks=4`. GPU0 performs the same forward path on chunk 0, 1, 2 and 3 (F0,0, F0,1, F0,2, F0,3) and then it waits for other GPUs to do their work and only when their work is starting to be complete, GPU0 starts to work again doing the backward path for chunks 3, 2, 1 and 0 (B0,3, B0,2, B0,1, B0,0).

 Note that conceptually this is the same concept as gradient accumulation steps (GAS). Pytorch uses `chunks`, whereas DeepSpeed refers to the same hyper-parameter as GAS.

--- a/docs/source/en/perf_train_gpu_one.mdx
+++ b/docs/source/en/perf_train_gpu_one.mdx
@ -718,18 +718,6 @@ For some applications, such as pretraining large language models, applying all t

 Another use case for training on many GPUs is if the model does not fit on a single GPU with all the mentioned tricks. There are still more methods we can apply although life starts to get a bit more complicated. This usually involves some form of pipeline or tensor parallelism where the model itself is distributed across several GPUs. One can also make use of DeepSpeed which implements some of these parallelism strategies along with some more optimization to reduce the memory footprint such as partitioning the optimizer states. You can read more about this in the ["Multi-GPU training" section](perf_train_gpu_many).

-## Using PyTorch native attention
-
-PyTorch 2.0 released the native [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) (SDPA), that allows to use fused GPU kernels as [memory-efficient attention](https://arxiv.org/abs/2112.05682) and [flash attention](https://arxiv.org/abs/2205.14135).
-
-After installing the [`optimum`](https://github.com/huggingface/optimum) package, the relevant internal modules can be replaced to use PyTorch's native attention with:
-
-```python
-model = model.to_bettertransformer()
-```
-
-Training can then be done as usual.
-
 ## Using torch.compile

 PyTorch 2.0 introduces a new compile function, you can learn more about it [in their documentation](https://pytorch.org/get-started/pytorch-2.0/). It uses Python’s frame evaluation API to automatically create a graph from existing PyTorch programs. After capturing the graph, different backends can be deployed to lower the graph to an optimized engine. You can choose one option below for performance boost.
--- a/docs/source/en/perplexity.mdx
+++ b/docs/source/en/perplexity.mdx
@ -115,10 +115,11 @@ for begin_loc in tqdm(range(0, seq_len, stride)):
    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

-        # loss is calculated using CrossEntropyLoss which averages over valid labels
-        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
-        # to the left by 1.
-        neg_log_likelihood = outputs.loss
+        # loss is calculated using CrossEntropyLoss which averages over input tokens.
+        # Multiply it with trg_len to get the summation instead of average.
+        # We will take average over all the tokens to get the true average
+        # in the last step of this example.
+        neg_log_likelihood = outputs.loss * trg_len

    nlls.append(neg_log_likelihood)

@ -126,14 +127,14 @@ for begin_loc in tqdm(range(0, seq_len, stride)):
    if end_loc == seq_len:
        break

-ppl = torch.exp(torch.stack(nlls).mean())
+ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
 ```

 Running this with the stride length equal to the max input length is equivalent to the suboptimal, non-sliding-window
 strategy we discussed above. The smaller the stride, the more context the model will have in making each prediction,
 and the better the reported perplexity will typically be.

-When we run the above with `stride = 1024`, i.e. no overlap, the resulting PPL is `19.44`, which is about the same
+When we run the above with `stride = 1024`, i.e. no overlap, the resulting PPL is `19.64`, which is about the same
 as the `19.93` reported in the GPT-2 paper. By using `stride = 512` and thereby employing our striding window
-strategy, this jumps down to `16.45`. This is not only a more favorable score, but is calculated in a way that is
+strategy, this jumps down to `16.44`. This is not only a more favorable score, but is calculated in a way that is
 closer to the true autoregressive decomposition of a sequence likelihood.
--- a/docs/source/en/pipeline_tutorial.mdx
+++ b/docs/source/en/pipeline_tutorial.mdx
@ -81,10 +81,10 @@ If you want to iterate over a whole dataset, or want to use it for inference in
 In general you can specify parameters anywhere you want:

 ```py
-generator = pipeline(model="openai/whisper-large", my_parameter=1)
-out = generator(...)  # This will use `my_parameter=1`.
-out = generator(..., my_parameter=2)  # This will override and use `my_parameter=2`.
-out = generator(...)  # This will go back to using `my_parameter=1`.
+generator(model="openai/whisper-large", my_parameter=1)
+out = generate(...)  # This will use `my_parameter=1`.
+out = generate(..., my_parameter=2)  # This will override and use `my_parameter=2`.
+out = generate(...)  # This will go back to using `my_parameter=1`.
 ```

 Let's check out 3 important ones:
@ -95,14 +95,14 @@ If you use `device=n`, the pipeline automatically puts the model on the specifie
 This will work regardless of whether you are using PyTorch or Tensorflow.

 ```py
-generator = pipeline(model="openai/whisper-large", device=0)
+generator(model="openai/whisper-large", device=0)
 ```

 If the model is too large for a single GPU, you can set `device_map="auto"` to allow 🤗 [Accelerate](https://huggingface.co/docs/accelerate) to automatically determine how to load and store the model weights.

 ```py
 #!pip install accelerate
-generator = pipeline(model="openai/whisper-large", device_map="auto")
+generator(model="openai/whisper-large", device_map="auto")
 ```

 Note that if  `device_map="auto"` is passed, there is no need to add the argument `device=device` when instantiating your `pipeline` as you may encounter some unexpected behavior!
@ -114,7 +114,7 @@ By default, pipelines will not batch inference for reasons explained in detail [
 But if it works in your use case, you can use:

 ```py
-generator = pipeline(model="openai/whisper-large", device=0, batch_size=2)
+generator(model="openai/whisper-large", device=0, batch_size=2)
 audio_filenames = [f"audio_{i}.flac" for i in range(10)]
 texts = generator(audio_filenames)
 ```
@ -287,4 +287,4 @@ pipe = pipeline(model="facebook/opt-1.3b", device_map="auto", model_kwargs={"loa
 output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
 ```

-Note that you can replace the checkpoint with any of the Hugging Face model that supports large model loading such as BLOOM!
+Note that you can replace the checkpoint with any of the Hugging Face model that supports large model loading such as BLOOM!
--- a/docs/source/en/pr_checks.mdx
+++ b/docs/source/en/pr_checks.mdx
@ -24,7 +24,7 @@ When you open a pull request on 🤗 Transformers, a fair number of checks will

 In this document, we will take a stab at explaining what those various checks are and the reason behind them, as well as how to debug them locally if one of them fails on your PR.

-Note that, ideally, they require you to have a dev install:
+Note that they all require you to have a dev install:

 ```bash
 pip install transformers[dev]
@ -36,18 +36,7 @@ or for an editable install:
 pip install -e .[dev]
 ```

-inside the Transformers repo. Since the number of optional dependencies of Transformers has grown a lot, it's possible you don't manage to get all of them. If the dev install fails, make sure to install the Deep Learning framework you are working with (PyTorch, TensorFlow and/or Flax) then do
-
-```bash
-pip install transformers[quality]
-```
-
-or for an editable install:
-
-```bash
-pip install -e .[quality]
-```
-
+inside the Transformers repo.

 ## Tests

--- a/docs/source/en/preprocessing.mdx
+++ b/docs/source/en/preprocessing.mdx
@ -41,7 +41,7 @@ The main tool for preprocessing textual data is a [tokenizer](main_classes/token

 <Tip>

-If you plan on using a pretrained model, it's important to use the associated pretrained tokenizer. This ensures the text is split the same way as the pretraining corpus, and uses the same corresponding tokens-to-index (usually referred to as the *vocab*) during pretraining.
+If you plan on using a pretrained model, it's important to use the associated pretrained tokenizer. This ensures the text is split the same way as the pretraining corpus, and uses the same corresponding tokens-to-index (usually referrred to as the *vocab*) during pretraining.

 </Tip>

--- a/docs/source/en/quicktour.mdx
+++ b/docs/source/en/quicktour.mdx
@ -528,7 +528,7 @@ All models are a standard [`tf.keras.Model`](https://www.tensorflow.org/api_docs
   ```py
   >>> dataset = dataset.map(tokenize_dataset)  # doctest: +SKIP
   >>> tf_dataset = model.prepare_tf_dataset(
-   ...     dataset["train"], batch_size=16, shuffle=True, tokenizer=tokenizer
+   ...     dataset, batch_size=16, shuffle=True, tokenizer=tokenizer
   ... )  # doctest: +SKIP
   ```

@ -538,7 +538,7 @@ All models are a standard [`tf.keras.Model`](https://www.tensorflow.org/api_docs
   >>> from tensorflow.keras.optimizers import Adam

   >>> model.compile(optimizer=Adam(3e-5))
-   >>> model.fit(tf_dataset)  # doctest: +SKIP
+   >>> model.fit(dataset)  # doctest: +SKIP
   ```

 ## What's next?
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@ -112,7 +112,6 @@ Ready-made configurations include the following architectures:
 - RoFormer
 - SegFormer
 - SqueezeBERT
- SwiftFormer
 - Swin Transformer
 - T5
 - Table Transformer
--- a/docs/source/en/tasks/asr.mdx
+++ b/docs/source/en/tasks/asr.mdx
@ -282,7 +282,7 @@ At this point, only three steps remain:
 ...     args=training_args,
 ...     train_dataset=encoded_minds["train"],
 ...     eval_dataset=encoded_minds["test"],
-...     tokenizer=processor,
+...     tokenizer=processor.feature_extractor,
 ...     data_collator=data_collator,
 ...     compute_metrics=compute_metrics,
 ... )
--- a/docs/source/en/tasks/image_classification.mdx
+++ b/docs/source/en/tasks/image_classification.mdx
@ -30,7 +30,7 @@ The task illustrated in this tutorial is supported by the following model archit

 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->

-[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [SwiftFormer](../model_doc/swiftformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn)
+[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn)
 <!--End of the generated tip-->

 </Tip>
@ -385,12 +385,12 @@ Convert your datasets to the `tf.data.Dataset` format using the [`~datasets.Data
 ```py
 >>> # converting our train dataset to tf.data.Dataset
 >>> tf_train_dataset = food["train"].to_tf_dataset(
-...     columns="pixel_values", label_cols="label", shuffle=True, batch_size=batch_size, collate_fn=data_collator
+...     columns=["pixel_values"], label_cols=["label"], shuffle=True, batch_size=batch_size, collate_fn=data_collator
 ... )

 >>> # converting our test dataset to tf.data.Dataset
 >>> tf_eval_dataset = food["test"].to_tf_dataset(
-...     columns="pixel_values", label_cols="label", shuffle=True, batch_size=batch_size, collate_fn=data_collator
+...     columns=["pixel_values"], label_cols=["label"], shuffle=True, batch_size=batch_size, collate_fn=data_collator
 ... )
 ```

--- a/docs/source/en/tasks/language_modeling.mdx
+++ b/docs/source/en/tasks/language_modeling.mdx
@ -33,8 +33,8 @@ You can finetune other architectures for causal language modeling following the
 Choose one of the following architectures:

 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
-[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)

+[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeGen](../model_doc/codegen), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MVP](../model_doc/mvp), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)

 <!--End of the generated tip-->

--- a/docs/source/en/tasks/question_answering.mdx
+++ b/docs/source/en/tasks/question_answering.mdx
@ -31,7 +31,7 @@ The task illustrated in this tutorial is supported by the following model archit

 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->

-[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [OpenAI GPT-2](../model_doc/gpt2), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)


 <!--End of the generated tip-->
--- a/docs/source/en/tasks/sequence_classification.mdx
+++ b/docs/source/en/tasks/sequence_classification.mdx
@ -28,7 +28,7 @@ The task illustrated in this tutorial is supported by the following model archit

 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->

-[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [LLaMA](../model_doc/llama), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Perceiver](../model_doc/perceiver), [PLBart](../model_doc/plbart), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [TAPAS](../model_doc/tapas), [Transformer-XL](../model_doc/transfo-xl), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPT Neo](../model_doc/gpt_neo), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [LLaMA](../model_doc/llama), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Perceiver](../model_doc/perceiver), [PLBart](../model_doc/plbart), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [TAPAS](../model_doc/tapas), [Transformer-XL](../model_doc/transfo-xl), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)


 <!--End of the generated tip-->
--- a/docs/source/en/tasks/summarization.mdx
+++ b/docs/source/en/tasks/summarization.mdx
@ -31,7 +31,7 @@ The task illustrated in this tutorial is supported by the following model archit

 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->

-[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)
+[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)

 <!--End of the generated tip-->

--- a/docs/source/en/tasks/text-to-speech.mdx
+++ b/docs/source/en/tasks/text-to-speech.mdx
@ -1,558 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Text to speech
-
-[[open-in-colab]]
-
-Text-to-speech (TTS) is the task of creating natural-sounding speech from text, where the speech can be generated in multiple 
-languages and for multiple speakers. The only text-to-speech model currently available in 🤗 Transformers 
-is [SpeechT5](model_doc/speecht5), though more will be added in the future. SpeechT5 is pre-trained on a combination of 
-speech-to-text and text-to-speech data, allowing it to learn a unified space of hidden representations shared by both text 
-and speech. This means that the same pre-trained model can be fine-tuned for different tasks. Furthermore, SpeechT5 
-supports multiple speakers through x-vector speaker embeddings. 
-
-This guide illustrates how to:
-
-1. Fine-tune [SpeechT5](model_doc/speecht5) that was originally trained on English speech on the Dutch (`nl`) language subset of the [VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) dataset.
-2. Use your fine-tuned model for inference.
-
-Before you begin, make sure you have all the necessary libraries installed:
-
-```bash
-pip install datasets soundfile speechbrain accelerate
-```
-
-Install 🤗Transformers from source as not all the SpeechT5 features have been merged into an official release yet:
-
-```bash
-pip install git+https://github.com/huggingface/transformers.git
-```
-
-<Tip>
-
-To follow this guide you will need a GPU. If you're working in a notebook, run the following line to check if a GPU is available: 
-
-```bash
-!nvidia-smi
-```
-
-</Tip>
-
-We encourage you to log in to your Hugging Face account to upload and share your model with the community. When prompted, enter your token to log in:
-
-```py
->>> from huggingface_hub import notebook_login
-
->>> notebook_login()
-```
-
-## Load the dataset
-
-[VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) is a large-scale multilingual speech corpus consisting of 
-data sourced from 2009-2020 European Parliament event recordings. It contains labelled audio-transcription data for 15 
-European languages. In this guide, we are using the Dutch language subset, feel free to pick another subset. 
-
-Note that VoxPopuli or any other automated speech recognition (ASR) dataset may not be the most suitable 
-option for training TTS models. The features that make it beneficial for ASR, such as excessive background noise, are 
-typically undesirable in TTS. However, finding top-quality, multilingual, and multi-speaker TTS datasets can be quite 
-challenging.
-
-Let's load the data:
-
-```py
->>> from datasets import load_dataset, Audio
-
->>> dataset = load_dataset("facebook/voxpopuli", "nl", split="train")
->>> len(dataset)
-20968
-```
-
-20968 examples should be sufficient for fine-tuning. SpeechT5 expects audio data to have a sampling rate of 16 kHz, so 
-make sure the examples in the dataset meet this requirement:
-
-```py
-dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
-```
-
-## Preprocess the data
-
-Let's begin by defining the model checkpoint to use and loading the appropriate processor: 
-
-```py
->>> from transformers import SpeechT5Processor
-
->>> checkpoint = "microsoft/speecht5_tts"
->>> processor = SpeechT5Processor.from_pretrained(checkpoint)
-```
-
-### Text cleanup for SpeechT5 tokenization 
-
-Start by cleaning up the text data. You'll need the tokenizer part of the processor to process the text:
-
-```py
->>> tokenizer = processor.tokenizer
-```
-
-The dataset examples contain `raw_text` and `normalized_text` features. When deciding which feature to use as the text input, 
-consider that the SpeechT5 tokenizer doesn't have any tokens for numbers. In `normalized_text` the numbers are written 
-out as text. Thus, it is a better fit, and we recommend using    `normalized_text` as input text.
-
-Because SpeechT5 was trained on the English language, it may not recognize certain characters in the Dutch dataset. If 
-left as is, these characters will be converted to `<unk>` tokens. However, in Dutch, certain characters like `à` are 
-used to stress syllables. In order to preserve the meaning of the text, we can replace this character with a regular `a`.
-
-To identify unsupported tokens, extract all unique characters in the dataset using the `SpeechT5Tokenizer` which 
-works with characters as tokens. To do this, write the `extract_all_chars` mapping function that concatenates 
-the transcriptions from all examples into one string and converts it to a set of characters. 
-Make sure to set `batched=True` and `batch_size=-1` in `dataset.map()` so that all transcriptions are available at once for 
-the mapping function.
-
-```py
->>> def extract_all_chars(batch):
-...     all_text = " ".join(batch["normalized_text"])
-...     vocab = list(set(all_text))
-...     return {"vocab": [vocab], "all_text": [all_text]}
-
-
->>> vocabs = dataset.map(
-...     extract_all_chars,
-...     batched=True,
-...     batch_size=-1,
-...     keep_in_memory=True,
-...     remove_columns=dataset.column_names,
-... )
-
->>> dataset_vocab = set(vocabs["vocab"][0])
->>> tokenizer_vocab = {k for k, _ in tokenizer.get_vocab().items()}
-```
-
-Now you have two sets of characters: one with the vocabulary from the dataset and one with the vocabulary from the tokenizer. 
-To identify any unsupported characters in the dataset, you can take the difference between these two sets. The resulting 
-set will contain the characters that are in the dataset but not in the tokenizer.
-
-```py
->>> dataset_vocab - tokenizer_vocab
-{' ', 'à', 'ç', 'è', 'ë', 'í', 'ï', 'ö', 'ü'}
-```
-
-To handle the unsupported characters identified in the previous step, define a function that maps these characters to 
-valid tokens. Note that spaces are already replaced by `▁` in the tokenizer and don't need to be handled separately.
-
-```py
->>> replacements = [
-...     ("à", "a"),
-...     ("ç", "c"),
-...     ("è", "e"),
-...     ("ë", "e"),
-...     ("í", "i"),
-...     ("ï", "i"),
-...     ("ö", "o"),
-...     ("ü", "u"),
-... ]
-
-
->>> def cleanup_text(inputs):
-...     for src, dst in replacements:
-...         inputs["normalized_text"] = inputs["normalized_text"].replace(src, dst)
-...     return inputs
-
-
->>> dataset = dataset.map(cleanup_text)
-```
-
-Now that you have dealt with special characters in the text, it's time to shift focus to the audio data.
-
-### Speakers
-
-The VoxPopuli dataset includes speech from multiple speakers, but how many speakers are represented in the dataset? To 
-determine this, we can count the number of unique speakers and the number of examples each speaker contributes to the dataset. 
-With a total of 20,968 examples in the dataset, this information will give us a better understanding of the distribution of 
-speakers and examples in the data.
-
-```py
->>> from collections import defaultdict
-
->>> speaker_counts = defaultdict(int)
-
->>> for speaker_id in dataset["speaker_id"]:
-...     speaker_counts[speaker_id] += 1
-```
-
-By plotting a histogram you can get a sense of how much data there is for each speaker.
-
-```py
->>> import matplotlib.pyplot as plt
-
->>> plt.figure()
->>> plt.hist(speaker_counts.values(), bins=20)
->>> plt.ylabel("Speakers")
->>> plt.xlabel("Examples")
->>> plt.show()
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/tts_speakers_histogram.png" alt="Speakers histogram"/>
-</div>
-
-The histogram reveals that approximately one-third of the speakers in the dataset have fewer than 100 examples, while 
-around ten speakers have more than 500 examples. To improve training efficiency and balance the dataset, we can limit 
-the data to speakers with between 100 and 400 examples. 
-
-```py
->>> def select_speaker(speaker_id):
-...     return 100 <= speaker_counts[speaker_id] <= 400
-
-
->>> dataset = dataset.filter(select_speaker, input_columns=["speaker_id"])
-```
-
-Let's check how many speakers remain: 
-
-```py
->>> len(set(dataset["speaker_id"]))
-42
-```
-
-Let's see how many examples are left: 
-
-```py
->>> len(dataset)
-9973
-```
-
-You are left with just under 10,000 examples from approximately 40 unique speakers, which should be sufficient.
-
-Note that some speakers with few examples may actually have more audio available if the examples are long. However, 
-determining the total amount of audio for each speaker requires scanning through the entire dataset, which is a 
-time-consuming process that involves loading and decoding each audio file. As such, we have chosen to skip this step here.
-
-### Speaker embeddings
-
-To enable the TTS model to differentiate between multiple speakers, you'll need to create a speaker embedding for each example. 
-The speaker embedding is an additional input into the model that captures a particular speaker's voice characteristics.
-To generate these speaker embeddings, use the pre-trained [spkrec-xvect-voxceleb](https://huggingface.co/speechbrain/spkrec-xvect-voxceleb) 
-model from SpeechBrain. 
-
-Create a function `create_speaker_embedding()` that takes an input audio waveform and outputs a 512-element vector 
-containing the corresponding speaker embedding.
-
-```py
->>> import os
->>> import torch
->>> from speechbrain.pretrained import EncoderClassifier
-
->>> spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
-
->>> device = "cuda" if torch.cuda.is_available() else "cpu"
->>> speaker_model = EncoderClassifier.from_hparams(
-...     source=spk_model_name,
-...     run_opts={"device": device},
-...     savedir=os.path.join("/tmp", spk_model_name),
-... )
-
-
->>> def create_speaker_embedding(waveform):
-...     with torch.no_grad():
-...         speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
-...         speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
-...         speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
-...     return speaker_embeddings
-```
-
-It's important to note that the `speechbrain/spkrec-xvect-voxceleb` model was trained on English speech from the VoxCeleb 
-dataset, whereas the training examples in this guide are in Dutch. While we believe that this model will still generate 
-reasonable speaker embeddings for our Dutch dataset, this assumption may not hold true in all cases.
-
-For optimal results, we recommend training an X-vector model on the target speech first. This will ensure that the model 
-is better able to capture the unique voice characteristics present in the Dutch language.
-
-### Processing the dataset
-
-Finally, let's process the data into the format the model expects. Create a `prepare_dataset` function that takes in a 
-single example and uses the `SpeechT5Processor` object to tokenize the input text and load the target audio into a log-mel spectrogram. 
-It should also add the speaker embeddings as an additional input.
-
-```py
->>> def prepare_dataset(example):
-...     audio = example["audio"]
-
-...     example = processor(
-...         text=example["normalized_text"],
-...         audio_target=audio["array"],
-...         sampling_rate=audio["sampling_rate"],
-...         return_attention_mask=False,
-...     )
-
-...     # strip off the batch dimension
-...     example["labels"] = example["labels"][0]
-
-...     # use SpeechBrain to obtain x-vector
-...     example["speaker_embeddings"] = create_speaker_embedding(audio["array"])
-
-...     return example
-```
-
-Verify the processing is correct by looking at a single example:
-
-```py
->>> processed_example = prepare_dataset(dataset[0])
->>> list(processed_example.keys())
-['input_ids', 'labels', 'stop_labels', 'speaker_embeddings']
-```
-
-Speaker embeddings should be a 512-element vector:
-
-```py
->>> processed_example["speaker_embeddings"].shape
-(512,)
-```
-
-The labels should be a log-mel spectrogram with 80 mel bins.
-
-```py
->>> import matplotlib.pyplot as plt
-
->>> plt.figure()
->>> plt.imshow(processed_example["labels"].T)
->>> plt.show()
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/tts_logmelspectrogram_1.png" alt="Log-mel spectrogram with 80 mel bins"/>
-</div>
-
-Side note: If you find this spectrogram confusing, it may be due to your familiarity with the convention of placing low frequencies 
-at the bottom and high frequencies at the top of a plot. However, when plotting spectrograms as an image using the matplotlib library, 
-the y-axis is flipped and the spectrograms appear upside down.
-
-Now apply the processing function to the entire dataset. This will take between 5 and 10 minutes.
-
-```py
->>> dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)
-```
-
-You'll see a warning saying that some examples in the dataset are longer than the maximum input length the model can handle (600 tokens). 
-Remove those examples from the dataset. Here we go even further and to allow for larger batch sizes we remove anything over 200 tokens.
-
-```py
->>> def is_not_too_long(input_ids):
-...     input_length = len(input_ids)
-...     return input_length < 200
-
-
->>> dataset = dataset.filter(is_not_too_long, input_columns=["input_ids"])
->>> len(dataset)
-8259
-```
-
-Next, create a basic train/test split: 
-
-```py
->>> dataset = dataset.train_test_split(test_size=0.1)
-```
-
-### Data collator
-
-In order to combine multiple examples into a batch, you need to define a custom data collator. This collator will pad shorter sequences with padding 
-tokens, ensuring that all examples have the same length. For the spectrogram labels, the padded portions are replaced with the special value `-100`. This special value 
-instructs the model to ignore that part of the spectrogram when calculating the spectrogram loss.
-
-```py
->>> from dataclasses import dataclass
->>> from typing import Any, Dict, List, Union
-
-
->>> @dataclass
-... class TTSDataCollatorWithPadding:
-...     processor: Any
-
-...     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
-...         input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
-...         label_features = [{"input_values": feature["labels"]} for feature in features]
-...         speaker_features = [feature["speaker_embeddings"] for feature in features]
-
-...         # collate the inputs and targets into a batch
-...         batch = processor.pad(input_ids=input_ids, labels=label_features, return_tensors="pt")
-
-...         # replace padding with -100 to ignore loss correctly
-...         batch["labels"] = batch["labels"].masked_fill(batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100)
-
-...         # not used during fine-tuning
-...         del batch["decoder_attention_mask"]
-
-...         # round down target lengths to multiple of reduction factor
-...         if model.config.reduction_factor > 1:
-...             target_lengths = torch.tensor([len(feature["input_values"]) for feature in label_features])
-...             target_lengths = target_lengths.new(
-...                 [length - length % model.config.reduction_factor for length in target_lengths]
-...             )
-...             max_length = max(target_lengths)
-...             batch["labels"] = batch["labels"][:, :max_length]
-
-...         # also add in the speaker embeddings
-...         batch["speaker_embeddings"] = torch.tensor(speaker_features)
-
-...         return batch
-```
-
-In SpeechT5, the input to the decoder part of the model is reduced by a factor 2. In other words, it throws away every 
-other timestep from the target sequence. The decoder then predicts a sequence that is twice as long. Since the original 
-target sequence length may be odd, the data collator makes sure to round the maximum length of the batch down to be a 
-multiple of 2.
-
-```py 
->>> data_collator = TTSDataCollatorWithPadding(processor=processor)
-```
-
-## Train the model
-
-Load the pre-trained model from the same checkpoint as you used for loading the processor: 
-
-```py
->>> from transformers import SpeechT5ForTextToSpeech
-
->>> model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
-```
-
-The `use_cache=True` option is incompatible with gradient checkpointing. Disable it for training.
-
-```py 
->>> model.config.use_cache = False
-```
-
-Define the training arguments. Here we are not computing any evaluation metrics during the training process. Instead, we'll 
-only look at the loss:
-
-```python
->>> from transformers import Seq2SeqTrainingArguments
-
->>> training_args = Seq2SeqTrainingArguments(
-...     output_dir="speecht5_finetuned_voxpopuli_nl",  # change to a repo name of your choice
-...     per_device_train_batch_size=4,
-...     gradient_accumulation_steps=8,
-...     learning_rate=1e-5,
-...     warmup_steps=500,
-...     max_steps=4000,
-...     gradient_checkpointing=True,
-...     fp16=True,
-...     evaluation_strategy="steps",
-...     per_device_eval_batch_size=2,
-...     save_steps=1000,
-...     eval_steps=1000,
-...     logging_steps=25,
-...     report_to=["tensorboard"],
-...     load_best_model_at_end=True,
-...     greater_is_better=False,
-...     label_names=["labels"],
-...     push_to_hub=True,
-... )
-```
-
-Instantiate the `Trainer` object  and pass the model, dataset, and data collator to it.
-
-```py
->>> from transformers import Seq2SeqTrainer
-
->>> trainer = Seq2SeqTrainer(
-...     args=training_args,
-...     model=model,
-...     train_dataset=dataset["train"],
-...     eval_dataset=dataset["test"],
-...     data_collator=data_collator,
-...     tokenizer=processor,
-... )
-```
-
-And with that, you're ready to start training! Training will take several hours. Depending on your GPU, 
-it is possible that you will encounter a CUDA "out-of-memory" error when you start training. In this case, you can reduce 
-the `per_device_train_batch_size` incrementally by factors of 2 and increase `gradient_accumulation_steps` by 2x to compensate.
-
-```py
->>> trainer.train()
-```
-
-Push the final model to the 🤗 Hub:
-
-```py
->>> trainer.push_to_hub()
-```
-
-## Inference
-
-Great, now that you've fine-tuned a model, you can use it for inference!
-Load the model from the 🤗 Hub (make sure to use your account name in the following code snippet): 
-
-```py
->>> model = SpeechT5ForTextToSpeech.from_pretrained("YOUR_ACCOUNT/speecht5_finetuned_voxpopuli_nl")
-```
-
-Pick an example, here we'll take one from the test dataset. Obtain a speaker embedding. 
-
-```py 
->>> example = dataset["test"][304]
->>> speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
-```
-
-Define some input text and tokenize it.
-
-```py 
->>> text = "hallo allemaal, ik praat nederlands. groetjes aan iedereen!"
-```
-
-Preprocess the input text: 
-
-```py
->>> inputs = processor(text=text, return_tensors="pt")
-```
-
-Create a spectrogram with your model: 
-
-```py
->>> spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
-```
-
-Visualize the spectrogram, if you'd like to: 
-
-```py
->>> plt.figure()
->>> plt.imshow(spectrogram.T)
->>> plt.show()
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/tts_logmelspectrogram_2.png" alt="Generated log-mel spectrogram"/>
-</div>
-
-Finally, use the vocoder to turn the spectrogram into sound.
-
-```py
->>> with torch.no_grad():
-...     speech = vocoder(spectrogram)
-
->>> from IPython.display import Audio
-
->>> Audio(speech.numpy(), rate=16000)
-```
-
-In our experience, obtaining satisfactory results from this model can be challenging. The quality of the speaker 
-embeddings appears to be a significant factor. Since SpeechT5 was pre-trained with English x-vectors, it performs best 
-when using English speaker embeddings. If the synthesized speech sounds poor, try using a different speaker embedding.
-
-Increasing the training duration is also likely to enhance the quality of the results. Even so, the speech clearly is Dutch instead of English, and it does 
-capture the voice characteristics of the speaker (compare to the original audio in the example).
-Another thing to experiment with is the model's configuration. For example, try using `config.reduction_factor = 1` to 
-see if this improves the results.
-
-Finally, it is essential to consider ethical considerations. Although TTS technology has numerous useful applications, it 
-may also be used for malicious purposes, such as impersonating someone's voice without their knowledge or consent. Please 
-use TTS judiciously and responsibly.
--- a/docs/source/en/tasks/token_classification.mdx
+++ b/docs/source/en/tasks/token_classification.mdx
@ -28,7 +28,7 @@ The task illustrated in this tutorial is supported by the following model archit

 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->

-[ALBERT](../model_doc/albert), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [QDQBert](../model_doc/qdqbert), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+[ALBERT](../model_doc/albert), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [QDQBert](../model_doc/qdqbert), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)

 <!--End of the generated tip-->

@ -121,7 +121,7 @@ As you saw in the example `tokens` field above, it looks like the input has alre

 However, this adds some special tokens `[CLS]` and `[SEP]` and the subword tokenization creates a mismatch between the input and labels. A single word corresponding to a single label may now be split into two subwords. You'll need to realign the tokens and labels by:

-1. Mapping all tokens to their corresponding word with the [`word_ids`](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.BatchEncoding.word_ids) method.
+1. Mapping all tokens to their corresponding word with the [`word_ids`](https://huggingface.co/docs/tokenizers/python/latest/api/reference.html#tokenizers.Encoding.word_ids) method.
 2. Assigning the label `-100` to the special tokens `[CLS]` and `[SEP]` so they're ignored by the PyTorch loss function.
 3. Only labeling the first token of a given word. Assign `-100` to other subtokens from the same word.

--- a/docs/source/en/tasks/translation.mdx
+++ b/docs/source/en/tasks/translation.mdx
@ -28,7 +28,7 @@ The task illustrated in this tutorial is supported by the following model archit

 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->

-[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)
+[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)

 <!--End of the generated tip-->

--- a/docs/source/en/testing.mdx
+++ b/docs/source/en/testing.mdx
@ -212,12 +212,20 @@ Example:
    ```"""

 ```
+3 steps are required to debug the docstring examples: 
+1. In order to properly run the test, **an extra line has to be added** at the end of the docstring. This can be automatically done on any file using: 
+```bash 
+python utils/prepare_for_doc_test.py <path_to_file_or_dir>
+```

-Just run the following line to automatically test every docstring example in the desired file: 
+2. Then, you can use the following line to automatically test every docstring example in the desired file: 
 ```bash 
 pytest --doctest-modules <path_to_file_or_dir>
 ```
-If the file has a markdown extention, you should add the `--doctest-glob="*.mdx"` argument.
+3. Once you are done debugging, you need to remove the extra line added in step **1.** by running the following: 
+```bash 
+python utils/prepare_for_doc_test.py <path_to_file_or_dir> --remove_new_line
+```

 ### Run only modified tests

--- a/docs/source/en/tools_inference_endpoints.mdx
+++ b/docs/source/en/tools_inference_endpoints.mdx
@ -1,132 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Running tools on inference endpoints
-
-<Tip>
-
-This document is about running tools on inference endpoints so that agents may use these tools remotely. 
-If you do not know what tools and agents are in the context of transformers, we recommend you read the
-[Transformers Agents](transformers_agents) page first.
-
-</Tip>
-
-Agents are designed to use tools in order to respond to a natural language query. They are setup so as to load tools
-locally and use them directly in the runtime they're at.
-
-However, some of these tools can be heavy; tools that handle images, long text, or audio signals may need a 
-significant amount of memory in order to perform inference. Tools that generate images through a diffusion
-process, may require significant compute in order to perform the multiple steps they need; but end users
-may not benefit from the powerful setups required to use them.
-
-This is why we have support for **remote** tools: these have an API that can be called from the runtime, offloading
-the processing to the remote API. In this guide we'll explore how to set up an inference endpoint for a given tool
-to leverage it with the agents.
-
-Inference endpoints are one solution to handle remote tools; but they're not the only one. We integrate with
-[`gradio_tools`](custom_tools#leveraging-gradiotools) that also offers remote tools, and we'll continue adding 
-guides to other alternatives for remote tools.
-
-## Inference Endpoints
-
-
-[Inference Endpoints](https://huggingface.co/inference-endpoints) is a paid Hugging Face solution to easily deploy
-Transformers and Diffusers models on a fully-managed infrastructure. It has default deployment options for
-transformers and diffusers, but given that we're using a specific type of object here, tools, we'll set up a custom
-handler to get it to work.
-
-<Tip warning={true}>
-
-Inference Endpoints are a paid hosting service by Hugging Face, which needs to have an organization setup with 
-billing enabled.
-
-</Tip>
-
-Tools are Spaces by default in Transformers. When calling `push_to_hub` on a tool, you're effectively pushing
-the code to a Space on the Hugging Face Hub under a namespace that you own. There are many tools living on the
-[`huggingface-tools` namespace](https://huggingface.co/huggingface-tools); having them be Spaces by default means
-that users can play around with the tool directly in the browser.
-
-However, Inference Endpoints only work with **model** repositories. We'll therefore have to create a model
-repository to act as a proxy for the Space. That model repository will contain the `handler.py` file to serve
-our tool through an inference endpoint.
-
-For demonstration purposes, we'll consider that you already have a tool handy that you'd like to use remotely. If
-you'd like to setup your custom tool, we recommend reading the [Custom Tool](custom_tools#leveraging-gradiotools) 
-guide.
-
-We'll try and deploy the `huggingface-tools/text-to-video` tool to an inference endpoint. We have it available as 
-a gradio Space [here](https://huggingface.co/huggingface-tools/text-to-video).
-
-### Setting up the repository
-
-We'll start by creating a model repository that will serve as a serving point for this tool.
-It can be public or private; for the sake of this tutorial we'll keep this one public, but having it set to
-private doesn't interfere with the inference endpoint setup.
-
-The repository is created and is available [here](https://huggingface.co/huggingface-tools/text-to-video).
-In it, you'll see there is a custom handler file, called 
-[`handler.py`](https://huggingface.co/huggingface-tools/text-to-video/blob/main/handler.py), as well as a traditional
-requirements file called 
-[`requirements.txt`](https://huggingface.co/huggingface-tools/text-to-video/blob/main/requirements.txt).
-
-#### Handler file
-
-The handler file exposes an `EndpointHandler`, which serves as the link between the requests you'll be doing to the
-remote tool and the tool itself. It should:
-
- Instantiate the tool in its initialization method
- Have a `__call__` method which will take the serialized input and return the computed result.
-
-For text-to-text tools, the handler file is very simple; it looks like the following:
-
-```python
-from transformers.tools import load_tool
-
-
-class EndpointHandler:
-    def __init__(self, path=""):
-        self.tool = load_tool("huggingface-tools/text-to-video")
-        self.tool.setup()
-
-    def __call__(self, data):
-        inputs = data.pop("inputs", data)
-        return self.tool(**inputs)
-```
-
-However, it is different if handling different data types as it will need to serialize this data type. 
-This guide will be completed to include different serialization for text, image, audio and video.
-
-#### Requirement file
-
-The requirement file needs to specify all requirements necessary to run the tool. The basic dependencies are the 
-following:
-
-```text
-transformers>=4.29.0
-accelerate
-```
-
-but you may need to include any and all other dependencies needed by your tool
-
-### Spinning up an endpoint
-
-Once we're done creating the repository, we can go ahead and create our first endpoint. Head over to
-[the Inference Endpoints UI](https://ui.endpoints.huggingface.co/endpoints) and create your first endpoint.
-
-If the repository is setup correctly, it should spin up directly without issue.
-
-In case you encounter a "Failed" deployment, we recommend checking out 
-[this guide](https://huggingface.co/docs/inference-endpoints/guides/logs) on checking out the logs of an inference
-endpoint.
-
-TODO add images
--- a/docs/source/en/torchscript.mdx
+++ b/docs/source/en/torchscript.mdx
@ -201,7 +201,7 @@ AMI](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launc
 ### Converting a model for AWS Neuron

 Convert a model for AWS NEURON using the same code from [Using TorchScript in
-Python](torchscript#using-torchscript-in-python) to trace a `BertModel`. Import the
+Python](serialization#using-torchscript-in-python) to trace a `BertModel`. Import the
 `torch.neuron` framework extension to access the components of the Neuron SDK through a
 Python API:

--- a/docs/source/en/training.mdx
+++ b/docs/source/en/training.mdx
@ -247,7 +247,7 @@ reduces the number of padding tokens compared to padding the entire dataset.


 ```py
->>> tf_dataset = model.prepare_tf_dataset(dataset["train"], batch_size=16, shuffle=True, tokenizer=tokenizer)
+>>> tf_dataset = model.prepare_tf_dataset(dataset, batch_size=16, shuffle=True, tokenizer=tokenizer)
 ```

 Note that in the code sample above, you need to pass the tokenizer to `prepare_tf_dataset` so it can correctly pad batches as they're loaded.
--- a/docs/source/en/transformers_agents.mdx
+++ b/docs/source/en/transformers_agents.mdx
@ -1,331 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Transformers Agent
-
-<Tip warning={true}>
-
-Transformers Agent is an experimental API which is subject to change at any time. Results returned by the agents
-can vary as the APIs or underlying models are prone to change.
-
-</Tip>
-
-Transformers version v4.29.0, building on the concept of *tools* and *agents*. You can play with in
-[this colab](https://colab.research.google.com/drive/1c7MHD-T1forUPGcC_jlwsIptOzpG3hSj).
-
-In short, it provides a natural language API on top of transformers: we define a set of curated tools and design an 
-agent to interpret natural language and to use these tools. It is extensible by design; we curated some relevant tools, 
-but we'll show you how the system can be extended easily to use any tool developed by the community.
-
-Let's start with a few examples of what can be achieved with this new API. It is particularly powerful when it comes 
-to multimodal tasks, so let's take it for a spin to generate images and read text out loud.
-
-```py
-agent.run("Caption the following image", image=image)
-```
-
-| **Input**                                                                                                                   | **Output**                        |
-|-----------------------------------------------------------------------------------------------------------------------------|-----------------------------------|
-| <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/beaver.png" width=200> | A beaver is swimming in the water |
-
---
-
-```py
-agent.run("Read the following text out loud", text=text)
-```
-| **Input**                                                                                                               | **Output**                                   |
-|-------------------------------------------------------------------------------------------------------------------------|----------------------------------------------|
-| A beaver is swimming in the water | <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tts_example.wav" type="audio/wav"> your browser does not support the audio element. </audio>
-
---
-
-```py
-agent.run(
-    "In the following `document`, where will the TRRF Scientific Advisory Council Meeting take place?",
-    document=document,
-)
-```
-| **Input**                                                                                                                   | **Output**     |
-|-----------------------------------------------------------------------------------------------------------------------------|----------------|
-| <img src="https://datasets-server.huggingface.co/assets/hf-internal-testing/example-documents/--/hf-internal-testing--example-documents/test/0/image/image.jpg" width=200> | ballroom foyer |
-
-## Quickstart
-
-Before being able to use `agent.run`, you will need to instantiate an agent, which is a large language model (LLM). 
-We provide support for openAI models as well as opensource alternatives from BigCode and OpenAssistant. The openAI
-models perform better (but require you to have an openAI API key, so cannot be used for free); Hugging Face is
-providing free access to endpoints for BigCode and OpenAssistant models.
-
-To start with, please install the `agents` extras in order to install all default dependencies.
-```bash
-pip install transformers[agents]
-```
-
-To use openAI models, you instantiate an [`OpenAiAgent`] after installing the `openai` dependency:
-
-```bash
-pip install openai
-```
-
-
-```py
-from transformers import OpenAiAgent
-
-agent = OpenAiAgent(model="text-davinci-003", api_key="<your_api_key>")
-```
-
-To use BigCode or OpenAssistant, start by logging in to have access to the Inference API:
-
-```py
-from huggingface_hub import login
-
-login("<YOUR_TOKEN>")
-```
-
-Then, instantiate the agent
-
-```py
-from transformers import HfAgent
-
-# Starcoder
-agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
-# StarcoderBase
-# agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoderbase")
-# OpenAssistant
-# agent = HfAgent(url_endpoint="https://api-inference.huggingface.co/models/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5")
-```
-
-This is using the inference API that Hugging Face provides for free at the moment. If you have your own inference
-endpoint for this model (or another one) you can replace the URL above with your URL endpoint.
-
-<Tip>
-
-StarCoder and OpenAssistant are free to use and perform admirably well on simple tasks. However, the checkpoints
-don't hold up when handling more complex prompts. If you're facing such an issue, we recommend trying out the OpenAI
-model which, while sadly not open-source, performs better at this given time.
-
-</Tip>
-
-You're now good to go! Let's dive into the two APIs that you now have at your disposal.
-
-### Single execution (run)
-
-The single execution method is when using the [`~Agent.run`] method of the agent:
-
-```py
-agent.run("Draw me a picture of rivers and lakes.")
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200>
-
-It automatically selects the tool (or tools) appropriate for the task you want to perform and runs them appropriately. It
-can perform one or several tasks in the same instruction (though the more complex your instruction, the more likely
-the agent is to fail).
-
-```py
-agent.run("Draw me a picture of the sea then transform the picture to add an island")
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/sea_and_island.png" width=200>
-
-<br/>
-
-
-Every [`~Agent.run`] operation is independent, so you can run it several times in a row with different tasks.
-
-Note that your `agent` is just a large-language model, so small variations in your prompt might yield completely
-different results. It's important to explain as clearly as possible the task you want to perform. We go more in-depth
-on how to write good prompts [here](custom_tools#writing-good-user-inputs).
-
-If you'd like to keep a state across executions or to pass non-text objects to the agent, you can do so by specifying
-variables that you would like the agent to use. For example, you could generate the first image of rivers and lakes, 
-and ask the model to update that picture to add an island by doing the following:
-
-```python
-picture = agent.run("Generate a picture of rivers and lakes.")
-updated_picture = agent.run("Transform the image in `picture` to add an island to it.", picture=picture)
-```
-
-<Tip>
-
-This can be helpful when the model is unable to understand your request and mixes tools. An example would be:
-
-```py
-agent.run("Draw me the picture of a capybara swimming in the sea")
-```
-
-Here, the model could interpret in two ways:
- Have the `text-to-image` generate a capybara swimming in the sea
- Or, have the `text-to-image` generate capybara, then use the `image-transformation` tool to have it swim in the sea
-
-In case you would like to force the first scenario, you could do so by passing it the prompt as an argument:
-
-```py
-agent.run("Draw me a picture of the `prompt`", prompt="a capybara swimming in the sea")
-```
-
-</Tip>
-
-
-### Chat-based execution (chat)
-
-The agent also has a chat-based approach, using the [`~Agent.chat`] method:
-
-```py
-agent.chat("Generate a picture of rivers and lakes")
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200> 
-
-```py
-agent.chat("Transform the picture so that there is a rock in there")
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_and_beaver.png" width=200>
-
-<br/>
-
-This is an interesting approach when you want to keep the state across instructions. It's better for experimentation, 
-but will tend to be much better at single instructions rather than complex instructions (which the [`~Agent.run`]
-method is better at handling).
-
-This method can also take arguments if you would like to pass non-text types or specific prompts.
-
-### ⚠️ Remote execution
-
-For demonstration purposes and so that this can be used with all setups, we have created remote executors for several 
-of the default tools the agent has access. These are created using 
-[inference endpoints](https://huggingface.co/inference-endpoints). To see how to set up remote executors tools yourself,
-we recommend reading the [custom tool guide](./custom_tools).
-
-In order to run with remote tools, specifying `remote=True` to either [`~Agent.run`] or [`~Agent.chat`] is sufficient.
-
-For example, the following command could be run on any device efficiently, without needing significant RAM or GPU:
-
-```py
-agent.run("Draw me a picture of rivers and lakes", remote=True)
-```
-
-The same can be said for [`~Agent.chat`]:
-
-```py
-agent.chat("Draw me a picture of rivers and lakes", remote=True)
-```
-
-### What's happening here? What are tools, and what are agents?
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/diagram.png">
-
-#### Agents
-
-The "agent" here is a large language model, and we're prompting it so that it has access to a specific set of tools.
-
-LLMs are pretty good at generating small samples of code, so this API takes advantage of that by prompting the 
-LLM gives a small sample of code performing a task with a set of tools. This prompt is then completed by the 
-task you give your agent and the description of the tools you give it. This way it gets access to the doc of the 
-tools you are using, especially their expected inputs and outputs, and can generate the relevant code.
-
-#### Tools
-
-Tools are very simple: they're a single function, with a name, and a description. We then use these tools' descriptions 
-to prompt the agent. Through the prompt, we show the agent how it would leverage tools to perform what was 
-requested in the query.
-
-This is using brand-new tools and not pipelines, because the agent writes better code with very atomic tools. 
-Pipelines are more refactored and often combine several tasks in one. Tools are meant to be focused on
-one very simple task only.
-
-#### Code-execution?!
-
-This code is then executed with our small Python interpreter on the set of inputs passed along with your tools. 
-We hear you screaming "Arbitrary code execution!" in the back, but let us explain why that is not the case.
-
-The only functions that can be called are the tools you provided and the print function, so you're already 
-limited in what can be executed. You should be safe if it's limited to Hugging Face tools. 
-
-Then, we don't allow any attribute lookup or imports (which shouldn't be needed anyway for passing along 
-inputs/outputs to a small set of functions) so all the most obvious attacks (and you'd need to prompt the LLM 
-to output them anyway) shouldn't be an issue. If you want to be on the super safe side, you can execute the 
-run() method with the additional argument return_code=True, in which case the agent will just return the code 
-to execute and you can decide whether to do it or not.
-
-The execution will stop at any line trying to perform an illegal operation or if there is a regular Python error 
-with the code generated by the agent.
-
-### A curated set of tools
-
-We identify a set of tools that can empower such agents. Here is an updated list of the tools we have integrated 
-in `transformers`:
-
- **Document question answering**: given a document (such as a PDF) in image format, answer a question on this document ([Donut](./model_doc/donut))
- **Text question answering**: given a long text and a question, answer the question in the text ([Flan-T5](./model_doc/flan-t5))
- **Unconditional image captioning**: Caption the image! ([BLIP](./model_doc/blip))
- **Image question answering**: given an image, answer a question on this image ([VILT](./model_doc/vilt))
- **Image segmentation**: given an image and a prompt, output the segmentation mask of that prompt ([CLIPSeg](./model_doc/clipseg))
- **Speech to text**: given an audio recording of a person talking, transcribe the speech into text ([Whisper](./model_doc/whisper))
- **Text to speech**: convert text to speech ([SpeechT5](./model_doc/speecht5))
- **Zero-shot text classification**: given a text and a list of labels, identify to which label the text corresponds the most ([BART](./model_doc/bart))
- **Text summarization**: summarize a long text in one or a few sentences ([BART](./model_doc/bart))
- **Translation**: translate the text into a given language ([NLLB](./model_doc/nllb))
-
-These tools have an integration in transformers, and can be used manually as well, for example:
-
-```py
-from transformers import load_tool
-
-tool = load_tool("text-to-speech")
-audio = tool("This is a text to speech tool")
-```
-
-### Custom tools
-
-While we identify a curated set of tools, we strongly believe that the main value provided by this implementation is 
-the ability to quickly create and share custom tools.
-
-By pushing the code of a tool to a Hugging Face Space or a model repository, you're then able to leverage the tool 
-directly with the agent. We've added a few 
-**transformers-agnostic** tools to the [`huggingface-tools` organization](https://huggingface.co/huggingface-tools):
-
- **Text downloader**: to download a text from a web URL
- **Text to image**: generate an image according to a prompt, leveraging stable diffusion
- **Image transformation**: modify an image given an initial image and a prompt, leveraging instruct pix2pix stable diffusion
- **Text to video**: generate a small video according to a prompt, leveraging damo-vilab
-
-The text-to-image tool we have been using since the beginning is a remote tool that lives in 
-[*huggingface-tools/text-to-image*](https://huggingface.co/spaces/huggingface-tools/text-to-image)! We will
-continue releasing such tools on this and other organizations, to further supercharge this implementation.
-
-The agents have by default access to tools that reside on [`huggingface-tools`](https://huggingface.co/huggingface-tools).
-We explain how to you can write and share your tools as well as leverage any custom tool that resides on the Hub in [following guide](custom_tools).
-
-### Code generation
-
-So far we have shown how to use the agents to perform actions for you. However, the agent is only generating code
-that we then execute using a very restricted Python interpreter. In case you would like to use the code generated in 
-a different setting, the agent can be prompted to return the code, along with tool definition and accurate imports.
-
-For example, the following instruction
-```python
-agent.run("Draw me a picture of rivers and lakes", return_code=True)
-```
-
-returns the following code
-
-```python
-from transformers import load_tool
-
-image_generator = load_tool("huggingface-tools/text-to-image")
-
-image = image_generator(prompt="rivers and lakes")
-```
-
-that you can then modify and execute yourself.
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Matt	47eacad805	Disable head masking	2023-03-27 14:54:40 +01:00
Matt	fd4c0f9c80	Fix inits, add autoclasses	2023-03-24 18:18:56 +00:00
Matt	8499b98381	Update tests, add dummies, make fixup	2023-03-24 18:13:52 +00:00
Matt	4bd4c580c2	Add gptneo to docs	2023-03-24 17:47:18 +00:00
Matt	869e0b5c66	Add module imports	2023-03-24 17:44:03 +00:00
Matt	86a5bf5989	Avoid 'register_buffer', this is a PT method	2023-03-24 17:36:11 +00:00
Matt	60c1b84720	Quick fix to the TF prefixes	2023-03-24 17:35:18 +00:00
Matt	c8fa95b1b2	First commit of GPT-4 auto-port	2023-03-24 17:27:17 +00:00