remove layer_idx

tgi update
be permissive
2025-10-21 09:44:02 +08:00 · 2024-12-13 14:07:01 +00:00 · 2024-12-12 18:29:26 +00:00 · 2024-12-12 11:33:37 +01:00 · 2024-12-12 10:43:13 +01:00 · 2024-12-12 10:36:54 +01:00
5483 changed files with 689307 additions and 652433 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -7,25 +7,12 @@ parameters:
    nightly:
        type: boolean
        default: false
-    GHA_Actor:
-        type: string
-        default: ""
-    GHA_Action:
-        type: string
-        default: ""
-    GHA_Event:
-        type: string
-        default: ""
-    GHA_Meta:
-        type: string
-        default: ""

 jobs:
    # Ensure running with CircleCI/huggingface
    check_circleci_user:
        docker:
            - image: python:3.10-slim
-        resource_class: small
        parallelism: 1
        steps:
            - run: echo $CIRCLE_PROJECT_USERNAME
@ -70,15 +57,15 @@ jobs:
            - run:
                name: "Prepare pipeline parameters"
                command: |
-                    python utils/process_test_artifacts.py
-
+                    python utils/process_test_artifacts.py 
+            
            # To avoid too long generated_config.yaml on the continuation orb, we pass the links to the artifacts as parameters.
            # Otherwise the list of tests was just too big. Explicit is good but for that it was a limitation.
            # We used:

            # https://circleci.com/docs/api/v2/index.html#operation/getJobArtifacts : to get the job artifacts
            # We could not pass a nested dict, which is why we create the test_file_... parameters for every single job
-
+                
            - store_artifacts:
                path: test_preparation/transformed_artifacts.json
            - store_artifacts:
@ -112,6 +99,8 @@ jobs:

            - run:
                name: "Retrieve Artifact Paths"
+                env:
+                    CIRCLE_TOKEN: ${{ secrets.CI_ARTIFACT_TOKEN }}
                command: |
                    project_slug="gh/${CIRCLE_PROJECT_USERNAME}/${CIRCLE_PROJECT_REPONAME}"
                    job_number=${CIRCLE_BUILD_NUM}
@ -120,7 +109,7 @@ jobs:
            - run:
                name: "Prepare pipeline parameters"
                command: |
-                    python utils/process_test_artifacts.py
+                    python utils/process_test_artifacts.py 

            # To avoid too long generated_config.yaml on the continuation orb, we pass the links to the artifacts as parameters.
            # Otherwise the list of tests was just too big. Explicit is good but for that it was a limitation.
@ -156,7 +145,7 @@ jobs:
                  path: ~/transformers/installed.txt
            - run: python -c "from transformers import *" || (echo '🚨 import failed, this means you introduced unprotected imports! 🚨'; exit 1)
            - run: ruff check examples tests src utils
-            - run: ruff format examples tests src utils --check
+            - run: ruff format tests src utils --check
            - run: python utils/custom_init_isort.py --check_only
            - run: python utils/sort_auto_mappings.py --check_only
            - run: python utils/check_doc_toc.py
@ -181,16 +170,17 @@ jobs:
                  path: ~/transformers/installed.txt
            - run: python utils/check_copies.py
            - run: python utils/check_modular_conversion.py
+            - run: python utils/check_table.py
            - run: python utils/check_dummies.py
            - run: python utils/check_repo.py
            - run: python utils/check_inits.py
-            - run: python utils/check_pipeline_typing.py
            - run: python utils/check_config_docstrings.py
            - run: python utils/check_config_attributes.py
            - run: python utils/check_doctest_list.py
            - run: make deps_table_check_updated
            - run: python utils/update_metadata.py --check-only
            - run: python utils/check_docstrings.py
+            - run: python utils/check_support_list.py

 workflows:
    version: 2
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@ -16,9 +16,10 @@
 import argparse
 import copy
 import os
+import random
 from dataclasses import dataclass
-from typing import Any, Optional
-
+from typing import Any, Dict, List, Optional
+import glob
 import yaml


@ -27,70 +28,36 @@ COMMON_ENV_VARIABLES = {
    "TRANSFORMERS_IS_CI": True,
    "PYTEST_TIMEOUT": 120,
    "RUN_PIPELINE_TESTS": False,
-    # will be adjust in `CircleCIJob.to_dict`.
-    "RUN_FLAKY": True,
-    "DISABLE_SAFETENSORS_CONVERSION": True,
+    "RUN_PT_TF_CROSS_TESTS": False,
+    "RUN_PT_FLAX_CROSS_TESTS": False,
 }
 # Disable the use of {"s": None} as the output is way too long, causing the navigation on CircleCI impractical
-COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "vvv": None, "rsfE":None}
+COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "dist": "loadfile", "vvv": None, "rsf":None}
 DEFAULT_DOCKER_IMAGE = [{"image": "cimg/python:3.8.12"}]

-# Strings that commonly appear in the output of flaky tests when they fail. These are used with `pytest-rerunfailures`
-# to rerun the tests that match these patterns.
-FLAKY_TEST_FAILURE_PATTERNS = [
-    "OSError",  # Machine/connection transient error
-    "Timeout",  # Machine/connection transient error
-    "ConnectionError",  # Connection transient error
-    "FileNotFoundError",  # Raised by `datasets` on Hub failures
-    "PIL.UnidentifiedImageError",  # Raised by `PIL.Image.open` on connection issues
-    "HTTPError",  # Also catches HfHubHTTPError
-    "AssertionError: Tensor-likes are not close!",  # `torch.testing.assert_close`, we might have unlucky random values
-    # TODO: error downloading tokenizer's `merged.txt` from hub can cause all the exceptions below. Throw and handle
-    # them under a single message.
-    "TypeError: expected str, bytes or os.PathLike object, not NoneType",
-    "TypeError: stat: path should be string, bytes, os.PathLike or integer, not NoneType",
-    "Converting from Tiktoken failed",
-    "KeyError: <class ",
-    "TypeError: not a string",
-]
-

 class EmptyJob:
    job_name = "empty"

    def to_dict(self):
-        steps = [{"run": 'ls -la'}]
-        if self.job_name == "collection_job":
-            steps.extend(
-                [
-                    "checkout",
-                    {"run": "pip install requests || true"},
-                    {"run": """while [[ $(curl --location --request GET "https://circleci.com/api/v2/workflow/$CIRCLE_WORKFLOW_ID/job" --header "Circle-Token: $CCI_TOKEN"| jq -r '.items[]|select(.name != "collection_job")|.status' | grep -c "running") -gt 0 ]]; do sleep 5; done || true"""},
-                    {"run": 'python utils/process_circleci_workflow_test_reports.py --workflow_id $CIRCLE_WORKFLOW_ID || true'},
-                    {"store_artifacts": {"path": "outputs"}},
-                    {"run": 'echo "All required jobs have now completed"'},
-                ]
-            )
-
        return {
            "docker": copy.deepcopy(DEFAULT_DOCKER_IMAGE),
-            "resource_class": "small",
-            "steps": steps,
+            "steps":["checkout"],
        }


@dataclass
 class CircleCIJob:
    name: str
-    additional_env: dict[str, Any] = None
-    docker_image: list[dict[str, str]] = None
-    install_steps: list[str] = None
+    additional_env: Dict[str, Any] = None
+    docker_image: List[Dict[str, str]] = None
+    install_steps: List[str] = None
    marker: Optional[str] = None
    parallelism: Optional[int] = 0
-    pytest_num_workers: int = 8
-    pytest_options: dict[str, Any] = None
-    resource_class: Optional[str] = "xlarge"
-    tests_to_run: Optional[list[str]] = None
+    pytest_num_workers: int = 12
+    pytest_options: Dict[str, Any] = None
+    resource_class: Optional[str] = "2xlarge"
+    tests_to_run: Optional[List[str]] = None
    num_test_files_per_worker: Optional[int] = 10
    # This should be only used for doctest job!
    command_timeout: Optional[int] = None
@ -109,9 +76,7 @@ class CircleCIJob:
                self.docker_image[0]["image"] = f"{self.docker_image[0]['image']}:dev"
            print(f"Using {self.docker_image} docker image")
        if self.install_steps is None:
-            self.install_steps = ["uv pip install ."]
-        # Use a custom patched pytest to force exit the process at the end, to avoid `Too long with no output (exceeded 10m0s): context deadline exceeded`
-        self.install_steps.append("uv pip install git+https://github.com/ydshieh/pytest.git@8.4.1-ydshieh")
+            self.install_steps = ["uv venv && uv pip install ."]
        if self.pytest_options is None:
            self.pytest_options = {}
        if isinstance(self.tests_to_run, str):
@ -130,14 +95,6 @@ class CircleCIJob:

    def to_dict(self):
        env = COMMON_ENV_VARIABLES.copy()
-        if self.job_name != "tests_hub":
-            # fmt: off
-            # not critical
-            env.update({"HF_TOKEN": "".join(["h", "f", "_", "H", "o", "d", "V", "u", "M", "q", "b", "R", "m", "t", "b", "z", "F", "Q", "O", "Q", "A", "J", "G", "D", "l", "V", "Q", "r", "R", "N", "w", "D", "M", "V", "C", "s", "d"])})
-            # fmt: on
-
-        # Do not run tests decorated by @is_flaky on pull requests
-        env['RUN_FLAKY'] = os.environ.get("CIRCLE_PULL_REQUEST", "") == ""
        env.update(self.additional_env)

        job = {
@ -155,9 +112,7 @@ class CircleCIJob:
                # Examples special case: we need to download NLTK files in advance to avoid cuncurrency issues
        timeout_cmd = f"timeout {self.command_timeout} " if self.command_timeout else ""
        marker_cmd = f"-m '{self.marker}'" if self.marker is not None else ""
-        junit_flags = " -p no:warning -o junit_family=xunit1 --junitxml=test-results/junit.xml"
-        joined_flaky_patterns = "|".join(FLAKY_TEST_FAILURE_PATTERNS)
-        repeat_on_failure_flags = f"--reruns 5 --reruns-delay 2 --only-rerun '({joined_flaky_patterns})'"
+        additional_flags = f" -p no:warning -o junit_family=xunit1 --junitxml=test-results/junit.xml"
        parallel = f' << pipeline.parameters.{self.job_name}_parallelism >> '
        steps = [
            "checkout",
@ -183,33 +138,13 @@ class CircleCIJob:
                    "command": f"TESTS=$(circleci tests split  --split-by=timings {self.job_name}_test_list.txt) && echo $TESTS > splitted_tests.txt && echo $TESTS | tr ' ' '\n'" if self.parallelism else f"awk '{{printf \"%s \", $0}}' {self.job_name}_test_list.txt > splitted_tests.txt"
                    }
            },
-            # During the CircleCI docker images build time, we might already (or not) download the data.
-            # If it's done already, the files are inside the directory `/test_data/`.
-            {"run": {"name": "fetch hub objects before pytest", "command": "cp -r /test_data/* . 2>/dev/null || true; python3 utils/fetch_hub_objects_for_ci.py"}},
-            {"run": {"name": "download and unzip hub cache", "command": 'curl -L -o huggingface-cache.tar.gz https://huggingface.co/datasets/hf-internal-testing/hf_hub_cache/resolve/main/huggingface-cache.tar.gz && apt-get install pigz && tar --use-compress-program="pigz -d -p 8" -xf huggingface-cache.tar.gz && mv -n hub/* /root/.cache/huggingface/hub/ && ls -la /root/.cache/huggingface/hub/'}},
            {"run": {
                "name": "Run tests",
-                "command": f"({timeout_cmd} python3 -m pytest {marker_cmd} -n {self.pytest_num_workers} {junit_flags} {repeat_on_failure_flags} {' '.join(pytest_flags)} $(cat splitted_tests.txt) | tee tests_output.txt)"}
+                "command": f"({timeout_cmd} python3 -m pytest {marker_cmd} -n {self.pytest_num_workers} {additional_flags} {' '.join(pytest_flags)} $(cat splitted_tests.txt) | tee tests_output.txt)"}
            },
-            {"run":
-                {
-                    "name": "Check for test crashes",
-                    "when": "always",
-                    "command": """if [ ! -f tests_output.txt ]; then
-                            echo "ERROR: tests_output.txt does not exist - tests may not have run properly"
-                            exit 1
-                        elif grep -q "crashed and worker restarting disabled" tests_output.txt; then
-                            echo "ERROR: Worker crash detected in test output"
-                            echo "Found: crashed and worker restarting disabled"
-                            exit 1
-                        else
-                            echo "Tests output file exists and no worker crashes detected"
-                        fi"""
-                },
-            },
-            {"run": {"name": "Expand to show skipped tests", "when": "always", "command": "python3 .circleci/parse_test_outputs.py --file tests_output.txt --skip"}},
-            {"run": {"name": "Failed tests: show reasons",   "when": "always", "command": "python3 .circleci/parse_test_outputs.py --file tests_output.txt --fail"}},
-            {"run": {"name": "Errors",                       "when": "always", "command": "python3 .circleci/parse_test_outputs.py --file tests_output.txt --errors"}},
+            {"run": {"name": "Expand to show skipped tests", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --skip"}},
+            {"run": {"name": "Failed tests: show reasons",   "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --fail"}},
+            {"run": {"name": "Errors",                       "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --errors"}},
            {"store_test_results": {"path": "test-results"}},
            {"store_artifacts": {"path": "test-results/junit.xml"}},
            {"store_artifacts": {"path": "reports"}},
@ -228,79 +163,147 @@ class CircleCIJob:


 # JOBS
+torch_and_tf_job = CircleCIJob(
+    "torch_and_tf",
+    docker_image=[{"image":"huggingface/transformers-torch-tf-light"}],
+    additional_env={"RUN_PT_TF_CROSS_TESTS": True},
+    marker="is_pt_tf_cross_test",
+    pytest_options={"rA": None, "durations": 0},
+)
+
+
+torch_and_flax_job = CircleCIJob(
+    "torch_and_flax",
+    additional_env={"RUN_PT_FLAX_CROSS_TESTS": True},
+    docker_image=[{"image":"huggingface/transformers-torch-jax-light"}],
+    marker="is_pt_flax_cross_test",
+    pytest_options={"rA": None, "durations": 0},
+)
+
 torch_job = CircleCIJob(
    "torch",
    docker_image=[{"image": "huggingface/transformers-torch-light"}],
    marker="not generate",
    parallelism=6,
+    pytest_num_workers=8
 )

 generate_job = CircleCIJob(
    "generate",
    docker_image=[{"image": "huggingface/transformers-torch-light"}],
-    # networkx==3.3 (after #36957) cause some issues
-    # TODO: remove this once it works directly
-    install_steps=["uv pip install ."],
    marker="generate",
    parallelism=6,
+    pytest_num_workers=8
 )

 tokenization_job = CircleCIJob(
    "tokenization",
    docker_image=[{"image": "huggingface/transformers-torch-light"}],
    parallelism=8,
+    pytest_num_workers=16
 )

 processor_job = CircleCIJob(
    "processors",
    docker_image=[{"image": "huggingface/transformers-torch-light"}],
    parallelism=8,
+    pytest_num_workers=6
 )

+tf_job = CircleCIJob(
+    "tf",
+    docker_image=[{"image":"huggingface/transformers-tf-light"}],
+    parallelism=6,
+    pytest_num_workers=16,
+)
+
+
+flax_job = CircleCIJob(
+    "flax",
+    docker_image=[{"image":"huggingface/transformers-jax-light"}],
+    parallelism=6,
+    pytest_num_workers=16
+)
+
+
 pipelines_torch_job = CircleCIJob(
    "pipelines_torch",
    additional_env={"RUN_PIPELINE_TESTS": True},
    docker_image=[{"image":"huggingface/transformers-torch-light"}],
    marker="is_pipeline_test",
-    parallelism=4,
+    parallelism=4
 )

+
+pipelines_tf_job = CircleCIJob(
+    "pipelines_tf",
+    additional_env={"RUN_PIPELINE_TESTS": True},
+    docker_image=[{"image":"huggingface/transformers-tf-light"}],
+    marker="is_pipeline_test",
+    parallelism=4
+)
+
+
 custom_tokenizers_job = CircleCIJob(
    "custom_tokenizers",
    additional_env={"RUN_CUSTOM_TOKENIZERS": True},
    docker_image=[{"image": "huggingface/transformers-custom-tokenizers"}],
 )

+
 examples_torch_job = CircleCIJob(
    "examples_torch",
    additional_env={"OMP_NUM_THREADS": 8},
    docker_image=[{"image":"huggingface/transformers-examples-torch"}],
    # TODO @ArthurZucker remove this once docker is easier to build
-    install_steps=["uv pip install . && uv pip install -r examples/pytorch/_tests_requirements.txt"],
-    pytest_num_workers=4,
+    install_steps=["uv venv && uv pip install . && uv pip install -r examples/pytorch/_tests_requirements.txt"],
+    pytest_num_workers=8,
 )

+
+examples_tensorflow_job = CircleCIJob(
+    "examples_tensorflow",
+    additional_env={"OMP_NUM_THREADS": 8},
+    docker_image=[{"image":"huggingface/transformers-examples-tf"}],
+    pytest_num_workers=16,
+)
+
+
 hub_job = CircleCIJob(
    "hub",
    additional_env={"HUGGINGFACE_CO_STAGING": True},
    docker_image=[{"image":"huggingface/transformers-torch-light"}],
    install_steps=[
-        'uv pip install .',
+        'uv venv && uv pip install .',
        'git config --global user.email "ci@dummy.com"',
        'git config --global user.name "ci"',
    ],
    marker="is_staging_test",
    pytest_num_workers=2,
-    resource_class="medium",
 )

+
+onnx_job = CircleCIJob(
+    "onnx",
+    docker_image=[{"image":"huggingface/transformers-torch-tf-light"}],
+    install_steps=[
+        "uv venv",
+        "uv pip install .[torch,tf,testing,sentencepiece,onnxruntime,vision,rjieba]",
+    ],
+    pytest_options={"k onnx": None},
+    pytest_num_workers=1,
+)
+
+
 exotic_models_job = CircleCIJob(
    "exotic_models",
    docker_image=[{"image":"huggingface/transformers-exotic-models"}],
+    pytest_num_workers=12,
    parallelism=4,
    pytest_options={"durations": 100},
 )

+
 repo_utils_job = CircleCIJob(
    "repo_utils",
    docker_image=[{"image":"huggingface/transformers-consistency"}],
@ -308,14 +311,13 @@ repo_utils_job = CircleCIJob(
    resource_class="large",
 )

+
 non_model_job = CircleCIJob(
    "non_model",
    docker_image=[{"image": "huggingface/transformers-torch-light"}],
-    # networkx==3.3 (after #36957) cause some issues
-    # TODO: remove this once it works directly
-    install_steps=["uv pip install .[serving]"],
    marker="not generate",
    parallelism=6,
+    pytest_num_workers=8,
 )


@ -331,7 +333,7 @@ doc_test_job = CircleCIJob(
    additional_env={"TRANSFORMERS_VERBOSITY": "error", "DATASETS_VERBOSITY": "error", "SKIP_CUDA_DOCTEST": "1"},
    install_steps=[
        # Add an empty file to keep the test step running correctly even no file is selected to be tested.
-        "uv pip install .",
+        "uv venv && pip install .",
        "touch dummy.py",
        command,
        "cat pr_documentation_tests_temp.txt",
@ -343,14 +345,13 @@ doc_test_job = CircleCIJob(
    pytest_num_workers=1,
 )

-REGULAR_TESTS = [torch_job, hub_job, tokenization_job, processor_job, generate_job, non_model_job] # fmt: skip
-EXAMPLES_TESTS = [examples_torch_job]
-PIPELINE_TESTS = [pipelines_torch_job]
+REGULAR_TESTS = [torch_and_tf_job, torch_and_flax_job, torch_job, tf_job, flax_job, hub_job, onnx_job, tokenization_job, processor_job, generate_job, non_model_job] # fmt: skip
+EXAMPLES_TESTS = [examples_torch_job, examples_tensorflow_job]
+PIPELINE_TESTS = [pipelines_torch_job, pipelines_tf_job]
 REPO_UTIL_TESTS = [repo_utils_job]
 DOC_TESTS = [doc_test_job]
 ALL_TESTS = REGULAR_TESTS + EXAMPLES_TESTS + PIPELINE_TESTS + REPO_UTIL_TESTS + DOC_TESTS + [custom_tokenizers_job] + [exotic_models_job]  # fmt: skip

-
 def create_circleci_config(folder=None):
    if folder is None:
        folder = os.getcwd()
@ -360,28 +361,17 @@ def create_circleci_config(folder=None):

    if len(jobs) == 0:
        jobs = [EmptyJob()]
-    else:
-        print("Full list of job name inputs", {j.job_name + "_test_list":{"type":"string", "default":''} for j in jobs})
-        # Add a job waiting all the test jobs and aggregate their test summary files at the end
-        collection_job = EmptyJob()
-        collection_job.job_name = "collection_job"
-        jobs = [collection_job] + jobs
-
+    print("Full list of job name inputs", {j.job_name + "_test_list":{"type":"string", "default":''} for j in jobs})
    config = {
        "version": "2.1",
        "parameters": {
            # Only used to accept the parameters from the trigger
            "nightly": {"type": "boolean", "default": False},
-            # Only used to accept the parameters from GitHub Actions trigger
-            "GHA_Actor": {"type": "string", "default": ""},
-            "GHA_Action": {"type": "string", "default": ""},
-            "GHA_Event": {"type": "string", "default": ""},
-            "GHA_Meta": {"type": "string", "default": ""},
-            "tests_to_run": {"type": "string", "default": ""},
+            "tests_to_run": {"type": "string", "default": ''},
            **{j.job_name + "_test_list":{"type":"string", "default":''} for j in jobs},
            **{j.job_name + "_parallelism":{"type":"integer", "default":1} for j in jobs},
        },
-        "jobs": {j.job_name: j.to_dict() for j in jobs}
+        "jobs" : {j.job_name: j.to_dict() for j in jobs}
    }
    if "CIRCLE_TOKEN" in os.environ:
        # For private forked repo. (e.g. new model addition)
--- a/.circleci/parse_test_outputs.py
+++ b/.circleci/parse_test_outputs.py
@ -1,6 +1,5 @@
-import argparse
 import re
-
+import argparse

 def parse_pytest_output(file_path):
    skipped_tests = {}
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@ -16,7 +16,7 @@ body:
    id: system-info
    attributes:
      label: System Info
-      description: Please share your system info with us. You can run the command `transformers env` and copy-paste its output below.
+      description: Please share your system info with us. You can run the command `transformers-cli env` and copy-paste its output below.
      placeholder: transformers version, platform, python version, ...
    validations:
      required: true
@ -36,38 +36,26 @@ body:

        Models:

-          - text models: @ArthurZucker @Cyrilvallez
-          - vision models: @yonigozlan @molbap
-          - audio models: @eustlb @ebezzam @vasqu
-          - multimodal models: @zucchini-nlp
+          - text models: @ArthurZucker
+          - vision models: @amyeroberts, @qubvel
+          - speech models: @ylacombe, @eustlb
          - graph models: @clefourrier

        Library:

+          - flax: @sanchit-gandhi
          - generate: @zucchini-nlp (visual-language models) or @gante (all others)
-          - continuous batching: @remi-or @ArthurZucker @McPatate
          - pipelines: @Rocketknight1
+          - tensorflow: @gante and @Rocketknight1
          - tokenizers: @ArthurZucker and @itazap
-          - trainer: @zach-huggingface @SunMarc
-          - attention: @vasqu @ArthurZucker @CyrilVallez
-          - model loading (from pretrained, etc): @CyrilVallez
-          - distributed: @3outeille @ArthurZucker @S1ro1
-          - CIs: @ydshieh
+          - trainer: @muellerzr @SunMarc

        Integrations:

-          - deepspeed: HF Trainer/Accelerate: @SunMarc @zach-huggingface
+          - deepspeed: HF Trainer/Accelerate: @muellerzr
          - ray/raytune: @richardliaw, @amogkam
          - Big Model Inference: @SunMarc
          - quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber
-          - kernels: @MekkCyber @drbh
-          - peft: @BenjaminBossan @githubnemo
-        
-        Devices/Backends:
-        
-          - AMD ROCm: @ivarflakstad
-          - Intel XPU: @IlyasMoutawwakil
-          - Ascend NPU: @ivarflakstad 

        Documentation: @stevhliu

@ -75,6 +63,19 @@ body:

          - for issues with a model, report at https://discuss.huggingface.co/ and tag the model's creator.

+        HF projects:
+
+          - accelerate: [different repo](https://github.com/huggingface/accelerate)
+          - datasets: [different repo](https://github.com/huggingface/datasets)
+          - diffusers: [different repo](https://github.com/huggingface/diffusers)
+          - rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)
+
+        Maintained examples (not research project or legacy):
+
+          - Flax: @sanchit-gandhi
+          - PyTorch: See Models above and tag the person corresponding to the modality of the example.
+          - TensorFlow: @Rocketknight1
+
        Research projects are not maintained and should be taken as is.

      placeholder: "@Username ..."
@ -105,7 +106,6 @@ body:
      label: Reproduction
      description: |
        Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
-        Please include relevant config information with your code, for example your Trainers, TRL, Peft, and DeepSpeed configs.
        If you have code snippets, error messages, stack traces please provide them here as well.
        Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
        Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.
--- a/.github/ISSUE_TEMPLATE/i18n.md
+++ b/.github/ISSUE_TEMPLATE/i18n.md
@ -23,7 +23,7 @@ Some notes:
 * Please translate in a gender-neutral way.
 * Add your translations to the folder called `<languageCode>` inside the [source folder](https://github.com/huggingface/transformers/tree/main/docs/source).
 * Register your translation in `<languageCode>/_toctree.yml`; please follow the order of the [English version](https://github.com/huggingface/transformers/blob/main/docs/source/en/_toctree.yml).
-* Once you're finished, open a pull request and tag this issue by including #issue-number in the description, where issue-number is the number of this issue. Please ping @stevhliu for review.
+* Once you're finished, open a pull request and tag this issue by including #issue-number in the description, where issue-number is the number of this issue. Please ping @stevhliu and @MKhalusova for review.
 * 🙋 If you'd like others to help you with the translation, you can also post in the 🤗 [forums](https://discuss.huggingface.co/).

 ## Get Started section
--- a/.github/ISSUE_TEMPLATE/migration.yml
+++ b/.github/ISSUE_TEMPLATE/migration.yml
@ -6,7 +6,7 @@ body:
    id: system-info
    attributes:
      label: System Info
-      description: Please share your system info with us. You can run the command `transformers env` and copy-paste its output below.
+      description: Please share your system info with us. You can run the command `transformers-cli env` and copy-paste its output below.
      render: shell
      placeholder: transformers version, platform, python version, ...
    validations:
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -39,41 +39,41 @@ members/contributors who may be interested in your PR.

 Models:

- text models: @ArthurZucker @Cyrilvallez
- vision models: @yonigozlan @molbap
- audio models: @eustlb @ebezzam @vasqu
- multimodal models: @zucchini-nlp
+- text models: @ArthurZucker
+- vision models: @amyeroberts, @qubvel
+- speech models: @ylacombe, @eustlb
 - graph models: @clefourrier

 Library:

+- flax: @sanchit-gandhi
 - generate: @zucchini-nlp (visual-language models) or @gante (all others)
- continuous batching: @remi-or @ArthurZucker @McPatate
 - pipelines: @Rocketknight1
- tokenizers: @ArthurZucker and @itazap
- trainer: @zach-huggingface @SunMarc
- attention: @vasqu @ArthurZucker @CyrilVallez
- model loading (from pretrained, etc): @CyrilVallez
- distributed: @3outeille @ArthurZucker @S1ro1
- CIs: @ydshieh
+- tensorflow: @gante and @Rocketknight1
+- tokenizers: @ArthurZucker
+- trainer: @muellerzr and @SunMarc
+- chat templates: @Rocketknight1

 Integrations:

- deepspeed: HF Trainer/Accelerate: @SunMarc @zach-huggingface
+- deepspeed: HF Trainer/Accelerate: @muellerzr
 - ray/raytune: @richardliaw, @amogkam
 - Big Model Inference: @SunMarc
 - quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber
- kernels: @MekkCyber @drbh
- peft: @BenjaminBossan @githubnemo
-
-Devices/Backends:
-
- AMD ROCm: @ivarflakstad
- Intel XPU: @IlyasMoutawwakil
- Ascend NPU: @ivarflakstad 

 Documentation: @stevhliu

-Research projects are not maintained and should be taken as is.
+HF projects:
+
+- accelerate: [different repo](https://github.com/huggingface/accelerate)
+- datasets: [different repo](https://github.com/huggingface/datasets)
+- diffusers: [different repo](https://github.com/huggingface/diffusers)
+- rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)
+
+Maintained examples (not research project or legacy):
+
+- Flax: @sanchit-gandhi
+- PyTorch: See Models above and tag the person corresponding to the modality of the example.
+- TensorFlow: @Rocketknight1

 -->
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@ -1,39 +0,0 @@
-# copilot-instructions.md Guide for Hugging Face Transformers
-
-This copilot-instructions.md file provides guidance for code agents working with this codebase.
-
-## Core Project Structure
-
- `/src/transformers`: This contains the core source code for the library
-  - `/models`: Code for individual models. Models inherit from base classes in the root `/src/transformers` directory.
- `/tests`: This contains the core test classes for the library. These are usually inherited rather than directly run.
-  - `/models`: Tests for individual models. Model tests inherit from common tests in the root `/tests` directory.
- `/docs`: This contains the documentation for the library, including guides, tutorials, and API references.
-
-## Coding Conventions for Hugging Face Transformers
-
- PRs should be as brief as possible. Bugfix PRs in particular can often be only one or two lines long, and do not need large comments, docstrings or new functions in this case. Aim to minimize the size of the diff.
- When writing tests, they should be added to an existing file. The only exception is for PRs to add a new model, when a new test directory should be created for that model.
- Code style is enforced in the CI. You can install the style tools with `pip install -e .[quality]`. You can then run `make fixup` to apply style and consistency fixes to your code.
-
-## Copying and inheritance
-
-Many models in the codebase have similar code, but it is not shared by inheritance because we want each model file to be self-contained.
-We use two mechanisms to keep this code in sync:
-
- "Copied from" syntax. Functions or entire classes can have a comment at the top like this: `# Copied from transformers.models.llama.modeling_llama.rotate_half` or `# Copied from transformers.models.t5.modeling_t5.T5LayerNorm with T5->MT5`
-  These comments are actively checked by the style tools, and copies will automatically be updated when the base code is updated. If you need to update a copied function, you should
-  either update the base function and use `make fixup` to propagate the change to all copies, or simply remove the `# Copied from` comment if that is inappropriate.
- "Modular" files. These files briefly define models by composing them using inheritance from other models. They are not meant to be used directly. Instead, the style tools
-  automatically generate a complete modeling file, like `modeling_bert.py`, from the modular file like `modular_bert.py`. If a model has a modular file, the modeling file
-  should never be edited directly! Instead, changes should be made in the modular file, and then you should run `make fixup` to update the modeling file automatically.
-
-When adding new models, you should prefer `modular` style and inherit as many classes as possible from existing models.
-
-## Testing
-
-After making changes, you should usually run `make fixup` to ensure any copies and modular files are updated, and then test all affected models. This includes both
-the model you made the changes in and any other models that were updated by `make fixup`. Tests can be run with `pytest tests/models/[name]/test_modeling_[name].py`
-If your changes affect code in other classes like tokenizers or processors, you should run those tests instead, like `test_processing_[name].py` or `test_tokenization_[name].py`.
-
-In order to run tests, you may need to install dependencies. You can do this with `pip install -e .[testing]`. You will probably also need to `pip install torch accelerate` if your environment does not already have them.
--- a/.github/scripts/assign_reviewers.py
+++ b/.github/scripts/assign_reviewers.py
@ -1,122 +0,0 @@
-# coding=utf-8
-# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-import re
-from collections import Counter
-from pathlib import Path
-
-import github
-from github import Github
-
-
-def pattern_to_regex(pattern):
-    if pattern.startswith("/"):
-        start_anchor = True
-        pattern = re.escape(pattern[1:])
-    else:
-        start_anchor = False
-        pattern = re.escape(pattern)
-    # Replace `*` with "any number of non-slash characters"
-    pattern = pattern.replace(r"\*", "[^/]*")
-    if start_anchor:
-        pattern = r"^\/?" + pattern  # Allow an optional leading slash after the start of the string
-    return pattern
-
-def get_file_owners(file_path, codeowners_lines):
-    # Process lines in reverse (last matching pattern takes precedence)
-    for line in reversed(codeowners_lines):
-        # Skip comments and empty lines, strip inline comments
-        line = line.split('#')[0].strip()
-        if not line:
-            continue
-
-        # Split into pattern and owners
-        parts = line.split()
-        pattern = parts[0]
-        # Can be empty, e.g. for dummy files with explicitly no owner!
-        owners = [owner.removeprefix("@") for owner in parts[1:]]
-
-        # Check if file matches pattern
-        file_regex = pattern_to_regex(pattern)
-        if re.search(file_regex, file_path) is not None:
-            return owners  # Remember, can still be empty!
-    return []  # Should never happen, but just in case
-
-def pr_author_is_in_hf(pr_author, codeowners_lines):
-    # Check if the PR author is in the codeowners file
-    for line in codeowners_lines:
-        line = line.split('#')[0].strip()
-        if not line:
-            continue
-
-        # Split into pattern and owners
-        parts = line.split()
-        owners = [owner.removeprefix("@") for owner in parts[1:]]
-
-        if pr_author in owners:
-            return True
-    return False
-
-def main():
-    script_dir = Path(__file__).parent.absolute()
-    with open(script_dir / "codeowners_for_review_action") as f:
-        codeowners_lines = f.readlines()
-
-    g = Github(os.environ['GITHUB_TOKEN'])
-    repo = g.get_repo("huggingface/transformers")
-    with open(os.environ['GITHUB_EVENT_PATH']) as f:
-        event = json.load(f)
-
-    # The PR number is available in the event payload
-    pr_number = event['pull_request']['number']
-    pr = repo.get_pull(pr_number)
-    pr_author = pr.user.login
-    if pr_author_is_in_hf(pr_author, codeowners_lines):
-        print(f"PR author {pr_author} is in codeowners, skipping review request.")
-        return
-
-    existing_reviews = list(pr.get_reviews())
-    if existing_reviews:
-        print(f"Already has reviews: {[r.user.login for r in existing_reviews]}")
-        return
-
-    users_requested, teams_requested = pr.get_review_requests()
-    users_requested = list(users_requested)
-    if users_requested:
-        print(f"Reviewers already requested: {users_requested}")
-        return
-
-    locs_per_owner = Counter()
-    for file in pr.get_files():
-        owners = get_file_owners(file.filename, codeowners_lines)
-        for owner in owners:
-            locs_per_owner[owner] += file.changes
-
-    # Assign the top 2 based on locs changed as reviewers, but skip the owner if present
-    locs_per_owner.pop(pr_author, None)
-    top_owners = locs_per_owner.most_common(2)
-    print("Top owners", top_owners)
-    top_owners = [owner[0] for owner in top_owners]
-    try:
-        pr.create_review_request(top_owners)
-    except github.GithubException as e:
-        print(f"Failed to request review for {top_owners}: {e}")
-
-
-
-if __name__ == "__main__":
-    main()
--- a/.github/scripts/codeowners_for_review_action
+++ b/.github/scripts/codeowners_for_review_action
@ -1,370 +0,0 @@
-# Top-level rules are matched only if nothing else matches
-* @Rocketknight1 @ArthurZucker # if no one is pinged based on the other rules, he will do the dispatch
-*.md @stevhliu
-*tokenization* @ArthurZucker
-docs/ @stevhliu
-/benchmark/ @McPatate
-/docker/ @ydshieh @ArthurZucker
-
-# More high-level globs catch cases when specific rules later don't apply
-/src/transformers/models/*/processing* @molbap @yonigozlan
-/src/transformers/models/*/image_processing* @yonigozlan
-/src/transformers/models/*/image_processing_*_fast* @yonigozlan
-
-# Owners of subsections of the library
-/src/transformers/generation/ @gante
-/src/transformers/pipeline/ @Rocketknight1 @yonigozlan
-/src/transformers/integrations/ @SunMarc @MekkCyber @zach-huggingface
-/src/transformers/quantizers/ @SunMarc @MekkCyber
-tests/ @ydshieh
-tests/generation/ @gante
-
-/src/transformers/models/auto/ @ArthurZucker
-/src/transformers/utils/ @ArthurZucker @Rocketknight1
-/src/transformers/loss/ @ArthurZucker
-/src/transformers/onnx/ @michaelbenayoun
-
-# Specific files come after the sections/globs, so they take priority
-/.circleci/config.yml @ArthurZucker @ydshieh
-/utils/tests_fetcher.py @ydshieh
-trainer.py @zach-huggingface @SunMarc
-trainer_utils.py @zach-huggingface @SunMarc
-/utils/modular_model_converter.py @Cyrilvallez @ArthurZucker
-
-# Owners of individual models are specific / high priority, and so they come last
-# mod* captures modeling and modular files
-
-# Text models
-/src/transformers/models/albert/mod*_albert* @ArthurZucker
-/src/transformers/models/bamba/mod*_bamba* @ArthurZucker
-/src/transformers/models/bart/mod*_bart* @ArthurZucker
-/src/transformers/models/barthez/mod*_barthez* @ArthurZucker
-/src/transformers/models/bartpho/mod*_bartpho* @ArthurZucker
-/src/transformers/models/bert/mod*_bert* @ArthurZucker
-/src/transformers/models/bert_generation/mod*_bert_generation* @ArthurZucker
-/src/transformers/models/bert_japanese/mod*_bert_japanese* @ArthurZucker
-/src/transformers/models/bertweet/mod*_bertweet* @ArthurZucker
-/src/transformers/models/big_bird/mod*_big_bird* @ArthurZucker
-/src/transformers/models/bigbird_pegasus/mod*_bigbird_pegasus* @ArthurZucker
-/src/transformers/models/biogpt/mod*_biogpt* @ArthurZucker
-/src/transformers/models/blenderbot/mod*_blenderbot* @ArthurZucker
-/src/transformers/models/blenderbot_small/mod*_blenderbot_small* @ArthurZucker
-/src/transformers/models/bloom/mod*_bloom* @ArthurZucker
-/src/transformers/models/bort/mod*_bort* @ArthurZucker
-/src/transformers/models/byt5/mod*_byt5* @ArthurZucker
-/src/transformers/models/camembert/mod*_camembert* @ArthurZucker
-/src/transformers/models/canine/mod*_canine* @ArthurZucker
-/src/transformers/models/codegen/mod*_codegen* @ArthurZucker
-/src/transformers/models/code_llama/mod*_code_llama* @ArthurZucker
-/src/transformers/models/cohere/mod*_cohere* @ArthurZucker
-/src/transformers/models/cohere2/mod*_cohere2* @ArthurZucker
-/src/transformers/models/convbert/mod*_convbert* @ArthurZucker
-/src/transformers/models/cpm/mod*_cpm* @ArthurZucker
-/src/transformers/models/cpmant/mod*_cpmant* @ArthurZucker
-/src/transformers/models/ctrl/mod*_ctrl* @ArthurZucker
-/src/transformers/models/dbrx/mod*_dbrx* @ArthurZucker
-/src/transformers/models/deberta/mod*_deberta* @ArthurZucker
-/src/transformers/models/deberta_v2/mod*_deberta_v2* @ArthurZucker
-/src/transformers/models/dialogpt/mod*_dialogpt* @ArthurZucker
-/src/transformers/models/diffllama/mod*_diffllama* @ArthurZucker
-/src/transformers/models/distilbert/mod*_distilbert* @ArthurZucker
-/src/transformers/models/dpr/mod*_dpr* @ArthurZucker
-/src/transformers/models/electra/mod*_electra* @ArthurZucker
-/src/transformers/models/encoder_decoder/mod*_encoder_decoder* @ArthurZucker
-/src/transformers/models/ernie/mod*_ernie* @ArthurZucker
-/src/transformers/models/ernie_m/mod*_ernie_m* @ArthurZucker
-/src/transformers/models/esm/mod*_esm* @ArthurZucker
-/src/transformers/models/falcon/mod*_falcon* @ArthurZucker
-/src/transformers/models/falcon3/mod*_falcon3* @ArthurZucker
-/src/transformers/models/falcon_mamba/mod*_falcon_mamba* @ArthurZucker
-/src/transformers/models/fastspeech2_conformer/mod*_fastspeech2_conformer* @ArthurZucker
-/src/transformers/models/flan_t5/mod*_flan_t5* @ArthurZucker
-/src/transformers/models/flan_ul2/mod*_flan_ul2* @ArthurZucker
-/src/transformers/models/flaubert/mod*_flaubert* @ArthurZucker
-/src/transformers/models/fnet/mod*_fnet* @ArthurZucker
-/src/transformers/models/fsmt/mod*_fsmt* @ArthurZucker
-/src/transformers/models/funnel/mod*_funnel* @ArthurZucker
-/src/transformers/models/fuyu/mod*_fuyu* @ArthurZucker
-/src/transformers/models/gemma/mod*_gemma* @ArthurZucker
-/src/transformers/models/gemma2/mod*_gemma2* @ArthurZucker
-/src/transformers/models/glm/mod*_glm* @ArthurZucker
-/src/transformers/models/openai_gpt/mod*_openai_gpt* @ArthurZucker
-/src/transformers/models/gpt_neo/mod*_gpt_neo* @ArthurZucker
-/src/transformers/models/gpt_neox/mod*_gpt_neox* @ArthurZucker
-/src/transformers/models/gpt_neox_japanese/mod*_gpt_neox_japanese* @ArthurZucker
-/src/transformers/models/gptj/mod*_gptj* @ArthurZucker
-/src/transformers/models/gpt2/mod*_gpt2* @ArthurZucker
-/src/transformers/models/gpt_bigcode/mod*_gpt_bigcode* @ArthurZucker
-/src/transformers/models/gptsan_japanese/mod*_gptsan_japanese* @ArthurZucker
-/src/transformers/models/gpt_sw3/mod*_gpt_sw3* @ArthurZucker
-/src/transformers/models/granite/mod*_granite* @ArthurZucker
-/src/transformers/models/granitemoe/mod*_granitemoe* @ArthurZucker
-/src/transformers/models/herbert/mod*_herbert* @ArthurZucker
-/src/transformers/models/ibert/mod*_ibert* @ArthurZucker
-/src/transformers/models/jamba/mod*_jamba* @ArthurZucker
-/src/transformers/models/jetmoe/mod*_jetmoe* @ArthurZucker
-/src/transformers/models/jukebox/mod*_jukebox* @ArthurZucker
-/src/transformers/models/led/mod*_led* @ArthurZucker
-/src/transformers/models/llama/mod*_llama* @ArthurZucker @Cyrilvallez
-/src/transformers/models/longformer/mod*_longformer* @ArthurZucker
-/src/transformers/models/longt5/mod*_longt5* @ArthurZucker
-/src/transformers/models/luke/mod*_luke* @ArthurZucker
-/src/transformers/models/m2m_100/mod*_m2m_100* @ArthurZucker
-/src/transformers/models/madlad_400/mod*_madlad_400* @ArthurZucker
-/src/transformers/models/mamba/mod*_mamba* @ArthurZucker
-/src/transformers/models/mamba2/mod*_mamba2* @ArthurZucker
-/src/transformers/models/marian/mod*_marian* @ArthurZucker
-/src/transformers/models/markuplm/mod*_markuplm* @ArthurZucker
-/src/transformers/models/mbart/mod*_mbart* @ArthurZucker
-/src/transformers/models/mega/mod*_mega* @ArthurZucker
-/src/transformers/models/megatron_bert/mod*_megatron_bert* @ArthurZucker
-/src/transformers/models/megatron_gpt2/mod*_megatron_gpt2* @ArthurZucker
-/src/transformers/models/mistral/mod*_mistral* @ArthurZucker
-/src/transformers/models/mixtral/mod*_mixtral* @ArthurZucker
-/src/transformers/models/mluke/mod*_mluke* @ArthurZucker
-/src/transformers/models/mobilebert/mod*_mobilebert* @ArthurZucker
-/src/transformers/models/modernbert/mod*_modernbert* @ArthurZucker
-/src/transformers/models/mpnet/mod*_mpnet* @ArthurZucker
-/src/transformers/models/mpt/mod*_mpt* @ArthurZucker
-/src/transformers/models/mra/mod*_mra* @ArthurZucker
-/src/transformers/models/mt5/mod*_mt5* @ArthurZucker
-/src/transformers/models/mvp/mod*_mvp* @ArthurZucker
-/src/transformers/models/myt5/mod*_myt5* @ArthurZucker
-/src/transformers/models/nemotron/mod*_nemotron* @ArthurZucker
-/src/transformers/models/nezha/mod*_nezha* @ArthurZucker
-/src/transformers/models/nllb/mod*_nllb* @ArthurZucker
-/src/transformers/models/nllb_moe/mod*_nllb_moe* @ArthurZucker
-/src/transformers/models/nystromformer/mod*_nystromformer* @ArthurZucker
-/src/transformers/models/olmo/mod*_olmo* @ArthurZucker
-/src/transformers/models/olmo2/mod*_olmo2* @ArthurZucker
-/src/transformers/models/olmoe/mod*_olmoe* @ArthurZucker
-/src/transformers/models/open_llama/mod*_open_llama* @ArthurZucker
-/src/transformers/models/opt/mod*_opt* @ArthurZucker
-/src/transformers/models/pegasus/mod*_pegasus* @ArthurZucker
-/src/transformers/models/pegasus_x/mod*_pegasus_x* @ArthurZucker
-/src/transformers/models/persimmon/mod*_persimmon* @ArthurZucker
-/src/transformers/models/phi/mod*_phi* @ArthurZucker
-/src/transformers/models/phi3/mod*_phi3* @ArthurZucker
-/src/transformers/models/phimoe/mod*_phimoe* @ArthurZucker
-/src/transformers/models/phobert/mod*_phobert* @ArthurZucker
-/src/transformers/models/plbart/mod*_plbart* @ArthurZucker
-/src/transformers/models/prophetnet/mod*_prophetnet* @ArthurZucker
-/src/transformers/models/qdqbert/mod*_qdqbert* @ArthurZucker
-/src/transformers/models/qwen2/mod*_qwen2* @ArthurZucker
-/src/transformers/models/qwen2_moe/mod*_qwen2_moe* @ArthurZucker
-/src/transformers/models/rag/mod*_rag* @ArthurZucker
-/src/transformers/models/realm/mod*_realm* @ArthurZucker
-/src/transformers/models/recurrent_gemma/mod*_recurrent_gemma* @ArthurZucker
-/src/transformers/models/reformer/mod*_reformer* @ArthurZucker
-/src/transformers/models/rembert/mod*_rembert* @ArthurZucker
-/src/transformers/models/retribert/mod*_retribert* @ArthurZucker
-/src/transformers/models/roberta/mod*_roberta* @ArthurZucker
-/src/transformers/models/roberta_prelayernorm/mod*_roberta_prelayernorm* @ArthurZucker
-/src/transformers/models/roc_bert/mod*_roc_bert* @ArthurZucker
-/src/transformers/models/roformer/mod*_roformer* @ArthurZucker
-/src/transformers/models/rwkv/mod*_rwkv* @ArthurZucker
-/src/transformers/models/splinter/mod*_splinter* @ArthurZucker
-/src/transformers/models/squeezebert/mod*_squeezebert* @ArthurZucker
-/src/transformers/models/stablelm/mod*_stablelm* @ArthurZucker
-/src/transformers/models/starcoder2/mod*_starcoder2* @ArthurZucker
-/src/transformers/models/switch_transformers/mod*_switch_transformers* @ArthurZucker
-/src/transformers/models/t5/mod*_t5* @ArthurZucker
-/src/transformers/models/t5v1.1/mod*_t5v1.1* @ArthurZucker
-/src/transformers/models/tapex/mod*_tapex* @ArthurZucker
-/src/transformers/models/transfo_xl/mod*_transfo_xl* @ArthurZucker
-/src/transformers/models/ul2/mod*_ul2* @ArthurZucker
-/src/transformers/models/umt5/mod*_umt5* @ArthurZucker
-/src/transformers/models/xmod/mod*_xmod* @ArthurZucker
-/src/transformers/models/xglm/mod*_xglm* @ArthurZucker
-/src/transformers/models/xlm/mod*_xlm* @ArthurZucker
-/src/transformers/models/xlm_prophetnet/mod*_xlm_prophetnet* @ArthurZucker
-/src/transformers/models/xlm_roberta/mod*_xlm_roberta* @ArthurZucker
-/src/transformers/models/xlm_roberta_xl/mod*_xlm_roberta_xl* @ArthurZucker
-/src/transformers/models/xlm_v/mod*_xlm_v* @ArthurZucker
-/src/transformers/models/xlnet/mod*_xlnet* @ArthurZucker
-/src/transformers/models/yoso/mod*_yoso* @ArthurZucker
-/src/transformers/models/zamba/mod*_zamba* @ArthurZucker
-
-# Vision models
-/src/transformers/models/beit/mod*_beit* @yonigozlan @molbap
-/src/transformers/models/bit/mod*_bit* @yonigozlan @molbap
-/src/transformers/models/conditional_detr/mod*_conditional_detr* @yonigozlan @molbap
-/src/transformers/models/convnext/mod*_convnext* @yonigozlan @molbap
-/src/transformers/models/convnextv2/mod*_convnextv2* @yonigozlan @molbap
-/src/transformers/models/cvt/mod*_cvt* @yonigozlan @molbap
-/src/transformers/models/deformable_detr/mod*_deformable_detr* @yonigozlan @molbap
-/src/transformers/models/deit/mod*_deit* @yonigozlan @molbap
-/src/transformers/models/depth_anything/mod*_depth_anything* @yonigozlan @molbap
-/src/transformers/models/depth_anything_v2/mod*_depth_anything_v2* @yonigozlan @molbap
-/src/transformers/models/deta/mod*_deta* @yonigozlan @molbap
-/src/transformers/models/detr/mod*_detr* @yonigozlan @molbap
-/src/transformers/models/dinat/mod*_dinat* @yonigozlan @molbap
-/src/transformers/models/dinov2/mod*_dinov2* @yonigozlan @molbap
-/src/transformers/models/dinov2_with_registers/mod*_dinov2_with_registers* @yonigozlan @molbap
-/src/transformers/models/dit/mod*_dit* @yonigozlan @molbap
-/src/transformers/models/dpt/mod*_dpt* @yonigozlan @molbap
-/src/transformers/models/efficientformer/mod*_efficientformer* @yonigozlan @molbap
-/src/transformers/models/efficientnet/mod*_efficientnet* @yonigozlan @molbap
-/src/transformers/models/focalnet/mod*_focalnet* @yonigozlan @molbap
-/src/transformers/models/glpn/mod*_glpn* @yonigozlan @molbap
-/src/transformers/models/hiera/mod*_hiera* @yonigozlan @molbap
-/src/transformers/models/ijepa/mod*_ijepa* @yonigozlan @molbap
-/src/transformers/models/imagegpt/mod*_imagegpt* @yonigozlan @molbap
-/src/transformers/models/levit/mod*_levit* @yonigozlan @molbap
-/src/transformers/models/mask2former/mod*_mask2former* @yonigozlan @molbap
-/src/transformers/models/maskformer/mod*_maskformer* @yonigozlan @molbap
-/src/transformers/models/mobilenet_v1/mod*_mobilenet_v1* @yonigozlan @molbap
-/src/transformers/models/mobilenet_v2/mod*_mobilenet_v2* @yonigozlan @molbap
-/src/transformers/models/mobilevit/mod*_mobilevit* @yonigozlan @molbap
-/src/transformers/models/mobilevitv2/mod*_mobilevitv2* @yonigozlan @molbap
-/src/transformers/models/nat/mod*_nat* @yonigozlan @molbap
-/src/transformers/models/poolformer/mod*_poolformer* @yonigozlan @molbap
-/src/transformers/models/pvt/mod*_pvt* @yonigozlan @molbap
-/src/transformers/models/pvt_v2/mod*_pvt_v2* @yonigozlan @molbap
-/src/transformers/models/regnet/mod*_regnet* @yonigozlan @molbap
-/src/transformers/models/resnet/mod*_resnet* @yonigozlan @molbap
-/src/transformers/models/rt_detr/mod*_rt_detr* @yonigozlan @molbap
-/src/transformers/models/segformer/mod*_segformer* @yonigozlan @molbap
-/src/transformers/models/seggpt/mod*_seggpt* @yonigozlan @molbap
-/src/transformers/models/superpoint/mod*_superpoint* @yonigozlan @molbap
-/src/transformers/models/swiftformer/mod*_swiftformer* @yonigozlan @molbap
-/src/transformers/models/swin/mod*_swin* @yonigozlan @molbap
-/src/transformers/models/swinv2/mod*_swinv2* @yonigozlan @molbap
-/src/transformers/models/swin2sr/mod*_swin2sr* @yonigozlan @molbap
-/src/transformers/models/table_transformer/mod*_table_transformer* @yonigozlan @molbap
-/src/transformers/models/textnet/mod*_textnet* @yonigozlan @molbap
-/src/transformers/models/timm_wrapper/mod*_timm_wrapper* @yonigozlan @molbap
-/src/transformers/models/upernet/mod*_upernet* @yonigozlan @molbap
-/src/transformers/models/van/mod*_van* @yonigozlan @molbap
-/src/transformers/models/vit/mod*_vit* @yonigozlan @molbap
-/src/transformers/models/vit_hybrid/mod*_vit_hybrid* @yonigozlan @molbap
-/src/transformers/models/vitdet/mod*_vitdet* @yonigozlan @molbap
-/src/transformers/models/vit_mae/mod*_vit_mae* @yonigozlan @molbap
-/src/transformers/models/vitmatte/mod*_vitmatte* @yonigozlan @molbap
-/src/transformers/models/vit_msn/mod*_vit_msn* @yonigozlan @molbap
-/src/transformers/models/vitpose/mod*_vitpose* @yonigozlan @molbap
-/src/transformers/models/yolos/mod*_yolos* @yonigozlan @molbap
-/src/transformers/models/zoedepth/mod*_zoedepth* @yonigozlan @molbap
-
-# Audio models
-/src/transformers/models/audio_spectrogram_transformer/mod*_audio_spectrogram_transformer* @eustlb
-/src/transformers/models/bark/mod*_bark* @eustlb
-/src/transformers/models/clap/mod*_clap* @eustlb
-/src/transformers/models/dac/mod*_dac* @eustlb
-/src/transformers/models/encodec/mod*_encodec* @eustlb
-/src/transformers/models/hubert/mod*_hubert* @eustlb
-/src/transformers/models/mctct/mod*_mctct* @eustlb
-/src/transformers/models/mimi/mod*_mimi* @eustlb
-/src/transformers/models/mms/mod*_mms* @eustlb
-/src/transformers/models/moshi/mod*_moshi* @eustlb
-/src/transformers/models/musicgen/mod*_musicgen* @eustlb
-/src/transformers/models/musicgen_melody/mod*_musicgen_melody* @eustlb
-/src/transformers/models/pop2piano/mod*_pop2piano* @eustlb
-/src/transformers/models/seamless_m4t/mod*_seamless_m4t* @eustlb
-/src/transformers/models/seamless_m4t_v2/mod*_seamless_m4t_v2* @eustlb
-/src/transformers/models/sew/mod*_sew* @eustlb
-/src/transformers/models/sew_d/mod*_sew_d* @eustlb
-/src/transformers/models/speech_to_text/mod*_speech_to_text* @eustlb
-/src/transformers/models/speech_to_text_2/mod*_speech_to_text_2* @eustlb
-/src/transformers/models/speecht5/mod*_speecht5* @eustlb
-/src/transformers/models/unispeech/mod*_unispeech* @eustlb
-/src/transformers/models/unispeech_sat/mod*_unispeech_sat* @eustlb
-/src/transformers/models/univnet/mod*_univnet* @eustlb
-/src/transformers/models/vits/mod*_vits* @eustlb
-/src/transformers/models/wav2vec2/mod*_wav2vec2* @eustlb
-/src/transformers/models/wav2vec2_bert/mod*_wav2vec2_bert* @eustlb
-/src/transformers/models/wav2vec2_conformer/mod*_wav2vec2_conformer* @eustlb
-/src/transformers/models/wav2vec2_phoneme/mod*_wav2vec2_phoneme* @eustlb
-/src/transformers/models/wavlm/mod*_wavlm* @eustlb
-/src/transformers/models/whisper/mod*_whisper* @eustlb
-/src/transformers/models/xls_r/mod*_xls_r* @eustlb
-/src/transformers/models/xlsr_wav2vec2/mod*_xlsr_wav2vec2* @eustlb
-
-# Video models
-/src/transformers/models/timesformer/mod*_timesformer* @Rocketknight1
-/src/transformers/models/videomae/mod*_videomae* @Rocketknight1
-/src/transformers/models/vivit/mod*_vivit* @Rocketknight1
-
-# Multimodal models
-/src/transformers/models/align/mod*_align* @zucchini-nlp
-/src/transformers/models/altclip/mod*_altclip* @zucchini-nlp
-/src/transformers/models/aria/mod*_aria* @zucchini-nlp
-/src/transformers/models/blip/mod*_blip* @zucchini-nlp
-/src/transformers/models/blip_2/mod*_blip_2* @zucchini-nlp
-/src/transformers/models/bridgetower/mod*_bridgetower* @zucchini-nlp
-/src/transformers/models/bros/mod*_bros* @zucchini-nlp
-/src/transformers/models/chameleon/mod*_chameleon* @zucchini-nlp
-/src/transformers/models/chinese_clip/mod*_chinese_clip* @zucchini-nlp
-/src/transformers/models/clip/mod*_clip* @zucchini-nlp
-/src/transformers/models/clipseg/mod*_clipseg* @zucchini-nlp
-/src/transformers/models/clvp/mod*_clvp* @zucchini-nlp
-/src/transformers/models/colpali/mod*_colpali* @zucchini-nlp @yonigozlan
-/src/transformers/models/data2vec/mod*_data2vec* @zucchini-nlp
-/src/transformers/models/deplot/mod*_deplot* @zucchini-nlp
-/src/transformers/models/donut/mod*_donut* @zucchini-nlp
-/src/transformers/models/flava/mod*_flava* @zucchini-nlp
-/src/transformers/models/git/mod*_git* @zucchini-nlp
-/src/transformers/models/grounding_dino/mod*_grounding_dino* @yonigozlan
-/src/transformers/models/groupvit/mod*_groupvit* @zucchini-nlp
-/src/transformers/models/idefics/mod*_idefics* @zucchini-nlp
-/src/transformers/models/idefics2/mod*_idefics2* @zucchini-nlp
-/src/transformers/models/idefics3/mod*_idefics3* @zucchini-nlp
-/src/transformers/models/instructblip/mod*_instructblip* @zucchini-nlp
-/src/transformers/models/instructblipvideo/mod*_instructblipvideo* @zucchini-nlp
-/src/transformers/models/kosmos_2/mod*_kosmos_2* @zucchini-nlp
-/src/transformers/models/layoutlm/mod*_layoutlm* @NielsRogge
-/src/transformers/models/layoutlmv2/mod*_layoutlmv2* @NielsRogge
-/src/transformers/models/layoutlmv3/mod*_layoutlmv3* @NielsRogge
-/src/transformers/models/layoutxlm/mod*_layoutxlm* @NielsRogge
-/src/transformers/models/lilt/mod*_lilt* @zucchini-nlp
-/src/transformers/models/llava/mod*_llava* @zucchini-nlp @arthurzucker
-/src/transformers/models/llava_next/mod*_llava_next* @zucchini-nlp
-/src/transformers/models/llava_next_video/mod*_llava_next_video* @zucchini-nlp
-/src/transformers/models/llava_onevision/mod*_llava_onevision* @zucchini-nlp
-/src/transformers/models/lxmert/mod*_lxmert* @zucchini-nlp
-/src/transformers/models/matcha/mod*_matcha* @zucchini-nlp
-/src/transformers/models/mgp_str/mod*_mgp_str* @zucchini-nlp
-/src/transformers/models/mllama/mod*_mllama* @zucchini-nlp
-/src/transformers/models/nougat/mod*_nougat* @NielsRogge
-/src/transformers/models/omdet_turbo/mod*_omdet_turbo* @yonigozlan
-/src/transformers/models/oneformer/mod*_oneformer* @zucchini-nlp
-/src/transformers/models/owlvit/mod*_owlvit* @yonigozlan
-/src/transformers/models/owlv2/mod*_owlv2* @yonigozlan
-/src/transformers/models/paligemma/mod*_paligemma* @zucchini-nlp @molbap
-/src/transformers/models/perceiver/mod*_perceiver* @zucchini-nlp
-/src/transformers/models/pix2struct/mod*_pix2struct* @zucchini-nlp
-/src/transformers/models/pixtral/mod*_pixtral* @zucchini-nlp @ArthurZucker
-/src/transformers/models/qwen2_audio/mod*_qwen2_audio* @zucchini-nlp @ArthurZucker
-/src/transformers/models/qwen2_vl/mod*_qwen2_vl* @zucchini-nlp @ArthurZucker
-/src/transformers/models/sam/mod*_sam* @zucchini-nlp @ArthurZucker
-/src/transformers/models/siglip/mod*_siglip* @zucchini-nlp
-/src/transformers/models/speech_encoder_decoder/mod*_speech_encoder_decoder* @zucchini-nlp
-/src/transformers/models/tapas/mod*_tapas* @NielsRogge
-/src/transformers/models/trocr/mod*_trocr* @zucchini-nlp
-/src/transformers/models/tvlt/mod*_tvlt* @zucchini-nlp
-/src/transformers/models/tvp/mod*_tvp* @zucchini-nlp
-/src/transformers/models/udop/mod*_udop* @zucchini-nlp
-/src/transformers/models/video_llava/mod*_video_llava* @zucchini-nlp
-/src/transformers/models/vilt/mod*_vilt* @zucchini-nlp
-/src/transformers/models/vipllava/mod*_vipllava* @zucchini-nlp
-/src/transformers/models/vision_encoder_decoder/mod*_vision_encoder_decoder* @Rocketknight1
-/src/transformers/models/vision_text_dual_encoder/mod*_vision_text_dual_encoder* @Rocketknight1
-/src/transformers/models/visual_bert/mod*_visual_bert* @zucchini-nlp
-/src/transformers/models/xclip/mod*_xclip* @zucchini-nlp
-
-# Reinforcement learning models
-/src/transformers/models/decision_transformer/mod*_decision_transformer* @Rocketknight1
-/src/transformers/models/trajectory_transformer/mod*_trajectory_transformer* @Rocketknight1
-
-# Time series models
-/src/transformers/models/autoformer/mod*_autoformer* @Rocketknight1
-/src/transformers/models/informer/mod*_informer* @Rocketknight1
-/src/transformers/models/patchtsmixer/mod*_patchtsmixer* @Rocketknight1
-/src/transformers/models/patchtst/mod*_patchtst* @Rocketknight1
-/src/transformers/models/time_series_transformer/mod*_time_series_transformer* @Rocketknight1
-
-# Graph models
-/src/transformers/models/graphormer/mod*_graphormer* @clefourrier
-
-# Finally, files with no owners that shouldn't generate pings, usually automatically generated and checked in the CI
-utils/dummy*
--- a/.github/workflows/add-model-like.yml
+++ b/.github/workflows/add-model-like.yml
@ -54,7 +54,7 @@ jobs:
      - name: Create model files
        run: |
          . ~/venv/bin/activate
-          transformers add-new-model-like --config_file tests/fixtures/add_distilbert_like_config.json --path_to_repo .
+          transformers-cli add-new-model-like --config_file tests/fixtures/add_distilbert_like_config.json --path_to_repo .
          make style
          make fix-copies

--- a/.github/workflows/assign-reviewers.yml
+++ b/.github/workflows/assign-reviewers.yml
@ -1,26 +0,0 @@
-name: Assign PR Reviewers
-on:
-  pull_request_target:
-    branches:
-      - main
-    types: [ready_for_review]
-
-jobs:
-  assign_reviewers:
-    permissions:
-       pull-requests: write
-    runs-on: ubuntu-22.04
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.13'
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install PyGithub
-      - name: Run assignment script
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: python .github/scripts/assign_reviewers.py
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -18,8 +18,7 @@ jobs:
    name: Benchmark
    strategy:
      matrix:
-        # group: [aws-g5-4xlarge-cache, aws-p4d-24xlarge-plus] (A100 runner is not enabled)
-        group: [aws-g5-4xlarge-cache]
+        group: [aws-g5-4xlarge-cache, aws-p4d-24xlarge-plus]
    runs-on:
      group: ${{ matrix.group }}
    if: |
@ -48,7 +47,7 @@ jobs:

      - name: Run database init script
        run: |
-          psql -f benchmark/utils/init_db.sql
+          psql -f benchmark/init_db.sql
        env:
          PGDATABASE: metrics
          PGHOST: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGHOST }}
@ -64,7 +63,7 @@ jobs:
            commit_id=$GITHUB_SHA
          fi
          commit_msg=$(git show -s --format=%s | cut -c1-70)
-          python3 benchmark/benchmarks_entrypoint.py "huggingface/transformers" "$BRANCH_NAME" "$commit_id" "$commit_msg"
+          python3 benchmark/llama.py "${{ github.head_ref || github.ref_name }}" "$commit_id" "$commit_msg"
        env:
          HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
          # Enable this to see debug logs
@ -73,4 +72,3 @@ jobs:
          PGHOST: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGHOST }}
          PGUSER: transformers_benchmarks
          PGPASSWORD: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGPASSWORD }}
-          BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
--- a/.github/workflows/benchmark_v2.yml
+++ b/.github/workflows/benchmark_v2.yml
@ -1,85 +0,0 @@
-name: Benchmark v2 Framework
-
-on:
-  workflow_call:
-    inputs:
-      runner:
-        description: 'GH Actions runner group to use'
-        required: true
-        type: string
-      container_image:
-        description: 'Docker image to use'
-        required: true
-        type: string
-      container_options:
-        description: 'Container options to use'
-        required: true
-        type: string
-      commit_sha:
-        description: 'Commit SHA to benchmark'
-        required: false
-        type: string
-        default: ''
-      run_id:
-        description: 'Custom run ID for organizing results (auto-generated if not provided)'
-        required: false
-        type: string
-        default: ''
-      benchmark_repo_id:
-        description: 'HuggingFace Dataset to upload results to (e.g., "org/benchmark-results")'
-        required: false
-        type: string
-        default: ''
-
-env:
-  HF_HOME: /mnt/cache
-  TRANSFORMERS_IS_CI: yes
-  # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access.
-  # This token is created under the bot `hf-transformers-bot`.
-  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
-
-jobs:
-  benchmark-v2:
-    name: Benchmark v2
-    runs-on: ${{ inputs.runner }}
-    if: |
-      (github.event_name == 'pull_request' && contains( github.event.pull_request.labels.*.name, 'run-benchmark')) ||
-      (github.event_name == 'schedule')
-    container:
-      image: ${{ inputs.container_image }}
-      options: ${{ inputs.container_options }}
-    steps:
-      - name: Get repo
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ inputs.commit_sha || github.sha }}
-
-      - name: Install benchmark dependencies
-        run: |
-          python3 -m pip install -r benchmark_v2/requirements.txt
-
-      - name: Reinstall transformers in edit mode
-        run: |
-          python3 -m pip uninstall -y transformers
-          python3 -m pip install -e ".[torch]"
-
-      - name: Show installed libraries and their versions
-        run: |
-          python3 -m pip list
-          python3 -c "import torch; print(f'PyTorch version: {torch.__version__}')"
-          python3 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
-          python3 -c "import torch; print(f'CUDA device count: {torch.cuda.device_count()}')" || true
-          nvidia-smi || true
-
-      - name: Run benchmark v2
-        working-directory: benchmark_v2
-        run: |
-          echo "Running benchmarks"
-          python3 run_benchmarks.py \
-          --commit-id '${{ inputs.commit_sha || github.sha }}' \
-          --run-id '${{ inputs.run_id }}' \
-          --push-to-hub '${{ inputs.benchmark_repo_id}}' \
-          --token '${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}' \
-          --log-level INFO
-        env:
-          HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
--- a/.github/workflows/benchmark_v2_a10_caller.yml
+++ b/.github/workflows/benchmark_v2_a10_caller.yml
@ -1,21 +0,0 @@
-name: Benchmark v2 Scheduled Runner - A10 Single-GPU
-
-on:
-  schedule:
-    # Run daily at 16:30 UTC
-    - cron: "30 16 * * *"
-  pull_request:
-    types: [ opened, labeled, reopened, synchronize ]
-
-jobs:
-  benchmark-v2-default:
-    name: Benchmark v2 - Default Models
-    uses: ./.github/workflows/benchmark_v2.yml
-    with:
-      runner: aws-g5-4xlarge-cache-use1-public-80
-      container_image: huggingface/transformers-pytorch-gpu
-      container_options: --gpus all --privileged --ipc host --shm-size "16gb"
-      commit_sha: ${{ github.sha }}
-      run_id: ${{ github.run_id }}
-      benchmark_repo_id: hf-internal-testing/transformers-daily-benchmarks
-    secrets: inherit
--- a/.github/workflows/benchmark_v2_mi325_caller.yml
+++ b/.github/workflows/benchmark_v2_mi325_caller.yml
@ -1,21 +0,0 @@
-name: Benchmark v2 Scheduled Runner - MI325 Single-GPU
-
-on:
-  schedule:
-    # Run daily at 16:30 UTC
-    - cron: "30 16 * * *"
-  pull_request:
-    types: [ opened, labeled, reopened, synchronize ]
-
-jobs:
-  benchmark-v2-default:
-    name: Benchmark v2 - Default Models
-    uses: ./.github/workflows/benchmark_v2.yml
-    with:
-      runner: amd-mi325-ci-1gpu
-      container_image: huggingface/transformers-pytorch-amd-gpu
-      container_options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache
-      commit_sha: ${{ github.sha }}
-      run_id: ${{ github.run_id }}
-      benchmark_repo_id: hf-internal-testing/transformers-daily-benchmarks
-    secrets: inherit
--- a/.github/workflows/build-ci-docker-images.yml
+++ b/.github/workflows/build-ci-docker-images.yml
@ -26,7 +26,7 @@ jobs:

    strategy:
      matrix:
-        file: ["quality", "consistency", "custom-tokenizers", "torch-light", "exotic-models", "examples-torch"]
+        file: ["quality", "consistency", "custom-tokenizers", "torch-light", "tf-light", "exotic-models", "torch-tf-light", "torch-jax-light", "jax-light", "examples-torch",  "examples-tf"]
    continue-on-error: true

    steps:
@ -34,11 +34,11 @@ jobs:
        name: Set tag
        run: |
              if ${{contains(github.event.head_commit.message, '[build-ci-image]')}}; then
-                  echo "TAG=huggingface/transformers-${{ matrix.file }}:dev" >> "$GITHUB_ENV"
+                  echo "TAG=huggingface/transformers-${{ matrix.file }}:dev" >> "$GITHUB_ENV" 
                  echo "setting it to DEV!"
              else
                  echo "TAG=huggingface/transformers-${{ matrix.file }}" >> "$GITHUB_ENV"
-
+                  
              fi
      -
        name: Set up Docker Buildx
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@ -5,7 +5,6 @@ on:
    branches:
      - build_ci_docker_image*
  repository_dispatch:
-  workflow_dispatch:
  workflow_call:
    inputs:
      image_postfix:
@ -20,7 +19,7 @@ concurrency:

 jobs:
  latest-docker:
-    name: "Latest PyTorch [dev]"
+    name: "Latest PyTorch + TensorFlow [dev]"
    runs-on:
      group: aws-general-8-plus
    steps:
@ -64,14 +63,14 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the transformers-all-latest-gpu-push-ci docker build
+          title: 🤗 Results of the transformers-all-latest-gpu-push-ci docker build 
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

  latest-torch-deepspeed-docker:
    name: "Latest PyTorch + DeepSpeed"
    runs-on:
-      group: aws-g4dn-2xlarge-cache
+      group: aws-general-8-plus
    steps:
      -
        name: Set up Docker Buildx
@ -100,7 +99,7 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER}}
-          title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu docker build
+          title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu docker build 
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

@ -141,7 +140,7 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu-push-ci docker build
+          title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu-push-ci docker build 
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

@ -177,7 +176,7 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the huggingface/transformers-doc-builder docker build
+          title: 🤗 Results of the huggingface/transformers-doc-builder docker build 
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

@ -215,28 +214,28 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the huggingface/transformers-pytorch-gpudocker build
+          title: 🤗 Results of the huggingface/transformers-pytorch-gpudocker build 
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

  latest-pytorch-amd:
    name: "Latest PyTorch (AMD) [dev]"
    runs-on:
-      group: aws-highcpu-32-priv
+      group: aws-general-8-plus
    steps:
-      -
+      - 
        name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
-      -
+      - 
        name: Check out code
        uses: actions/checkout@v4
-      -
+      - 
        name: Login to DockerHub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      -
+      - 
        name: Build and push
        uses: docker/build-push-action@v5
        with:
@ -264,12 +263,14 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu-push-ci build
+          title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu-push-ci build 
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

-  latest-pytorch-deepspeed-amd:
-    name: "PyTorch + DeepSpeed (AMD) [dev]"
+  latest-tensorflow:
+    name: "Latest TensorFlow [dev]"
+    # Push CI doesn't need this image
+    if: inputs.image_postfix != '-push-ci'
    runs-on:
      group: aws-general-8-plus
    steps:
@ -286,6 +287,42 @@ jobs:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-tensorflow-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-tensorflow-gpu
+
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
+          title: 🤗 Results of the huggingface/transformers-tensorflow-gpu build 
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
+  latest-pytorch-deepspeed-amd:
+    name: "PyTorch + DeepSpeed (AMD) [dev]"
+    runs-on:
+      group: aws-general-8-plus
+    steps:
+      - 
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - 
+        name: Check out code
+        uses: actions/checkout@v4
+      - 
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      - 
        name: Build and push
        uses: docker/build-push-action@v5
        with:
@ -313,7 +350,7 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the transformers-pytorch-deepspeed-amd-gpu build
+          title: 🤗 Results of the transformers-pytorch-deepspeed-amd-gpu build 
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

@ -351,6 +388,6 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the transformers-quantization-latest-gpu build
+          title: 🤗 Results of the transformers-quantization-latest-gpu build 
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
--- a/.github/workflows/build-nightly-ci-docker-images.yml
+++ b/.github/workflows/build-nightly-ci-docker-images.yml
@ -2,10 +2,6 @@ name: Build docker images (Nightly CI)

 on:
  workflow_call:
-    inputs:
-      job:
-        required: true
-        type: string
  push:
    branches:
      - build_nightly_ci_docker_image*
@ -16,8 +12,7 @@ concurrency:

 jobs:
  latest-with-torch-nightly-docker:
-    name: "Nightly PyTorch"
-    if: inputs.job == 'latest-with-torch-nightly-docker' || inputs.job == ''
+    name: "Nightly PyTorch + Stable TensorFlow"
    runs-on:
      group: aws-general-8-plus
    steps:
@ -46,9 +41,8 @@ jobs:

  nightly-torch-deepspeed-docker:
    name: "Nightly PyTorch + DeepSpeed"
-    if: inputs.job == 'nightly-torch-deepspeed-docker' || inputs.job == ''
    runs-on:
-      group: aws-g4dn-2xlarge-cache
+      group: aws-general-8-plus
    steps:
      -
        name: Set up Docker Buildx
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@ -16,20 +16,8 @@ jobs:
      commit_sha: ${{ github.sha }}
      package: transformers
      notebook_folder: transformers_doc
-      languages: en
+      languages: ar de en es fr hi it ko pt tr zh ja te
      custom_container: huggingface/transformers-doc-builder
    secrets:
      token: ${{ secrets.HUGGINGFACE_PUSH }}
      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
-
-   build_other_lang:
-    uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
-    with:
-      commit_sha: ${{ github.sha }}
-      package: transformers
-      notebook_folder: transformers_doc
-      languages: ar de es fr hi it ja ko pt zh
-      custom_container: huggingface/transformers-doc-builder
-    secrets:
-      token: ${{ secrets.HUGGINGFACE_PUSH }}
-      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@ -14,4 +14,5 @@ jobs:
      commit_sha: ${{ github.event.pull_request.head.sha }}
      pr_number: ${{ github.event.number }}
      package: transformers
-      languages: en
+      languages: ar de en es fr hi it ko pt tr zh ja te
+      custom_container: huggingface/transformers-doc-builder
--- a/.github/workflows/check_failed_model_tests.yml
+++ b/.github/workflows/check_failed_model_tests.yml
@ -0,0 +1,129 @@
+name: Process failed tests
+
+on:
+  workflow_call:
+    inputs:
+      docker:
+        required: true
+        type: string
+      start_sha:
+        required: true
+        type: string
+
+
+env:
+  HF_HOME: /mnt/cache
+  TRANSFORMERS_IS_CI: yes
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  RUN_SLOW: yes
+  # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access.
+  # This token is created under the bot `hf-transformers-bot`.
+  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
+  TF_FORCE_GPU_ALLOW_GROWTH: true
+  RUN_PT_TF_CROSS_TESTS: 1
+  CUDA_VISIBLE_DEVICES: 0,1
+
+
+jobs:
+  run_models_gpu:
+    name: " "
+    runs-on:
+      group: aws-g4dn-2xlarge-cache
+    container:
+      image: ${{ inputs.docker }}
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          name: ci_results_run_models_gpu
+          path: /transformers/ci_results_run_models_gpu
+
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Get target commit
+        working-directory: /transformers/utils
+        run: |
+          echo "END_SHA=$(TOKEN=${{ secrets.ACCESS_REPO_INFO_TOKEN }} python3 -c 'import os; from get_previous_daily_ci import get_last_daily_ci_run_commit; commit=get_last_daily_ci_run_commit(token=os.environ["TOKEN"]); print(commit)')" >> $GITHUB_ENV
+
+      - name: Checkout to `start_sha`
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ inputs.start_sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Check failed tests
+        working-directory: /transformers
+        run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_run_models_gpu/new_model_failures.json --output_file new_model_failures_with_bad_commit.json
+
+      - name: Show results
+        working-directory: /transformers
+        run: |
+          ls -l new_model_failures_with_bad_commit.json
+          cat new_model_failures_with_bad_commit.json
+
+      - name: Checkout back
+        working-directory: /transformers
+        run: |
+          git checkout ${{ inputs.start_sha }}
+
+      - name: Process report
+        shell: bash
+        working-directory: /transformers
+        env:
+          TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
+        run: |
+          python3 utils/process_bad_commit_report.py
+
+      - name: Process report
+        shell: bash
+        working-directory: /transformers
+        env:
+          TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
+        run: |
+          {
+            echo 'REPORT_TEXT<<EOF'
+            python3 utils/process_bad_commit_report.py
+            echo EOF
+          } >> "$GITHUB_ENV"
+
+      - name: Send processed report
+        if: ${{ !endsWith(env.REPORT_TEXT, '{}') }}
+        uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
+        with:
+          # Slack channel id, channel name, or user id to post message.
+          # See also: https://api.slack.com/methods/chat.postMessage#channels
+          channel-id: '#transformers-ci-feedback-tests'
+          # For posting a rich message using Block Kit
+          payload: |
+            {
+              "blocks": [
+                {
+                  "type": "section",
+                  "text": {
+                    "type": "mrkdwn",
+                    "text": "${{ env.REPORT_TEXT }}"
+                  }
+                }
+              ]
+            }
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
--- a/.github/workflows/check_failed_tests.yml
+++ b/.github/workflows/check_failed_tests.yml
@ -1,207 +0,0 @@
-name: Process failed tests
-
-on:
-  workflow_call:
-    inputs:
-      docker:
-        required: true
-        type: string
-      start_sha:
-        required: true
-        type: string
-      job:
-        required: true
-        type: string
-      slack_report_channel:
-        required: true
-        type: string
-      ci_event:
-        required: true
-        type: string
-      report_repo_id:
-        required: true
-        type: string
-      commit_sha:
-        required: false
-        type: string
-
-
-env:
-  HF_HOME: /mnt/cache
-  TRANSFORMERS_IS_CI: yes
-  OMP_NUM_THREADS: 8
-  MKL_NUM_THREADS: 8
-  RUN_SLOW: yes
-  # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access.
-  # This token is created under the bot `hf-transformers-bot`.
-  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
-  TF_FORCE_GPU_ALLOW_GROWTH: true
-  CUDA_VISIBLE_DEVICES: 0,1
-
-
-jobs:
-  check_new_failures:
-    name: " "
-    runs-on:
-      group: aws-g5-4xlarge-cache
-    container:
-      image: ${{ inputs.docker }}
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - uses: actions/download-artifact@v4
-        with:
-          name: ci_results_${{ inputs.job }}
-          path: /transformers/ci_results_${{ inputs.job }}
-
-      - name: Check file
-        working-directory: /transformers
-        run: |
-          if [ -f ci_results_${{ inputs.job }}/new_failures.json ]; then
-            echo "`ci_results_${{ inputs.job }}/new_failures.json` exists, continue ..."
-            echo "process=true" >> $GITHUB_ENV
-          else
-            echo "`ci_results_${{ inputs.job }}/new_failures.json` doesn't exist, abort."
-            echo "process=false" >> $GITHUB_ENV
-          fi
-
-      - uses: actions/download-artifact@v4
-        if: ${{ env.process == 'true' }}
-        with:
-          pattern: setup_values*
-          path: setup_values
-          merge-multiple: true
-
-      - name: Prepare some setup values
-        if: ${{ env.process == 'true' }}
-        run: |
-          if [ -f setup_values/prev_workflow_run_id.txt ]; then
-            echo "PREV_WORKFLOW_RUN_ID=$(cat setup_values/prev_workflow_run_id.txt)" >> $GITHUB_ENV
-          else
-            echo "PREV_WORKFLOW_RUN_ID=" >> $GITHUB_ENV
-          fi
-
-          if [ -f setup_values/other_workflow_run_id.txt ]; then
-            echo "OTHER_WORKFLOW_RUN_ID=$(cat setup_values/other_workflow_run_id.txt)" >> $GITHUB_ENV
-          else
-            echo "OTHER_WORKFLOW_RUN_ID=" >> $GITHUB_ENV
-          fi
-
-      - name: Update clone
-        working-directory: /transformers
-        if: ${{ env.process == 'true' }}
-        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
-
-      - name: Get target commit
-        working-directory: /transformers/utils
-        if: ${{ env.process == 'true' }}
-        run: |
-          echo "END_SHA=$(TOKEN=${{ secrets.ACCESS_REPO_INFO_TOKEN }} python3 -c 'import os; from get_previous_daily_ci import get_last_daily_ci_run_commit; commit=get_last_daily_ci_run_commit(token=os.environ["TOKEN"], workflow_run_id=os.environ["PREV_WORKFLOW_RUN_ID"]); print(commit)')" >> $GITHUB_ENV
-
-      - name: Checkout to `start_sha`
-        working-directory: /transformers
-        if: ${{ env.process == 'true' }}
-        run: git fetch && git checkout ${{ inputs.start_sha }}
-
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
-        if: ${{ env.process == 'true' }}
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-      - name: NVIDIA-SMI
-        if: ${{ env.process == 'true' }}
-        run: |
-          nvidia-smi
-
-      - name: Environment
-        working-directory: /transformers
-        if: ${{ env.process == 'true' }}
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        if: ${{ env.process == 'true' }}
-        run: pip freeze
-
-      - name: Check failed tests
-        working-directory: /transformers
-        if: ${{ env.process == 'true' }}
-        run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_${{ inputs.job }}/new_failures.json --output_file new_failures_with_bad_commit.json
-
-      - name: Show results
-        working-directory: /transformers
-        if: ${{ env.process == 'true' }}
-        run: |
-          ls -l new_failures_with_bad_commit.json
-          cat new_failures_with_bad_commit.json
-
-      - name: Checkout back
-        working-directory: /transformers
-        if: ${{ env.process == 'true' }}
-        run: |
-          git checkout ${{ inputs.start_sha }}
-
-      - name: Process report
-        shell: bash
-        working-directory: /transformers
-        if: ${{ env.process == 'true' }}
-        env:
-          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
-          TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
-          JOB_NAME: ${{ inputs.job }}
-          REPORT_REPO_ID: ${{ inputs.report_repo_id }}
-        run: |
-          python3 utils/process_bad_commit_report.py
-
-      - name: Process report
-        shell: bash
-        working-directory: /transformers
-        if: ${{ env.process == 'true' }}
-        env:
-          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
-          TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
-          JOB_NAME: ${{ inputs.job }}
-          REPORT_REPO_ID: ${{ inputs.report_repo_id }}
-        run: |
-          {
-            echo 'REPORT_TEXT<<EOF'
-            python3 utils/process_bad_commit_report.py
-            echo EOF
-          } >> "$GITHUB_ENV"
-
-      - name: Prepare Slack report title
-        working-directory: /transformers
-        if: ${{ env.process == 'true' }}
-        run: |
-          pip install slack_sdk
-          echo "title=$(python3 -c 'import sys; sys.path.append("utils"); from utils.notification_service import job_to_test_map; ci_event = "${{ inputs.ci_event }}"; job = "${{ inputs.job }}"; test_name = job_to_test_map[job]; title = f"New failed tests of {ci_event}" + ":" + f" {test_name}"; print(title)')" >> $GITHUB_ENV
-
-      - name: Send processed report
-        if: ${{ env.process == 'true' && !endsWith(env.REPORT_TEXT, '{}') }}
-        uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
-        with:
-          # Slack channel id, channel name, or user id to post message.
-          # See also: https://api.slack.com/methods/chat.postMessage#channels
-          channel-id: '#${{ inputs.slack_report_channel }}'
-          # For posting a rich message using Block Kit
-          payload: |
-            {
-              "blocks": [
-                {
-                  "type": "header",
-                  "text": {
-                    "type": "plain_text",
-                    "text": "${{ env.title }}"
-                  }
-                },
-                {
-                  "type": "section",
-                  "text": {
-                    "type": "mrkdwn",
-                    "text": "${{ env.REPORT_TEXT }}"
-                  }
-                }
-              ]
-            }
-        env:
-          SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
--- a/.github/workflows/collated-reports.yml
+++ b/.github/workflows/collated-reports.yml
@ -1,43 +0,0 @@
-name: CI collated reports
-
-on:
-  workflow_call:
-    inputs:
-      job:
-        required: true
-        type: string
-      report_repo_id:
-        required: true
-        type: string
-      machine_type:
-        required: true
-        type: string
-      gpu_name:
-        description: Name of the GPU used for the job. Its enough that the value contains the name of the GPU, e.g. "noise-h100-more-noise". Case insensitive.
-        required: true
-        type: string
-
-jobs:
-  collated_reports:
-    name: Collated reports
-    runs-on: ubuntu-22.04
-    if: always()
-    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/download-artifact@v4
-
-      - name: Collated reports
-        shell: bash
-        env:
-          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
-          CI_SHA: ${{ github.sha }}
-          TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
-        run: |
-          pip install huggingface_hub
-          python3 utils/collated_reports.py                  \
-            --path .                                         \
-            --machine-type ${{ inputs.machine_type }}        \
-            --commit-hash ${{ env.CI_SHA }}                  \
-            --job ${{ inputs.job }}                          \
-            --report-repo-id ${{ inputs.report_repo_id }}    \
-            --gpu-name ${{ inputs.gpu_name }}
--- a/.github/workflows/doctest_job.yml
+++ b/.github/workflows/doctest_job.yml
@ -16,6 +16,7 @@ env:
  RUN_SLOW: yes
  OMP_NUM_THREADS: 16
  MKL_NUM_THREADS: 16
+  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
  TF_FORCE_GPU_ALLOW_GROWTH: true

 jobs:
@ -27,10 +28,10 @@ jobs:
      matrix:
        split_keys: ${{ fromJson(inputs.split_keys) }}
    runs-on: 
-      group: aws-g5-4xlarge-cache
+      group: aws-g4dn-2xlarge-cache
    container:
      image: huggingface/transformers-all-latest-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Update clone
        working-directory: /transformers
--- a/.github/workflows/doctests.yml
+++ b/.github/workflows/doctests.yml
@ -15,10 +15,10 @@ jobs:
  setup:
    name: Setup
    runs-on: 
-      group: aws-g5-4xlarge-cache
+      group: aws-g4dn-2xlarge-cache
    container:
      image: huggingface/transformers-all-latest-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    outputs:
      job_splits: ${{ steps.set-matrix.outputs.job_splits }}
      split_keys: ${{ steps.set-matrix.outputs.split_keys }}
--- a/.github/workflows/get-pr-info.yml
+++ b/.github/workflows/get-pr-info.yml
@ -1,157 +0,0 @@
-name: Get PR commit SHA
-on:
-  workflow_call:
-    inputs:
-      pr_number:
-        required: true
-        type: string
-    outputs:
-      PR_HEAD_REPO_FULL_NAME:
-        description: "The full name of the repository from which the pull request is created"
-        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_REPO_FULL_NAME }}
-      PR_BASE_REPO_FULL_NAME:
-        description: "The full name of the repository to which the pull request is created"
-        value: ${{ jobs.get-pr-info.outputs.PR_BASE_REPO_FULL_NAME }}
-      PR_HEAD_REPO_OWNER:
-        description: "The owner of the repository from which the pull request is created"
-        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}
-      PR_BASE_REPO_OWNER:
-        description: "The owner of the repository to which the pull request is created"
-        value: ${{ jobs.get-pr-info.outputs.PR_BASE_REPO_OWNER }}
-      PR_HEAD_REPO_NAME:
-        description: "The name of the repository from which the pull request is created"
-        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}
-      PR_BASE_REPO_NAME:
-        description: "The name of the repository to which the pull request is created"
-        value: ${{ jobs.get-pr-info.outputs.PR_BASE_REPO_NAME }}
-      PR_HEAD_REF:
-        description: "The branch name of the pull request in the head repository"
-        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_REF }}
-      PR_BASE_REF:
-        description: "The branch name in the base repository (to merge into)"
-        value: ${{ jobs.get-pr-info.outputs.PR_BASE_REF }}
-      PR_HEAD_SHA:
-        description: "The head sha of the pull request branch in the head repository"
-        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_SHA }}
-      PR_BASE_SHA:
-        description: "The head sha of the target branch in the base repository"
-        value: ${{ jobs.get-pr-info.outputs.PR_BASE_SHA }}
-      PR_MERGE_COMMIT_SHA:
-        description: "The sha of the merge commit for the pull request (created by GitHub) in the base repository"
-        value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_SHA }}
-      PR_HEAD_COMMIT_DATE:
-        description: "The date of the head sha of the pull request branch in the head repository"
-        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_COMMIT_DATE }}
-      PR_MERGE_COMMIT_DATE:
-        description: "The date of the merge commit for the pull request (created by GitHub) in the base repository"
-        value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_DATE }}
-      PR_HEAD_COMMIT_TIMESTAMP:
-        description: "The timestamp of the head sha of the pull request branch in the head repository"
-        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_COMMIT_TIMESTAMP }}
-      PR_MERGE_COMMIT_TIMESTAMP:
-        description: "The timestamp of the merge commit for the pull request (created by GitHub) in the base repository"
-        value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_TIMESTAMP }}
-      PR:
-        description: "The PR"
-        value: ${{ jobs.get-pr-info.outputs.PR }}
-      PR_FILES:
-        description: "The files touched in the PR"
-        value: ${{ jobs.get-pr-info.outputs.PR_FILES }}
-
-
-jobs:
-  get-pr-info:
-    runs-on: ubuntu-22.04
-    name: Get PR commit SHA better
-    outputs:
-      PR_HEAD_REPO_FULL_NAME: ${{ steps.pr_info.outputs.head_repo_full_name }}
-      PR_BASE_REPO_FULL_NAME: ${{ steps.pr_info.outputs.base_repo_full_name }}
-      PR_HEAD_REPO_OWNER: ${{ steps.pr_info.outputs.head_repo_owner }}
-      PR_BASE_REPO_OWNER: ${{ steps.pr_info.outputs.base_repo_owner }}
-      PR_HEAD_REPO_NAME: ${{ steps.pr_info.outputs.head_repo_name }}
-      PR_BASE_REPO_NAME: ${{ steps.pr_info.outputs.base_repo_name }}
-      PR_HEAD_REF: ${{ steps.pr_info.outputs.head_ref }}
-      PR_BASE_REF: ${{ steps.pr_info.outputs.base_ref }}
-      PR_HEAD_SHA: ${{ steps.pr_info.outputs.head_sha }}
-      PR_BASE_SHA: ${{ steps.pr_info.outputs.base_sha }}
-      PR_MERGE_COMMIT_SHA: ${{ steps.pr_info.outputs.merge_commit_sha }}
-      PR_HEAD_COMMIT_DATE: ${{ steps.pr_info.outputs.head_commit_date }}
-      PR_MERGE_COMMIT_DATE: ${{ steps.pr_info.outputs.merge_commit_date }}
-      PR_HEAD_COMMIT_TIMESTAMP: ${{ steps.get_timestamps.outputs.head_commit_timestamp }}
-      PR_MERGE_COMMIT_TIMESTAMP: ${{ steps.get_timestamps.outputs.merge_commit_timestamp }}
-      PR: ${{ steps.pr_info.outputs.pr }}
-      PR_FILES: ${{ steps.pr_info.outputs.files }}
-    if: ${{ inputs.pr_number != '' }}
-    steps:
-      - name: Extract PR details
-        id: pr_info
-        uses: actions/github-script@v6
-        with:
-          script: |            
-            const { data: pr } = await github.rest.pulls.get({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              pull_number: ${{ inputs.pr_number }}
-            });
-
-            const { data: head_commit }  = await github.rest.repos.getCommit({
-              owner: pr.head.repo.owner.login,
-              repo: pr.head.repo.name,
-              ref: pr.head.ref
-            });
-
-            const { data: merge_commit }  = await github.rest.repos.getCommit({
-              owner: pr.base.repo.owner.login,
-              repo: pr.base.repo.name,
-              ref: pr.merge_commit_sha,
-            });
-
-            const { data: files } = await github.rest.pulls.listFiles({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              pull_number: ${{ inputs.pr_number }}
-            });
-
-            core.setOutput('head_repo_full_name', pr.head.repo.full_name);
-            core.setOutput('base_repo_full_name', pr.base.repo.full_name);
-            core.setOutput('head_repo_owner', pr.head.repo.owner.login);
-            core.setOutput('base_repo_owner', pr.base.repo.owner.login);
-            core.setOutput('head_repo_name', pr.head.repo.name);
-            core.setOutput('base_repo_name', pr.base.repo.name);
-            core.setOutput('head_ref', pr.head.ref);
-            core.setOutput('base_ref', pr.base.ref);
-            core.setOutput('head_sha', pr.head.sha);
-            core.setOutput('base_sha', pr.base.sha);
-            core.setOutput('merge_commit_sha', pr.merge_commit_sha);
-            core.setOutput('pr', pr);
-
-            core.setOutput('head_commit_date', head_commit.commit.committer.date);
-            core.setOutput('merge_commit_date', merge_commit.commit.committer.date);
-            
-            core.setOutput('files', files);            
-            
-            console.log('PR head commit:', {
-              head_commit: head_commit,
-              commit: head_commit.commit,
-              date: head_commit.commit.committer.date
-            });
-
-            console.log('PR merge commit:', {
-              merge_commit: merge_commit,
-              commit: merge_commit.commit,
-              date: merge_commit.commit.committer.date
-            });
-
-      - name: Convert dates to timestamps
-        id: get_timestamps
-        run: |
-          head_commit_date=${{ steps.pr_info.outputs.head_commit_date }}
-          merge_commit_date=${{ steps.pr_info.outputs.merge_commit_date }}
-          echo $head_commit_date
-          echo $merge_commit_date
-          head_commit_timestamp=$(date -d "$head_commit_date" +%s)
-          merge_commit_timestamp=$(date -d "$merge_commit_date" +%s)
-          echo $head_commit_timestamp
-          echo $merge_commit_timestamp
-          echo "head_commit_timestamp=$head_commit_timestamp" >> $GITHUB_OUTPUT
-          echo "merge_commit_timestamp=$merge_commit_timestamp" >> $GITHUB_OUTPUT
--- a/.github/workflows/get-pr-number.yml
+++ b/.github/workflows/get-pr-number.yml
@ -1,36 +0,0 @@
-name: Get PR number
-on:
-  workflow_call:
-    outputs:
-      PR_NUMBER:
-        description: "The extracted PR number"
-        value: ${{ jobs.get-pr-number.outputs.PR_NUMBER }}
-
-jobs:
-  get-pr-number:
-    runs-on: ubuntu-22.04
-    name: Get PR number
-    outputs:
-      PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
-    steps:
-      - name: Get PR number
-        shell: bash
-        run: |
-          if [[ "${{ github.event.issue.number }}" != "" && "${{ github.event.issue.pull_request }}" != "" ]]; then
-            echo "PR_NUMBER=${{ github.event.issue.number }}" >> $GITHUB_ENV
-          elif [[ "${{ github.event.pull_request.number }}" != "" ]]; then
-            echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV
-          elif [[ "${{ github.event.pull_request }}" != "" ]]; then
-            echo "PR_NUMBER=${{ github.event.number }}" >> $GITHUB_ENV
-          else
-            echo "PR_NUMBER=" >> $GITHUB_ENV
-          fi
-
-      - name: Check PR number
-        shell: bash
-        run: |
-          echo "${{ env.PR_NUMBER }}"
-
-      - name: Set PR number
-        id: set_pr_number
-        run: echo "PR_NUMBER=${{ env.PR_NUMBER }}" >> "$GITHUB_OUTPUT"
--- a/.github/workflows/model_jobs.yml
+++ b/.github/workflows/model_jobs.yml
@ -12,21 +12,11 @@ on:
      slice_id:
        required: true
        type: number
-      docker:
+      runner:
        required: true
        type: string
-      commit_sha:
-        required: false
-        type: string
-      report_name_prefix:
-        required: false
-        default: run_models_gpu
-        type: string
-      runner_type:
-        required: false
-        type: string
-      report_repo_id:
-        required: false
+      docker:
+        required: true
        type: string

 env:
@ -38,7 +28,9 @@ env:
  # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access.
  # This token is created under the bot `hf-transformers-bot`.
  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
  TF_FORCE_GPU_ALLOW_GROWTH: true
+  RUN_PT_TF_CROSS_TESTS: 1
  CUDA_VISIBLE_DEVICES: 0,1

 jobs:
@ -54,8 +46,6 @@ jobs:
    container:
      image: ${{ inputs.docker }}
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    outputs:
-      machine_type: ${{ steps.set_machine_type.outputs.machine_type }}
    steps:
      - name: Echo input and matrix info
        shell: bash
@ -77,7 +67,7 @@ jobs:

      - name: Update clone
        working-directory: /transformers
-        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
+        run: git fetch && git checkout ${{ github.sha }}

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
@ -109,15 +99,14 @@ jobs:
        run: pip freeze

      - name: Set `machine_type` for report and artifact names
-        id: set_machine_type
        working-directory: /transformers
        shell: bash
        run: |
          echo "${{ inputs.machine_type }}"

-          if [ "${{ inputs.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "${{ inputs.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ inputs.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "${{ inputs.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ inputs.machine_type }}
@ -125,58 +114,26 @@ jobs:

          echo "$machine_type"
          echo "machine_type=$machine_type" >> $GITHUB_ENV
-          echo "machine_type=$machine_type" >> $GITHUB_OUTPUT
-
-      - name: Create report directory if it doesn't exist
-        shell: bash
-        run: |
-          mkdir -p /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
-          echo "dummy" > /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/dummy.txt
-          ls -la /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports

      - name: Run all tests on GPU
        working-directory: /transformers
-        run: |
-          script -q -c "PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports tests/${{ matrix.folders }}" test_outputs.txt
-          ls -la
-          # Extract the exit code from the output file
-          EXIT_CODE=$(tail -1 test_outputs.txt | grep -o 'COMMAND_EXIT_CODE="[0-9]*"' | cut -d'"' -f2)
-          exit ${EXIT_CODE:-1}
+        run: python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}

      - name: Failure short reports
        if: ${{ failure() }}
-        # This step is only to show information on Github Actions log.
-        # Always mark this step as successful, even if the report directory or the file `failures_short.txt` in it doesn't exist
        continue-on-error: true
-        run: cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/failures_short.txt
+        run: cat /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt

-      - name: Captured information
-        if: ${{ failure() }}
-        continue-on-error: true
+      - name: Run test
+        shell: bash
        run: |
-          cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/captured_info.txt
+          mkdir -p /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
+          echo "hello" > /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
+          echo "${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"

-      - name: Copy test_outputs.txt
-        if: ${{ always() }}
-        continue-on-error: true
-        run: |
-          cp /transformers/test_outputs.txt /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
-
-      - name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
-          name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
-
-  collated_reports:
-    name: Collated Reports
-    if: ${{ always() }}
-    needs: run_models_gpu
-    uses: huggingface/transformers/.github/workflows/collated-reports.yml@main
-    with:
-      job: run_models_gpu
-      report_repo_id: ${{ inputs.report_repo_id }}
-      gpu_name: ${{ inputs.runner_type }}
-      machine_type: ${{ needs.run_models_gpu.outputs.machine_type }}
-    secrets: inherit
+          name: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
--- a/.github/workflows/model_jobs_amd.yml
+++ b/.github/workflows/model_jobs_amd.yml
@ -0,0 +1,129 @@
+name: model jobs
+
+on:
+  workflow_call:
+    inputs:
+      folder_slices:
+        required: true
+        type: string
+      machine_type:
+        required: true
+        type: string
+      slice_id:
+        required: true
+        type: number
+      runner:
+        required: true
+        type: string
+      docker:
+        required: true
+        type: string
+
+env:
+  HF_HOME: /mnt/cache
+  TRANSFORMERS_IS_CI: yes
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  RUN_SLOW: yes
+  # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access.
+  # This token is created under the bot `hf-transformers-bot`.
+  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
+  TF_FORCE_GPU_ALLOW_GROWTH: true
+  RUN_PT_TF_CROSS_TESTS: 1
+  CUDA_VISIBLE_DEVICES: 0,1
+
+jobs:
+  run_models_gpu:
+    name: " "
+    strategy:
+      max-parallel: 1  # For now, not to parallelize. Can change later if it works well.
+      fail-fast: false
+      matrix:
+        folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
+    runs-on: ['${{ inputs.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
+    container:
+      image: ${{ inputs.docker }}
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Echo input and matrix info
+        shell: bash
+        run: |
+          echo "${{ inputs.folder_slices }}"
+          echo "${{ matrix.folders }}"
+          echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}"
+
+      - name: Echo folder ${{ matrix.folders }}
+        shell: bash
+        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
+        # set the artifact folder names (because the character `/` is not allowed).
+        run: |
+          echo "${{ matrix.folders }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'models/'/'models_'}
+          echo "$matrix_folders"
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: Update / Install some packages (for Past CI)
+        if: ${{ contains(inputs.docker, '-past-') }}
+        working-directory: /transformers
+        run: |
+          python3 -m pip install -U datasets
+
+      - name: Update / Install some packages (for Past CI)
+        if: ${{ contains(inputs.docker, '-past-') && contains(inputs.docker, '-pytorch-') }}
+        working-directory: /transformers
+        run: |
+          python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all tests on GPU
+        working-directory: /transformers
+        run: python3 -m pytest -rsfE -v --make-reports=${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}  -m "not not_device_test"
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
+
+      - name: Run test
+        shell: bash
+        run: |
+          mkdir -p /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
+          echo "hello" > /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
+          echo "${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"
+
+      - name: "Test suite reports artifacts: ${{ inputs.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ inputs.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
--- a/.github/workflows/model_jobs_intel_gaudi.yml
+++ b/.github/workflows/model_jobs_intel_gaudi.yml
@ -1,120 +0,0 @@
-name: model jobs
-
-on:
-  workflow_call:
-    inputs:
-      folder_slices:
-        required: true
-        type: string
-      slice_id:
-        required: true
-        type: number
-      runner:
-        required: true
-        type: string
-      machine_type:
-        required: true
-        type: string
-      report_name_prefix:
-        required: false
-        default: run_models_gpu
-        type: string
-
-env:
-  RUN_SLOW: yes
-  PT_HPU_LAZY_MODE: 0
-  TRANSFORMERS_IS_CI: yes
-  PT_ENABLE_INT64_SUPPORT: 1
-  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
-  HF_HOME: /mnt/cache/.cache/huggingface
-
-jobs:
-  run_models_gpu:
-    name: " "
-    strategy:
-      max-parallel: 8
-      fail-fast: false
-      matrix:
-        folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
-    runs-on:
-      group: ${{ inputs.runner }}
-    container:
-      image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
-      options: --runtime=habana
-        -v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
-        --env OMPI_MCA_btl_vader_single_copy_mechanism=none
-        --env HABANA_VISIBLE_DEVICES
-        --env HABANA_VISIBLE_MODULES
-        --cap-add=sys_nice
-        --shm-size=64G
-    steps:
-      - name: Echo input and matrix info
-        shell: bash
-        run: |
-          echo "${{ inputs.folder_slices }}"
-          echo "${{ matrix.folders }}"
-          echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}"
-
-      - name: Echo folder ${{ matrix.folders }}
-        shell: bash
-        run: |
-          echo "${{ matrix.folders }}"
-          matrix_folders=${{ matrix.folders }}
-          matrix_folders=${matrix_folders/'models/'/'models_'}
-          echo "$matrix_folders"
-          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
-
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Install dependencies
-        run: |
-          pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn
-
-      - name: HL-SMI
-        run: |
-          hl-smi
-          echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
-          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
-
-      - name: Environment
-        run: python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        run: pip freeze
-
-      - name: Set `machine_type` for report and artifact names
-        shell: bash
-        run: |
-          if [ "${{ inputs.machine_type }}" = "1gaudi" ]; then
-            machine_type=single-gpu
-          elif [ "${{ inputs.machine_type }}" = "2gaudi" ]; then
-            machine_type=multi-gpu
-          else
-            machine_type=${{ inputs.machine_type }}
-          fi
-          echo "machine_type=$machine_type" >> $GITHUB_ENV
-
-      - name: Run all tests on Gaudi
-        run: python3 -m pytest -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/failures_short.txt
-
-      - name: Run test
-        shell: bash
-        run: |
-          mkdir -p reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
-          echo "hello" > reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/hello.txt
-          echo "${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports"
-
-      - name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
-          path: reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
--- a/.github/workflows/new_model_pr_merged_notification.yml
+++ b/.github/workflows/new_model_pr_merged_notification.yml
@ -1,68 +0,0 @@
-# Used to notify core maintainers about new model PR being merged
-name: New model PR merged notification
-
-on:
-  push:
-    branches:
-      - main
-    paths:
-      - 'src/transformers/models/*/modeling_*'
-
-jobs:
-  notify_new_model:
-    name: Notify new model
-    runs-on: ubuntu-22.04
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-      - name: Check new model
-        shell: bash
-        run: |
-          python -m pip install gitpython
-          python -c 'from utils.pr_slow_ci_models import get_new_model; new_model = get_new_model(diff_with_last_commit=True); print(new_model)' | tee output.txt
-          echo "NEW_MODEL=$(tail -n 1 output.txt)" >> $GITHUB_ENV
-          echo "COMMIT_SHA=$(git log -1 --format=%H)" >> $GITHUB_ENV
-
-      - name: print commit sha
-        if: ${{ env.NEW_MODEL != ''}}
-        shell: bash
-        run: |
-          echo "$COMMIT_SHA"
-
-      - name: print new model
-        if: ${{ env.NEW_MODEL != ''}}
-        shell: bash
-        run: |
-          echo "$NEW_MODEL"
-
-      - name: Notify
-        if: ${{ env.NEW_MODEL != ''}}
-        uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
-        with:
-          # Slack channel id, channel name, or user id to post message.
-          # See also: https://api.slack.com/methods/chat.postMessage#channels
-          channel-id: transformers-new-model-notification
-          # For posting a rich message using Block Kit
-          payload: |
-            {
-              "blocks": [
-                {
-                  "type": "header",
-                  "text": {
-                    "type": "plain_text",
-                    "text": "New model!",
-                    "emoji": true
-                  }
-                },
-                {
-                  "type": "section",
-                  "text": {
-                    "type": "mrkdwn",
-                    "text": "<https://github.com/huggingface/transformers/commit/${{ env.COMMIT_SHA }}|New model: ${{ env.NEW_MODEL }}> GH_ArthurZucker, GH_lysandrejik, GH_ydshieh\ncommit SHA: ${{ env.COMMIT_SHA }}"
-                  }
-                }
-              ]
-            }
-        env:
-          SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
--- a/.github/workflows/pr-style-bot.yml
+++ b/.github/workflows/pr-style-bot.yml
@ -1,18 +0,0 @@
-# To run this bot, comment "@bot /style" on a PR
-name: Style Bot
-
-on:
-  issue_comment:
-    types: [created]
-
-permissions:
-  pull-requests: write
-
-jobs:
-  style:
-    uses: huggingface/huggingface_hub/.github/workflows/style-bot-action.yml@main
-    with:
-      python_quality_dependencies: "[quality]"
-      style_command_type: "default"
-    secrets:
-      bot_token: ${{ secrets.HF_STYLE_BOT_ACTION }}
--- a/.github/workflows/pr_build_doc_with_comment.yml
+++ b/.github/workflows/pr_build_doc_with_comment.yml
@ -1,134 +0,0 @@
-name: PR - build doc via comment
-on:
-  issue_comment:
-    types:
-      - created
-    branches-ignore:
-      - main
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.issue.number }}-${{ startsWith(github.event.comment.body, 'build-doc') }}
-  cancel-in-progress: true
-permissions: {}
-
-
-jobs:
-  get-pr-number:
-    name: Get PR number
-    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "eustlb", "MekkCyber", "vasqu", "ivarflakstad", "stevhliu", "ebezzam", "itazap"]'), github.actor) && (startsWith(github.event.comment.body, 'build-doc')) }}
-    uses: ./.github/workflows/get-pr-number.yml
-
-  get-pr-info:
-    name: Get PR commit SHA
-    needs: get-pr-number
-    if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
-    uses: ./.github/workflows/get-pr-info.yml
-    with:
-      pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
-
-  verity_pr_commit:
-    name: Verity PR commit corresponds to a specific event by comparing timestamps
-    if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
-    runs-on: ubuntu-22.04
-    needs: get-pr-info
-    env:
-      COMMENT_DATE: ${{ github.event.comment.created_at }}
-      PR_MERGE_COMMIT_DATE: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_DATE }}
-      PR_MERGE_COMMIT_TIMESTAMP: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_TIMESTAMP }}
-    steps:
-      - run: |
-          COMMENT_TIMESTAMP=$(date -d "${COMMENT_DATE}" +"%s")
-          echo "COMMENT_DATE: $COMMENT_DATE"
-          echo "PR_MERGE_COMMIT_DATE: $PR_MERGE_COMMIT_DATE"
-          echo "COMMENT_TIMESTAMP: $COMMENT_TIMESTAMP"
-          echo "PR_MERGE_COMMIT_TIMESTAMP: $PR_MERGE_COMMIT_TIMESTAMP"
-          if [ $COMMENT_TIMESTAMP -le $PR_MERGE_COMMIT_TIMESTAMP ]; then
-            echo "Last commit on the pull request is newer than the issue comment triggering this run! Abort!";
-            exit -1;
-          fi
-
-  create_run:
-    name: Create run
-    needs: [get-pr-number, get-pr-info]
-    if: ${{ needs.get-pr-number.outputs.PR_NUMBER != '' }}
-    permissions:
-      statuses: write
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Create Run
-        id: create_run
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          # Create a commit status (pending) for a run of this workflow. The status has to be updated later in `update_run_status`.
-          # See https://docs.github.com/en/rest/commits/statuses?apiVersion=2022-11-28#create-a-commit-status
-          GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
-        run: |
-          gh api \
-            --method POST \
-            -H "Accept: application/vnd.github+json" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            repos/${{ github.repository }}/statuses/${{ needs.get-pr-info.outputs.PR_HEAD_SHA }} \
-            -f "target_url=$GITHUB_RUN_URL" -f "state=pending" -f "description=Custom doc building job" -f "context=custom-doc-build"
-
-  reply_to_comment:
-    name: Reply to the comment
-    if: ${{ needs.create_run.result == 'success' }}
-    needs: [get-pr-number, create_run]
-    permissions:
-      pull-requests: write
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Reply to the comment
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
-        run: |
-          gh api \
-            --method POST \
-            -H "Accept: application/vnd.github+json" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            repos/${{ github.repository }}/issues/${{ needs.get-pr-number.outputs.PR_NUMBER }}/comments \
-            -f "body=[Building docs for all languages...](${{ env.GITHUB_RUN_URL }})"
-
-  build-doc:
-    name: Build doc
-    needs: [get-pr-number, get-pr-info]
-    if: ${{ needs.get-pr-number.outputs.PR_NUMBER != '' }}
-    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
-    with:
-      commit_sha: ${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}
-      pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
-      package: transformers
-      languages: ar de en es fr hi it ko pt tr zh ja te
-
-  update_run_status:
-    name: Update Check Run Status
-    needs: [ get-pr-info, create_run, build-doc ]
-    permissions:
-      statuses: write
-    if: ${{ always() && needs.create_run.result == 'success' }}
-    runs-on: ubuntu-22.04
-    env:
-      GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
-      STATUS_OK: ${{ contains(fromJSON('["skipped", "success"]'), needs.create_run.result) }}
-    steps:
-      - name: Get `build-doc` job status
-        run: |
-          echo "${{ needs.build-doc.result }}"
-          echo $STATUS_OK
-          if [ "$STATUS_OK" = "true" ]; then
-            echo "STATUS=success" >> $GITHUB_ENV
-          else
-            echo "STATUS=failure" >> $GITHUB_ENV
-          fi
-
-      - name: Update PR commit statuses
-        run: |
-          echo "${{ needs.build-doc.result }}"
-          echo "${{ env.STATUS }}"
-          gh api \
-            --method POST \
-            -H "Accept: application/vnd.github+json" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            repos/${{ github.repository }}/statuses/${{ needs.get-pr-info.outputs.PR_HEAD_SHA }} \
-            -f "target_url=$GITHUB_RUN_URL" -f "state=${{ env.STATUS }}" -f "description=Custom doc building job" -f "context=custom-doc-build"
--- a/.github/workflows/pr_run_slow_ci.yml
+++ b/.github/workflows/pr_run_slow_ci.yml
@ -1,177 +0,0 @@
-name: PR slow CI
-on:
-  pull_request_target:
-    types: [opened, synchronize, reopened]
-
-jobs:
-  get-pr-number:
-    name: Get PR number
-    uses: ./.github/workflows/get-pr-number.yml
-
-  get-pr-info:
-    name: Get PR commit SHA
-    needs: get-pr-number
-    if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
-    uses: ./.github/workflows/get-pr-info.yml
-    with:
-      pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
-
-  get-jobs:
-    name: Get test files to run
-    runs-on: ubuntu-22.04
-    needs: [get-pr-number, get-pr-info]
-    outputs:
-      jobs: ${{ steps.get_jobs.outputs.jobs_to_run }}
-    steps:
-      - name: Get repository content
-        id: repo_content
-        uses: actions/github-script@v6
-        with:
-          script: |
-            const { data: tests_dir } = await github.rest.repos.getContent({
-              owner: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}',
-              repo: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}',
-              path: 'tests',
-              ref: '${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}',
-            });
-
-            const { data: tests_models_dir } = await github.rest.repos.getContent({
-              owner: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}',
-              repo: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}',
-              path: 'tests/models',
-              ref: '${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}',
-            });
-
-            const { data: tests_quantization_dir } = await github.rest.repos.getContent({
-              owner: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}',
-              repo: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}',
-              path: 'tests/quantization',
-              ref: '${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}',
-            });
-
-            core.setOutput('tests_dir', tests_dir);
-            core.setOutput('tests_models_dir', tests_models_dir);
-            core.setOutput('tests_quantization_dir', tests_quantization_dir);
-
-      # This checkout to the main branch
-      - uses: actions/checkout@v4
-        with:
-          fetch-depth: "0"
-
-      - name: Write pr_files file
-        run: |
-          cat > pr_files.txt << 'EOF'
-          ${{ needs.get-pr-info.outputs.PR_FILES }}
-          EOF
-
-      - name: Write tests_dir file
-        run: |
-          cat > tests_dir.txt << 'EOF'
-          ${{ steps.repo_content.outputs.tests_dir }}
-          EOF
-
-      - name: Write tests_models_dir file
-        run: |
-          cat > tests_models_dir.txt << 'EOF'
-          ${{ steps.repo_content.outputs.tests_models_dir }}
-          EOF
-
-      - name: Write tests_quantization_dir file
-        run: |
-          cat > tests_quantization_dir.txt << 'EOF'
-          ${{ steps.repo_content.outputs.tests_quantization_dir }}
-          EOF
-
-      - name: Run script to get jobs to run
-        id: get_jobs
-        run: |
-          python utils/get_pr_run_slow_jobs.py | tee output.txt
-          echo "jobs_to_run: $(tail -n 1 output.txt)"
-          echo "jobs_to_run=$(tail -n 1 output.txt)" >> $GITHUB_OUTPUT
-
-  send_comment:
-    # Will delete the previous comment and send a new one if:
-    #   - either the content is changed
-    #   - or the previous comment is 30 minutes or more old
-    name: Send a comment to suggest jobs to run
-    if: ${{ needs.get-jobs.outputs.jobs != '' }}
-    needs: [get-pr-number, get-jobs]
-    permissions:
-      pull-requests: write
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Check and update comment if needed
-        uses: actions/github-script@v7
-        env:
-          BODY: "\n\nrun-slow: ${{ needs.get-jobs.outputs.jobs }}"
-        with:
-          script: |
-            const prNumber = ${{ needs.get-pr-number.outputs.PR_NUMBER }};
-            const commentPrefix = "**[For maintainers]** Suggested jobs to run (before merge)";
-            const thirtyMinutesAgo = new Date(Date.now() - 30 * 60 * 1000); // 30 minutes ago
-            const newBody = `${commentPrefix}${process.env.BODY}`;
-            
-            // Get all comments on the PR
-            const { data: comments } = await github.rest.issues.listComments({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              issue_number: prNumber
-            });
-            
-            // Find existing comments that start with our prefix
-            const existingComments = comments.filter(comment => 
-              comment.user.login === 'github-actions[bot]' && 
-              comment.body.startsWith(commentPrefix)
-            );
-            
-            let shouldCreateNewComment = true;
-            let commentsToDelete = [];
-            
-            if (existingComments.length > 0) {
-              // Get the most recent comment
-              const mostRecentComment = existingComments
-                .sort((a, b) => new Date(b.created_at) - new Date(a.created_at))[0];
-              
-              const commentDate = new Date(mostRecentComment.created_at);
-              const isOld = commentDate < thirtyMinutesAgo;
-              const isDifferentContent = mostRecentComment.body !== newBody;
-              
-              console.log(`Most recent comment created: ${mostRecentComment.created_at}`);
-              console.log(`Is older than 30 minutes: ${isOld}`);
-              console.log(`Has different content: ${isDifferentContent}`);
-              
-              if (isOld || isDifferentContent) {
-                // Delete all existing comments and create new one
-                commentsToDelete = existingComments;
-                console.log(`Will delete ${commentsToDelete.length} existing comment(s) and create new one`);
-              } else {
-                // Content is same and comment is recent, skip
-                shouldCreateNewComment = false;
-                console.log('Comment is recent and content unchanged, skipping update');
-              }
-            } else {
-              console.log('No existing comments found, will create new one');
-            }
-            
-            // Delete old comments if needed
-            for (const comment of commentsToDelete) {
-              console.log(`Deleting comment #${comment.id} (created: ${comment.created_at})`);
-              await github.rest.issues.deleteComment({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                comment_id: comment.id
-              });
-            }
-            
-            // Create new comment if needed
-            if (shouldCreateNewComment) {
-              await github.rest.issues.createComment({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                issue_number: prNumber,
-                body: newBody
-              });
-              console.log('✅ New comment created');
-            } else {
-              console.log('ℹ️ No comment update needed');
-            }
--- a/.github/workflows/push-important-models.yml
+++ b/.github/workflows/push-important-models.yml
@ -4,6 +4,18 @@ on:
  push:
    branches: [ main ]

+env:
+  OUTPUT_SLACK_CHANNEL_ID: "C06L2SGMEEA"
+  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+  HF_HOME: /mnt/cache 
+  TRANSFORMERS_IS_CI: yes 
+  OMP_NUM_THREADS: 8 
+  MKL_NUM_THREADS: 8 
+  RUN_SLOW: yes # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. # This token is created under the bot `hf-transformers-bot`. 
+  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} 
+  TF_FORCE_GPU_ALLOW_GROWTH: true 
+  RUN_PT_TF_CROSS_TESTS: 1
+
 jobs:
  get_modified_models:
    name: "Get all modified files"
@ -13,145 +25,119 @@ jobs:
    steps:
      - name: Check out code
        uses: actions/checkout@v4
-
-      - name: Get changed files using `actions/github-script`
-        id: get-changed-files
-        uses: actions/github-script@v7
+      
+      - name: Get changed files
+        id: changed-files
+        uses: tj-actions/changed-files@3f54ebb830831fc121d3263c1857cfbdc310cdb9 #v42
        with:
-          script: |
-            let files = [];
-            
-            // Only handle push events
-            if (context.eventName === 'push') {
-              const afterSha = context.payload.after;
-              const branchName = context.payload.ref.replace('refs/heads/', '');
-              
-              let baseSha;
-              
-              if (branchName === 'main') {
-                console.log('Push to main branch, comparing to parent commit');
-                // Get the parent commit of the pushed commit
-                const { data: commit } = await github.rest.repos.getCommit({
-                  owner: context.repo.owner,
-                  repo: context.repo.repo,
-                  ref: afterSha
-                });
-                baseSha = commit.parents[0]?.sha;
-                if (!baseSha) {
-                  throw new Error('No parent commit found for the pushed commit');
-                }
-              } else {
-                console.log(`Push to branch ${branchName}, comparing to main`);
-                baseSha = 'main';
-              }
-              
-              const { data: comparison } = await github.rest.repos.compareCommits({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                base: baseSha,
-                head: afterSha
-              });
-              
-              // Include added, modified, and renamed files
-              files = comparison.files
-                .filter(file => file.status === 'added' || file.status === 'modified' || file.status === 'renamed')
-                .map(file => file.filename);
-            }
-            
-            // Include all files under src/transformers/ (not just models subdirectory)
-            const filteredFiles = files.filter(file => 
-              file.startsWith('src/transformers/')
-            );
-            
-            core.setOutput('changed_files', filteredFiles.join(' '));
-            core.setOutput('any_changed', filteredFiles.length > 0 ? 'true' : 'false');
-
-      - name: Parse changed files with Python
-        if: steps.get-changed-files.outputs.any_changed == 'true'
-        env:
-          CHANGED_FILES: ${{ steps.get-changed-files.outputs.changed_files }}
+          files: src/transformers/models/**
+      
+      - name: Run step if only the files listed above change
+        if: steps.changed-files.outputs.any_changed == 'true'
        id: set-matrix
+        env:
+          ALL_CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
        run: |
-          python3 - << 'EOF'
-          import os
-          import sys
-          import json
-          
-          # Add the utils directory to Python path
-          sys.path.insert(0, 'utils')
-          
-          # Import the important models list
-          from important_files import IMPORTANT_MODELS
-          
-          print(f"Important models: {IMPORTANT_MODELS}")
-          
-          # Get the changed files from the previous step
-          changed_files_str = os.environ.get('CHANGED_FILES', '')
-          changed_files = changed_files_str.split() if changed_files_str else []
-          
-          # Filter to only Python files
-          python_files = [f for f in changed_files if f.endswith('.py')]
-          print(f"Python files changed: {python_files}")
-          
-          result_models = set()
-          
-          # Specific files that trigger all models
-          transformers_utils_files = [
-              'modeling_utils.py',
-              'modeling_rope_utils.py', 
-              'modeling_flash_attention_utils.py',
-              'modeling_attn_mask_utils.py',
-              'cache_utils.py',
-              'masking_utils.py',
-              'pytorch_utils.py'
-          ]
-          
-          # Single loop through all Python files
-          for file in python_files:
-              # Check for files under src/transformers/models/
-              if file.startswith('src/transformers/models/'):
-                  remaining_path = file[len('src/transformers/models/'):]
-                  if '/' in remaining_path:
-                      model_dir = remaining_path.split('/')[0]
-                      if model_dir in IMPORTANT_MODELS:
-                          result_models.add(model_dir)
-                          print(f"Added model directory: {model_dir}")
-              
-              # Check for specific files under src/transformers/ or src/transformers/generation/ files
-              elif file.startswith('src/transformers/generation/') or \
-                   (file.startswith('src/transformers/') and os.path.basename(file) in transformers_utils_files):
-                  print(f"Found core file: {file} - including all important models")
-                  result_models.update(IMPORTANT_MODELS)
-                  break  # No need to continue once we include all models
-          
-          # Convert to sorted list and create matrix
-          result_list = sorted(list(result_models))
-          print(f"Final model list: {result_list}")
-          
-          if result_list:
-              matrix_json = json.dumps(result_list)
-              print(f"matrix={matrix_json}")
-              
-              # Write to GITHUB_OUTPUT
-              with open(os.environ['GITHUB_OUTPUT'], 'a') as f:
-                  f.write(f"matrix={matrix_json}\n")
-          else:
-              print("matrix=[]")
-              with open(os.environ['GITHUB_OUTPUT'], 'a') as f:
-                  f.write("matrix=[]\n")
-          EOF
-
-  model-ci:
-    name: Model CI
-    uses: ./.github/workflows/self-scheduled.yml
+            model_arrays=()
+            for file in $ALL_CHANGED_FILES; do
+                model_path="${file#*models/}"
+                model_path="models/${model_path%%/*}"
+                if grep -qFx "$model_path" utils/important_models.txt; then
+                    # Append the file to the matrix string
+                    model_arrays+=("$model_path")
+                fi
+            done
+            matrix_string=$(printf '"%s", ' "${model_arrays[@]}" | sed 's/, $//')
+            echo "matrix=[$matrix_string]" >> $GITHUB_OUTPUT
+  test_modified_files:
    needs: get_modified_models
-    if: needs.get_modified_models.outputs.matrix != '' && needs.get_modified_models.outputs.matrix != '[]'
-    with:
-      job: run_models_gpu
-      slack_report_channel: "#transformers-ci-push"
-      docker: huggingface/transformers-all-latest-gpu
-      ci_event: push
-      report_repo_id: hf-internal-testing/transformers_ci_push
-      commit_sha: ${{ github.sha }}
-      models: ${{ needs.get_modified_models.outputs.matrix }}
+    name: Slow & FA2 tests
+    runs-on:
+      group: aws-g5-4xlarge-cache
+    container:
+      image: huggingface/transformers-all-latest-gpu
+      options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    if: ${{ needs.get_modified_models.outputs.matrix != '[]' && needs.get_modified_models.outputs.matrix != '' && fromJson(needs.get_modified_models.outputs.matrix)[0] != null }}
+    strategy:
+      fail-fast: false
+      matrix: 
+        model-name: ${{ fromJson(needs.get_modified_models.outputs.matrix) }}
+
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v4
+      
+      - name: Install locally transformers & other libs
+        run: |
+          apt install sudo
+          sudo -H pip install --upgrade pip
+          sudo -H pip uninstall -y transformers 
+          sudo -H pip install -U -e ".[testing]" 
+          MAX_JOBS=4 pip install flash-attn --no-build-isolation
+          pip install bitsandbytes
+      
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+      
+      - name: Show installed libraries and their versions
+        run: pip freeze
+      
+      - name: Run FA2 tests
+        id: run_fa2_tests
+        run:
+          pytest -rsfE -m "flash_attn_test" --make-reports=${{ matrix.model-name }}_fa2_tests/ tests/${{ matrix.model-name }}/test_modeling_*
+      
+      - name: "Test suite reports artifacts: ${{ matrix.model-name }}_fa2_tests"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ matrix.model-name }}_fa2_tests
+          path: /transformers/reports/${{ matrix.model-name }}_fa2_tests
+      
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ env.OUTPUT_SLACK_CHANNEL_ID }}
+          title: 🤗 Results of the FA2 tests - ${{ matrix.model-name }}
+          status: ${{ steps.run_fa2_tests.conclusion}}
+          slack_token: ${{ secrets.CI_SLACK_BOT_TOKEN }}
+      
+      - name: Run integration tests
+        id: run_integration_tests
+        if: always()
+        run:
+          pytest -rsfE -k "IntegrationTest"  --make-reports=tests_integration_${{ matrix.model-name }} tests/${{ matrix.model-name }}/test_modeling_*
+      
+      - name: "Test suite reports artifacts: tests_integration_${{ matrix.model-name }}"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: tests_integration_${{ matrix.model-name }}
+          path: /transformers/reports/tests_integration_${{ matrix.model-name }}
+
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main 
+        with:
+          slack_channel: ${{ env.OUTPUT_SLACK_CHANNEL_ID }}
+          title: 🤗 Results of the Integration tests - ${{ matrix.model-name }}
+          status: ${{ steps.run_integration_tests.conclusion}}
+          slack_token: ${{ secrets.CI_SLACK_BOT_TOKEN }}
+
+      - name: Tailscale # In order to be able to SSH when a test fails
+        if: ${{ runner.debug == '1'}}
+        uses: huggingface/tailscale-action@v1
+        with:
+          authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }}
+          slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}
+          slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+          waitForSSH: true
+
+  benchmark:
+    name: Benchmark workflow
+    needs: get_modified_models
+    if: ${{ needs.get_modified_models.outputs.matrix != '[]' && needs.get_modified_models.outputs.matrix != '' && fromJson(needs.get_modified_models.outputs.matrix)[0] != null }}
+    uses: ./.github/workflows/benchmark.yml
    secrets: inherit
--- a/.github/workflows/self-comment-ci.yml
+++ b/.github/workflows/self-comment-ci.yml
@ -1,415 +0,0 @@
-name: PR comment GitHub CI
-
-on:
-  issue_comment:
-    types:
-      - created
-    branches-ignore:
-      - main
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.issue.number }}-${{ startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow') }}
-  cancel-in-progress: true
-permissions: read-all
-
-env:
-  HF_HOME: /mnt/cache
-  TRANSFORMERS_IS_CI: yes
-  OMP_NUM_THREADS: 8
-  MKL_NUM_THREADS: 8
-  RUN_SLOW: yes
-  # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access.
-  # This token is created under the bot `hf-transformers-bot`.
-  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
-  TF_FORCE_GPU_ALLOW_GROWTH: true
-  CUDA_VISIBLE_DEVICES: 0,1
-
-jobs:
-  get-pr-number:
-    runs-on: ubuntu-22.04
-    name: Get PR number
-    # For security: only allow team members to run
-    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "eustlb", "MekkCyber", "vasqu", "ivarflakstad", "stevhliu", "ebezzam", "remi-or", "itazap"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
-    outputs:
-      PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
-    steps:
-      - name: Get PR number
-        shell: bash
-        run: |
-          if [[ "${{ github.event.issue.number }}" != "" && "${{ github.event.issue.pull_request }}" != "" ]]; then
-            echo "PR_NUMBER=${{ github.event.issue.number }}" >> $GITHUB_ENV
-          else
-            echo "PR_NUMBER=" >> $GITHUB_ENV
-          fi
-
-      - name: Check PR number
-        shell: bash
-        run: |
-          echo "${{ env.PR_NUMBER }}"
-
-      - name: Set PR number
-        id: set_pr_number
-        run: echo "PR_NUMBER=${{ env.PR_NUMBER }}" >> "$GITHUB_OUTPUT"
-
-  get-sha:
-    runs-on: ubuntu-22.04
-    needs: get-pr-number
-    if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
-    outputs:
-      PR_HEAD_SHA: ${{ steps.get_sha.outputs.PR_HEAD_SHA }}
-      PR_MERGE_SHA: ${{ steps.get_sha.outputs.PR_MERGE_SHA }}
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          fetch-depth: "0"
-          ref: "refs/pull/${{needs.get-pr-number.outputs.PR_NUMBER}}/merge"
-
-      - name: Get SHA (and verify timestamps against the issue comment date)
-        id: get_sha
-        env:
-          PR_NUMBER: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
-          COMMENT_DATE: ${{ github.event.comment.created_at }}
-        run: |
-            git fetch origin refs/pull/$PR_NUMBER/head:refs/remotes/pull/$PR_NUMBER/head
-            git checkout refs/remotes/pull/$PR_NUMBER/head
-            echo "PR_HEAD_SHA: $(git log -1 --format=%H)"
-            echo "PR_HEAD_SHA=$(git log -1 --format=%H)" >> "$GITHUB_OUTPUT"
-            git fetch origin refs/pull/$PR_NUMBER/merge:refs/remotes/pull/$PR_NUMBER/merge
-            git checkout refs/remotes/pull/$PR_NUMBER/merge
-            echo "PR_MERGE_SHA: $(git log -1 --format=%H)"
-            echo "PR_MERGE_SHA=$(git log -1 --format=%H)" >> "$GITHUB_OUTPUT"
-            PR_MERGE_COMMIT_TIMESTAMP=$(git log -1 --date=unix --format=%cd)
-            echo "PR_MERGE_COMMIT_TIMESTAMP: $PR_MERGE_COMMIT_TIMESTAMP"
-            COMMENT_TIMESTAMP=$(date -d "${COMMENT_DATE}" +"%s")
-            echo "COMMENT_DATE: $COMMENT_DATE"
-            echo "COMMENT_TIMESTAMP: $COMMENT_TIMESTAMP"
-            if [ $COMMENT_TIMESTAMP -le $PR_MERGE_COMMIT_TIMESTAMP ]; then
-              echo "Last commit on the pull request is newer than the issue comment triggering this run! Abort!";
-              exit -1;
-            fi
-
-  # use a python script to handle this complex logic
-  # case 1: `run-slow` (auto. infer with limited number of models, but in particular, new model)
-  # case 2: `run-slow model_1, model_2`
-  get-tests:
-    runs-on: ubuntu-22.04
-    needs: [get-pr-number, get-sha]
-    if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
-    outputs:
-      models: ${{ steps.models_to_run.outputs.models }}
-      quantizations: ${{ steps.models_to_run.outputs.quantizations }}
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          fetch-depth: "0"
-          ref: "refs/pull/${{needs.get-pr-number.outputs.PR_NUMBER}}/merge"
-
-      - name: Verify merge commit SHA
-        env:
-          VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }}
-        run: |
-            PR_MERGE_SHA=$(git log -1 --format=%H)
-            if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then
-              echo "The merged commit SHA is not the same as the verified one! Security issue detected, abort the workflow!";
-              exit -1;
-            fi
-
-      - name: Get models to test
-        env:
-          PR_COMMENT: ${{ github.event.comment.body }}
-        run: |
-          python -m pip install GitPython
-          python utils/pr_slow_ci_models.py --message "$PR_COMMENT" | tee output.txt
-          echo "models=$(tail -n 1 output.txt)" >> $GITHUB_ENV
-          python utils/pr_slow_ci_models.py --message "$PR_COMMENT" --quantization | tee output2.txt
-          echo "quantizations=$(tail -n 1 output2.txt)" >> $GITHUB_ENV
-
-      - name: Show models to test
-        id: models_to_run
-        run: |
-          echo "${{ env.models }}"
-          echo "models=${{ env.models }}" >> $GITHUB_ENV
-          echo "models=${{ env.models }}" >> $GITHUB_OUTPUT
-          echo "${{ env.quantizations }}"
-          echo "quantizations=${{ env.quantizations }}" >> $GITHUB_OUTPUT
-
-  reply_to_comment:
-    name: Reply to the comment
-    if: ${{ needs.get-tests.outputs.models != '[]'  || needs.get-tests.outputs.quantizations != '[]' }}
-    needs: [get-pr-number, get-tests]
-    permissions:
-      pull-requests: write
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Reply to the comment
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          MODELS: ${{ needs.get-tests.outputs.models }}
-          BODY: "\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}"
-        run: |
-          gh api \
-            --method POST \
-            -H "Accept: application/vnd.github+json" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            repos/${{ github.repository }}/issues/${{ needs.get-pr-number.outputs.PR_NUMBER }}/comments \
-            -f "body=This comment contains run-slow, running the specified jobs: ${{ env.BODY }} ..."
-
-  create_run:
-    name: Create run
-    if: ${{ needs.get-tests.outputs.models != '[]' || needs.get-tests.outputs.quantizations != '[]' }}
-    needs: [get-sha, get-tests, reply_to_comment]
-    permissions:
-      statuses: write
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Create Run
-        id: create_run
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          # Create a commit status (pending) for a run of this workflow. The status has to be updated later in `update_run_status`.
-          # See https://docs.github.com/en/rest/commits/statuses?apiVersion=2022-11-28#create-a-commit-status
-          GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
-        run: |
-          gh api \
-            --method POST \
-            -H "Accept: application/vnd.github+json" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            repos/${{ github.repository }}/statuses/${{ needs.get-sha.outputs.PR_HEAD_SHA }} \
-            -f "target_url=$GITHUB_RUN_URL" -f "state=pending" -f "description=Slow CI job" -f "context=pytest/custom-tests"
-
-  run_models_gpu:
-    name: Run all tests for the model
-    if: ${{ needs.get-tests.outputs.models != '[]' }}
-    needs: [get-pr-number, get-sha, get-tests, create_run]
-    strategy:
-      fail-fast: false
-      matrix:
-        folders: ${{ fromJson(needs.get-tests.outputs.models) }}
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
-    runs-on:
-       group: '${{ matrix.machine_type }}'
-    container:
-      image: huggingface/transformers-all-latest-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - name: Echo input and matrix info
-        shell: bash
-        run: |
-          echo "${{ matrix.folders }}"
-
-      - name: Echo folder ${{ matrix.folders }}
-        shell: bash
-        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
-        # set the artifact folder names (because the character `/` is not allowed).
-        run: |
-          echo "${{ matrix.folders }}"
-          matrix_folders=${{ matrix.folders }}
-          matrix_folders=${matrix_folders/'models/'/'models_'}
-          echo "$matrix_folders"
-          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
-
-      - name: Checkout to PR merge commit
-        working-directory: /transformers
-        run: |
-          git fetch origin refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge:refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
-          git checkout refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
-          git log -1 --format=%H
-
-      - name: Verify merge commit SHA
-        env:
-          VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }}
-        working-directory: /transformers
-        run: |
-          PR_MERGE_SHA=$(git log -1 --format=%H)
-          if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then
-            echo "The merged commit SHA is not the same as the verified one! Security issue detected, abort the workflow!";
-            exit -1;
-          fi
-
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
-      - name: Set `machine_type` for report and artifact names
-        working-directory: /transformers
-        shell: bash
-        run: |
-          echo "${{ matrix.machine_type }}"
-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
-            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
-            machine_type=multi-gpu
-          else
-            machine_type=${{ matrix.machine_type }}
-          fi
-          echo "$machine_type"
-          echo "machine_type=$machine_type" >> $GITHUB_ENV
-
-      - name: Environment
-        working-directory: /transformers
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        run: pip freeze
-
-      - name: Run all tests on GPU
-        working-directory: /transformers
-        run: |
-          export CUDA_VISIBLE_DEVICES="$(python3 utils/set_cuda_devices_for_ci.py --test_folder ${{ matrix.folders }})"
-          echo $CUDA_VISIBLE_DEVICES
-          python3 -m pytest -v -rsfE --make-reports=${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
-
-      - name: Make sure report directory exists
-        shell: bash
-        run: |
-          mkdir -p /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
-          echo "hello" > /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
-          echo "${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"
-
-      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
-
-  run_quantization_torch_gpu:
-    name: Run all tests for a quantization
-    if: ${{ needs.get-tests.outputs.quantizations != '[]' }}
-    needs: [get-pr-number, get-sha, get-tests, create_run]
-    strategy:
-      fail-fast: false
-      matrix:
-        folders: ${{ fromJson(needs.get-tests.outputs.quantizations) }}
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
-    runs-on:
-      group: '${{ matrix.machine_type }}'
-    container:
-      image: huggingface/transformers-quantization-latest-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - name: Echo folder ${{ matrix.folders }}
-        shell: bash
-        run: |
-          echo "${{ matrix.folders }}"
-          matrix_folders=${{ matrix.folders }}
-          matrix_folders=${matrix_folders/'quantization/'/'quantization_'}
-          echo "$matrix_folders"
-          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
-
-      - name: Checkout to PR merge commit
-        working-directory: /transformers
-        run: |
-          git fetch origin refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge:refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
-          git checkout refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
-          git log -1 --format=%H
-
-      - name: Verify merge commit SHA
-        env:
-          VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }}
-        working-directory: /transformers
-        run: |
-          PR_MERGE_SHA=$(git log -1 --format=%H)
-          if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then
-            echo "The merged commit SHA is not the same as the verified one! Security issue detected, abort the workflow!";
-            exit -1;
-          fi
-
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
-      - name: Set `machine_type` for report and artifact names
-        working-directory: /transformers
-        shell: bash
-        run: |
-          echo "${{ matrix.machine_type }}"
-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
-            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
-            machine_type=multi-gpu
-          else
-            machine_type=${{ matrix.machine_type }}
-          fi
-          echo "$machine_type"
-          echo "machine_type=$machine_type" >> $GITHUB_ENV
-
-      - name: Environment
-        working-directory: /transformers
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        run: pip freeze
-
-      - name: Run quantization tests on GPU
-        working-directory: /transformers
-        run: |
-          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
-
-      - name: Make sure report directory exists
-        shell: bash
-        run: |
-          mkdir -p /transformers/reports/${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports
-          echo "hello" > /transformers/reports/${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports/hello.txt
-          echo "${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports"
-
-      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports"
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports
-
-  update_run_status:
-    name: Update Check Run Status
-    needs: [get-sha, create_run, run_models_gpu, run_quantization_torch_gpu]
-    permissions:
-      statuses: write
-    if: ${{ always() && needs.create_run.result == 'success' }}
-    runs-on: ubuntu-22.04
-    env:
-      GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
-      STATUS_OK: ${{ contains(fromJSON('["skipped", "success"]'), needs.run_models_gpu.result) && contains(fromJSON('["skipped", "success"]'), needs.run_quantization_torch_gpu.result) }}
-    steps:
-      - name: Get `run_models_gpu` job status
-        run: |
-          echo "${{ needs.run_models_gpu.result }}"
-          echo "${{ needs.run_quantization_torch_gpu.result }}"
-          echo $STATUS_OK
-          if [ "$STATUS_OK" = "true" ]; then
-            echo "STATUS=success" >> $GITHUB_ENV
-          else
-            echo "STATUS=failure" >> $GITHUB_ENV
-          fi
-
-      - name: Update PR commit statuses
-        run: |
-          echo "${{ needs.run_models_gpu.result }}"
-          echo "${{ env.STATUS }}"
-          gh api \
-            --method POST \
-            -H "Accept: application/vnd.github+json" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            repos/${{ github.repository }}/statuses/${{ needs.get-sha.outputs.PR_HEAD_SHA }} \
-            -f "target_url=$GITHUB_RUN_URL" -f "state=${{ env.STATUS }}" -f "description=Slow CI job" -f "context=pytest/custom-tests"
--- a/.github/workflows/self-nightly-caller.yml
+++ b/.github/workflows/self-nightly-caller.yml
@ -1,56 +1,43 @@
-name: Nvidia CI with nightly torch
+name: Self-hosted runner (nightly-ci)
+

 on:
  repository_dispatch:
-  # triggered when the daily scheduled Nvidia CI is completed.
-  # This way, we can compare the results more easily.
-  workflow_run:
-    workflows: ["Nvidia CI"]
-    branches: ["main"]
-    types: [completed]
+  schedule:
+    - cron: "17 2 * * *"
  push:
    branches:
-      - run_ci_with_nightly_torch*
-
-# Used for `push` to easily modify the target workflow runs to compare against
-env:
-    prev_workflow_run_id: ""
-    other_workflow_run_id: ""
-
+      - run_nightly_ci*

 jobs:
-  build_nightly_torch_ci_images:
-    name: Build CI Docker Images with nightly torch
+  build_nightly_ci_images:
+    name: Build Nightly CI Docker Images
+    if: (github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_nightly_ci'))
    uses: ./.github/workflows/build-nightly-ci-docker-images.yml
-    with:
-      job: latest-with-torch-nightly-docker
    secrets: inherit

-  setup:
-    name: Setup
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Setup
-        run: |
-          mkdir "setup_values"
-          echo "${{ inputs.prev_workflow_run_id || env.prev_workflow_run_id }}" > "setup_values/prev_workflow_run_id.txt"
-          echo "${{ inputs.other_workflow_run_id || env.other_workflow_run_id }}" > "setup_values/other_workflow_run_id.txt"
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          name: setup_values
-          path: setup_values
-
  model-ci:
    name: Model CI
-    needs: build_nightly_torch_ci_images
+    needs: [build_nightly_ci_images]
    uses: ./.github/workflows/self-scheduled.yml
    with:
      job: run_models_gpu
      slack_report_channel: "#transformers-ci-past-future"
+      runner: ci
      docker: huggingface/transformers-all-latest-torch-nightly-gpu
      ci_event: Nightly CI
-      report_repo_id: hf-internal-testing/transformers_daily_ci_with_torch_nightly
-      commit_sha: ${{ github.event.workflow_run.head_sha || github.sha }}
+    secrets: inherit
+
+  deepspeed-ci:
+    name: DeepSpeed CI
+    needs: [build_nightly_ci_images]
+    uses: ./.github/workflows/self-scheduled.yml
+    with:
+      job: run_torch_cuda_extensions_gpu
+      slack_report_channel: "#transformers-ci-past-future"
+      runner: ci
+      # test deepspeed nightly build with the latest release torch
+      docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
+      ci_event: Nightly CI
+      working-directory-prefix: /workspace
    secrets: inherit
--- a/.github/workflows/self-nightly-past-ci-caller.yml
+++ b/.github/workflows/self-nightly-past-ci-caller.yml
@ -21,6 +21,39 @@ jobs:
          echo "$(python3 -c 'print(int(${{ github.run_number }}) % 10)')"
          echo "run_number=$(python3 -c 'print(int(${{ github.run_number }}) % 10)')" >> $GITHUB_OUTPUT

+  run_past_ci_pytorch_1-13:
+    name: PyTorch 1.13
+    needs: get_number
+    if: needs.get_number.outputs.run_number == 0 && (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')))
+    uses: ./.github/workflows/self-past-caller.yml
+    with:
+      framework: pytorch
+      version: "1.13"
+      sha: ${{ github.sha }}
+    secrets: inherit
+
+  run_past_ci_pytorch_1-12:
+    name: PyTorch 1.12
+    needs: get_number
+    if: needs.get_number.outputs.run_number == 1 && (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')))
+    uses: ./.github/workflows/self-past-caller.yml
+    with:
+      framework: pytorch
+      version: "1.12"
+      sha: ${{ github.sha }}
+    secrets: inherit
+
+  run_past_ci_pytorch_1-11:
+    name: PyTorch 1.11
+    needs: get_number
+    if: needs.get_number.outputs.run_number == 2 && (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')))
+    uses: ./.github/workflows/self-past-caller.yml
+    with:
+      framework: pytorch
+      version: "1.11"
+      sha: ${{ github.sha }}
+    secrets: inherit
+
  run_past_ci_tensorflow_2-11:
    name: TensorFlow 2.11
    needs: get_number
--- a/.github/workflows/self-pr-slow-ci.yml
+++ b/.github/workflows/self-pr-slow-ci.yml
@ -0,0 +1,151 @@
+name: PR slow CI
+
+on:
+  pull_request:
+    paths:
+      - "src/transformers/models/*/modeling_*.py"
+      - "tests/**/test_*.py"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  HF_HOME: /mnt/cache
+  TRANSFORMERS_IS_CI: yes
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  RUN_SLOW: yes
+  # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access.
+  # This token is created under the bot `hf-transformers-bot`.
+  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
+  TF_FORCE_GPU_ALLOW_GROWTH: true
+  RUN_PT_TF_CROSS_TESTS: 1
+  CUDA_VISIBLE_DEVICES: 0,1
+
+jobs:
+  find_models_to_run:
+      runs-on: ubuntu-22.04
+      name: Find models to run slow tests
+      # Triggered only if the required label `run-slow` is added
+      if: ${{ contains(github.event.pull_request.labels.*.name, 'run-slow') }}
+      outputs:
+        models: ${{ steps.models_to_run.outputs.models }}
+      steps:
+        - uses: actions/checkout@v4
+          with:
+            fetch-depth: "0"
+            ref: ${{ github.event.pull_request.head.sha }}
+
+        - name: Get commit message
+          run: |
+            echo "commit_message=$(git show -s --format=%s)" >> $GITHUB_ENV
+
+        - name: Get models to run slow tests
+          run: |
+            echo "${{ env.commit_message }}"
+            python -m pip install GitPython
+            python utils/pr_slow_ci_models.py --commit_message "${{ env.commit_message }}" | tee output.txt
+            echo "models=$(tail -n 1 output.txt)" >> $GITHUB_ENV
+
+        - name: Models to run slow tests
+          id: models_to_run
+          run: |
+            echo "${{ env.models }}"
+            echo "models=${{ env.models }}" >> $GITHUB_OUTPUT
+
+  run_models_gpu:
+      name: Run all tests for the model
+      # Triggered only `find_models_to_run` is triggered (label `run-slow` is added) which gives the models to run
+      # (either a new model PR or via a commit message)
+      if: ${{ needs.find_models_to_run.outputs.models != '[]' }}
+      needs: find_models_to_run
+      strategy:
+        fail-fast: false
+        matrix:
+          folders: ${{ fromJson(needs.find_models_to_run.outputs.models) }}
+          machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+      runs-on:
+        group: '${{ matrix.machine_type }}'
+      container:
+        image: huggingface/transformers-all-latest-gpu
+        options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      steps:
+      - name: Echo input and matrix info
+        shell: bash
+        run: |
+          echo "${{ matrix.folders }}"
+
+      - name: Echo folder ${{ matrix.folders }}
+        shell: bash
+        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
+        # set the artifact folder names (because the character `/` is not allowed).
+        run: |
+          echo "${{ matrix.folders }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'models/'/'models_'}
+          echo "$matrix_folders"
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git fetch origin pull/${{ github.event.pull_request.number }}/head:pull/${{ github.event.pull_request.number }}/merge && git checkout pull/${{ github.event.pull_request.number }}/merge
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . && python3 -m pip install --upgrade torch torchaudio torchvision
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Set `machine_type` for report and artifact names
+        working-directory: /transformers
+        shell: bash
+        run: |
+          echo "${{ matrix.machine_type }}"
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+          echo "$machine_type"
+          echo "machine_type=$machine_type" >> $GITHUB_ENV    
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all tests on GPU
+        working-directory: /transformers
+        run: |
+          export CUDA_VISIBLE_DEVICES="$(python3 utils/set_cuda_devices_for_ci.py --test_folder ${{ matrix.folders }})"
+          echo $CUDA_VISIBLE_DEVICES
+          python3 -m pytest -v -rsfE --make-reports=${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
+
+      - name: Make sure report directory exists
+        shell: bash
+        run: |
+          mkdir -p /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
+          echo "hello" > /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
+          echo "${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
--- a/.github/workflows/self-push-amd-mi210-caller.yml
+++ b/.github/workflows/self-push-amd-mi210-caller.yml
@ -1,25 +1,25 @@
-name: Self-hosted runner (AMD mi210 CI caller)
-
-on:
-  #workflow_run:
-  #  workflows: ["Self-hosted runner (push-caller)"]
-  #  branches: ["main"]
-  #  types: [completed]
-  push:
-    branches:
-      - run_amd_push_ci_caller*
-    paths:
-      - "src/**"
-      - "tests/**"
-      - ".github/**"
-      - "templates/**"
-      - "utils/**"
-
-jobs:
-  run_amd_ci:
-    name: AMD mi210
-    if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
-    uses: ./.github/workflows/self-push-amd.yml
-    with:
-      gpu_flavor: mi210
-    secrets: inherit
+name: Self-hosted runner (AMD mi210 CI caller)
+
+on:
+  workflow_run:
+    workflows: ["Self-hosted runner (push-caller)"]
+    branches: ["main"]
+    types: [completed]
+  push:
+    branches:
+      - run_amd_push_ci_caller*
+    paths:
+      - "src/**"
+      - "tests/**"
+      - ".github/**"
+      - "templates/**"
+      - "utils/**"
+
+jobs:
+  run_amd_ci:
+    name: AMD mi210
+    if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
+    uses: ./.github/workflows/self-push-amd.yml
+    with:
+      gpu_flavor: mi210
+    secrets: inherit
--- a/.github/workflows/self-push-amd-mi250-caller.yml
+++ b/.github/workflows/self-push-amd-mi250-caller.yml
@ -1,25 +1,25 @@
-name: Self-hosted runner (AMD mi250 CI caller)
-
-on:
-  #workflow_run:
-  #  workflows: ["Self-hosted runner (push-caller)"]
-  #  branches: ["main"]
-  #  types: [completed]
-  push:
-    branches:
-      - run_amd_push_ci_caller*
-    paths:
-      - "src/**"
-      - "tests/**"
-      - ".github/**"
-      - "templates/**"
-      - "utils/**"
-
-jobs:
-  run_amd_ci:
-    name: AMD mi250
-    if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
-    uses: ./.github/workflows/self-push-amd.yml
-    with:
-      gpu_flavor: mi250
-    secrets: inherit
+name: Self-hosted runner (AMD mi250 CI caller)
+
+on:
+  workflow_run:
+    workflows: ["Self-hosted runner (push-caller)"]
+    branches: ["main"]
+    types: [completed]
+  push:
+    branches:
+      - run_amd_push_ci_caller*
+    paths:
+      - "src/**"
+      - "tests/**"
+      - ".github/**"
+      - "templates/**"
+      - "utils/**"
+
+jobs:
+  run_amd_ci:
+    name: AMD mi250
+    if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
+    uses: ./.github/workflows/self-push-amd.yml
+    with:
+      gpu_flavor: mi250
+    secrets: inherit
--- a/.github/workflows/self-push-amd-mi300-caller.yml
+++ b/.github/workflows/self-push-amd-mi300-caller.yml
@ -0,0 +1,25 @@
+name: Self-hosted runner (AMD mi300 CI caller)
+
+on:
+  workflow_run:
+    workflows: ["Self-hosted runner (push-caller)"]
+    branches: ["main"]
+    types: [completed]
+  push:
+    branches:
+      - run_amd_push_ci_caller*
+    paths:
+      - "src/**"
+      - "tests/**"
+      - ".github/**"
+      - "templates/**"
+      - "utils/**"
+
+jobs:
+  run_amd_ci:
+    name: AMD mi300
+    if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && (startsWith(github.ref_name, 'run_amd_push_ci_caller') || startsWith(github.ref_name, 'mi300-ci'))))
+    uses: ./.github/workflows/self-push-amd.yml
+    with:
+      gpu_flavor: mi300
+    secrets: inherit
--- a/.github/workflows/self-push-amd.yml
+++ b/.github/workflows/self-push-amd.yml
@ -14,6 +14,7 @@ env:
  MKL_NUM_THREADS: 8
  PYTEST_TIMEOUT: 60
  TF_FORCE_GPU_ALLOW_GROWTH: true
+  RUN_PT_TF_CROSS_TESTS: 1
  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}

 jobs:
--- a/.github/workflows/self-push-caller.yml
+++ b/.github/workflows/self-push-caller.yml
@ -25,7 +25,7 @@ jobs:
        
        - name: Get changed files
          id: changed-files
-          uses: tj-actions/changed-files@1c8e6069583811afb28f97afeaf8e7da80c6be5c
+          uses: tj-actions/changed-files@v41
        
        - name: Was setup changed 
          id: was_changed
@ -51,4 +51,4 @@ jobs:
    needs: build-docker-containers
    steps:
      - name: Trigger push CI via workflow_run
-        run: echo "Trigger push CI via workflow_run"
+        run: echo "Trigger push CI via workflow_run"
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@ -24,6 +24,7 @@ env:
  MKL_NUM_THREADS: 8
  PYTEST_TIMEOUT: 60
  TF_FORCE_GPU_ALLOW_GROWTH: true
+  RUN_PT_TF_CROSS_TESTS: 1
  CUDA_VISIBLE_DEVICES: 0,1

 jobs:
@ -31,12 +32,12 @@ jobs:
    name: Setup
    strategy:
      matrix:
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
      image: huggingface/transformers-all-latest-gpu-push-ci
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    outputs:
      matrix: ${{ steps.set-matrix.outputs.matrix }}
      test_map: ${{ steps.set-matrix.outputs.test_map }}
@ -131,12 +132,12 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
-        machine_type: [aws-g5-4xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
      image: huggingface/transformers-all-latest-gpu-push-ci
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    env:
      # For the meaning of these environment variables, see the job `Setup`
      CI_BRANCH_PUSH: ${{ github.event.ref }}
@ -169,9 +170,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -244,7 +245,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
-        machine_type: [aws-g5-12xlarge-cache]
+        machine_type: [aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -282,9 +283,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -292,7 +293,7 @@ jobs:

          echo "$machine_type"
          echo "machine_type=$machine_type" >> $GITHUB_ENV
-
+          
      - name: Update clone using environment variables
        working-directory: /transformers
        run: |
@ -357,12 +358,12 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g5-4xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
      image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    env:
      # For the meaning of these environment variables, see the job `Setup`
      CI_BRANCH_PUSH: ${{ github.event.ref }}
@ -395,9 +396,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -405,7 +406,7 @@ jobs:

          echo "$machine_type"
          echo "machine_type=$machine_type" >> $GITHUB_ENV
-
+          
      - name: Update clone using environment variables
        working-directory: /workspace/transformers
        run: |
@ -467,7 +468,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g5-12xlarge-cache]
+        machine_type: [aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -505,9 +506,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -515,7 +516,7 @@ jobs:

          echo "$machine_type"
          echo "machine_type=$machine_type" >> $GITHUB_ENV
-
+          
      - name: Update clone using environment variables
        working-directory: /workspace/transformers
        run: |
@ -647,6 +648,6 @@ jobs:
        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
        run: |
          pip install huggingface_hub
-          pip install slack_sdk
+          pip install slack_sdk 
          pip show slack_sdk
          python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
--- a/.github/workflows/self-scheduled-amd-mi210-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi210-caller.yml
@ -0,0 +1,55 @@
+name: Self-hosted runner (AMD mi210 scheduled CI caller)
+
+on:
+  workflow_run:
+    workflows: ["Self-hosted runner (AMD scheduled CI caller)"]
+    branches: ["main"]
+    types: [completed]
+  push:
+    branches:
+      - run_amd_scheduled_ci_caller*
+
+jobs:
+  model-ci:
+    name: Model CI
+    uses: ./.github/workflows/self-scheduled-amd.yml
+    with:
+      job: run_models_gpu
+      slack_report_channel: "#transformers-ci-daily-amd"
+      runner: mi210
+      docker: huggingface/transformers-pytorch-amd-gpu
+      ci_event: Scheduled CI (AMD) - mi210
+    secrets: inherit
+
+  torch-pipeline:
+    name: Torch pipeline CI
+    uses: ./.github/workflows/self-scheduled-amd.yml
+    with:
+      job: run_pipelines_torch_gpu
+      slack_report_channel: "#transformers-ci-daily-amd"
+      runner: mi210
+      docker: huggingface/transformers-pytorch-amd-gpu
+      ci_event: Scheduled CI (AMD) - mi210
+    secrets: inherit
+
+  example-ci:
+    name: Example CI
+    uses: ./.github/workflows/self-scheduled-amd.yml
+    with:
+      job: run_examples_gpu
+      slack_report_channel: "#transformers-ci-daily-amd"
+      runner: mi210
+      docker: huggingface/transformers-pytorch-amd-gpu
+      ci_event: Scheduled CI (AMD) - mi210
+    secrets: inherit
+
+  deepspeed-ci:
+    name: DeepSpeed CI
+    uses: ./.github/workflows/self-scheduled-amd.yml
+    with:
+      job: run_torch_cuda_extensions_gpu
+      slack_report_channel: "#transformers-ci-daily-amd"
+      runner: mi210
+      docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
+      ci_event: Scheduled CI (AMD) - mi210
+    secrets: inherit
--- a/.github/workflows/self-scheduled-amd-mi250-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi250-caller.yml
@ -1,59 +1,55 @@
-name: Self-hosted runner (AMD mi250 scheduled CI caller)
-
-on:
-  workflow_run:
-    workflows: ["Self-hosted runner (AMD scheduled CI caller)"]
-    branches: ["main"]
-    types: [completed]
-  push:
-    branches:
-      - run_amd_scheduled_ci_caller*
-
-jobs:
-  model-ci:
-    name: Model CI
-    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main
-    with:
-      job: run_models_gpu
-      slack_report_channel: "#transformers-ci-daily-amd"
-      runner: mi250
-      docker: huggingface/transformers-pytorch-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi250
-      report_repo_id: optimum-amd/transformers_daily_ci
-    secrets: inherit
-
-  torch-pipeline:
-    name: Torch pipeline CI
-    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main
-    with:
-      job: run_pipelines_torch_gpu
-      slack_report_channel: "#transformers-ci-daily-amd"
-      runner: mi250
-      docker: huggingface/transformers-pytorch-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi250
-      report_repo_id: optimum-amd/transformers_daily_ci
-    secrets: inherit
-
-  example-ci:
-    name: Example CI
-    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main
-    with:
-      job: run_examples_gpu
-      slack_report_channel: "#transformers-ci-daily-amd"
-      runner: mi250
-      docker: huggingface/transformers-pytorch-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi250
-      report_repo_id: optimum-amd/transformers_daily_ci
-    secrets: inherit
-
-  deepspeed-ci:
-    name: DeepSpeed CI
-    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main
-    with:
-      job: run_torch_cuda_extensions_gpu
-      slack_report_channel: "#transformers-ci-daily-amd"
-      runner: mi250
-      docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi250
-      report_repo_id: optimum-amd/transformers_daily_ci
-    secrets: inherit
+name: Self-hosted runner (AMD mi250 scheduled CI caller)
+
+on:
+  workflow_run:
+    workflows: ["Self-hosted runner (AMD scheduled CI caller)"]
+    branches: ["main"]
+    types: [completed]
+  push:
+    branches:
+      - run_amd_scheduled_ci_caller*
+
+jobs:
+  model-ci:
+    name: Model CI
+    uses: ./.github/workflows/self-scheduled-amd.yml
+    with:
+      job: run_models_gpu
+      slack_report_channel: "#transformers-ci-daily-amd"
+      runner: mi250
+      docker: huggingface/transformers-pytorch-amd-gpu
+      ci_event: Scheduled CI (AMD) - mi250
+    secrets: inherit
+
+  torch-pipeline:
+    name: Torch pipeline CI
+    uses: ./.github/workflows/self-scheduled-amd.yml
+    with:
+      job: run_pipelines_torch_gpu
+      slack_report_channel: "#transformers-ci-daily-amd"
+      runner: mi250
+      docker: huggingface/transformers-pytorch-amd-gpu
+      ci_event: Scheduled CI (AMD) - mi250
+    secrets: inherit
+
+  example-ci:
+    name: Example CI
+    uses: ./.github/workflows/self-scheduled-amd.yml
+    with:
+      job: run_examples_gpu
+      slack_report_channel: "#transformers-ci-daily-amd"
+      runner: mi250
+      docker: huggingface/transformers-pytorch-amd-gpu
+      ci_event: Scheduled CI (AMD) - mi250
+    secrets: inherit
+
+  deepspeed-ci:
+    name: DeepSpeed CI
+    uses: ./.github/workflows/self-scheduled-amd.yml
+    with:
+      job: run_torch_cuda_extensions_gpu
+      slack_report_channel: "#transformers-ci-daily-amd"
+      runner: mi250
+      docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
+      ci_event: Scheduled CI (AMD) - mi250
+    secrets: inherit
--- a/.github/workflows/self-scheduled-amd-mi325-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi325-caller.yml
@ -1,67 +0,0 @@
-name: Self-hosted runner scale set (AMD mi325 scheduled CI caller)
-
-# Note: For every job in this workflow, the name of the runner scale set is finalized in the runner yaml i.e. huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml
-# For example, 1gpu scale set: amd-mi325-ci-1gpu
-#              2gpu scale set: amd-mi325-ci-2gpu
-
-on:
-  workflow_run:
-    workflows: ["Self-hosted runner (AMD scheduled CI caller)"]
-    branches: ["main"]
-    types: [completed]
-  push:
-    branches:
-      - run_amd_scheduled_ci_caller*
-
-jobs:
-  model-ci:
-    name: Model CI
-    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
-    with:
-      job: run_models_gpu
-      slack_report_channel: "#amd-hf-ci"
-      runner_group: amd-mi325
-      docker: huggingface/transformers-pytorch-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi325
-      report_repo_id: optimum-amd/transformers_daily_ci
-      env_file: /etc/podinfo/gha-gpu-isolation-settings
-    secrets: inherit
-
-  torch-pipeline:
-    name: Torch pipeline CI
-    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
-    with:
-      job: run_pipelines_torch_gpu
-      slack_report_channel: "#amd-hf-ci"
-      runner_group: amd-mi325
-      docker: huggingface/transformers-pytorch-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi325
-      report_repo_id: optimum-amd/transformers_daily_ci
-      env_file: /etc/podinfo/gha-gpu-isolation-settings
-    secrets: inherit
-
-  example-ci:
-    name: Example CI
-    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
-    with:
-      job: run_examples_gpu
-      slack_report_channel: "#amd-hf-ci"
-      runner_group: amd-mi325
-      docker: huggingface/transformers-pytorch-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi325
-      report_repo_id: optimum-amd/transformers_daily_ci
-      env_file: /etc/podinfo/gha-gpu-isolation-settings
-    secrets: inherit
-
-  deepspeed-ci:
-    name: DeepSpeed CI
-    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
-    with:
-      job: run_torch_cuda_extensions_gpu
-      slack_report_channel: "#amd-hf-ci"
-      runner_group: amd-mi325
-      docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi325
-      report_repo_id: optimum-amd/transformers_daily_ci
-      env_file: /etc/podinfo/gha-gpu-isolation-settings
-    secrets: inherit
--- a/.github/workflows/self-scheduled-amd-mi355-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi355-caller.yml
@ -1,63 +0,0 @@
-name: Self-hosted runner scale set (AMD mi355 scheduled CI caller)
-
-# Note: For every job in this workflow, the name of the runner scale set is finalized in the runner yaml i.e. huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml
-# For example, 1gpu : amd-mi355-ci-1gpu
-#              2gpu : amd-mi355-ci-2gpu
- 
-on:
-  workflow_run:
-    workflows: ["Self-hosted runner (AMD scheduled CI caller)"]
-    branches: ["main"]
-    types: [completed]
-  push:
-    branches:
-      - run_amd_scheduled_ci_caller*
-
-jobs:
-  model-ci:
-    name: Model CI
-    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
-    with:
-      job: run_models_gpu
-      slack_report_channel: "#amd-hf-ci"
-      runner_group: hfc-amd-mi355
-      docker: huggingface/testing-rocm7.0-preview
-      ci_event: Scheduled CI (AMD) - mi355
-      report_repo_id: hf-transformers-bot/transformers-ci-dummy
-    secrets: inherit
-
-  torch-pipeline:
-    name: Torch pipeline CI
-    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
-    with:
-      job: run_pipelines_torch_gpu
-      slack_report_channel: "#amd-hf-ci"
-      runner_group: hfc-amd-mi355
-      docker: huggingface/testing-rocm7.0-preview
-      ci_event: Scheduled CI (AMD) - mi355
-      report_repo_id: hf-transformers-bot/transformers-ci-dummy
-    secrets: inherit
-
-  example-ci:
-    name: Example CI
-    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
-    with:
-      job: run_examples_gpu
-      slack_report_channel: "#amd-hf-ci"
-      runner_group: hfc-amd-mi355
-      docker: huggingface/testing-rocm7.0-preview
-      ci_event: Scheduled CI (AMD) - mi355
-      report_repo_id: hf-transformers-bot/transformers-ci-dummy
-    secrets: inherit
-
-  deepspeed-ci:
-    name: DeepSpeed CI
-    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
-    with:  
-      job: run_torch_cuda_extensions_gpu
-      slack_report_channel: "#amd-hf-ci"
-      runner_group: hfc-amd-mi355
-      docker: huggingface/testing-rocm7.0-preview
-      ci_event: Scheduled CI (AMD) - mi355
-      report_repo_id: hf-transformers-bot/transformers-ci-dummy
-    secrets: inherit
--- a/.github/workflows/self-scheduled-amd.yml
+++ b/.github/workflows/self-scheduled-amd.yml
@ -0,0 +1,349 @@
+name: Self-hosted runner (scheduled-amd)
+
+# Note: For the AMD CI, we rely on a caller workflow and on the workflow_call event to trigger the
+# CI in order to run it on both MI210 and MI250, without having to use matrix here which pushes
+# us towards the limit of allowed jobs on GitHub Actions.
+
+on:
+  workflow_call:
+    inputs:
+      job:
+        required: true
+        type: string
+      slack_report_channel:
+        required: true
+        type: string
+      runner:
+        required: true
+        type: string
+      docker:
+        required: true
+        type: string
+      ci_event:
+        required: true
+        type: string
+
+env:
+  HF_HOME: /mnt/cache
+  TRANSFORMERS_IS_CI: yes
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  RUN_SLOW: yes
+  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
+  NUM_SLICES: 2
+
+# Important note: each job (run_tests_single_gpu, run_tests_multi_gpu, run_examples_gpu, run_pipelines_torch_gpu) requires all the previous jobs before running.
+# This is done so that we avoid parallelizing the scheduled tests, to leave available
+# runners for the push CI that is running on the same machine.
+jobs:
+  check_runner_status:
+    name: Check Runner Status
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Checkout transformers
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 2
+
+      - name: Check Runner Status
+        run: python utils/check_self_hosted_runner.py --target_runners hf-amd-mi210-ci-1gpu-1,hf-amd-mi250-ci-1gpu-1,hf-amd-mi300-ci-1gpu-1 --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+
+  check_runners:
+    name: Check Runners
+    needs: check_runner_status
+    strategy:
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: ['${{ matrix.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+  setup:
+    if: contains(fromJSON('["run_models_gpu"]'), inputs.job)
+    name: Setup
+    needs: check_runners
+    strategy:
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: ['${{ matrix.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    outputs:
+      folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
+      slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
+    steps:
+      - name: Update clone
+        working-directory: /transformers
+        run: |
+          git fetch && git checkout ${{ github.sha }}
+
+      - name: Cleanup
+        working-directory: /transformers
+        run: |
+          rm -rf tests/__pycache__
+          rm -rf tests/models/__pycache__
+          rm -rf reports
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - id: set-matrix
+        name: Identify models to test
+        working-directory: /transformers/tests
+        run: |
+          echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
+          echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+  run_models_gpu:
+    if: ${{ inputs.job == 'run_models_gpu' }}
+    name: Single GPU tests
+    needs: setup
+    strategy:
+      max-parallel: 1  # For now, not to parallelize. Can change later if it works well.
+      fail-fast: false
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
+    uses: ./.github/workflows/model_jobs_amd.yml
+    with:
+      folder_slices: ${{ needs.setup.outputs.folder_slices }}
+      machine_type: ${{ matrix.machine_type }}
+      slice_id: ${{ matrix.slice_id }}
+      runner: ${{ inputs.runner }}
+      docker: ${{ inputs.docker }}
+    secrets: inherit
+
+  run_pipelines_torch_gpu:
+    if: ${{ inputs.job == 'run_pipelines_torch_gpu' }}
+    name: PyTorch pipelines
+    needs: check_runners
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: ['${{ matrix.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
+    container:
+      image: ${{ inputs.docker }}
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all pipeline tests on GPU
+        working-directory: /transformers
+        run: |
+          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines -m "not not_device_test"
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports
+          path: /transformers/reports/${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports
+
+  run_examples_gpu:
+    if: ${{ inputs.job == 'run_examples_gpu' }}
+    name: Examples directory
+    needs: check_runners
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [single-gpu]
+    runs-on: ['${{ matrix.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
+    container:
+      image: ${{ inputs.docker }}
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run examples tests on GPU
+        working-directory: /transformers
+        run: |
+          pip install -r examples/pytorch/_tests_requirements.txt
+          python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_examples_gpu_test_reports examples/pytorch -m "not not_device_test"
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_run_examples_gpu_test_reports/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ matrix.machine_type }}_run_examples_gpu_test_reports
+          path: /transformers/reports/${{ matrix.machine_type }}_run_examples_gpu_test_reports
+
+  run_torch_cuda_extensions_gpu:
+    if: ${{ inputs.job == 'run_torch_cuda_extensions_gpu' }}
+    name: Torch ROCm deepspeed tests
+    needs: check_runners
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: ['${{ matrix.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
+    container:
+      image: ${{ inputs.docker }}
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all tests on GPU
+        working-directory: /transformers
+        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended -m "not not_device_test"
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
+          path: /transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
+
+  send_results:
+    name: Slack Report
+    needs: [
+      check_runner_status,
+      check_runners,
+      setup,
+      run_models_gpu,
+      run_pipelines_torch_gpu,
+      run_examples_gpu,
+      run_torch_cuda_extensions_gpu
+    ]
+    if: ${{ always() }}
+    uses: ./.github/workflows/slack-report.yml
+    with:
+      job: ${{ inputs.job }}
+      # This would be `skipped` if `setup` is skipped.
+      setup_status: ${{ needs.setup.result }}
+      slack_report_channel: ${{ inputs.slack_report_channel }}
+      # This would be an empty string if `setup` is skipped.
+      folder_slices: ${{ needs.setup.outputs.folder_slices }}
+      quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
+      ci_event: ${{ inputs.ci_event }}
+
+    secrets: inherit
--- a/.github/workflows/self-scheduled-caller.yml
+++ b/.github/workflows/self-scheduled-caller.yml
@ -1,4 +1,5 @@
-name: Nvidia CI
+name: Self-hosted runner (scheduled)
+

 on:
  repository_dispatch:
@ -6,55 +7,18 @@ on:
    - cron: "17 2 * * *"
  push:
    branches:
-      - run_nvidia_ci*
-  workflow_dispatch:
-    inputs:
-      prev_workflow_run_id:
-        description: 'previous workflow run id to compare'
-        type: string
-        required: false
-        default: ""
-      other_workflow_run_id:
-        description: 'other workflow run id to compare'
-        type: string
-        required: false
-        default: ""
-
-
-# Used for `push` to easily modify the target workflow runs to compare against
-env:
-    prev_workflow_run_id: ""
-    other_workflow_run_id: ""
-
+      - run_scheduled_ci*

 jobs:
-  setup:
-    name: Setup
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Setup
-        run: |
-          mkdir "setup_values"
-          echo "${{ inputs.prev_workflow_run_id || env.prev_workflow_run_id }}" > "setup_values/prev_workflow_run_id.txt"
-          echo "${{ inputs.other_workflow_run_id || env.other_workflow_run_id }}" > "setup_values/other_workflow_run_id.txt"
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          name: setup_values
-          path: setup_values
-
  model-ci:
    name: Model CI
    uses: ./.github/workflows/self-scheduled.yml
    with:
      job: run_models_gpu
      slack_report_channel: "#transformers-ci-daily-models"
+      runner: daily-ci
      docker: huggingface/transformers-all-latest-gpu
      ci_event: Daily CI
-      runner_type: "a10"
-      report_repo_id: hf-internal-testing/transformers_daily_ci
-      commit_sha: ${{ github.sha }}
    secrets: inherit

  torch-pipeline:
@ -63,10 +27,20 @@ jobs:
    with:
      job: run_pipelines_torch_gpu
      slack_report_channel: "#transformers-ci-daily-pipeline-torch"
+      runner: daily-ci
      docker: huggingface/transformers-pytorch-gpu
      ci_event: Daily CI
-      report_repo_id: hf-internal-testing/transformers_daily_ci
-      commit_sha: ${{ github.sha }}
+    secrets: inherit
+
+  tf-pipeline:
+    name: TF pipeline CI
+    uses: ./.github/workflows/self-scheduled.yml
+    with:
+      job: run_pipelines_tf_gpu
+      slack_report_channel: "#transformers-ci-daily-pipeline-tf"
+      runner: daily-ci
+      docker: huggingface/transformers-tensorflow-gpu
+      ci_event: Daily CI
    secrets: inherit

  example-ci:
@ -75,23 +49,9 @@ jobs:
    with:
      job: run_examples_gpu
      slack_report_channel: "#transformers-ci-daily-examples"
+      runner: daily-ci
      docker: huggingface/transformers-all-latest-gpu
      ci_event: Daily CI
-      report_repo_id: hf-internal-testing/transformers_daily_ci
-      commit_sha: ${{ github.sha }}
-    secrets: inherit
-
-  trainer-fsdp-ci:
-    name: Trainer/FSDP CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_trainer_and_fsdp_gpu
-      slack_report_channel: "#transformers-ci-daily-training"
-      docker: huggingface/transformers-all-latest-gpu
-      runner_type: "a10"
-      ci_event: Daily CI
-      report_repo_id: hf-internal-testing/transformers_daily_ci
-      commit_sha: ${{ github.sha }}
    secrets: inherit

  deepspeed-ci:
@ -99,12 +59,11 @@ jobs:
    uses: ./.github/workflows/self-scheduled.yml
    with:
      job: run_torch_cuda_extensions_gpu
-      slack_report_channel: "#transformers-ci-daily-training"
+      slack_report_channel: "#transformers-ci-daily-deepspeed"
+      runner: daily-ci
      docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
      ci_event: Daily CI
      working-directory-prefix: /workspace
-      report_repo_id: hf-internal-testing/transformers_daily_ci
-      commit_sha: ${{ github.sha }}
    secrets: inherit

  quantization-ci:
@ -113,8 +72,7 @@ jobs:
    with:
      job: run_quantization_torch_gpu
      slack_report_channel: "#transformers-ci-daily-quantization"
+      runner: daily-ci
      docker: huggingface/transformers-quantization-latest-gpu
      ci_event: Daily CI
-      report_repo_id: hf-internal-testing/transformers_daily_ci
-      commit_sha: ${{ github.sha }}
    secrets: inherit
--- a/.github/workflows/self-scheduled-intel-gaudi.yml
+++ b/.github/workflows/self-scheduled-intel-gaudi.yml
@ -1,341 +0,0 @@
-name: Self-hosted runner (scheduled-intel-gaudi)
-
-on:
-  workflow_call:
-    inputs:
-      job:
-        required: true
-        type: string
-      slack_report_channel:
-        required: true
-        type: string
-      runner_scale_set:
-        required: true
-        type: string
-      ci_event:
-        required: true
-        type: string
-      report_repo_id:
-        required: true
-        type: string
-
-env:
-  NUM_SLICES: 2
-  RUN_SLOW: yes
-  PT_HPU_LAZY_MODE: 0
-  TRANSFORMERS_IS_CI: yes
-  PT_ENABLE_INT64_SUPPORT: 1
-  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
-  HF_HOME: /mnt/cache/.cache/huggingface
-
-jobs:
-  setup:
-    if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
-    name: Setup
-    runs-on: ubuntu-latest
-    outputs:
-      slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
-      folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
-      quantization_matrix: ${{ steps.set-matrix.outputs.quantization_matrix }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.10"
-
-      - id: set-matrix
-        if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
-        name: Identify models to test
-        working-directory: tests
-        run: |
-          if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
-            echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
-            echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
-          elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
-            echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
-            echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
-          fi
-
-      - id: set-matrix-quantization
-        if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
-        name: Identify quantization method to test
-        working-directory: tests
-        run: |
-          echo "quantization_matrix=$(python3 -c 'import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))) ;  print(d)')" >> $GITHUB_OUTPUT
-
-  run_models_gpu:
-    if: ${{ inputs.job == 'run_models_gpu' }}
-    name: " "
-    needs: setup
-    strategy:
-      fail-fast: false
-      matrix:
-        machine_type: [1gaudi, 2gaudi]
-        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
-    uses: ./.github/workflows/model_jobs_intel_gaudi.yml
-    with:
-      slice_id: ${{ matrix.slice_id }}
-      machine_type: ${{ matrix.machine_type }}
-      folder_slices: ${{ needs.setup.outputs.folder_slices }}
-      runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
-    secrets: inherit
-
-  run_trainer_and_fsdp_gpu:
-    if: ${{ inputs.job == 'run_trainer_and_fsdp_gpu' }}
-    name: " "
-    needs: setup
-    strategy:
-      fail-fast: false
-      matrix:
-        machine_type: [1gaudi, 2gaudi]
-        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
-    uses: ./.github/workflows/model_jobs_intel_gaudi.yml
-    with:
-      slice_id: ${{ matrix.slice_id }}
-      machine_type: ${{ matrix.machine_type }}
-      folder_slices: ${{ needs.setup.outputs.folder_slices }}
-      runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
-      report_name_prefix: run_trainer_and_fsdp_gpu
-    secrets: inherit
-
-  run_pipelines_torch_gpu:
-    if: ${{ inputs.job == 'run_pipelines_torch_gpu' }}
-    name: Pipelines
-    strategy:
-      fail-fast: false
-      matrix:
-        machine_type: [1gaudi, 2gaudi]
-    runs-on:
-      group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
-    container:
-      image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
-      options: --runtime=habana
-        -v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
-        --env OMPI_MCA_btl_vader_single_copy_mechanism=none
-        --env HABANA_VISIBLE_DEVICES
-        --env HABANA_VISIBLE_MODULES
-        --cap-add=sys_nice
-        --shm-size=64G
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Install dependencies
-        run: |
-          pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
-
-      - name: HL-SMI
-        run: |
-          hl-smi
-          echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
-          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
-
-      - name: Environment
-        run: python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        run: pip freeze
-
-      - name: Set `machine_type` for report and artifact names
-        shell: bash
-        run: |
-          if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
-            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
-            machine_type=multi-gpu
-          else
-            machine_type=${{ matrix.machine_type }}
-          fi
-          echo "machine_type=$machine_type" >> $GITHUB_ENV
-
-      - name: Run all pipeline tests on Intel Gaudi
-        run: |
-          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines -m "not not_device_test"
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: |
-          cat reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports/failures_short.txt
-
-      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports"
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports
-          path: reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports
-
-  run_examples_gpu:
-    if: ${{ inputs.job == 'run_examples_gpu' }}
-    name: Examples directory
-    strategy:
-      fail-fast: false
-      matrix:
-        machine_type: [1gaudi]
-    runs-on:
-      group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
-    container:
-      image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
-      options: --runtime=habana
-        -v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
-        --env OMPI_MCA_btl_vader_single_copy_mechanism=none
-        --env HABANA_VISIBLE_DEVICES
-        --env HABANA_VISIBLE_MODULES
-        --cap-add=sys_nice
-        --shm-size=64G
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Install dependencies
-        run: |
-          pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
-
-      - name: HL-SMI
-        run: |
-          hl-smi
-          echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
-          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
-
-      - name: Environment
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        run: |
-          pip freeze
-
-      - name: Set `machine_type` for report and artifact names
-        shell: bash
-        run: |
-          if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
-            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
-            machine_type=multi-gpu
-          else
-            machine_type=${{ matrix.machine_type }}
-          fi
-          echo "machine_type=$machine_type" >> $GITHUB_ENV
-
-      - name: Run examples tests on Intel Gaudi
-        run: |
-          pip install -r examples/pytorch/_tests_requirements.txt
-          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_examples_gpu_test_reports examples/pytorch -m "not not_device_test"
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: |
-          cat reports/${{ env.machine_type }}_run_examples_gpu_test_reports/failures_short.txt
-
-      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_examples_gpu_test_reports"
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: ${{ env.machine_type }}_run_examples_gpu_test_reports
-          path: reports/${{ env.machine_type }}_run_examples_gpu_test_reports
-
-  run_torch_cuda_extensions_gpu:
-    if: ${{ inputs.job == 'run_torch_cuda_extensions_gpu' }}
-    name: Intel Gaudi deepspeed tests
-    strategy:
-      fail-fast: false
-      matrix:
-        machine_type: [1gaudi, 2gaudi]
-    runs-on:
-      group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
-    container:
-      image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
-      options: --runtime=habana
-        -v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
-        --env OMPI_MCA_btl_vader_single_copy_mechanism=none
-        --env HABANA_VISIBLE_DEVICES
-        --env HABANA_VISIBLE_MODULES
-        --cap-add=sys_nice
-        --shm-size=64G
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Install dependencies
-        run: |
-          pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
-          pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0
-
-      - name: HL-SMI
-        run: |
-          hl-smi
-          echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
-          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
-
-      - name: Environment
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        run: |
-          pip freeze
-
-      - name: Set `machine_type` for report and artifact names
-        shell: bash
-        run: |
-          if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
-            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
-            machine_type=multi-gpu
-          else
-            machine_type=${{ matrix.machine_type }}
-          fi
-          echo "machine_type=$machine_type" >> $GITHUB_ENV
-
-      - name: Run all deepspeed tests on intel Gaudi
-        run: |
-          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed -m "not not_device_test"
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: |
-          cat reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
-
-      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
-          path: reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
-
-  send_results:
-    name: Slack Report
-    needs:
-      [
-        setup,
-        run_models_gpu,
-        run_examples_gpu,
-        run_torch_cuda_extensions_gpu,
-        run_pipelines_torch_gpu,
-        run_trainer_and_fsdp_gpu,
-      ]
-    if: ${{ always() }}
-    uses: ./.github/workflows/slack-report.yml
-    with:
-      job: ${{ inputs.job }}
-      setup_status: ${{ needs.setup.result }}
-      slack_report_channel: ${{ inputs.slack_report_channel }}
-      quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
-      folder_slices: ${{ needs.setup.outputs.folder_slices }}
-      report_repo_id: ${{ inputs.report_repo_id }}
-      ci_event: ${{ inputs.ci_event }}
-
-    secrets: inherit
--- a/.github/workflows/self-scheduled-intel-gaudi3-caller.yml
+++ b/.github/workflows/self-scheduled-intel-gaudi3-caller.yml
@ -1,67 +0,0 @@
-name: Self-hosted runner (Intel Gaudi3 scheduled CI caller)
-
-on:
-  repository_dispatch:
-  workflow_dispatch:
-  schedule:
-    - cron: "17 2 * * *"
-
-jobs:
-  model-ci:
-    name: Model CI
-    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
-    with:
-      job: run_models_gpu
-      ci_event: Scheduled CI (Intel) - Gaudi3
-      runner_scale_set: itac-bm-emr-gaudi3-dell
-      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
-      report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
-
-    secrets: inherit
-
-  pipeline-ci:
-    name: Pipeline CI
-    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
-    with:
-      job: run_pipelines_torch_gpu
-      ci_event: Scheduled CI (Intel) - Gaudi3
-      runner_scale_set: itac-bm-emr-gaudi3-dell
-      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
-      report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
-
-    secrets: inherit
-
-  example-ci:
-    name: Example CI
-    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
-    with:
-      job: run_examples_gpu
-      ci_event: Scheduled CI (Intel) - Gaudi3
-      runner_scale_set: itac-bm-emr-gaudi3-dell
-      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
-      report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
-
-    secrets: inherit
-
-  deepspeed-ci:
-    name: DeepSpeed CI
-    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
-    with:
-      job: run_torch_cuda_extensions_gpu
-      ci_event: Scheduled CI (Intel) - Gaudi3
-      runner_scale_set: itac-bm-emr-gaudi3-dell
-      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
-      report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
-
-    secrets: inherit
-
-  trainer-fsdp-ci:
-    name: Trainer/FSDP CI
-    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
-    with:
-      job: run_trainer_and_fsdp_gpu
-      ci_event: Scheduled CI (Intel) - Gaudi3
-      runner_scale_set: itac-bm-emr-gaudi3-dell
-      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
-      report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
-    secrets: inherit
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -1,4 +1,4 @@
-name: Nvidia CI (job definitions)
+name: Self-hosted runner (scheduled)

 # Note that each job's dependencies go into a corresponding docker file.
 #
@ -15,6 +15,9 @@ on:
      slack_report_channel:
        required: true
        type: string
+      runner:
+        required: true
+        type: string
      docker:
        required: true
        type: string
@ -25,19 +28,6 @@ on:
        default: ''
        required: false
        type: string
-      report_repo_id:
-        required: true
-        type: string
-      commit_sha:
-        required: false
-        type: string
-      runner_type:
-        required: false
-        type: string
-      models:
-        default: ""
-        required: false
-        type: string

 env:
  HF_HOME: /mnt/cache
@ -48,22 +38,24 @@ env:
  # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access.
  # This token is created under the bot `hf-transformers-bot`.
  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
  TF_FORCE_GPU_ALLOW_GROWTH: true
+  RUN_PT_TF_CROSS_TESTS: 1
  CUDA_VISIBLE_DEVICES: 0,1
  NUM_SLICES: 2

 jobs:
  setup:
+    if: contains(fromJSON('["run_models_gpu", "run_quantization_torch_gpu"]'), inputs.job)
    name: Setup
-    if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu", "run_quantization_torch_gpu"]'), inputs.job)
    strategy:
      matrix:
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
      image: huggingface/transformers-all-latest-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    outputs:
      folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
      slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
@ -72,7 +64,7 @@ jobs:
      - name: Update clone
        working-directory: /transformers
        run: |
-          git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
+          git fetch && git checkout ${{ github.sha }}

      - name: Cleanup
        working-directory: /transformers
@ -86,17 +78,12 @@ jobs:
        run: pip freeze

      - id: set-matrix
-        if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
+        if: ${{ inputs.job == 'run_models_gpu' }}
        name: Identify models to test
        working-directory: /transformers/tests
        run: |
-          if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
-            echo "folder_slices=$(python3 ../utils/split_model_tests.py --models '${{ inputs.models }}' --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
-            echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
-          elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
-            echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
-            echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
-          fi
+          echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
+          echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT

      - id: set-matrix-quantization
        if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
@ -116,38 +103,15 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
    uses: ./.github/workflows/model_jobs.yml
    with:
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      machine_type: ${{ matrix.machine_type }}
      slice_id: ${{ matrix.slice_id }}
+      runner: ${{ inputs.runner }}
      docker: ${{ inputs.docker }}
-      commit_sha: ${{ inputs.commit_sha || github.sha }}
-      runner_type: ${{ inputs.runner_type }}
-      report_repo_id: ${{ inputs.report_repo_id }}
-    secrets: inherit
-
-  run_trainer_and_fsdp_gpu:
-    if: ${{ inputs.job == 'run_trainer_and_fsdp_gpu' }}
-    name: " "
-    needs: setup
-    strategy:
-      fail-fast: false
-      matrix:
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
-        slice_id: [0, 1]
-    uses: ./.github/workflows/model_jobs.yml
-    with:
-      folder_slices: ${{ needs.setup.outputs.folder_slices }}
-      machine_type: ${{ matrix.machine_type }}
-      slice_id: ${{ matrix.slice_id }}
-      docker: ${{ inputs.docker }}
-      commit_sha: ${{ inputs.commit_sha || github.sha }}
-      runner_type: ${{ inputs.runner_type }}
-      report_repo_id: ${{ inputs.report_repo_id }}
-      report_name_prefix: run_trainer_and_fsdp_gpu
    secrets: inherit

  run_pipelines_torch_gpu:
@ -156,7 +120,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -165,7 +129,7 @@ jobs:
    steps:
      - name: Update clone
        working-directory: /transformers
-        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
+        run: git fetch && git checkout ${{ github.sha }}

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
@ -190,9 +154,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -218,22 +182,23 @@ jobs:
          name: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports
          path: /transformers/reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports

-  run_examples_gpu:
-    if: ${{ inputs.job == 'run_examples_gpu' }}
-    name: Examples directory
+  run_pipelines_tf_gpu:
+    if: ${{ inputs.job == 'run_pipelines_tf_gpu' }}
+    name: TensorFlow pipelines
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g5-4xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
-      image: huggingface/transformers-all-latest-gpu
+      image: huggingface/transformers-tensorflow-gpu
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Update clone
        working-directory: /transformers
-        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
+        run: |
+          git fetch && git checkout ${{ github.sha }}

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
@ -258,9 +223,77 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+
+          echo "$machine_type"
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Run all pipeline tests on GPU
+        working-directory: /transformers
+        run: |
+          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports tests/pipelines
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: |
+          cat /transformers/reports/${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports
+
+  run_examples_gpu:
+    if: ${{ inputs.job == 'run_examples_gpu' }}
+    name: Examples directory
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [aws-g4dn-2xlarge-cache]
+    runs-on:
+      group: '${{ matrix.machine_type }}'
+    container:
+      image: huggingface/transformers-all-latest-gpu
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Set `machine_type` for report and artifact names
+        working-directory: /transformers
+        shell: bash
+        run: |
+          echo "${{ matrix.machine_type }}"
+
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -293,7 +326,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -302,7 +335,7 @@ jobs:
    steps:
      - name: Update clone
        working-directory: ${{ inputs.working-directory-prefix }}/transformers
-        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
+        run: git fetch && git checkout ${{ github.sha }}

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: ${{ inputs.working-directory-prefix }}/transformers
@ -333,7 +366,7 @@ jobs:
        run: |
          python3 -m pip uninstall -y deepspeed
          rm -rf DeepSpeed
-          git clone https://github.com/deepspeedai/DeepSpeed && cd DeepSpeed && rm -rf build
+          git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check

      - name: NVIDIA-SMI
@ -350,14 +383,14 @@ jobs:
        run: pip freeze

      - name: Set `machine_type` for report and artifact names
-        working-directory: ${{ inputs.working-directory-prefix }}/transformers
+        working-directory: /transformers
        shell: bash
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -392,7 +425,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.setup.outputs.quantization_matrix) }}
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -410,7 +443,7 @@ jobs:

      - name: Update clone
        working-directory: /transformers
-        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
+        run: git fetch && git checkout ${{ github.sha }}

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
@ -435,9 +468,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -474,7 +507,6 @@ jobs:
        uses: actions/checkout@v4
        with:
          fetch-depth: 2
-          ref: ${{ inputs.commit_sha || github.sha }}

      - name: Install transformers
        run: pip install transformers
@ -510,14 +542,14 @@ jobs:
    needs: [
      setup,
      run_models_gpu,
-      run_trainer_and_fsdp_gpu,
      run_pipelines_torch_gpu,
+      run_pipelines_tf_gpu,
      run_examples_gpu,
      run_torch_cuda_extensions_gpu,
      run_quantization_torch_gpu,
      run_extract_warnings
    ]
-    if: always() && !cancelled()
+    if: ${{ always() }}
    uses: ./.github/workflows/slack-report.yml
    with:
      job: ${{ inputs.job }}
@ -528,22 +560,15 @@ jobs:
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
      ci_event: ${{ inputs.ci_event }}
-      report_repo_id: ${{ inputs.report_repo_id }}
-      commit_sha: ${{ inputs.commit_sha || github.sha }}

    secrets: inherit

-  check_new_failures:
-    if: ${{ always() && inputs.ci_event == 'Daily CI' && needs.send_results.result == 'success' }}
-    name: Check new failures
+  check_new_model_failures:
+    if: ${{ always() && inputs.ci_event == 'Daily CI' && inputs.job == 'run_models_gpu' && needs.send_results.result == 'success' }}
+    name: Check new model failures
    needs: send_results
-    uses: ./.github/workflows/check_failed_tests.yml
+    uses: ./.github/workflows/check_failed_model_tests.yml
    with:
      docker: ${{ inputs.docker }}
-      start_sha: ${{ inputs.commit_sha || github.sha }}
-      job: ${{ inputs.job }}
-      slack_report_channel: ${{ inputs.slack_report_channel }}
-      ci_event: ${{ inputs.ci_event }}
-      report_repo_id: ${{ inputs.report_repo_id }}
-
-    secrets: inherit
+      start_sha: ${{ github.sha }}
+    secrets: inherit
--- a/.github/workflows/slack-report.yml
+++ b/.github/workflows/slack-report.yml
@ -21,13 +21,6 @@ on:
      ci_event:
        required: true
        type: string
-      report_repo_id:
-        required: true
-        type: string
-      commit_sha:
-        required: false
-        type: string
-

 env:
  TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
@ -36,7 +29,7 @@ jobs:
  send_results:
    name: Send results to webhook
    runs-on: ubuntu-22.04
-    if: always() && !cancelled()
+    if: always()
    steps:
      - name: Preliminary job status
        shell: bash
@ -45,28 +38,9 @@ jobs:
          echo "Setup status: ${{ inputs.setup_status }}"

      - uses: actions/checkout@v4
-        with:
-          fetch-depth: 2
-          ref: ${{ inputs.commit_sha || github.sha }}
-
      - uses: actions/download-artifact@v4
-
-      - name: Prepare some setup values
-        run: |
-          if [ -f setup_values/prev_workflow_run_id.txt ]; then
-            echo "PREV_WORKFLOW_RUN_ID=$(cat setup_values/prev_workflow_run_id.txt)" >> $GITHUB_ENV
-          else
-            echo "PREV_WORKFLOW_RUN_ID=" >> $GITHUB_ENV
-          fi
-
-          if [ -f setup_values/other_workflow_run_id.txt ]; then
-            echo "OTHER_WORKFLOW_RUN_ID=$(cat setup_values/other_workflow_run_id.txt)" >> $GITHUB_ENV
-          else
-            echo "OTHER_WORKFLOW_RUN_ID=" >> $GITHUB_ENV
-          fi
-
      - name: Send message to Slack
-        shell: bash
+        if: ${{ inputs.job != 'run_quantization_torch_gpu' }}
        env:
          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
@ -75,25 +49,20 @@ jobs:
          SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }}
          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
          CI_EVENT: ${{ inputs.ci_event }}
-          # This `CI_TITLE` would be empty for `schedule` or `workflow_run` events.
-          CI_TITLE: ${{ github.event.head_commit.message }}
-          CI_SHA: ${{ inputs.commit_sha || github.sha }}
+          CI_SHA: ${{ github.sha }}
+          CI_WORKFLOW_REF: ${{ github.workflow_ref }}
          CI_TEST_JOB: ${{ inputs.job }}
          SETUP_STATUS: ${{ inputs.setup_status }}
-          REPORT_REPO_ID: ${{ inputs.report_repo_id }}
        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
        # For a job that doesn't depend on (i.e. `needs`) `setup`, the value for `inputs.folder_slices` would be an
        # empty string, and the called script still get one argument (which is the emtpy string).
        run: |
+          sudo apt-get install -y curl
          pip install huggingface_hub
          pip install slack_sdk
          pip show slack_sdk
-          if [ "${{ inputs.quantization_matrix }}" != "" ]; then
-            python utils/notification_service.py "${{ inputs.quantization_matrix }}"
-          else
-            python utils/notification_service.py "${{ inputs.folder_slices }}"
-          fi
+          python utils/notification_service.py "${{ inputs.folder_slices }}"

      # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
      - name: Failure table artifacts
@ -101,3 +70,32 @@ jobs:
        with:
          name: ci_results_${{ inputs.job }}
          path: ci_results_${{ inputs.job }}
+      
+      - uses: actions/checkout@v4
+      - uses: actions/download-artifact@v4
+      - name: Send message to Slack for quantization workflow
+        if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
+        env:
+          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
+          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+          SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }}
+          CI_EVENT: ${{ inputs.ci_event }}
+          CI_SHA: ${{ github.sha }}
+          CI_TEST_JOB: ${{ inputs.job }}
+          SETUP_STATUS: ${{ inputs.setup_status }}
+        # We pass `needs.setup.outputs.quantization_matrix` as the argument. A processing in `notification_service_quantization.py` to change
+        # `quantization/bnb` to `quantization_bnb` is required, as the artifact names use `_` instead of `/`.
+        run: |
+          sudo apt-get install -y curl
+          pip install huggingface_hub
+          pip install slack_sdk
+          pip show slack_sdk
+          python utils/notification_service_quantization.py "${{ inputs.quantization_matrix }}" 
+
+      # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
+      - name: Failure table artifacts
+        if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ci_results_${{ inputs.job }}
+          path: ci_results_${{ inputs.job }}
--- a/.github/workflows/ssh-runner.yml
+++ b/.github/workflows/ssh-runner.yml
@ -5,7 +5,7 @@ on:
    inputs:
      runner_type:
        description: 'Type of runner to test (a10 or t4)'
-        required: true
+        required: true 
      docker_image:
        description: 'Name of the Docker image'
        required: true
@ -15,13 +15,15 @@ on:

 env:
  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
-  HF_HOME: /mnt/cache
-  TRANSFORMERS_IS_CI: yes
-  OMP_NUM_THREADS: 8
-  MKL_NUM_THREADS: 8
-  RUN_SLOW: yes # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. # This token is created under the bot `hf-transformers-bot`.
-  TF_FORCE_GPU_ALLOW_GROWTH: true
+  HF_HOME: /mnt/cache 
+  TRANSFORMERS_IS_CI: yes 
+  OMP_NUM_THREADS: 8 
+  MKL_NUM_THREADS: 8 
+  RUN_SLOW: yes # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. # This token is created under the bot `hf-transformers-bot`. 
+  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} 
+  TF_FORCE_GPU_ALLOW_GROWTH: true 
  CUDA_VISIBLE_DEVICES: 0,1
+  RUN_PT_TF_CROSS_TESTS: 1

 jobs:
  get_runner:
@ -32,17 +34,14 @@ jobs:
    steps:
      - name: Get runner to use
        shell: bash
-        env:
-          NUM_GPUS: ${{ github.event.inputs.num_gpus }}
-          RUNNER_TYPE: ${{ github.event.inputs.runner_type }}
        run: |
-          if [[ "$NUM_GPUS" == "single" && "$RUNNER_TYPE" == "t4" ]]; then
-            echo "RUNNER=aws-g4dn-4xlarge-cache" >> $GITHUB_ENV
-          elif [[ "$NUM_GPUS" == "multi" && "$RUNNER_TYPE" == "t4" ]]; then
+          if [[ "${{ github.event.inputs.num_gpus }}" == "single" && "${{ github.event.inputs.runner_type }}" == "t4" ]]; then
+            echo "RUNNER=aws-g4dn-2xlarge-cache" >> $GITHUB_ENV
+          elif [[ "${{ github.event.inputs.num_gpus }}" == "multi" && "${{ github.event.inputs.runner_type }}" == "t4" ]]; then
            echo "RUNNER=aws-g4dn-12xlarge-cache" >> $GITHUB_ENV
-          elif [[ "$NUM_GPUS" == "single" && "$RUNNER_TYPE" == "a10" ]]; then
+          elif [[ "${{ github.event.inputs.num_gpus }}" == "single" && "${{ github.event.inputs.runner_type }}" == "a10" ]]; then
            echo "RUNNER=aws-g5-4xlarge-cache" >> $GITHUB_ENV
-          elif [[ "$NUM_GPUS" == "multi" && "$RUNNER_TYPE" == "a10" ]]; then
+          elif [[ "${{ github.event.inputs.num_gpus }}" == "multi" && "${{ github.event.inputs.runner_type }}" == "a10" ]]; then
            echo "RUNNER=aws-g5-12xlarge-cache" >> $GITHUB_ENV
          else
            echo "RUNNER=" >> $GITHUB_ENV
@ -79,7 +78,7 @@ jobs:
      - name: Show installed libraries and their versions
        working-directory: /transformers
        run: pip freeze
-
+      
      - name: NVIDIA-SMI
        run: |
          nvidia-smi
@ -87,11 +86,9 @@ jobs:
      - name: Store Slack infos
        #because the SSH can be enabled dynamically if the workflow failed, so we need to store slack infos to be able to retrieve them during the waitforssh step
        shell: bash
-        env:
-          GITHUB_ACTOR: ${{ github.actor }}
        run: |
-          echo "$GITHUB_ACTOR"
-          github_actor=$GITHUB_ACTOR
+          echo "${{ github.actor }}"
+          github_actor=${{ github.actor }}
          github_actor=${github_actor/'-'/'_'}
          echo "$github_actor"
          echo "github_actor=$github_actor" >> $GITHUB_ENV
--- a/.github/workflows/trufflehog.yml
+++ b/.github/workflows/trufflehog.yml
@ -16,5 +16,3 @@ jobs:
          fetch-depth: 0
      - name: Secret Scanning
        uses: trufflesecurity/trufflehog@main
-        with:
-          extra_args: --results=verified,unknown
--- a/.github/workflows/update_metdata.yml
+++ b/.github/workflows/update_metdata.yml
@ -19,7 +19,7 @@ jobs:
      - name: Setup environment
        run: |
          pip install --upgrade pip
-          pip install datasets pandas
+          pip install datasets pandas==2.0.3
          pip install .[torch,tf,flax]

      - name: Update metadata
--- a/.gitignore
+++ b/.gitignore
@ -13,7 +13,6 @@ tests/fixtures/cached_*_text.txt
 logs/
 lightning_logs/
 lang_code_data/
-reports/

 # Distribution / packaging
 .Python
@ -168,6 +167,3 @@ tags

 # ruff
 .ruff_cache
-
-# modular conversion
-*.modular_backup
--- a/AGENTS.md
+++ b/AGENTS.md
@ -1,39 +0,0 @@
-# AGENTS.md Guide for Hugging Face Transformers
-
-This AGENTS.md file provides guidance for code agents working with this codebase.
-
-## Core Project Structure
-
- `/src/transformers`: This contains the core source code for the library
-  - `/models`: Code for individual models. Models inherit from base classes in the root `/src/transformers` directory.
- `/tests`: This contains the core test classes for the library. These are usually inherited rather than directly run.
-  - `/models`: Tests for individual models. Model tests inherit from common tests in the root `/tests` directory.
- `/docs`: This contains the documentation for the library, including guides, tutorials, and API references.
-
-## Coding Conventions for Hugging Face Transformers
-
- PRs should be as brief as possible. Bugfix PRs in particular can often be only one or two lines long, and do not need large comments, docstrings or new functions in this case. Aim to minimize the size of the diff.
- When writing tests, they should be added to an existing file. The only exception is for PRs to add a new model, when a new test directory should be created for that model.
- Code style is enforced in the CI. You can install the style tools with `pip install -e .[quality]`. You can then run `make fixup` to apply style and consistency fixes to your code.
-
-## Copying and inheritance
-
-Many models in the codebase have similar code, but it is not shared by inheritance because we want each model file to be self-contained.
-We use two mechanisms to keep this code in sync:
-
- "Copied from" syntax. Functions or entire classes can have a comment at the top like this: `# Copied from transformers.models.llama.modeling_llama.rotate_half` or `# Copied from transformers.models.t5.modeling_t5.T5LayerNorm with T5->MT5`
-  These comments are actively checked by the style tools, and copies will automatically be updated when the base code is updated. If you need to update a copied function, you should
-  either update the base function and use `make fixup` to propagate the change to all copies, or simply remove the `# Copied from` comment if that is inappropriate.
- "Modular" files. These files briefly define models by composing them using inheritance from other models. They are not meant to be used directly. Instead, the style tools
-  automatically generate a complete modeling file, like `modeling_bert.py`, from the modular file like `modular_bert.py`. If a model has a modular file, the modeling file
-  should never be edited directly! Instead, changes should be made in the modular file, and then you should run `make fixup` to update the modeling file automatically.
-
-When adding new models, you should prefer `modular` style.
-
-## Testing
-
-After making changes, you should usually run `make fixup` to ensure any copies and modular files are updated, and then test all affected models. This includes both
-the model you made the changes in and any other models that were updated by `make fixup`. Tests can be run with `pytest tests/models/[name]/test_modeling_[name].py`
-If your changes affect code in other classes like tokenizers or processors, you should run those tests instead, like `test_processing_[name].py` or `test_tokenization_[name].py`.
-
-In order to run tests, you may need to install dependencies. You can do this with `pip install -e .[testing]`. You will probably also need to `pip install torch accelerate` if your environment does not already have them.
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -68,7 +68,8 @@ already reported** (use the search bar on GitHub under Issues). Your issue shoul

 Once you've confirmed the bug hasn't already been reported, please include the following information in your issue so we can quickly resolve it:

-* Your **OS type and version** and **Python**, and **PyTorch** versions when applicable.
+* Your **OS type and version** and **Python**, **PyTorch** and
+  **TensorFlow** versions when applicable.
 * A short, self-contained, code snippet that allows us to reproduce the bug in
  less than 30s.
 * The *full* traceback if an exception is raised.
@ -77,7 +78,7 @@ Once you've confirmed the bug hasn't already been reported, please include the f
 To get the OS and software versions automatically, run the following command:

 ```bash
-transformers env
+transformers-cli env
 ```

 You can also run the same command from the root of the repository:
@ -164,7 +165,8 @@ You'll need **[Python 3.9](https://github.com/huggingface/transformers/blob/main
   mode with the `-e` flag.

   Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a
-   failure with this command. If that's the case make sure to install Pytorch then do:
+   failure with this command. If that's the case make sure to install the Deep Learning framework you are working with
+   (PyTorch, TensorFlow and/or Flax) then do:

   ```bash
   pip install -e ".[quality]"
@ -219,10 +221,10 @@ You'll need **[Python 3.9](https://github.com/huggingface/transformers/blob/main
   [Checks on a Pull Request](https://huggingface.co/docs/transformers/pr_checks) guide.

   If you're modifying documents under the `docs/source` directory, make sure the documentation can still be built. This check will also run in the CI when you open a pull request. To run a local check
-   make sure you install the [documentation builder](https://github.com/huggingface/doc-builder).
+   make sure you install the documentation builder:

   ```bash
-   pip install hf-doc-builder
+   pip install ".[docs]"
   ```

   Run the following command from the root of the repository:
@ -278,14 +280,13 @@ are working on it).<br>
 useful to avoid duplicated work, and to differentiate it from PRs ready to be merged.<br>
 ☐ Make sure existing tests pass.<br>
 ☐ If adding a new feature, also add tests for it.<br>
-
- If you are adding a new model, make sure you use
+   - If you are adding a new model, make sure you use
     `ModelTester.all_model_classes = (MyModel, MyModelWithLMHead,...)` to trigger the common tests.
- If you are adding new `@slow` tests, make sure they pass using
+   - If you are adding new `@slow` tests, make sure they pass using
     `RUN_SLOW=1 python -m pytest tests/models/my_new_model/test_my_new_model.py`.
- If you are adding a new tokenizer, write tests and make sure
+   - If you are adding a new tokenizer, write tests and make sure
     `RUN_SLOW=1 python -m pytest tests/models/{your_model_name}/test_tokenization_{your_model_name}.py` passes.
- CircleCI does not run the slow tests, but GitHub Actions does every night!<br>
+   - CircleCI does not run the slow tests, but GitHub Actions does every night!<br>

 ☐ All public methods must have informative docstrings (see
 [`modeling_bert.py`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py)
@ -341,8 +342,9 @@ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/t
 ```

 Like the slow tests, there are other environment variables available which are not enabled by default during testing:
-
 - `RUN_CUSTOM_TOKENIZERS`: Enables tests for custom tokenizers.
+- `RUN_PT_FLAX_CROSS_TESTS`: Enables tests for PyTorch + Flax integration.
+- `RUN_PT_TF_CROSS_TESTS`: Enables tests for TensorFlow + PyTorch integration.

 More environment variables and additional information can be found in the [testing_utils.py](https://github.com/huggingface/transformers/blob/main/src/transformers/testing_utils.py).

--- a/ISSUES.md
+++ b/ISSUES.md
@ -26,7 +26,7 @@ There are two main venues to receive support: [the forums](https://discuss.huggi

 [The user forums](https://discuss.huggingface.co/) are supported by the wide community of the library users and backed up by developers when needed.

-If you have a difficulty with deploying this library or some questions, or you'd like to discuss a new feature, please first consider discussing those things at the forums. Only when you feel your subject matter has been crystallized and you still need support from the library developers do proceed to file an [issue](https://github.com/huggingface/transformers/issues).
+If you have a difficulty with deploying this library or some questions, or you'd like to discuss a new feature, please first consider discussing those things at the forums. Only when you feel your subject matter has been crystalized and you still need support from the library developers do proceed to file an [issue](https://github.com/huggingface/transformers/issues).

 In particular all "Please explain" questions or objectively very user-specific feature requests belong to the forums. Here are some example of such questions:

@ -38,6 +38,7 @@ In particular all "Please explain" questions or objectively very user-specific f

 * "How to train T5 on De->En translation?"

+
 ## The GitHub Issues

 Everything which hints at a bug should be opened as an [issue](https://github.com/huggingface/transformers/issues).
@ -153,7 +154,7 @@ You are not required to read the following guidelines before opening an issue. H
    cd examples/seq2seq
    torchrun --nproc_per_node=2 ./finetune_trainer.py \
    --model_name_or_path sshleifer/distill-mbart-en-ro-12-4 --data_dir wmt_en_ro \
-    --output_dir output_dir \
+    --output_dir output_dir --overwrite_output_dir \
    --do_train --n_train 500 --num_train_epochs 1 \
    --per_device_train_batch_size 1  --freeze_embeds \
    --src_lang en_XX --tgt_lang ro_RO --task translation \
@ -246,6 +247,7 @@ You are not required to read the following guidelines before opening an issue. H

    Try not use italics and bold text too much as these often make the text more difficult to read.

+
 12. If you are cross-referencing a specific comment in a given thread or another issue, always link to that specific comment, rather than using the issue link. If you do the latter it could be quite impossible to find which specific comment you're referring to.

    To get the link to the specific comment do not copy the url from the location bar of your browser, but instead, click the `...` icon in the upper right corner of the comment and then select "Copy Link".
@ -255,14 +257,15 @@ You are not required to read the following guidelines before opening an issue. H
    1. https://github.com/huggingface/transformers/issues/9257
    2. https://github.com/huggingface/transformers/issues/9257#issuecomment-749945162

+
 13. If you are replying to a last comment, it's totally fine to make your reply with just your comment in it. The readers can follow the information flow here.

    But if you're replying to a comment that happened some comments back it's always a good practice to quote just the relevant lines you're replying it. The `>` is used for quoting, or you can always use the menu to do so. For example your editor box will look like:

    ```
-    > How big is your GPU cluster?
+    > How big is your gpu cluster?

-    Our cluster is made of 256 GPUs.
+    Our cluster is made of 256 gpus.
    ```

    If you are addressing multiple comments, quote the relevant parts of each before your answer. Some people use the same comment to do multiple replies, others separate them into separate comments. Either way works. The latter approach helps for linking to a specific comment.
--- a/28
+++ b/28
@ -3,24 +3,18 @@
 # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
 export PYTHONPATH = src

-check_dirs := examples tests src utils scripts benchmark benchmark_v2
+check_dirs := examples tests src utils

 exclude_folders :=  ""

 modified_only_fixup:
-	@current_branch=$$(git branch --show-current); \
-	if [ "$$current_branch" = "main" ]; then \
-		echo "On main branch, running 'style' target instead..."; \
-		$(MAKE) style; \
+	$(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs)))
+	@if test -n "$(modified_py_files)"; then \
+		echo "Checking/fixing $(modified_py_files)"; \
+		ruff check $(modified_py_files) --fix --exclude $(exclude_folders); \
+		ruff format $(modified_py_files) --exclude $(exclude_folders);\
 	else \
-		modified_py_files=$$(python utils/get_modified_files.py $(check_dirs)); \
-		if [ -n "$$modified_py_files" ]; then \
-			echo "Checking/fixing files: $${modified_py_files}"; \
-			ruff check $${modified_py_files} --fix --exclude $(exclude_folders); \
-			ruff format $${modified_py_files} --exclude $(exclude_folders); \
-		else \
-			echo "No library .py files were modified"; \
-		fi; \
+		echo "No library .py files were modified"; \
 	fi

 # Update src/transformers/dependency_versions_table.py
@ -43,16 +37,16 @@ autogenerate_code: deps_table_update
 repo-consistency:
 	python utils/check_copies.py
 	python utils/check_modular_conversion.py
+	python utils/check_table.py
 	python utils/check_dummies.py
 	python utils/check_repo.py
 	python utils/check_inits.py
-	python utils/check_pipeline_typing.py
 	python utils/check_config_docstrings.py
 	python utils/check_config_attributes.py
 	python utils/check_doctest_list.py
 	python utils/update_metadata.py --check-only
 	python utils/check_docstrings.py
-	python utils/add_dates.py
+	python utils/check_support_list.py

 # this target runs checks on all files

@ -87,9 +81,9 @@ fixup: modified_only_fixup extra_style_checks autogenerate_code repo-consistency

 fix-copies:
 	python utils/check_copies.py --fix_and_overwrite
-	python utils/check_modular_conversion.py --fix_and_overwrite
+	python utils/check_modular_conversion.py  --fix_and_overwrite
+	python utils/check_table.py --fix_and_overwrite
 	python utils/check_dummies.py --fix_and_overwrite
-	python utils/check_pipeline_typing.py --fix_and_overwrite
 	python utils/check_doctest_list.py --fix_and_overwrite
 	python utils/check_docstrings.py --fix_and_overwrite

--- a/README.md
+++ b/README.md
@ -25,7 +25,6 @@ limitations under the License.
 </p>

 <p align="center">
-    <a href="https://huggingface.com/models"><img alt="Checkpoints on Hub" src="https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen"></a>
    <a href="https://circleci.com/gh/huggingface/transformers"><img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main"></a>
    <a href="https://github.com/huggingface/transformers/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue"></a>
    <a href="https://huggingface.co/docs/transformers/index"><img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/transformers/index.svg?down_color=red&down_message=offline&up_message=online"></a>
@ -44,279 +43,266 @@ limitations under the License.
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ja.md">日本語</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_hd.md">हिन्दी</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ru.md">Русский</a> |
-        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_pt-br.md">Português</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_pt-br.md">Рortuguês</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_te.md">తెలుగు</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
-        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_it.md">Italiano</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ur.md">اردو</a> |
-        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_bn.md">বাংলা</a> |
    </p>
 </h4>

 <h3 align="center">
-    <p>State-of-the-art pretrained models for inference and training</p>
+    <p>State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow</p>
 </h3>

 <h3 align="center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_as_a_model_definition.png"/>
+    <a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
 </h3>

-Transformers acts as the model-definition framework for state-of-the-art machine learning models in text, computer
-vision, audio, video, and multimodal model, for both inference and training.
+🤗 Transformers provides thousands of pretrained models to perform tasks on different modalities such as text, vision, and audio.

-It centralizes the model definition so that this definition is agreed upon across the ecosystem. `transformers` is the
-pivot across frameworks: if a model definition is supported, it will be compatible with the majority of training
-frameworks (Axolotl, Unsloth, DeepSpeed, FSDP, PyTorch-Lightning, ...), inference engines (vLLM, SGLang, TGI, ...),
-and adjacent modeling libraries (llama.cpp, mlx, ...) which leverage the model definition from `transformers`.
+These models can be applied on:

-We pledge to help support new state-of-the-art models and democratize their usage by having their model definition be
-simple, customizable, and efficient.
+* 📝 Text, for tasks like text classification, information extraction, question answering, summarization, translation, and text generation, in over 100 languages.
+* 🖼️ Images, for tasks like image classification, object detection, and segmentation.
+* 🗣️ Audio, for tasks like speech recognition and audio classification.

-There are over 1M+ Transformers [model checkpoints](https://huggingface.co/models?library=transformers&sort=trending) on the [Hugging Face Hub](https://huggingface.com/models) you can use.
+Transformer models can also perform tasks on **several modalities combined**, such as table question answering, optical character recognition, information extraction from scanned documents, video classification, and visual question answering.

-Explore the [Hub](https://huggingface.com/) today to find a model and use Transformers to help you get started right away.
+🤗 Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets and then share them with the community on our [model hub](https://huggingface.co/models). At the same time, each python module defining an architecture is fully standalone and can be modified to enable quick research experiments.

-## Installation
+🤗 Transformers is backed by the three most popular deep learning libraries — [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) and [TensorFlow](https://www.tensorflow.org/) — with a seamless integration between them. It's straightforward to train your models with one before loading them for inference with the other.

-Transformers works with Python 3.9+, and [PyTorch](https://pytorch.org/get-started/locally/) 2.1+.
+## Online demos

-Create and activate a virtual environment with [venv](https://docs.python.org/3/library/venv.html) or [uv](https://docs.astral.sh/uv/), a fast Rust-based Python package and project manager.
+You can test most of our models directly on their pages from the [model hub](https://huggingface.co/models). We also offer [private model hosting, versioning, & an inference API](https://huggingface.co/pricing) for public and private models.

-```py
-# venv
-python -m venv .my-env
-source .my-env/bin/activate
-# uv
-uv venv .my-env
-source .my-env/bin/activate
+Here are a few examples:
+
+In Natural Language Processing:
+- [Masked word completion with BERT](https://huggingface.co/google-bert/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [Named Entity Recognition with Electra](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
+- [Text generation with Mistral](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)
+- [Natural Language Inference with RoBERTa](https://huggingface.co/FacebookAI/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [Summarization with BART](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
+- [Question answering with DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [Translation with T5](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
+
+In Computer Vision:
+- [Image classification with ViT](https://huggingface.co/google/vit-base-patch16-224)
+- [Object Detection with DETR](https://huggingface.co/facebook/detr-resnet-50)
+- [Semantic Segmentation with SegFormer](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512)
+- [Panoptic Segmentation with Mask2Former](https://huggingface.co/facebook/mask2former-swin-large-coco-panoptic)
+- [Depth Estimation with Depth Anything](https://huggingface.co/docs/transformers/main/model_doc/depth_anything)
+- [Video Classification with VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)
+- [Universal Segmentation with OneFormer](https://huggingface.co/shi-labs/oneformer_ade20k_dinat_large)
+
+In Audio:
+- [Automatic Speech Recognition with Whisper](https://huggingface.co/openai/whisper-large-v3)
+- [Keyword Spotting with Wav2Vec2](https://huggingface.co/superb/wav2vec2-base-superb-ks)
+- [Audio Classification with Audio Spectrogram Transformer](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593)
+
+In Multimodal tasks:
+- [Table Question Answering with TAPAS](https://huggingface.co/google/tapas-base-finetuned-wtq)
+- [Visual Question Answering with ViLT](https://huggingface.co/dandelin/vilt-b32-finetuned-vqa)
+- [Image captioning with LLaVa](https://huggingface.co/llava-hf/llava-1.5-7b-hf)
+- [Zero-shot Image Classification with SigLIP](https://huggingface.co/google/siglip-so400m-patch14-384)
+- [Document Question Answering with LayoutLM](https://huggingface.co/impira/layoutlm-document-qa)
+- [Zero-shot Video Classification with X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)
+- [Zero-shot Object Detection with OWLv2](https://huggingface.co/docs/transformers/en/model_doc/owlv2)
+- [Zero-shot Image Segmentation with CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)
+- [Automatic Mask Generation with SAM](https://huggingface.co/docs/transformers/model_doc/sam)
+
+
+## 100 projects using Transformers
+
+Transformers is more than a toolkit to use pretrained models: it's a community of projects built around it and the
+Hugging Face Hub. We want Transformers to enable developers, researchers, students, professors, engineers, and anyone
+else to build their dream projects.
+
+In order to celebrate the 100,000 stars of transformers, we have decided to put the spotlight on the
+community, and we have created the [awesome-transformers](./awesome-transformers.md) page which lists 100
+incredible projects built in the vicinity of transformers.
+
+If you own or use a project that you believe should be part of the list, please open a PR to add it!
+
+## Serious about AI in your organisation? Build faster with the Hugging Face Enterprise Hub.
+
+<a target="_blank" href="https://huggingface.co/enterprise">
+    <img alt="Hugging Face Enterprise Hub" src="https://github.com/user-attachments/assets/247fb16d-d251-4583-96c4-d3d76dda4925">
+</a><br>
+
+## Quick tour
+
+To immediately use a model on a given input (text, image, audio, ...), we provide the `pipeline` API. Pipelines group together a pretrained model with the preprocessing that was used during that model's training. Here is how to quickly use a pipeline to classify positive versus negative texts:
+
+```python
+>>> from transformers import pipeline
+
+# Allocate a pipeline for sentiment-analysis
+>>> classifier = pipeline('sentiment-analysis')
+>>> classifier('We are very happy to introduce pipeline to the transformers repository.')
+[{'label': 'POSITIVE', 'score': 0.9996980428695679}]
 ```

-Install Transformers in your virtual environment.
+The second line of code downloads and caches the pretrained model used by the pipeline, while the third evaluates it on the given text. Here, the answer is "positive" with a confidence of 99.97%.

-```py
-# pip
-pip install "transformers[torch]"
+Many tasks have a pre-trained `pipeline` ready to go, in NLP but also in computer vision and speech. For example, we can easily extract detected objects in an image:

-# uv
-uv pip install "transformers[torch]"
+``` python
+>>> import requests
+>>> from PIL import Image
+>>> from transformers import pipeline
+
+# Download an image with cute cats
+>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png"
+>>> image_data = requests.get(url, stream=True).raw
+>>> image = Image.open(image_data)
+
+# Allocate a pipeline for object detection
+>>> object_detector = pipeline('object-detection')
+>>> object_detector(image)
+[{'score': 0.9982201457023621,
+  'label': 'remote',
+  'box': {'xmin': 40, 'ymin': 70, 'xmax': 175, 'ymax': 117}},
+ {'score': 0.9960021376609802,
+  'label': 'remote',
+  'box': {'xmin': 333, 'ymin': 72, 'xmax': 368, 'ymax': 187}},
+ {'score': 0.9954745173454285,
+  'label': 'couch',
+  'box': {'xmin': 0, 'ymin': 1, 'xmax': 639, 'ymax': 473}},
+ {'score': 0.9988006353378296,
+  'label': 'cat',
+  'box': {'xmin': 13, 'ymin': 52, 'xmax': 314, 'ymax': 470}},
+ {'score': 0.9986783862113953,
+  'label': 'cat',
+  'box': {'xmin': 345, 'ymin': 23, 'xmax': 640, 'ymax': 368}}]
 ```

-Install Transformers from source if you want the latest changes in the library or are interested in contributing. However, the *latest* version may not be stable. Feel free to open an [issue](https://github.com/huggingface/transformers/issues) if you encounter an error.
-
-```shell
-git clone https://github.com/huggingface/transformers.git
-cd transformers
-
-# pip
-pip install '.[torch]'
-
-# uv
-uv pip install '.[torch]'
-```
-
-## Quickstart
-
-Get started with Transformers right away with the [Pipeline](https://huggingface.co/docs/transformers/pipeline_tutorial) API. The `Pipeline` is a high-level inference class that supports text, audio, vision, and multimodal tasks. It handles preprocessing the input and returns the appropriate output.
-
-Instantiate a pipeline and specify model to use for text generation. The model is downloaded and cached so you can easily reuse it again. Finally, pass some text to prompt the model.
-
-```py
-from transformers import pipeline
-
-pipeline = pipeline(task="text-generation", model="Qwen/Qwen2.5-1.5B")
-pipeline("the secret to baking a really good cake is ")
-[{'generated_text': 'the secret to baking a really good cake is 1) to use the right ingredients and 2) to follow the recipe exactly. the recipe for the cake is as follows: 1 cup of sugar, 1 cup of flour, 1 cup of milk, 1 cup of butter, 1 cup of eggs, 1 cup of chocolate chips. if you want to make 2 cakes, how much sugar do you need? To make 2 cakes, you will need 2 cups of sugar.'}]
-```
-
-To chat with a model, the usage pattern is the same. The only difference is you need to construct a chat history (the input to `Pipeline`) between you and the system.
-
-> [!TIP]
-> You can also chat with a model directly from the command line.
-> ```shell
-> transformers chat Qwen/Qwen2.5-0.5B-Instruct
-> ```
-
-```py
-import torch
-from transformers import pipeline
-
-chat = [
-    {"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."},
-    {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
-]
-
-pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", dtype=torch.bfloat16, device_map="auto")
-response = pipeline(chat, max_new_tokens=512)
-print(response[0]["generated_text"][-1]["content"])
-```
-
-Expand the examples below to see how `Pipeline` works for different modalities and tasks.
-
-<details>
-<summary>Automatic speech recognition</summary>
-
-```py
-from transformers import pipeline
-
-pipeline = pipeline(task="automatic-speech-recognition", model="openai/whisper-large-v3")
-pipeline("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
-{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
-```
-
-</details>
-
-<details>
-<summary>Image classification</summary>
+Here, we get a list of objects detected in the image, with a box surrounding the object and a confidence score. Here is the original image on the left, with the predictions displayed on the right:

 <h3 align="center">
-    <a><img src="https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png"></a>
+    <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png" width="400"></a>
+    <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample_post_processed.png" width="400"></a>
 </h3>

-```py
-from transformers import pipeline
+You can learn more about the tasks supported by the `pipeline` API in [this tutorial](https://huggingface.co/docs/transformers/task_summary).

-pipeline = pipeline(task="image-classification", model="facebook/dinov2-small-imagenet1k-1-layer")
-pipeline("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
-[{'label': 'macaw', 'score': 0.997848391532898},
- {'label': 'sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita',
-  'score': 0.0016551691805943847},
- {'label': 'lorikeet', 'score': 0.00018523589824326336},
- {'label': 'African grey, African gray, Psittacus erithacus',
-  'score': 7.85409429227002e-05},
- {'label': 'quail', 'score': 5.502637941390276e-05}]
+In addition to `pipeline`, to download and use any of the pretrained models on your given task, all it takes is three lines of code. Here is the PyTorch version:
+```python
+>>> from transformers import AutoTokenizer, AutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = AutoModel.from_pretrained("google-bert/bert-base-uncased")
+
+>>> inputs = tokenizer("Hello world!", return_tensors="pt")
+>>> outputs = model(**inputs)
 ```

-</details>
+And here is the equivalent code for TensorFlow:
+```python
+>>> from transformers import AutoTokenizer, TFAutoModel

-<details>
-<summary>Visual question answering</summary>
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = TFAutoModel.from_pretrained("google-bert/bert-base-uncased")

-<h3 align="center">
-    <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg"></a>
-</h3>
-
-```py
-from transformers import pipeline
-
-pipeline = pipeline(task="visual-question-answering", model="Salesforce/blip-vqa-base")
-pipeline(
-    image="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg",
-    question="What is in the image?",
-)
-[{'answer': 'statue of liberty'}]
+>>> inputs = tokenizer("Hello world!", return_tensors="tf")
+>>> outputs = model(**inputs)
 ```

-</details>
+The tokenizer is responsible for all the preprocessing the pretrained model expects and can be called directly on a single string (as in the above examples) or a list. It will output a dictionary that you can use in downstream code or simply directly pass to your model using the ** argument unpacking operator.

-## Why should I use Transformers?
+The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) or a [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) (depending on your backend) which you can use as usual. [This tutorial](https://huggingface.co/docs/transformers/training) explains how to integrate such a model into a classic PyTorch or TensorFlow training loop, or how to use our `Trainer` API to quickly fine-tune on a new dataset.
+
+## Why should I use transformers?

 1. Easy-to-use state-of-the-art models:
-    - High performance on natural language understanding & generation, computer vision, audio, video, and multimodal tasks.
-    - Low barrier to entry for researchers, engineers, and developers.
+    - High performance on natural language understanding & generation, computer vision, and audio tasks.
+    - Low barrier to entry for educators and practitioners.
    - Few user-facing abstractions with just three classes to learn.
    - A unified API for using all our pretrained models.

 1. Lower compute costs, smaller carbon footprint:
-    - Share trained models instead of training from scratch.
-    - Reduce compute time and production costs.
-    - Dozens of model architectures with 1M+ pretrained checkpoints across all modalities.
+    - Researchers can share trained models instead of always retraining.
+    - Practitioners can reduce compute time and production costs.
+    - Dozens of architectures with over 400,000 pretrained models across all modalities.

-1. Choose the right framework for every part of a models lifetime:
+1. Choose the right framework for every part of a model's lifetime:
    - Train state-of-the-art models in 3 lines of code.
-    - Move a single model between PyTorch/JAX/TF2.0 frameworks at will.
-    - Pick the right framework for training, evaluation, and production.
+    - Move a single model between TF2.0/PyTorch/JAX frameworks at will.
+    - Seamlessly pick the right framework for training, evaluation, and production.

 1. Easily customize a model or an example to your needs:
    - We provide examples for each architecture to reproduce the results published by its original authors.
    - Model internals are exposed as consistently as possible.
    - Model files can be used independently of the library for quick experiments.

-<a target="_blank" href="https://huggingface.co/enterprise">
-    <img alt="Hugging Face Enterprise Hub" src="https://github.com/user-attachments/assets/247fb16d-d251-4583-96c4-d3d76dda4925">
-</a><br>
-
-## Why shouldn't I use Transformers?
+## Why shouldn't I use transformers?

 - This library is not a modular toolbox of building blocks for neural nets. The code in the model files is not refactored with additional abstractions on purpose, so that researchers can quickly iterate on each of the models without diving into additional abstractions/files.
- The training API is optimized to work with PyTorch models provided by Transformers. For generic machine learning loops, you should use another library like [Accelerate](https://huggingface.co/docs/accelerate).
- The [example scripts](https://github.com/huggingface/transformers/tree/main/examples) are only *examples*. They may not necessarily work out-of-the-box on your specific use case and you'll need to adapt the code for it to work.
+- The training API is not intended to work on any model but is optimized to work with the models provided by the library. For generic machine learning loops, you should use another library (possibly, [Accelerate](https://huggingface.co/docs/accelerate)).
+- While we strive to present as many use cases as possible, the scripts in our [examples folder](https://github.com/huggingface/transformers/tree/main/examples) are just that: examples. It is expected that they won't work out-of-the-box on your specific problem and that you will be required to change a few lines of code to adapt them to your needs.

-## 100 projects using Transformers
+## Installation

-Transformers is more than a toolkit to use pretrained models, it's a community of projects built around it and the
-Hugging Face Hub. We want Transformers to enable developers, researchers, students, professors, engineers, and anyone
-else to build their dream projects.
+### With pip

-In order to celebrate Transformers 100,000 stars, we wanted to put the spotlight on the
-community with the [awesome-transformers](./awesome-transformers.md) page which lists 100
-incredible projects built with Transformers.
+This repository is tested on Python 3.9+, Flax 0.4.1+, PyTorch 1.11+, and TensorFlow 2.6+.

-If you own or use a project that you believe should be part of the list, please open a PR to add it!
+You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).

-## Example models
+First, create a virtual environment with the version of Python you're going to use and activate it.

-You can test most of our models directly on their [Hub model pages](https://huggingface.co/models).
+Then, you will need to install at least one of Flax, PyTorch, or TensorFlow.
+Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/), [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) and/or [Flax](https://github.com/google/flax#quick-install) and [Jax](https://github.com/google/jax#installation) installation pages regarding the specific installation command for your platform.

-Expand each modality below to see a few example models for various use cases.
+When one of those backends has been installed, 🤗 Transformers can be installed using pip as follows:

-<details>
-<summary>Audio</summary>
+```bash
+pip install transformers
+```

- Audio classification with [Whisper](https://huggingface.co/openai/whisper-large-v3-turbo)
- Automatic speech recognition with [Moonshine](https://huggingface.co/UsefulSensors/moonshine)
- Keyword spotting with [Wav2Vec2](https://huggingface.co/superb/wav2vec2-base-superb-ks)
- Speech to speech generation with [Moshi](https://huggingface.co/kyutai/moshiko-pytorch-bf16)
- Text to audio with [MusicGen](https://huggingface.co/facebook/musicgen-large)
- Text to speech with [Bark](https://huggingface.co/suno/bark)
+If you'd like to play with the examples or need the bleeding edge of the code and can't wait for a new release, you must [install the library from source](https://huggingface.co/docs/transformers/installation#installing-from-source).

-</details>
+### With conda

-<details>
-<summary>Computer vision</summary>
+🤗 Transformers can be installed using conda as follows:

- Automatic mask generation with [SAM](https://huggingface.co/facebook/sam-vit-base)
- Depth estimation with [DepthPro](https://huggingface.co/apple/DepthPro-hf)
- Image classification with [DINO v2](https://huggingface.co/facebook/dinov2-base)
- Keypoint detection with [SuperPoint](https://huggingface.co/magic-leap-community/superpoint)
- Keypoint matching with [SuperGlue](https://huggingface.co/magic-leap-community/superglue_outdoor)
- Object detection with [RT-DETRv2](https://huggingface.co/PekingU/rtdetr_v2_r50vd)
- Pose Estimation with [VitPose](https://huggingface.co/usyd-community/vitpose-base-simple)
- Universal segmentation with [OneFormer](https://huggingface.co/shi-labs/oneformer_ade20k_swin_large)
- Video classification with [VideoMAE](https://huggingface.co/MCG-NJU/videomae-large)
+```shell script
+conda install conda-forge::transformers
+```

-</details>
+> **_NOTE:_** Installing `transformers` from the `huggingface` channel is deprecated.

-<details>
-<summary>Multimodal</summary>
+Follow the installation pages of Flax, PyTorch or TensorFlow to see how to install them with conda.

- Audio or text to text with [Qwen2-Audio](https://huggingface.co/Qwen/Qwen2-Audio-7B)
- Document question answering with [LayoutLMv3](https://huggingface.co/microsoft/layoutlmv3-base)
- Image or text to text with [Qwen-VL](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct)
- Image captioning [BLIP-2](https://huggingface.co/Salesforce/blip2-opt-2.7b)
- OCR-based document understanding with [GOT-OCR2](https://huggingface.co/stepfun-ai/GOT-OCR-2.0-hf)
- Table question answering with [TAPAS](https://huggingface.co/google/tapas-base)
- Unified multimodal understanding and generation with [Emu3](https://huggingface.co/BAAI/Emu3-Gen)
- Vision to text with [Llava-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf)
- Visual question answering with [Llava](https://huggingface.co/llava-hf/llava-1.5-7b-hf)
- Visual referring expression segmentation with [Kosmos-2](https://huggingface.co/microsoft/kosmos-2-patch14-224)
+> **_NOTE:_**  On Windows, you may be prompted to activate Developer Mode in order to benefit from caching. If this is not an option for you, please let us know in [this issue](https://github.com/huggingface/huggingface_hub/issues/1062).

-</details>
+## Model architectures

-<details>
-<summary>NLP</summary>
+**[All the model checkpoints](https://huggingface.co/models)** provided by 🤗 Transformers are seamlessly integrated from the huggingface.co [model hub](https://huggingface.co/models), where they are uploaded directly by [users](https://huggingface.co/users) and [organizations](https://huggingface.co/organizations).

- Masked word completion with [ModernBERT](https://huggingface.co/answerdotai/ModernBERT-base)
- Named entity recognition with [Gemma](https://huggingface.co/google/gemma-2-2b)
- Question answering with [Mixtral](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)
- Summarization with [BART](https://huggingface.co/facebook/bart-large-cnn)
- Translation with [T5](https://huggingface.co/google-t5/t5-base)
- Text generation with [Llama](https://huggingface.co/meta-llama/Llama-3.2-1B)
- Text classification with [Qwen](https://huggingface.co/Qwen/Qwen2.5-0.5B)
+Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)

-</details>
+🤗 Transformers currently provides the following architectures: see [here](https://huggingface.co/docs/transformers/model_summary) for a high-level summary of each them.
+
+To check if each model has an implementation in Flax, PyTorch or TensorFlow, or has an associated tokenizer backed by the 🤗 Tokenizers library, refer to [this table](https://huggingface.co/docs/transformers/index#supported-frameworks).
+
+These implementations have been tested on several datasets (see the example scripts) and should match the performance of the original implementations. You can find more details on performance in the Examples section of the [documentation](https://github.com/huggingface/transformers/tree/main/examples).
+
+
+## Learn more
+
+| Section | Description |
+|-|-|
+| [Documentation](https://huggingface.co/docs/transformers/) | Full API documentation and tutorials |
+| [Task summary](https://huggingface.co/docs/transformers/task_summary) | Tasks supported by 🤗 Transformers |
+| [Preprocessing tutorial](https://huggingface.co/docs/transformers/preprocessing) | Using the `Tokenizer` class to prepare data for the models |
+| [Training and fine-tuning](https://huggingface.co/docs/transformers/training) | Using the models provided by 🤗 Transformers in a PyTorch/TensorFlow training loop and the `Trainer` API |
+| [Quick tour: Fine-tuning/usage scripts](https://github.com/huggingface/transformers/tree/main/examples) | Example scripts for fine-tuning models on a wide range of tasks |
+| [Model sharing and uploading](https://huggingface.co/docs/transformers/model_sharing) | Upload and share your fine-tuned models with the community |

 ## Citation

--- a/SECURITY.md
+++ b/SECURITY.md
@ -14,7 +14,7 @@ Models uploaded on the Hugging Face Hub come in different formats. We heavily re
 models in the [`safetensors`](https://github.com/huggingface/safetensors) format (which is the default prioritized
 by the transformers library), as developed specifically to prevent arbitrary code execution on your system.

-To avoid loading models from unsafe formats (e.g. [pickle](https://docs.python.org/3/library/pickle.html), you should use the `use_safetensors` parameter. If doing so, in the event that no .safetensors file is present, transformers will error when loading the model.
+To avoid loading models from unsafe formats(e.g. [pickle](https://docs.python.org/3/library/pickle.html), you should use the `use_safetensors` parameter. If doing so, in the event that no .safetensors file is present, transformers will error when loading the model.

 ### Remote code

@ -27,6 +27,13 @@ These models require the `trust_remote_code=True` parameter to be set when using
 the content of the modeling files when using this argument. We recommend setting a revision in order to ensure you
 protect yourself from updates on the repository.

+#### Tools
+
+Through the `Agent` framework, remote tools can be downloaded to be used by the Agent. You're to specify these tools
+yourself, but please keep in mind that their code will be run on your machine if the Agent chooses to run them.
+
+Please inspect the code of the tools before passing them to the Agent to protect your runtime and local setup.
+
 ## Reporting a Vulnerability

 Feel free to submit vulnerability reports to [security@huggingface.co](mailto:security@huggingface.co), where someone from the HF security team will review and recommend next steps. If reporting a vulnerability specific to open source, please note [Huntr](https://huntr.com) is a vulnerability disclosure program for open source software.
--- a/awesome-transformers.md
+++ b/awesome-transformers.md
@ -6,7 +6,7 @@ developers, researchers, students, professors, engineers, and anyone else to bui

 In this list, we showcase incredibly impactful and novel projects that have pushed the field forward. We celebrate
 100 of these projects as we reach the milestone of 100k stars as a community; but we're very open to pull requests
-adding other projects to the list. If you believe a project should be here and it's not, then please, open a PR
+adding other projects to the list. If you believe a project should be here and it's not, then please, open a PR 
 to add it.

 ## [gpt4all](https://github.com/nomic-ai/gpt4all)
@ -15,7 +15,7 @@ to add it.

 Keywords: Open-source, LLaMa, GPT-J, instruction, assistant

-## [recommenders](https://github.com/recommenders-team/recommenders)
+## [recommenders](https://github.com/microsoft/recommenders)

 This repository contains examples and best practices for building recommendation systems, provided as Jupyter notebooks. It goes over several aspects required to build efficient recommendation systems: data preparation, modeling, evaluation, model selection & optimization, as well as operationalization

@ -29,7 +29,7 @@ Keywords: inpainting, SD, Stable Diffusion

 ## [flair](https://github.com/flairNLP/flair)

-FLAIR is a powerful PyTorch NLP framework, covering several important tasks: NER, sentiment-analysis, part-of-speech tagging, text and document embeddings, among other things.
+FLAIR is a powerful PyTorch NLP framework, convering several important tasks: NER, sentiment-analysis, part-of-speech tagging, text and document embeddings, among other things.

 Keywords: NLP, text embedding, document embedding, biomedical, NER, PoS, sentiment-analysis

@ -39,17 +39,17 @@ MindsDB is a low-code ML platform, which automates and integrates several ML fra

 Keywords: Database, low-code, AI table

-## [langchain](https://github.com/langchain-ai/langchain)
+## [langchain](https://github.com/hwchase17/langchain)

-[langchain](https://github.com/langchain-ai/langchain) is aimed at assisting in the development of apps merging both LLMs and other sources of knowledge. The library allows chaining calls to applications, creating a sequence across many tools.
+[langchain](https://github.com/hwchase17/langchain) is aimed at assisting in the development of apps merging both LLMs and other sources of knowledge. The library allows chaining calls to applications, creating a sequence across many tools.

 Keywords: LLMs, Large Language Models, Agents, Chains

-## [LlamaIndex](https://github.com/run-llama/llama_index)
+## [LlamaIndex](https://github.com/jerryjliu/llama_index)

-[LlamaIndex](https://github.com/run-llama/llama_index) is a project that provides a central interface to connect your LLM's with external data. It provides various kinds of indices and retrieval mechanisms to perform different LLM tasks and obtain knowledge-augmented results.
+[LlamaIndex](https://github.com/jerryjliu/llama_index) is a project that provides a central interface to connect your LLM's with external data. It provides various kinds of indices and retreival mechanisms to perform different LLM tasks and obtain knowledge-augmented results.

-Keywords: LLMs, Large Language Models, Data Retrieval, Indices, Knowledge Augmentation
+Keywords: LLMs, Large Language Models, Data Retrieval, Indices, Knowledge Augmentation 

 ## [ParlAI](https://github.com/facebookresearch/ParlAI)

@ -146,9 +146,9 @@ Keywords: Framework, simplicity, NLP

 Keywords: LLM, Agents, HF Hub

-## [transformers.js](https://github.com/huggingface/transformers.js/)
+## [transformers.js](https://xenova.github.io/transformers.js/)

-[transformers.js](https://github.com/huggingface/transformers.js/) is a JavaScript library targeted at running models from transformers directly within the browser.
+[transformers.js](https://xenova.github.io/transformers.js/) is a JavaScript library targeted at running models from transformers directly within the browser.

 Keywords: Transformers, JavaScript, browser

@ -257,7 +257,7 @@ Stable-Dreamfusion is a pytorch implementation of the text-to-3D model Dreamfusi
 Keywords: Text-to-3D, Stable Diffusion

 ## [txtai](https://github.com/neuml/txtai)
-
+ 
 [txtai](https://github.com/neuml/txtai) is an open-source platform for semantic search and workflows powered by language models. txtai builds embeddings databases, which are a union of vector indexes and relational databases enabling similarity search with SQL. Semantic workflows connect language models together into unified applications.

 Keywords: Semantic search, LLM
@ -288,7 +288,7 @@ Keywords: Music understanding, Music generation

 ## [dalle-flow](https://github.com/jina-ai/dalle-flow)

-DALL·E Flow is an interactive workflow for generating high-definition images from a text prompt. It leverages DALL·E-Mega, GLID-3 XL, and Stable Diffusion to generate image candidates, and then calls CLIP-as-service to rank the candidates w.r.t. the prompt.
+DALL·E Flow is an interactive workflow for generating high-definition images from a text prompt. Itt leverages DALL·E-Mega, GLID-3 XL, and Stable Diffusion to generate image candidates, and then calls CLIP-as-service to rank the candidates w.r.t. the prompt.
 The preferred candidate is fed to GLID-3 XL for diffusion, which often enriches the texture and background. Finally, the candidate is upscaled to 1024x1024 via SwinIR.

 Keywords: High-definition image generation, Stable Diffusion, DALL-E Mega, GLID-3 XL, CLIP, SwinIR
@ -309,8 +309,8 @@ Keywords: OCR, LaTeX, Math formula

 OpenCLIP is an open source implementation of OpenAI's CLIP.

-The goal of this repository is to enable training models with contrastive image-text supervision, and to investigate their properties such as robustness to distribution shift.
-The starting point is an implementation of CLIP that matches the accuracy of the original CLIP models when trained on the same dataset.
+The goal of this repository is to enable training models with contrastive image-text supervision, and to investigate their properties such as robustness to distribution shift. 
+The starting point is an implementation of CLIP that matches the accuracy of the original CLIP models when trained on the same dataset. 

 Specifically, a ResNet-50 model trained with this codebase on OpenAI's 15 million image subset of YFCC achieves 32.7% top-1 accuracy on ImageNet.

@ -437,7 +437,7 @@ Keywords: DALL-E, Russian

 Keywords: Knowledge Extraction, Knowledge Graphs

-## [Nebuly](https://github.com/nebuly-ai/optimate)
+## [Nebuly](https://github.com/nebuly-ai/nebuly)

 Nebuly is the next-generation platform to monitor and optimize your AI costs in one place. The platform connects to all your AI cost sources (compute, API providers, AI software licenses, etc) and centralizes them in one place to give you full visibility on a model basis. The platform also provides optimization recommendations and a co-pilot model that can guide during the optimization process. The platform builds on top of the open-source tools allowing you to optimize the different steps of your AI stack to squeeze out the best possible cost performances.

@ -526,7 +526,7 @@ Keywords: Model deployment, CLoud, Mobile, Edge

 ## [underthesea](https://github.com/undertheseanlp/underthesea)

-[underthesea](https://github.com/undertheseanlp/underthesea) is a Vietnamese NLP toolkit. Underthesea is a suite of open source Python modules data sets and tutorials supporting research and development in Vietnamese Natural Language Processing. We provide extremely easy API to quickly apply pretrained NLP models to your Vietnamese text, such as word segmentation, part-of-speech tagging (PoS), named entity recognition (NER), text classification and dependency parsing.
+[underthesea](https://github.com/undertheseanlp/underthesea) is a Vietnamese NLP toolkit. Underthesea is a suite of open source Python modules data sets and tutorials supporting research and development in Vietnamese Natural Language Processing. We provides extremely easy API to quickly apply pretrained NLP models to your Vietnamese text, such as word segmentation, part-of-speech tagging (PoS), named entity recognition (NER), text classification and dependency parsing.

 Keywords: Vietnamese, NLP

@ -596,7 +596,7 @@ Keywords: Data-Centric AI, Data Quality, Noisy Labels, Outlier Detection, Active

 ## [BentoML](https://github.com/bentoml/BentoML)

-[BentoML](https://github.com/bentoml) is the unified framework for building, shipping, and scaling production-ready AI applications incorporating traditional ML, pre-trained AI models, Generative and Large Language Models.
+[BentoML](https://github.com/bentoml) is the unified framework for building, shipping, and scaling production-ready AI applications incorporating traditional ML, pre-trained AI models, Generative and Large Language Models. 
 All Hugging Face models and pipelines can be seamlessly integrated into BentoML applications, enabling the running of models on the most suitable hardware and independent scaling based on usage.

 Keywords: BentoML, Framework, Deployment, AI Applications
@ -606,3 +606,4 @@ Keywords: BentoML, Framework, Deployment, AI Applications
 [LLaMA Factory](https://github.com/hiyouga/LLaMA-Factory) offers a user-friendly fine-tuning framework that incorporates PEFT. The repository includes training(fine-tuning) and inference examples for LLaMA-2, BLOOM, Falcon, Baichuan, Qwen, and other LLMs. A ChatGLM version is also available in [ChatGLM-Efficient-Tuning](https://github.com/hiyouga/ChatGLM-Efficient-Tuning).

 Keywords: PEFT, fine-tuning, LLaMA-2, ChatGLM, Qwen
+
--- a/benchmark/.gitignore
+++ b/benchmark/.gitignore
@ -1 +0,0 @@
-benchmark_results/
--- a/benchmark/README.md
+++ b/benchmark/README.md
@ -1,49 +0,0 @@
-# Benchmarks
-
-You might want to add new benchmarks.
-
-You will need to define a python function named `run_benchmark` in your python file and the file must be located in this `benchmark/` directory.
-
-The expected function signature is the following:
-
-```py
-def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100):
-```
-
-## Writing metrics to the database
-
-`MetricsRecorder` is thread-safe, in the sense of the python [`Thread`](https://docs.python.org/3/library/threading.html#threading.Thread). This means you can start a background thread to do the readings on the device measurements while not blocking the main thread to execute the model measurements.
-
-cf [`llama.py`](./llama.py) to see an example of this in practice.
-
-```py
-from benchmarks_entrypoint import MetricsRecorder
-import psycopg2
-
-def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100):
-  metrics_recorder = MetricsRecorder(psycopg2.connect("dbname=metrics"), logger, branch, commit_id, commit_msg)
-  benchmark_id = metrics_recorder.initialise_benchmark({"gpu_name": gpu_name, "model_id": model_id})
-    # To collect device measurements
-    metrics_recorder.collect_device_measurements(
-        benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes
-    )
-    # To collect your model measurements
-    metrics_recorder.collect_model_measurements(
-        benchmark_id,
-        {
-            "model_load_time": model_load_time,
-            "first_eager_forward_pass_time_secs": first_eager_fwd_pass_time,
-            "second_eager_forward_pass_time_secs": second_eager_fwd_pass_time,
-            "first_eager_generate_time_secs": first_eager_generate_time,
-            "second_eager_generate_time_secs": second_eager_generate_time,
-            "time_to_first_token_secs": time_to_first_token,
-            "time_to_second_token_secs": time_to_second_token,
-            "time_to_third_token_secs": time_to_third_token,
-            "time_to_next_token_mean_secs": mean_time_to_next_token,
-            "first_compile_generate_time_secs": first_compile_generate_time,
-            "second_compile_generate_time_secs": second_compile_generate_time,
-            "third_compile_generate_time_secs": third_compile_generate_time,
-            "fourth_compile_generate_time_secs": fourth_compile_generate_time,
-        },
-    )
-```
--- a/benchmark/benches/llama.py
+++ b/benchmark/benches/llama.py
@ -1,354 +0,0 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import sys
-from logging import Logger
-from threading import Event, Thread
-from time import perf_counter, sleep
-from typing import Optional
-
-
-# Add the parent directory to Python path to import benchmarks_entrypoint
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-import gpustat
-import psutil
-import psycopg2
-from benchmarks_entrypoint import MetricsRecorder
-
-
-# Optional heavy ML dependencies - only required when actually running the benchmark
-try:
-    import torch
-
-    from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, StaticCache
-
-    TRANSFORMERS_AVAILABLE = True
-except ImportError:
-    TRANSFORMERS_AVAILABLE = False
-    torch = None
-    AutoModelForCausalLM = None
-    AutoTokenizer = None
-    GenerationConfig = None
-    StaticCache = None
-
-os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
-os.environ["TOKENIZERS_PARALLELISM"] = "1"
-
-# Only set torch precision if torch is available
-if TRANSFORMERS_AVAILABLE:
-    torch.set_float32_matmul_precision("high")
-
-
-def collect_metrics(benchmark_id, continue_metric_collection, metrics_recorder):
-    p = psutil.Process(os.getpid())
-    while not continue_metric_collection.is_set():
-        with p.oneshot():
-            cpu_util = p.cpu_percent()
-            mem_megabytes = p.memory_info().rss / (1024 * 1024)
-        gpu_stats = gpustat.GPUStatCollection.new_query()
-        gpu_util = gpu_stats[0]["utilization.gpu"]
-        gpu_mem_megabytes = gpu_stats[0]["memory.used"]
-        metrics_recorder.collect_device_measurements(
-            benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes
-        )
-        sleep(0.01)
-
-
-def run_benchmark(
-    logger: Logger,
-    repository: str,
-    branch: str,
-    commit_id: str,
-    commit_msg: str,
-    metrics_recorder=None,
-    num_tokens_to_generate=100,
-):
-    # Check if required ML dependencies are available
-    if not TRANSFORMERS_AVAILABLE:
-        logger.error("Transformers and torch are required to run the LLaMA benchmark. Please install them with:")
-        logger.error("pip install torch transformers")
-        logger.error("Skipping LLaMA benchmark due to missing dependencies.")
-        return
-
-    continue_metric_collection = Event()
-    metrics_thread = None
-    model_id = "meta-llama/Llama-2-7b-hf"
-
-    # If no metrics_recorder is provided, create one for backward compatibility
-    if metrics_recorder is None:
-        try:
-            metrics_recorder = MetricsRecorder(
-                psycopg2.connect("dbname=metrics"), logger, repository, branch, commit_id, commit_msg, True
-            )
-            should_close_recorder = True
-        except Exception as e:
-            logger.error(f"Failed to create metrics recorder: {e}")
-            return
-    else:
-        should_close_recorder = False
-    try:
-        gpu_stats = gpustat.GPUStatCollection.new_query()
-        gpu_name = gpu_stats[0]["name"]
-        benchmark_id = metrics_recorder.initialise_benchmark({"gpu_name": gpu_name, "model_id": model_id})
-        logger.info(f"running benchmark #{benchmark_id} on {gpu_name} for {model_id}")
-        metrics_thread = Thread(
-            target=collect_metrics,
-            args=[benchmark_id, continue_metric_collection, metrics_recorder],
-        )
-        metrics_thread.start()
-        logger.info("started background thread to fetch device metrics")
-
-        os.environ["TOKENIZERS_PARALLELISM"] = "false"  # silence warnings when compiling
-
-        device = "cuda"
-
-        logger.info("downloading weights")
-        # This is to avoid counting download in model load time measurement
-        model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.float16)
-        gen_config = GenerationConfig(do_sample=False, top_p=1, temperature=1)
-        logger.info("loading model")
-        start = perf_counter()
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id, dtype=torch.float16, generation_config=gen_config
-        ).eval()
-        model.to(device)
-        torch.cuda.synchronize()
-        end = perf_counter()
-        model_load_time = end - start
-        logger.info(f"loaded model in: {model_load_time}s")
-
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-
-        prompt = "Why dogs are so cute?"
-        inputs = tokenizer(prompt, return_tensors="pt").to(device)
-
-        # Specify the max length (including both the prompt and the response)
-        # When calling `generate` with `cache_implementation="static" later, this is also used to create a `StaticCache` object
-        # with sequence length = `max_length`. The longer the more you will re-use it
-        seq_length = inputs["input_ids"].shape[1]
-        model.generation_config.max_length = seq_length + num_tokens_to_generate
-        batch_size = inputs["input_ids"].shape[0]
-
-        # Copied from the gpt-fast repo
-        def multinomial_sample_one_no_sync(probs_sort):  # Does multinomial sampling without a cuda synchronization
-            q = torch.empty_like(probs_sort).exponential_(1)
-            return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
-
-        def logits_to_probs(logits, temperature: float = 1.0, top_k: Optional[int] = None):
-            logits = logits / max(temperature, 1e-5)
-
-            if top_k is not None:
-                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
-                pivot = v.select(-1, -1).unsqueeze(-1)
-                logits = torch.where(logits < pivot, -float("Inf"), logits)
-            probs = torch.nn.functional.softmax(logits, dim=-1)
-            return probs
-
-        def sample(logits, temperature: float = 1.0, top_k: Optional[int] = None):
-            probs = logits_to_probs(logits[0, -1], temperature, top_k)
-            idx_next = multinomial_sample_one_no_sync(probs)
-            return idx_next, probs
-
-        # First eager forward pass
-        logger.info("running first eager forward pass")
-        start = perf_counter()
-        _ = model(**inputs)
-        torch.cuda.synchronize()
-        end = perf_counter()
-        first_eager_fwd_pass_time = end - start
-        logger.info(f"completed first eager forward pass in: {first_eager_fwd_pass_time}s")
-
-        # Second eager forward pass (should be faster)
-        logger.info("running second eager forward pass")
-        start = perf_counter()
-        _ = model(**inputs)
-        torch.cuda.synchronize()
-        end = perf_counter()
-        second_eager_fwd_pass_time = end - start
-        logger.info(f"completed second eager forward pass in: {second_eager_fwd_pass_time}s")
-
-        # First eager generation
-        logger.info("running first eager generation")
-        start = perf_counter()
-        output = model.generate(**inputs)
-        torch.cuda.synchronize()
-        end = perf_counter()
-        first_eager_generate_time = end - start
-        logger.info(f"completed first eager generation in: {first_eager_generate_time}s")
-        logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
-
-        # Second eager generation (should be faster)
-        logger.info("running second eager generation")
-        start = perf_counter()
-        output = model.generate(**inputs)
-        torch.cuda.synchronize()
-        end = perf_counter()
-        second_eager_generate_time = end - start
-        logger.info(f"completed second eager generation in: {second_eager_generate_time}s")
-        logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
-
-        logger.info("running generation timing loop")
-
-        input_pos = torch.arange(0, seq_length, device=device)
-        inputs = inputs["input_ids"]
-
-        start = perf_counter()
-        with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.MATH):
-            logits = model(inputs, position_ids=input_pos).logits
-        next_token, probs = sample(logits, temperature=0.6, top_k=5)
-        torch.cuda.synchronize()
-        end = perf_counter()
-        time_to_first_token = end - start
-
-        input_pos = torch.tensor([seq_length], device=device, dtype=torch.int)
-        next_token = next_token.clone()
-        start = perf_counter()
-        with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.MATH):
-            logits = model(next_token, position_ids=input_pos).logits
-        next_token, probs = sample(logits, temperature=0.6, top_k=5)
-        torch.cuda.synchronize()
-        end = perf_counter()
-        time_to_second_token = end - start
-
-        input_pos = torch.tensor([seq_length + 1], device=device, dtype=torch.int)
-        next_token = next_token.clone()
-        start = perf_counter()
-        with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.MATH):
-            logits = model(next_token, position_ids=input_pos).logits
-        next_token, probs = sample(logits, temperature=0.6, top_k=5)
-        torch.cuda.synchronize()
-        end = perf_counter()
-        time_to_third_token = end - start
-
-        logger.info("running longer generation timing loop")
-
-        total_time = 0
-        for i in range(20):
-            input_pos = torch.tensor([seq_length + 2 + i], device=device, dtype=torch.int)
-            next_token = next_token.clone()
-            start = perf_counter()
-            with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.MATH):
-                logits = model(next_token, position_ids=input_pos).logits
-            next_token, probs = sample(logits, temperature=0.6, top_k=5)
-            torch.cuda.synchronize()
-            end = perf_counter()
-            total_time += end - start
-
-        mean_time_to_next_token = total_time / 20
-
-        logger.info("running compilation benchmarks")
-
-        # Now compile the model
-        model = torch.compile(model, mode="max-autotune", fullgraph=True)
-
-        # StaticCache for generation
-        with torch.device(device):
-            model.setup_caches(max_batch_size=batch_size, max_seq_len=seq_length + num_tokens_to_generate)
-
-        input_pos = torch.arange(0, seq_length, device=device)
-        inputs = tokenizer(prompt, return_tensors="pt").to(device)["input_ids"]
-
-        logger.info("compiling model")
-
-        model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.float16, generation_config=gen_config)
-        model.to(device)
-        model = torch.compile(model, mode="max-autotune", fullgraph=True)
-
-        past_key_values = StaticCache(
-            model.config,
-            max_batch_size=batch_size,
-            device=device,
-            dtype=torch.float16,
-            max_cache_len=seq_length + 128,
-        )
-        # 1st call
-        start = perf_counter()
-        output = model.generate(**inputs, past_key_values=past_key_values)
-        end = perf_counter()
-        first_compile_generate_time = end - start
-        logger.info(f"completed first compile generation in: {first_compile_generate_time}s")
-        logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
-
-        past_key_values = StaticCache(
-            model.config,
-            max_batch_size=batch_size,
-            device=device,
-            dtype=torch.float16,
-            max_cache_len=seq_length + 128,
-        )
-        # 2nd call
-        start = perf_counter()
-        output = model.generate(**inputs, past_key_values=past_key_values)
-        end = perf_counter()
-        second_compile_generate_time = end - start
-        logger.info(f"completed second compile generation in: {second_compile_generate_time}s")
-        logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
-
-        past_key_values = StaticCache(
-            model.config,
-            max_batch_size=batch_size,
-            device=device,
-            dtype=torch.float16,
-            max_cache_len=seq_length + 128,
-        )
-        # 3rd call
-        start = perf_counter()
-        output = model.generate(**inputs, past_key_values=past_key_values)
-        end = perf_counter()
-        third_compile_generate_time = end - start
-        logger.info(f"completed third compile generation in: {third_compile_generate_time}s")
-        logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
-
-        past_key_values = StaticCache(
-            model.config,
-            max_batch_size=batch_size,
-            device=device,
-            dtype=torch.float16,
-            max_cache_len=seq_length + 128,
-        )
-        # 4th call
-        start = perf_counter()
-        output = model.generate(**inputs, past_key_values=past_key_values)
-        end = perf_counter()
-        fourth_compile_generate_time = end - start
-        logger.info(f"completed fourth compile generation in: {fourth_compile_generate_time}s")
-        logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
-
-        metrics_recorder.collect_model_measurements(
-            benchmark_id,
-            {
-                "model_load_time": model_load_time,
-                "first_eager_forward_pass_time_secs": first_eager_fwd_pass_time,
-                "second_eager_forward_pass_time_secs": second_eager_fwd_pass_time,
-                "first_eager_generate_time_secs": first_eager_generate_time,
-                "second_eager_generate_time_secs": second_eager_generate_time,
-                "time_to_first_token_secs": time_to_first_token,
-                "time_to_second_token_secs": time_to_second_token,
-                "time_to_third_token_secs": time_to_third_token,
-                "time_to_next_token_mean_secs": mean_time_to_next_token,
-                "first_compile_generate_time_secs": first_compile_generate_time,
-                "second_compile_generate_time_secs": second_compile_generate_time,
-                "third_compile_generate_time_secs": third_compile_generate_time,
-                "fourth_compile_generate_time_secs": fourth_compile_generate_time,
-            },
-        )
-    except Exception as e:
-        logger.error(f"Caught exception: {e}")
-    continue_metric_collection.set()
-    if metrics_thread is not None:
-        metrics_thread.join()
-
-    # Only close the recorder if we created it locally
-    if should_close_recorder:
-        metrics_recorder.close()
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@ -31,7 +31,9 @@ from contextlib import contextmanager
 from pathlib import Path

 from git import Repo
+
 from huggingface_hub import HfApi
+
 from optimum_benchmark import Benchmark
 from optimum_benchmark_wrapper import main

@ -88,7 +90,7 @@ def summarize(run_dir, metrics, expand_metrics=False):

        model = benchmark.config.backend["model"]

-        # This looks like `benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5`.
+        # Ths looks like `benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5`.
        # (we rely on the usage of hydra's `${hydra.job.override_dirname}`.)
        benchmark_name = re.sub(f"backend.model={model},*", "", report_dir)
        benchmark_name = str(Path(benchmark_name).parts[-1])
--- a/benchmark/benchmarks_entrypoint.py
+++ b/benchmark/benchmarks_entrypoint.py
@ -1,502 +0,0 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import importlib.util
-import json
-import logging
-import os
-import sys
-import uuid
-from datetime import datetime
-
-import pandas as pd
-
-
-try:
-    from psycopg2.extensions import register_adapter
-    from psycopg2.extras import Json
-
-    register_adapter(dict, Json)
-    PSYCOPG2_AVAILABLE = True
-except ImportError:
-    PSYCOPG2_AVAILABLE = False
-
-
-class ImportModuleException(Exception):
-    pass
-
-
-class MetricsRecorder:
-    def __init__(
-        self,
-        connection,
-        logger: logging.Logger,
-        repository: str,
-        branch: str,
-        commit_id: str,
-        commit_msg: str,
-        collect_csv_data: bool = True,
-    ):
-        self.conn = connection
-        self.use_database = connection is not None
-        if self.use_database:
-            self.conn.autocommit = True
-        self.logger = logger
-        self.repository = repository
-        self.branch = branch
-        self.commit_id = commit_id
-        self.commit_msg = commit_msg
-        self.collect_csv_data = collect_csv_data
-
-        # For CSV export - store all data in pandas DataFrames (only if CSV collection is enabled)
-        if self.collect_csv_data:
-            # Initialize empty DataFrames with proper schemas
-            self.benchmarks_df = pd.DataFrame(
-                columns=[
-                    "benchmark_id",
-                    "repository",
-                    "branch",
-                    "commit_id",
-                    "commit_message",
-                    "metadata",
-                    "created_at",
-                ]
-            )
-            self.device_measurements_df = pd.DataFrame(
-                columns=["benchmark_id", "cpu_util", "mem_megabytes", "gpu_util", "gpu_mem_megabytes", "time"]
-            )
-            self.model_measurements_df = pd.DataFrame(
-                columns=[
-                    "benchmark_id",
-                    "time",
-                    "model_load_time",
-                    "first_eager_forward_pass_time_secs",
-                    "second_eager_forward_pass_time_secs",
-                    "first_eager_generate_time_secs",
-                    "second_eager_generate_time_secs",
-                    "time_to_first_token_secs",
-                    "time_to_second_token_secs",
-                    "time_to_third_token_secs",
-                    "time_to_next_token_mean_secs",
-                    "first_compile_generate_time_secs",
-                    "second_compile_generate_time_secs",
-                    "third_compile_generate_time_secs",
-                    "fourth_compile_generate_time_secs",
-                ]
-            )
-        else:
-            self.benchmarks_df = None
-            self.device_measurements_df = None
-            self.model_measurements_df = None
-
-    def initialise_benchmark(self, metadata: dict[str, str]) -> str:
-        """
-        Creates a new benchmark, returns the benchmark id (UUID)
-        """
-        # Generate a unique UUID for this benchmark
-        benchmark_id = str(uuid.uuid4())
-
-        if self.use_database:
-            with self.conn.cursor() as cur:
-                cur.execute(
-                    "INSERT INTO benchmarks (benchmark_id, repository, branch, commit_id, commit_message, metadata) VALUES (%s, %s, %s, %s, %s, %s)",
-                    (benchmark_id, self.repository, self.branch, self.commit_id, self.commit_msg, metadata),
-                )
-                self.logger.debug(f"initialised benchmark #{benchmark_id}")
-
-        # Store benchmark data for CSV export (if enabled)
-        if self.collect_csv_data:
-            # Add row to pandas DataFrame
-            new_row = pd.DataFrame(
-                [
-                    {
-                        "benchmark_id": benchmark_id,
-                        "repository": self.repository,
-                        "branch": self.branch,
-                        "commit_id": self.commit_id,
-                        "commit_message": self.commit_msg,
-                        "metadata": json.dumps(metadata),
-                        "created_at": datetime.utcnow().isoformat(),
-                    }
-                ]
-            )
-            self.benchmarks_df = pd.concat([self.benchmarks_df, new_row], ignore_index=True)
-
-        mode_info = []
-        if self.use_database:
-            mode_info.append("database")
-        if self.collect_csv_data:
-            mode_info.append("CSV")
-        mode_str = " + ".join(mode_info) if mode_info else "no storage"
-
-        self.logger.debug(f"initialised benchmark #{benchmark_id} ({mode_str} mode)")
-        return benchmark_id
-
-    def collect_device_measurements(self, benchmark_id: str, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes):
-        """
-        Collect device metrics, such as CPU & GPU usage. These are "static", as in you cannot pass arbitrary arguments to the function.
-        """
-        # Store device measurements for CSV export (if enabled)
-        if self.collect_csv_data:
-            # Add row to pandas DataFrame
-            new_row = pd.DataFrame(
-                [
-                    {
-                        "benchmark_id": benchmark_id,
-                        "cpu_util": cpu_util,
-                        "mem_megabytes": mem_megabytes,
-                        "gpu_util": gpu_util,
-                        "gpu_mem_megabytes": gpu_mem_megabytes,
-                        "time": datetime.utcnow().isoformat(),
-                    }
-                ]
-            )
-            self.device_measurements_df = pd.concat([self.device_measurements_df, new_row], ignore_index=True)
-
-        # Store in database if available
-        if self.use_database:
-            with self.conn.cursor() as cur:
-                cur.execute(
-                    "INSERT INTO device_measurements (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes) VALUES (%s, %s, %s, %s, %s)",
-                    (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes),
-                )
-
-        self.logger.debug(
-            f"collected device measurements for benchmark #{benchmark_id} [CPU util: {cpu_util}, mem MBs: {mem_megabytes}, GPU util: {gpu_util}, GPU mem MBs: {gpu_mem_megabytes}]"
-        )
-
-    def collect_model_measurements(self, benchmark_id: str, measurements: dict[str, float]):
-        # Store model measurements for CSV export (if enabled)
-        if self.collect_csv_data:
-            # Add row to pandas DataFrame with flattened measurements
-            row_data = {"benchmark_id": benchmark_id, "time": datetime.utcnow().isoformat()}
-            # Flatten the measurements dict into the row
-            row_data.update(measurements)
-
-            new_row = pd.DataFrame([row_data])
-            self.model_measurements_df = pd.concat([self.model_measurements_df, new_row], ignore_index=True)
-
-        # Store in database if available
-        if self.use_database:
-            with self.conn.cursor() as cur:
-                cur.execute(
-                    """
-                    INSERT INTO model_measurements (
-                        benchmark_id,
-                        measurements
-                    ) VALUES (%s, %s)
-                    """,
-                    (
-                        benchmark_id,
-                        measurements,
-                    ),
-                )
-
-        self.logger.debug(f"collected model measurements for benchmark #{benchmark_id}: {measurements}")
-
-    def export_to_csv(self, output_dir: str = "benchmark_results"):
-        """
-        Export all collected data to CSV files using pandas DataFrames
-        """
-        if not self.collect_csv_data:
-            self.logger.warning("CSV data collection is disabled - no CSV files will be generated")
-            return
-
-        if not os.path.exists(output_dir):
-            os.makedirs(output_dir)
-            self.logger.info(f"Created output directory: {output_dir}")
-
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        files_created = []
-
-        # Export using pandas DataFrames
-        self._export_pandas_data(output_dir, timestamp, files_created)
-
-        self.logger.info(f"CSV export complete! Created {len(files_created)} files in {output_dir}")
-
-    def _export_pandas_data(self, output_dir: str, timestamp: str, files_created: list):
-        """
-        Export CSV files using pandas DataFrames
-        """
-        # Export benchmarks
-        benchmarks_file = os.path.join(output_dir, f"benchmarks_{timestamp}.csv")
-        self.benchmarks_df.to_csv(benchmarks_file, index=False)
-        files_created.append(benchmarks_file)
-        self.logger.info(f"Exported {len(self.benchmarks_df)} benchmark records to {benchmarks_file}")
-
-        # Export device measurements
-        device_file = os.path.join(output_dir, f"device_measurements_{timestamp}.csv")
-        self.device_measurements_df.to_csv(device_file, index=False)
-        files_created.append(device_file)
-        self.logger.info(f"Exported {len(self.device_measurements_df)} device measurement records to {device_file}")
-
-        # Export model measurements (already flattened)
-        model_file = os.path.join(output_dir, f"model_measurements_{timestamp}.csv")
-        self.model_measurements_df.to_csv(model_file, index=False)
-        files_created.append(model_file)
-        self.logger.info(f"Exported {len(self.model_measurements_df)} model measurement records to {model_file}")
-
-        # Create comprehensive summary using pandas operations
-        summary_file = os.path.join(output_dir, f"benchmark_summary_{timestamp}.csv")
-        self._create_summary(summary_file)
-        files_created.append(summary_file)
-
-    def _create_summary(self, summary_file: str):
-        """
-        Create a comprehensive summary CSV using pandas operations
-        """
-        if len(self.benchmarks_df) == 0:
-            # Create empty summary file
-            summary_df = pd.DataFrame()
-            summary_df.to_csv(summary_file, index=False)
-            self.logger.info(f"Created empty benchmark summary at {summary_file}")
-            return
-
-        # Start with benchmarks as the base
-        summary_df = self.benchmarks_df.copy()
-
-        # Add model measurements (join on benchmark_id)
-        if len(self.model_measurements_df) > 0:
-            # Drop 'time' column from model measurements to avoid conflicts
-            model_df = self.model_measurements_df.drop(columns=["time"], errors="ignore")
-            summary_df = summary_df.merge(model_df, on="benchmark_id", how="left")
-
-        # Calculate device measurement aggregates using pandas groupby
-        if len(self.device_measurements_df) > 0:
-            device_agg = (
-                self.device_measurements_df.groupby("benchmark_id")
-                .agg(
-                    {
-                        "cpu_util": ["mean", "max", "std", "count"],
-                        "mem_megabytes": ["mean", "max", "std"],
-                        "gpu_util": ["mean", "max", "std"],
-                        "gpu_mem_megabytes": ["mean", "max", "std"],
-                    }
-                )
-                .round(3)
-            )
-
-            # Flatten column names
-            device_agg.columns = [f"{col[0]}_{col[1]}" for col in device_agg.columns]
-            device_agg = device_agg.reset_index()
-
-            # Rename count column to be more descriptive
-            if "cpu_util_count" in device_agg.columns:
-                device_agg = device_agg.rename(columns={"cpu_util_count": "device_measurement_count"})
-
-            # Merge with summary
-            summary_df = summary_df.merge(device_agg, on="benchmark_id", how="left")
-
-        # Export the comprehensive summary
-        summary_df.to_csv(summary_file, index=False)
-        self.logger.info(f"Created comprehensive benchmark summary with {len(summary_df)} records at {summary_file}")
-
-    def close(self):
-        if self.use_database and self.conn:
-            self.conn.close()
-
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
-handler = logging.StreamHandler(sys.stdout)
-handler.setLevel(logging.INFO)
-formatter = logging.Formatter("[%(levelname)s - %(asctime)s] %(message)s")
-handler.setFormatter(formatter)
-logger.addHandler(handler)
-
-
-def parse_arguments() -> tuple[str, str, str, str, bool, str]:
-    """
-    Parse command line arguments for the benchmarking CLI.
-    """
-    parser = argparse.ArgumentParser(description="CLI for benchmarking the huggingface/transformers.")
-
-    parser.add_argument(
-        "repository",
-        type=str,
-        help="The repository name on which the benchmarking is performed.",
-    )
-
-    parser.add_argument(
-        "branch",
-        type=str,
-        help="The branch name on which the benchmarking is performed.",
-    )
-
-    parser.add_argument(
-        "commit_id",
-        type=str,
-        help="The commit hash on which the benchmarking is performed.",
-    )
-
-    parser.add_argument(
-        "commit_msg",
-        type=str,
-        help="The commit message associated with the commit, truncated to 70 characters.",
-    )
-
-    parser.add_argument("--csv", action="store_true", default=False, help="Enable CSV output files generation.")
-
-    parser.add_argument(
-        "--csv-output-dir",
-        type=str,
-        default="benchmark_results",
-        help="Directory for CSV output files (default: benchmark_results).",
-    )
-
-    args = parser.parse_args()
-
-    # CSV is disabled by default, only enabled when --csv is used
-    generate_csv = args.csv
-
-    return args.repository, args.branch, args.commit_id, args.commit_msg, generate_csv, args.csv_output_dir
-
-
-def import_from_path(module_name, file_path):
-    try:
-        spec = importlib.util.spec_from_file_location(module_name, file_path)
-        module = importlib.util.module_from_spec(spec)
-        sys.modules[module_name] = module
-        spec.loader.exec_module(module)
-        return module
-    except Exception as e:
-        raise ImportModuleException(f"failed to load python module: {e}")
-
-
-def create_database_connection():
-    """
-    Try to create a database connection. Returns None if connection fails.
-    """
-    if not PSYCOPG2_AVAILABLE:
-        logger.warning("psycopg2 not available - running in CSV-only mode")
-        return None
-
-    try:
-        import psycopg2
-
-        conn = psycopg2.connect("dbname=metrics")
-        logger.info("Successfully connected to database")
-        return conn
-    except Exception as e:
-        logger.warning(f"Failed to connect to database: {e}. Running in CSV-only mode")
-        return None
-
-
-def create_global_metrics_recorder(
-    repository: str, branch: str, commit_id: str, commit_msg: str, generate_csv: bool = False
-) -> MetricsRecorder:
-    """
-    Create a global metrics recorder that will be used across all benchmarks.
-    """
-    connection = create_database_connection()
-    recorder = MetricsRecorder(connection, logger, repository, branch, commit_id, commit_msg, generate_csv)
-
-    # Log the storage mode
-    storage_modes = []
-    if connection is not None:
-        storage_modes.append("database")
-    if generate_csv:
-        storage_modes.append("CSV")
-
-    if not storage_modes:
-        logger.warning("Running benchmarks with NO data storage (no database connection, CSV disabled)")
-        logger.warning("Use --csv flag to enable CSV output when database is unavailable")
-    else:
-        logger.info(f"Running benchmarks with: {' + '.join(storage_modes)} storage")
-
-    return recorder
-
-
-if __name__ == "__main__":
-    benchmarks_folder_path = os.path.dirname(os.path.realpath(__file__))
-    benches_folder_path = os.path.join(benchmarks_folder_path, "benches")
-
-    repository, branch, commit_id, commit_msg, generate_csv, csv_output_dir = parse_arguments()
-
-    # Create a global metrics recorder
-    global_metrics_recorder = create_global_metrics_recorder(repository, branch, commit_id, commit_msg, generate_csv)
-
-    successful_benchmarks = 0
-    failed_benchmarks = 0
-
-    # Automatically discover all benchmark modules in benches/ folder
-    benchmark_modules = []
-
-    if os.path.exists(benches_folder_path):
-        logger.debug(f"Scanning for benchmarks in: {benches_folder_path}")
-        for entry in os.scandir(benches_folder_path):
-            if not entry.name.endswith(".py"):
-                continue
-            if entry.name.startswith("__"):  # Skip __init__.py, __pycache__, etc.
-                continue
-
-            # Check if the file has a run_benchmark function
-            try:
-                logger.debug(f"checking if benches/{entry.name} has run_benchmark function")
-                module = import_from_path(entry.name.split(".")[0], entry.path)
-                if hasattr(module, "run_benchmark"):
-                    benchmark_modules.append(entry.name)
-                    logger.debug(f"discovered benchmark: {entry.name}")
-                else:
-                    logger.debug(f"skipping {entry.name} - no run_benchmark function found")
-            except Exception as e:
-                logger.debug(f"failed to check benches/{entry.name}: {e}")
-    else:
-        logger.warning(f"Benches directory not found: {benches_folder_path}")
-
-    if benchmark_modules:
-        logger.info(f"Discovered {len(benchmark_modules)} benchmark(s): {benchmark_modules}")
-    else:
-        logger.warning("No benchmark modules found in benches/ directory")
-
-    for module_name in benchmark_modules:
-        module_path = os.path.join(benches_folder_path, module_name)
-        try:
-            logger.debug(f"loading: {module_name}")
-            module = import_from_path(module_name.split(".")[0], module_path)
-            logger.info(f"running benchmarks in: {module_name}")
-
-            # Check if the module has an updated run_benchmark function that accepts metrics_recorder
-            try:
-                # Try the new signature first
-                module.run_benchmark(logger, repository, branch, commit_id, commit_msg, global_metrics_recorder)
-            except TypeError:
-                # Fall back to the old signature for backward compatibility
-                logger.warning(
-                    f"Module {module_name} using old run_benchmark signature - database connection will be created per module"
-                )
-                module.run_benchmark(logger, repository, branch, commit_id, commit_msg)
-
-            successful_benchmarks += 1
-        except ImportModuleException as e:
-            logger.error(e)
-            failed_benchmarks += 1
-        except Exception as e:
-            logger.error(f"error running benchmarks for {module_name}: {e}")
-            failed_benchmarks += 1
-
-    # Export CSV results at the end (if enabled)
-    try:
-        if generate_csv:
-            global_metrics_recorder.export_to_csv(csv_output_dir)
-            logger.info(f"CSV reports have been generated and saved to the {csv_output_dir} directory")
-        else:
-            logger.info("CSV generation disabled - no CSV files created (use --csv to enable)")
-
-        logger.info(f"Benchmark run completed. Successful: {successful_benchmarks}, Failed: {failed_benchmarks}")
-    except Exception as e:
-        logger.error(f"Failed to export CSV results: {e}")
-    finally:
-        global_metrics_recorder.close()
--- a/benchmark/config/generation.yaml
+++ b/benchmark/config/generation.yaml
@ -19,7 +19,7 @@ backend:
  model: meta-llama/Llama-2-7b-hf
  cache_implementation: static
  torch_compile: true
-  dtype: float16
+  torch_dtype: float16
  torch_compile_config:
    backend: inductor
    mode: reduce-overhead
--- a/benchmark/default.yml
+++ b/benchmark/default.yml
@ -1,10 +0,0 @@
-apiVersion: 1
-
-providers:
-  - name: 'Transformers Benchmarks'
-    orgId: 1
-    type: file
-    updateIntervalSeconds: 10
-    allowUiUpdates: true
-    options:
-      path: /etc/grafana/dashboards
--- a/benchmark/grafana_dashboard.json
+++ b/benchmark/grafana_dashboard.json
@ -30,7 +30,7 @@
      "title": "Go to data",
      "tooltip": "Go to data",
      "type": "link",
-      "url": "http://transformers-benchmarks.hf.co/d/fdz33iyzln9c0a/transformers-benchmarks?orgId=1&from=${StartTime}&to=${EndTime}"
+      "url": "http://transformers-benchmarks.huggingface.co/d/fdz33iyzln9c0a/transformers-benchmarks?orgId=1&from=${StartTime}&to=${EndTime}"
    }
  ],
  "liveNow": true,
@ -77,7 +77,7 @@
            "properties": [
              {
                "id": "custom.width",
-                "value": 202
+                "value": 196
              }
            ]
          },
@ -101,7 +101,7 @@
            "properties": [
              {
                "id": "custom.width",
-                "value": 524
+                "value": 581
              }
            ]
          },
@ -113,19 +113,7 @@
            "properties": [
              {
                "id": "custom.width",
-                "value": 353
-              }
-            ]
-          },
-          {
-            "matcher": {
-              "id": "byName",
-              "options": "model_id"
-            },
-            "properties": [
-              {
-                "id": "custom.width",
-                "value": 216
+                "value": 379
              }
            ]
          }
@ -155,14 +143,12 @@
      "targets": [
        {
          "datasource": {
-            "default": true,
-            "type": "grafana-postgresql-datasource",
-            "uid": "be28nkzirtb0gd"
+            "type": "grafana-postgresql-datasource"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT commit_id, commit_message, metadata->>'gpu_name' as gpu_name, metadata->>'model_id' as model_id, created_at AS date FROM benchmarks WHERE branch = '${branch}' AND metadata->>'gpu_name' = '${gpu_name}' ORDER BY benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT commit_id as commit_id, commit_message, gpu_name, created_at AS date FROM benchmarks WHERE branch = '${branch}' ORDER BY benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -320,14 +306,13 @@
      "targets": [
        {
          "datasource": {
-            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "be28nkzirtb0gd"
+            "uid": "bdz2yss7sxo1sc"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'first_eager_forward_pass_time_secs' AS double precision) AS first_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'first_eager_forward_pass_time_secs' AS double precision) AS first_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -446,14 +431,13 @@
      "targets": [
        {
          "datasource": {
-            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "be28nkzirtb0gd"
+            "uid": "bdz2yss7sxo1sc"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'second_eager_forward_pass_time_secs' AS double precision) AS second_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'second_eager_forward_pass_time_secs' AS double precision) AS second_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -581,14 +565,13 @@
      "targets": [
        {
          "datasource": {
-            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "be28nkzirtb0gd"
+            "uid": "bdz2yss7sxo1sc"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'time_to_first_token_secs' AS double precision) AS time_to_first_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'time_to_first_token_secs' AS double precision) AS time_to_first_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -703,14 +686,13 @@
      "targets": [
        {
          "datasource": {
-            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "be28nkzirtb0gd"
+            "uid": "bdz2yss7sxo1sc"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'time_to_second_token_secs' AS double precision) AS time_to_second_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'time_to_second_token_secs' AS double precision) AS time_to_second_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -825,14 +807,13 @@
      "targets": [
        {
          "datasource": {
-            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "be28nkzirtb0gd"
+            "uid": "bdz2yss7sxo1sc"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'time_to_third_token_secs' AS double precision) AS time_to_third_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'time_to_third_token_secs' AS double precision) AS time_to_third_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -947,14 +928,13 @@
      "targets": [
        {
          "datasource": {
-            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "be28nkzirtb0gd"
+            "uid": "bdz2yss7sxo1sc"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'time_to_next_token_mean_secs' AS double precision) AS time_to_next_token_mean_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'time_to_next_token_mean_secs' AS double precision) AS time_to_next_token_mean_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -1082,14 +1062,13 @@
      "targets": [
        {
          "datasource": {
-            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "be28nkzirtb0gd"
+            "uid": "bdz2yss7sxo1sc"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'first_compile_generate_time_secs' AS double precision) AS first_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'first_compile_generate_time_secs' AS double precision) AS first_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -1204,14 +1183,13 @@
      "targets": [
        {
          "datasource": {
-            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "be28nkzirtb0gd"
+            "uid": "bdz2yss7sxo1sc"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'second_compile_generate_time_secs' AS double precision) AS second_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'second_compile_generate_time_secs' AS double precision) AS second_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -1326,14 +1304,13 @@
      "targets": [
        {
          "datasource": {
-            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "be28nkzirtb0gd"
+            "uid": "bdz2yss7sxo1sc"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'third_compile_generate_time_secs' AS double precision) AS third_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'third_compile_generate_time_secs' AS double precision) AS third_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -1448,14 +1425,13 @@
      "targets": [
        {
          "datasource": {
-            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "be28nkzirtb0gd"
+            "uid": "bdz2yss7sxo1sc"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'fourth_compile_generate_time_secs' AS double precision) AS fourth_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'fourth_compile_generate_time_secs' AS double precision) AS fourth_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -1504,7 +1480,11 @@
      "id": 15,
      "panels": [
        {
-          "datasource": {},
+          "datasource": {
+            "default": true,
+            "type": "grafana-postgresql-datasource",
+            "uid": "be28nkzirtb0gd"
+          },
          "fieldConfig": {
            "defaults": {
              "color": {
@ -1548,7 +1528,8 @@
                "mode": "absolute",
                "steps": [
                  {
-                    "color": "green"
+                    "color": "green",
+                    "value": null
                  },
                  {
                    "color": "red",
@ -1582,9 +1563,8 @@
          "targets": [
            {
              "datasource": {
-                "default": true,
                "type": "grafana-postgresql-datasource",
-                "uid": "be28nkzirtb0gd"
+                "uid": "bdz2yss7sxo1sc"
              },
              "editorMode": "code",
              "format": "table",
@ -1685,7 +1665,11 @@
          "type": "timeseries"
        },
        {
-          "datasource": {},
+          "datasource": {
+            "default": true,
+            "type": "grafana-postgresql-datasource",
+            "uid": "be28nkzirtb0gd"
+          },
          "fieldConfig": {
            "defaults": {
              "color": {
@ -1729,7 +1713,8 @@
                "mode": "absolute",
                "steps": [
                  {
-                    "color": "green"
+                    "color": "green",
+                    "value": null
                  },
                  {
                    "color": "red",
@ -1763,9 +1748,8 @@
          "targets": [
            {
              "datasource": {
-                "default": true,
                "type": "grafana-postgresql-datasource",
-                "uid": "be28nkzirtb0gd"
+                "uid": "bdz2yss7sxo1sc"
              },
              "editorMode": "code",
              "format": "table",
@ -1866,7 +1850,11 @@
          "type": "timeseries"
        },
        {
-          "datasource": {},
+          "datasource": {
+            "default": true,
+            "type": "grafana-postgresql-datasource",
+            "uid": "be28nkzirtb0gd"
+          },
          "fieldConfig": {
            "defaults": {
              "color": {
@ -1910,7 +1898,8 @@
                "mode": "absolute",
                "steps": [
                  {
-                    "color": "green"
+                    "color": "green",
+                    "value": null
                  },
                  {
                    "color": "red",
@ -1944,9 +1933,8 @@
          "targets": [
            {
              "datasource": {
-                "default": true,
                "type": "grafana-postgresql-datasource",
-                "uid": "be28nkzirtb0gd"
+                "uid": "bdz2yss7sxo1sc"
              },
              "editorMode": "code",
              "format": "table",
@ -2047,7 +2035,11 @@
          "type": "timeseries"
        },
        {
-          "datasource": {},
+          "datasource": {
+            "default": true,
+            "type": "grafana-postgresql-datasource",
+            "uid": "be28nkzirtb0gd"
+          },
          "fieldConfig": {
            "defaults": {
              "color": {
@ -2091,7 +2083,8 @@
                "mode": "absolute",
                "steps": [
                  {
-                    "color": "green"
+                    "color": "green",
+                    "value": null
                  },
                  {
                    "color": "red",
@ -2125,9 +2118,8 @@
          "targets": [
            {
              "datasource": {
-                "default": true,
                "type": "grafana-postgresql-datasource",
-                "uid": "be28nkzirtb0gd"
+                "uid": "bdz2yss7sxo1sc"
              },
              "editorMode": "code",
              "format": "table",
@ -2232,6 +2224,7 @@
      "type": "row"
    }
  ],
+  "refresh": "",
  "schemaVersion": 39,
  "tags": [],
  "templating": {
@ -2243,7 +2236,6 @@
          "value": "main"
        },
        "datasource": {
-          "default": true,
          "type": "grafana-postgresql-datasource",
          "uid": "be28nkzirtb0gd"
        },
@ -2256,7 +2248,7 @@
        "name": "branch",
        "options": [],
        "query": "SELECT DISTINCT branch FROM benchmarks;",
-        "refresh": 1,
+        "refresh": 2,
        "regex": "",
        "skipUrlSync": false,
        "sort": 0,
@ -2269,7 +2261,6 @@
          "value": "1729701492845"
        },
        "datasource": {
-          "default": true,
          "type": "grafana-postgresql-datasource",
          "uid": "be28nkzirtb0gd"
        },
@ -2290,11 +2281,10 @@
      {
        "current": {
          "selected": false,
-          "text": "1730393397577",
-          "value": "1730393397577"
+          "text": "1730120430069",
+          "value": "1730120430069"
        },
        "datasource": {
-          "default": true,
          "type": "grafana-postgresql-datasource",
          "uid": "be28nkzirtb0gd"
        },
@ -2322,16 +2312,15 @@
          "type": "grafana-postgresql-datasource",
          "uid": "be28nkzirtb0gd"
        },
-        "definition": "SELECT DISTINCT metadata->>'gpu_name' FROM benchmarks;",
-        "description": "",
+        "definition": "SELECT DISTINCT gpu_name FROM benchmarks;",
        "hide": 0,
        "includeAll": false,
        "label": "GPU",
        "multi": false,
        "name": "gpu_name",
        "options": [],
-        "query": "SELECT DISTINCT metadata->>'gpu_name' FROM benchmarks;",
-        "refresh": 1,
+        "query": "SELECT DISTINCT gpu_name FROM benchmarks;",
+        "refresh": 2,
        "regex": "",
        "skipUrlSync": false,
        "sort": 0,
@ -2339,7 +2328,7 @@
      },
      {
        "current": {
-          "selected": true,
+          "selected": false,
          "text": "10",
          "value": "10"
        },
@ -2370,6 +2359,6 @@
  "timezone": "browser",
  "title": "Transformers benchmarks",
  "uid": "fdz33iyzln9c0a",
-  "version": 10,
+  "version": 4,
  "weekStart": ""
 }
--- a/benchmark/grafana_datasource.yaml
+++ b/benchmark/grafana_datasource.yaml
@ -1,17 +0,0 @@
-apiVersion: 1
-datasources:
-  - name: grafana-postgresql-datasource
-    uid: be28nkzirtb0gd
-    type: postgres
-    url: $GRAFANA_POSTGRES_DATASOURCE_URL
-    user: $GRAFANA_POSTGRES_DATASOURCE_USER
-    secureJsonData:
-      password: $GRAFANA_POSTGRES_DATASOURCE_PWD
-    jsonData:
-      database: metrics
-      maxOpenConns: 100
-      maxIdleConns: 100
-      maxIdleConnsAuto: true
-      connMaxLifetime: 14400
-      postgresVersion: 1000
-      timescaledb: false
--- a/benchmark/init_db.sql
+++ b/benchmark/init_db.sql
@ -0,0 +1,33 @@
+CREATE TABLE IF NOT EXISTS benchmarks (
+  benchmark_id SERIAL PRIMARY KEY,
+  branch VARCHAR(255),
+  commit_id VARCHAR(72),
+  commit_message VARCHAR(70),
+  gpu_name VARCHAR(255),
+  created_at timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC')
+);
+
+CREATE INDEX IF NOT EXISTS benchmarks_benchmark_id_idx ON benchmarks (benchmark_id);
+
+CREATE INDEX IF NOT EXISTS benchmarks_branch_idx ON benchmarks (branch);
+
+CREATE TABLE IF NOT EXISTS device_measurements (
+  measurement_id SERIAL PRIMARY KEY,
+  benchmark_id int REFERENCES benchmarks (benchmark_id),
+  cpu_util double precision,
+  mem_megabytes double precision,
+  gpu_util double precision,
+  gpu_mem_megabytes double precision,
+  time timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC')
+);
+
+CREATE INDEX IF NOT EXISTS device_measurements_branch_idx ON device_measurements (benchmark_id);
+
+CREATE TABLE IF NOT EXISTS model_measurements (
+  measurement_id SERIAL PRIMARY KEY,
+  benchmark_id int REFERENCES benchmarks (benchmark_id),
+  measurements jsonb,
+  time timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC')
+);
+
+CREATE INDEX IF NOT EXISTS model_measurements_branch_idx ON model_measurements (benchmark_id);
--- a/benchmark/llama.py
+++ b/benchmark/llama.py
@ -0,0 +1,408 @@
+import argparse
+import json
+import logging
+import os
+import sys
+from statistics import mean
+from threading import Event, Thread
+from time import perf_counter, sleep
+from typing import Optional
+import gpustat
+import psutil
+import psycopg2
+import torch
+
+from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, StaticCache
+from psycopg2.extras import Json
+from psycopg2.extensions import register_adapter
+
+
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+handler = logging.StreamHandler(sys.stdout)
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter("[%(levelname)s - %(asctime)s] %(message)s")
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+os.environ["TOKENIZERS_PARALLELISM"] = "1"
+torch.set_float32_matmul_precision("high")
+register_adapter(dict, Json)
+
+
+def parse_arguments():
+    """
+    Parse command line arguments for the benchmarking CLI.
+    """
+    parser = argparse.ArgumentParser(description="CLI for benchmarking the huggingface/transformers.")
+
+    parser.add_argument(
+        "branch",
+        type=str,
+        help="The branch name on which the benchmarking is performed.",
+    )
+
+    parser.add_argument(
+        "commit_id",
+        type=str,
+        help="The commit hash on which the benchmarking is performed.",
+    )
+
+    parser.add_argument(
+        "commit_msg",
+        type=str,
+        help="The commit message associated with the commit, truncated to 70 characters.",
+    )
+
+    args = parser.parse_args()
+
+    return args.branch, args.commit_id, args.commit_msg
+
+
+def collect_metrics(benchmark_id, continue_metric_collection):
+    p = psutil.Process(os.getpid())
+    conn = psycopg2.connect("dbname=metrics")
+    cur = conn.cursor()
+    while not continue_metric_collection.is_set():
+        with p.oneshot():
+            cpu_util = p.cpu_percent()
+            mem_megabytes = p.memory_info().rss / (1024 * 1024)
+        gpu_stats = gpustat.GPUStatCollection.new_query()
+        gpu_util = gpu_stats[0]["utilization.gpu"]
+        gpu_mem_megabytes = gpu_stats[0]["memory.used"]
+        cur.execute(
+            "INSERT INTO device_measurements (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes) VALUES (%s, %s, %s, %s, %s)",
+            (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes),
+        )
+        sleep(0.01)
+        conn.commit()
+    conn.close()
+
+
+def run_benchmark(branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100):
+    continue_metric_collection = Event()
+    metrics_thread = None
+    try:
+        gpu_stats = gpustat.GPUStatCollection.new_query()
+        gpu_name = gpu_stats[0]["name"]
+        conn = psycopg2.connect("dbname=metrics")
+        cur = conn.cursor()
+        cur.execute(
+            "INSERT INTO benchmarks (branch, commit_id, commit_message, gpu_name) VALUES (%s, %s, %s, %s) RETURNING benchmark_id",
+            (branch, commit_id, commit_msg, gpu_name),
+        )
+        conn.commit()
+        benchmark_id = cur.fetchone()[0]
+        logger.info(f"running benchmark #{benchmark_id} on {gpu_name}")
+        metrics_thread = Thread(target=collect_metrics, args=[benchmark_id, continue_metric_collection])
+        metrics_thread.start()
+        logger.info("started background thread to fetch device metrics")
+
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"  # silence warnings when compiling
+
+        device = "cuda"
+        ckpt = "meta-llama/Llama-2-7b-hf"
+
+        logger.info("downloading weights")
+        # This is to avoid counting download in model load time measurement
+        model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16)
+        gen_config = GenerationConfig(do_sample=False, top_p=1, temperature=1)
+        logger.info("loading model")
+        start = perf_counter()
+        model = AutoModelForCausalLM.from_pretrained(
+            ckpt, torch_dtype=torch.float16, generation_config=gen_config
+        ).eval()
+        model.to(device)
+        torch.cuda.synchronize()
+        end = perf_counter()
+        model_load_time = end - start
+        logger.info(f"loaded model in: {model_load_time}s")
+
+        tokenizer = AutoTokenizer.from_pretrained(ckpt)
+
+        prompt = "Why dogs are so cute?"
+        inputs = tokenizer(prompt, return_tensors="pt").to(device)
+
+        # Specify the max length (including both the prompt and the response)
+        # When calling `generate` with `cache_implementation="static" later, this is also used to create a `StaticCache` object
+        # with sequence length = `max_length`. The longer the more you will re-use it
+        seq_length = inputs["input_ids"].shape[1]
+        model.generation_config.max_length = seq_length + num_tokens_to_generate
+        batch_size = inputs["input_ids"].shape[0]
+
+        # Copied from the gpt-fast repo
+        def multinomial_sample_one_no_sync(probs_sort):  # Does multinomial sampling without a cuda synchronization
+            q = torch.empty_like(probs_sort).exponential_(1)
+            return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
+
+        def logits_to_probs(logits, temperature: float = 1.0, top_k: Optional[int] = None):
+            logits = logits / max(temperature, 1e-5)
+
+            if top_k is not None:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                pivot = v.select(-1, -1).unsqueeze(-1)
+                logits = torch.where(logits < pivot, -float("Inf"), logits)
+            probs = torch.nn.functional.softmax(logits, dim=-1)
+            return probs
+
+        def sample(logits, temperature: float = 1.0, top_k: Optional[int] = None):
+            probs = logits_to_probs(logits[:, -1], temperature, top_k)
+            idx_next = multinomial_sample_one_no_sync(probs)
+            return idx_next, probs
+
+        def decode_one_token(model, cur_token, cache_position, past_key_values):
+            logits = model(
+                cur_token,
+                cache_position=cache_position,
+                past_key_values=past_key_values,
+                return_dict=False,
+                use_cache=True,
+            )[0]
+            new_token = sample(logits, temperature=0.6, top_k=5)[0]
+            return new_token
+
+        #########
+        # Eager #
+        #########
+        with torch.no_grad():
+            past_key_values = StaticCache(
+                model.config,
+                batch_size=batch_size,
+                device=device,
+                dtype=torch.float16,
+                max_cache_len=seq_length + num_tokens_to_generate,
+            )
+            cache_position = torch.arange(seq_length, device=device)
+            start = perf_counter()
+            model(
+                **inputs,
+                cache_position=cache_position,
+                past_key_values=past_key_values,
+                return_dict=False,
+                use_cache=True,
+            )
+            end = perf_counter()
+            first_eager_fwd_pass_time = end - start
+            logger.info(f"completed first eager fwd pass in: {first_eager_fwd_pass_time}s")
+            start = perf_counter()
+            output = model.generate(**inputs, do_sample=False)
+            end = perf_counter()
+            first_eager_generate_time = end - start
+            logger.info(f"completed first eager generation in: {first_eager_generate_time}s")
+            logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
+
+            past_key_values = StaticCache(
+                model.config,
+                batch_size=batch_size,
+                device=device,
+                dtype=torch.float16,
+                max_cache_len=seq_length + num_tokens_to_generate,
+            )
+            cache_position = torch.arange(seq_length, device=device)
+            start = perf_counter()
+            model(
+                **inputs,
+                cache_position=cache_position,
+                past_key_values=past_key_values,
+                return_dict=False,
+                use_cache=True,
+            )
+            end = perf_counter()
+            second_eager_fwd_pass_time = end - start
+            logger.info(f"completed second eager fwd pass in: {second_eager_fwd_pass_time}s")
+            start = perf_counter()
+            model.generate(**inputs, do_sample=False)
+            end = perf_counter()
+            second_eager_generate_time = end - start
+            logger.info(f"completed second eager generation in: {second_eager_generate_time}s")
+            logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
+
+            torch.compiler.reset()
+
+            ################
+            # Forward pass #
+            ################
+
+            # `torch.compile(model, ...)` is not recommended as you compile callbacks
+            # and full generate. We recommend compiling only the forward for now.
+            # "reduce-overhead" will use cudagraphs.
+            generated_ids = torch.zeros(
+                (batch_size, num_tokens_to_generate + seq_length), dtype=torch.int, device=device
+            )
+
+            generated_ids[:, :seq_length] = inputs["input_ids"]
+            decode_one_token = torch.compile(decode_one_token, mode="reduce-overhead", fullgraph=True)
+            # model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
+            # TODO use  decode_one_token(model, input_id.clone(), cache_position) for verification
+            past_key_values = StaticCache(
+                model.config,
+                batch_size=batch_size,
+                device=device,
+                dtype=torch.float16,
+                max_cache_len=seq_length + num_tokens_to_generate + 10,
+            )
+            cache_position = torch.arange(seq_length, device=device)
+            all_generated_tokens = []
+            ### First compile, prefill
+            start = perf_counter()
+            next_token = decode_one_token(
+                model, inputs["input_ids"], cache_position=cache_position, past_key_values=past_key_values
+            )
+            torch.cuda.synchronize()
+            end = perf_counter()
+            time_to_first_token = end - start
+            logger.info(f"completed first compile generation in: {time_to_first_token}s")
+            cache_position += 1
+            all_generated_tokens += next_token.clone().detach().cpu().tolist()
+
+            cache_position = torch.tensor([seq_length], device=device)
+            ### First compile, decoding
+            start = perf_counter()
+            next_token = decode_one_token(
+                model, next_token.clone(), cache_position=cache_position, past_key_values=past_key_values
+            )
+            torch.cuda.synchronize()
+            end = perf_counter()
+            time_to_second_token = end - start
+            logger.info(f"completed second compile generation in: {time_to_first_token}s")
+            cache_position += 1
+            all_generated_tokens += next_token.clone().detach().cpu().tolist()
+
+            ### Second compile, decoding
+            start = perf_counter()
+            next_token = decode_one_token(
+                model, next_token.clone(), cache_position=cache_position, past_key_values=past_key_values
+            )
+            torch.cuda.synchronize()
+            end = perf_counter()
+            time_to_third_token = end - start
+            logger.info(f"completed third compile forward in: {time_to_first_token}s")
+            cache_position += 1
+            all_generated_tokens += next_token.clone().detach().cpu().tolist()
+
+            ### Using cuda graphs decoding
+
+            start = perf_counter()
+            for _ in range(1, num_tokens_to_generate):
+                all_generated_tokens += next_token.clone().detach().cpu().tolist()
+                next_token = decode_one_token(
+                    model, next_token.clone(), cache_position=cache_position, past_key_values=past_key_values
+                )
+                cache_position += 1
+            torch.cuda.synchronize()
+            end = perf_counter()
+            mean_time_to_next_token = (end - start) / num_tokens_to_generate
+            logger.info(f"completed next compile generation in: {mean_time_to_next_token}s")
+            logger.info(f"generated: {tokenizer.batch_decode(all_generated_tokens)}")
+
+            ####################
+            # Generate compile #
+            ####################
+            torch.compiler.reset()
+            # we will not compile full generate as it' s to intensive, tho we measure full forward!
+
+            past_key_values = StaticCache(
+                model.config,
+                batch_size=batch_size,
+                device=device,
+                dtype=torch.float16,
+                max_cache_len=seq_length + 128,
+            )
+
+            # 1st call
+            start = perf_counter()
+            output = model.generate(**inputs, past_key_values=past_key_values)
+            torch.cuda.synchronize()
+            end = perf_counter()
+            first_compile_generate_time = end - start
+            logger.info(f"completed first compile generation in: {first_compile_generate_time}s")
+            logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
+
+            past_key_values = StaticCache(
+                model.config,
+                batch_size=batch_size,
+                device=device,
+                dtype=torch.float16,
+                max_cache_len=seq_length + 128,
+            )
+            # 2nd call
+            start = perf_counter()
+            output = model.generate(**inputs, past_key_values=past_key_values)
+            torch.cuda.synchronize()
+            end = perf_counter()
+            second_compile_generate_time = end - start
+            logger.info(f"completed second compile generation in: {second_compile_generate_time}s")
+            logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
+
+            past_key_values = StaticCache(
+                model.config,
+                batch_size=batch_size,
+                device=device,
+                dtype=torch.float16,
+                max_cache_len=seq_length + 128,
+            )
+
+            # 3nd call
+            start = perf_counter()
+            output = model.generate(**inputs, past_key_values=past_key_values)
+            end = perf_counter()
+            third_compile_generate_time = end - start
+            logger.info(f"completed second compile generation in: {third_compile_generate_time}s")
+            logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
+
+            past_key_values = StaticCache(
+                model.config,
+                batch_size=batch_size,
+                device=device,
+                dtype=torch.float16,
+                max_cache_len=seq_length + 128,
+            )
+            # 4th call
+            start = perf_counter()
+            output = model.generate(**inputs, past_key_values=past_key_values)
+            end = perf_counter()
+            fourth_compile_generate_time = end - start
+            logger.info(f"completed second compile generation in: {fourth_compile_generate_time}s")
+            logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
+
+        cur.execute(
+            """
+            INSERT INTO model_measurements (
+                benchmark_id,
+                measurements
+            ) VALUES (%s, %s)
+            """,
+            (
+                benchmark_id,
+                {
+                    "model_load_time": model_load_time,
+                    "first_eager_forward_pass_time_secs": first_eager_fwd_pass_time,
+                    "second_eager_forward_pass_time_secs": second_eager_fwd_pass_time,
+                    "first_eager_generate_time_secs": first_eager_generate_time,
+                    "second_eager_generate_time_secs": second_eager_generate_time,
+                    "time_to_first_token_secs": time_to_first_token,
+                    "time_to_second_token_secs": time_to_second_token,
+                    "time_to_third_token_secs": time_to_third_token,
+                    "time_to_next_token_mean_secs": mean_time_to_next_token,
+                    "first_compile_generate_time_secs": first_compile_generate_time,
+                    "second_compile_generate_time_secs": second_compile_generate_time,
+                    "third_compile_generate_time_secs": third_compile_generate_time,
+                    "fourth_compile_generate_time_secs": fourth_compile_generate_time,
+                },
+            ),
+        )
+        conn.commit()
+        conn.close()
+    except Exception as e:
+        logger.error(f"Caught exception: {e}")
+    continue_metric_collection.set()
+    if metrics_thread is not None:
+        metrics_thread.join()
+
+
+if __name__ == "__main__":
+    branch, commit_id, commit_msg = parse_arguments()
+    run_benchmark(branch, commit_id, commit_msg, num_tokens_to_generate=20)
--- a/benchmark/optimum_benchmark_wrapper.py
+++ b/benchmark/optimum_benchmark_wrapper.py
@ -3,11 +3,7 @@ import subprocess


 def main(config_dir, config_name, args):
-    subprocess.run(
-        ["optimum-benchmark", "--config-dir", f"{config_dir}", "--config-name", f"{config_name}"]
-        + ["hydra/job_logging=disabled", "hydra/hydra_logging=disabled"]
-        + args
-    )
+    subprocess.run(["optimum-benchmark", "--config-dir", f"{config_dir}", "--config-name", f"{config_name}"] + ["hydra/job_logging=disabled", "hydra/hydra_logging=disabled"] + args)


 if __name__ == "__main__":
--- a/benchmark/requirements.txt
+++ b/benchmark/requirements.txt
@ -2,5 +2,4 @@ gpustat==1.1.1
 psutil==6.0.0
 psycopg2==2.9.9
 torch>=2.4.0
-hf_transfer
-pandas>=1.5.0
+hf_transfer
--- a/benchmark/utils/init_db.sql
+++ b/benchmark/utils/init_db.sql
--- a/benchmark_v2/.gitignore
+++ b/benchmark_v2/.gitignore
@ -1 +0,0 @@
-benchmark_results/
--- a/benchmark_v2/README.md
+++ b/benchmark_v2/README.md
@ -1,138 +0,0 @@
-# Benchmarking v2
-
-A comprehensive benchmarking framework for transformer models that supports multiple execution modes (eager, compiled, kernelized), detailed performance metrics collection, and structured output format.
-
-
-## Quick Start
-
-### Running All Benchmarks
-
-```bash
-# Run all benchmarks with default settings
-python run_benchmarks.py
-
-# Specify output directory
-python run_benchmarks.py --output-dir my_results
-
-# Run with custom parameters
-python run_benchmarks.py \
-    --warmup-iterations 5 \
-    --measurement-iterations 10 \
-    --num-tokens-to-generate 200
-```
-
-### Uploading Results to HuggingFace Dataset
-
-You can automatically upload benchmark results to a HuggingFace Dataset for tracking and analysis:
-
-```bash
-# Upload to a public dataset with auto-generated run ID
-python run_benchmarks.py --upload-to-hub username/benchmark-results
-
-# Upload with a custom run ID for easy identification
-python run_benchmarks.py --upload-to-hub username/benchmark-results --run-id experiment_v1
-
-# Upload with custom HuggingFace token (if not set in environment)
-python run_benchmarks.py --upload-to-hub username/benchmark-results --token hf_your_token_here
-```
-
-**Dataset Directory Structure:**
-```
-dataset_name/
-├── 2025-01-15/
-│   ├── runs/                       # Non-scheduled runs (manual, PR, etc.)
-│   │   └── 123-1245151651/         # GitHub run number and ID
-│   │       └── benchmark_results/
-│   │           ├── benchmark_summary_20250115_143022.json
-│   │           └── model-name/
-│   │               └── model-name_benchmark_20250115_143022.json
-│   └── benchmark_results_abc123de/ # Scheduled runs (daily CI)
-│       ├── benchmark_summary_20250115_143022.json
-│       └── model-name/
-│           └── model-name_benchmark_20250115_143022.json
-└── 2025-01-16/
-    └── ...
-```
-
-**Authentication for Uploads:**
-
-For uploading results, you need a HuggingFace token with write permissions to the target dataset. You can provide the token in several ways (in order of precedence):
-
-1. Command line: `--token hf_your_token_here`
-3. Environment variable: `HF_TOKEN`
-
-### Running Specific Benchmarks
-
-```bash
-# Include only specific benchmarks
-python run_benchmarks.py --include llama
-
-# Exclude specific benchmarks
-python run_benchmarks.py --exclude old_benchmark
-
-## Output Format
-
-Results are saved as JSON files with the following structure:
-
-```json
-{
-  "model_name": "llama_2_7b",
-  "benchmark_scenarios": [
-    {
-      "scenario_name": "eager_variant",
-      "metadata": {
-        "timestamp": "2025-01-XX...",
-        "commit_id": "abc123...",
-        "hardware_info": {
-          "gpu_name": "NVIDIA A100",
-          "gpu_memory_total": 40960,
-          "cpu_count": 64
-        },
-        "config": {
-          "variant": "eager",
-          "warmup_iterations": 3,
-          "measurement_iterations": 5
-        }
-      },
-      "measurements": {
-        "latency": {
-          "mean": 2.45,
-          "median": 2.43,
-          "std": 0.12,
-          "min": 2.31,
-          "max": 2.67,
-          "p95": 2.61,
-          "p99": 2.65
-        },
-        "time_to_first_token": {
-          "mean": 0.15,
-          "std": 0.02
-        },
-        "tokens_per_second": {
-          "mean": 87.3,
-          "unit": "tokens/sec"
-        }
-      },
-      "gpu_metrics": {
-        "gpu_utilization_mean": 85.2,
-        "gpu_memory_used_mean": 12450
-      }
-    }
-  ]
-}
-```
-
-### Debug Mode
-
-```bash
-python run_benchmarks.py --log-level DEBUG
-```
-
-## Contributing
-
-To add new benchmarks:
-
-1. Create a new file in `benches/`
-2. Implement the `ModelBenchmark` interface
-3. Add a runner function (`run_<benchmark_name>` or `run_benchmark`)
-4. run_benchmarks.py
--- a/benchmark_v2/benches/init.py
+++ b/benchmark_v2/benches/init.py
@ -1 +0,0 @@
-# Benchmark implementations directory
--- a/benchmark_v2/benches/llama.py
+++ b/benchmark_v2/benches/llama.py
@ -1,165 +0,0 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-from typing import Any
-
-import torch
-from benchmark_framework import ModelBenchmark
-
-
-os.environ["TOKENIZERS_PARALLELISM"] = "1"
-torch.set_float32_matmul_precision("high")
-
-
-class LLaMABenchmark(ModelBenchmark):
-    """Simplified LLaMA model benchmark implementation using the ModelBenchmark base class."""
-
-    def __init__(self, logger: logging.Logger):
-        super().__init__(logger)
-        self._default_prompt = "Why dogs are so cute?"  # Custom prompt for LLaMA
-
-    def get_scenario_configs(self) -> list[dict[str, Any]]:
-        """
-        Get LLaMA-specific scenario configurations.
-
-        Returns:
-            List of scenario configuration dictionaries
-        """
-        return [
-            # Eager variants
-            {"variant": "eager", "compile_mode": None, "use_cache": True, "description": "Eager execution with cache"},
-            # Compiled variants
-            {
-                "variant": "compiled",
-                "compile_mode": "max-autotune",
-                "use_cache": True,
-                "description": "Compiled with max autotune",
-            },
-            # Kernelized variant (if available)
-            {
-                "variant": "kernelized",
-                "compile_mode": "max-autotune",
-                "use_cache": True,
-                "description": "Kernelized execution",
-            },
-        ]
-
-    def _is_kernelization_available(self) -> bool:
-        """Check if kernelization is available for LLaMA."""
-        try:
-            from kernels import Mode, kernelize  # noqa: F401
-
-            return True
-        except ImportError:
-            self.logger.debug("Kernelization not available: kernels module not found")
-            return False
-
-    def get_default_generation_config(self) -> dict[str, Any]:
-        """Get LLaMA-specific generation configuration."""
-        return {
-            "do_sample": False,
-            "top_p": 1.0,
-            "temperature": 1.0,
-            "repetition_penalty": 1.0,
-            "max_new_tokens": None,  # Will be set per scenario
-        }
-
-    def get_model_init_kwargs(self, config) -> dict[str, Any]:
-        """Get LLaMA-specific model initialization kwargs."""
-        return {
-            "torch_dtype": getattr(torch, config.torch_dtype),
-            "attn_implementation": config.attn_implementation,
-            "use_cache": True,
-        }
-
-    def get_default_torch_dtype(self) -> str:
-        """Get default torch dtype for LLaMA."""
-        return "float16"  # LLaMA works well with float16
-
-    def get_default_device(self) -> str:
-        """Get default device for LLaMA."""
-        return "cuda"  # LLaMA prefers CUDA
-
-
-def run_llama(logger, output_dir, **kwargs):
-    """
-    Run LLaMA benchmark with the given configuration.
-
-    Args:
-        logger: Logger instance
-        output_dir: Output directory for results
-        **kwargs: Additional configuration options
-
-    Returns:
-        Path to output file if successful
-    """
-    from benchmark_framework import BenchmarkRunner
-
-    # Extract parameters with defaults
-    model_id = kwargs.get("model_id", "meta-llama/Llama-2-7b-hf")
-    warmup_iterations = kwargs.get("warmup_iterations", 3)
-    measurement_iterations = kwargs.get("measurement_iterations", 5)
-    num_tokens_to_generate = kwargs.get("num_tokens_to_generate", 100)
-    include_sdpa_variants = kwargs.get("include_sdpa_variants", True)
-    device = kwargs.get("device", "cuda")
-    torch_dtype = kwargs.get("torch_dtype", "float16")
-    batch_size = kwargs.get("batch_size", 1)
-    commit_id = kwargs.get("commit_id")
-
-    logger.info(f"Starting LLaMA benchmark for model: {model_id}")
-    logger.info(
-        f"Configuration: warmup={warmup_iterations}, measurement={measurement_iterations}, tokens={num_tokens_to_generate}"
-    )
-
-    try:
-        # Create benchmark instance
-        benchmark = LLaMABenchmark(logger)
-
-        # Create scenarios
-        scenarios = benchmark.create_scenarios(
-            model_id=model_id,
-            warmup_iterations=warmup_iterations,
-            measurement_iterations=measurement_iterations,
-            num_tokens_to_generate=num_tokens_to_generate,
-            include_sdpa_variants=include_sdpa_variants,
-            device=device,
-            torch_dtype=torch_dtype,
-            batch_size=batch_size,
-        )
-
-        logger.info(f"Created {len(scenarios)} benchmark scenarios")
-
-        # Create runner and execute benchmarks
-        runner = BenchmarkRunner(logger, output_dir)
-        results = runner.run_benchmark(benchmark, scenarios, commit_id=commit_id)
-
-        if not results:
-            logger.warning("No successful benchmark results")
-            return None
-
-        # Save results
-        model_name = model_id.split("/")[-1]  # Extract model name from ID
-        output_file = runner.save_results(model_name, results)
-
-        logger.info(f"LLaMA benchmark completed successfully. Results saved to: {output_file}")
-        return output_file
-
-    except Exception as e:
-        logger.error(f"LLaMA benchmark failed: {e}")
-        import traceback
-
-        logger.debug(traceback.format_exc())
-        raise
--- a/benchmark_v2/benchmark_framework.py
+++ b/benchmark_v2/benchmark_framework.py
--- a/benchmark_v2/requirements.txt
+++ b/benchmark_v2/requirements.txt
@ -1,7 +0,0 @@
-numpy>=1.21.0
-psutil>=5.8.0
-gpustat>=1.0.0
-torch>=2.0.0
-transformers>=4.30.0
-datasets>=2.10.0
-huggingface_hub>=0.16.0 
--- a/benchmark_v2/run_benchmarks.py
+++ b/benchmark_v2/run_benchmarks.py
@ -1,495 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Top-level benchmarking script that automatically discovers and runs all benchmarks
-in the ./benches directory, organizing outputs into model-specific subfolders.
-"""
-
-import argparse
-import importlib.util
-import json
-import logging
-import os
-import sys
-import uuid
-from datetime import datetime
-from pathlib import Path
-from typing import Any, Optional
-
-
-def setup_logging(log_level: str = "INFO", enable_file_logging: bool = False) -> logging.Logger:
-    """Setup logging configuration."""
-    numeric_level = getattr(logging, log_level.upper(), None)
-    if not isinstance(numeric_level, int):
-        raise ValueError(f"Invalid log level: {log_level}")
-
-    handlers = [logging.StreamHandler(sys.stdout)]
-
-    if enable_file_logging:
-        handlers.append(logging.FileHandler(f"benchmark_run_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"))
-
-    logging.basicConfig(
-        level=numeric_level, format="[%(levelname)s - %(asctime)s] %(name)s: %(message)s", handlers=handlers
-    )
-
-    return logging.getLogger(__name__)
-
-
-def discover_benchmarks(benches_dir: str) -> list[dict[str, Any]]:
-    """
-    Discover all benchmark modules in the benches directory.
-
-    Returns:
-        List of dictionaries containing benchmark module info
-    """
-    benchmarks = []
-    benches_path = Path(benches_dir)
-
-    if not benches_path.exists():
-        raise FileNotFoundError(f"Benches directory not found: {benches_dir}")
-
-    for py_file in benches_path.glob("*.py"):
-        if py_file.name.startswith("__"):
-            continue
-
-        module_name = py_file.stem
-
-        try:
-            # Import the module
-            spec = importlib.util.spec_from_file_location(module_name, py_file)
-            module = importlib.util.module_from_spec(spec)
-            spec.loader.exec_module(module)
-
-            # Check if it has a benchmark runner function
-            if hasattr(module, f"run_{module_name}"):
-                benchmarks.append(
-                    {
-                        "name": module_name,
-                        "path": str(py_file),
-                        "module": module,
-                        "runner_function": getattr(module, f"run_{module_name}"),
-                    }
-                )
-            elif hasattr(module, "run_benchmark"):
-                benchmarks.append(
-                    {
-                        "name": module_name,
-                        "path": str(py_file),
-                        "module": module,
-                        "runner_function": getattr(module, "run_benchmark"),
-                    }
-                )
-            else:
-                logging.warning(f"No runner function found in {py_file}")
-
-        except Exception as e:
-            logging.error(f"Failed to import {py_file}: {e}")
-
-    return benchmarks
-
-
-def run_single_benchmark(
-    benchmark_info: dict[str, Any], output_dir: str, logger: logging.Logger, **kwargs
-) -> Optional[str]:
-    """
-    Run a single benchmark and return the output file path.
-
-    Args:
-        benchmark_info: Dictionary containing benchmark module info
-        output_dir: Base output directory
-        logger: Logger instance
-        **kwargs: Additional arguments to pass to the benchmark
-
-    Returns:
-        Path to the output file if successful, None otherwise
-    """
-    benchmark_name = benchmark_info["name"]
-    runner_func = benchmark_info["runner_function"]
-
-    logger.info(f"Running benchmark: {benchmark_name}")
-
-    try:
-        # Check function signature to determine what arguments to pass
-        import inspect
-
-        sig = inspect.signature(runner_func)
-
-        # Prepare arguments based on function signature
-        func_kwargs = {"logger": logger, "output_dir": output_dir}
-
-        # Add other kwargs if the function accepts them
-        for param_name in sig.parameters:
-            if param_name in kwargs:
-                func_kwargs[param_name] = kwargs[param_name]
-
-        # Filter kwargs to only include parameters the function accepts
-        # If function has **kwargs, include all provided kwargs
-        has_var_kwargs = any(param.kind == param.VAR_KEYWORD for param in sig.parameters.values())
-        if has_var_kwargs:
-            valid_kwargs = {**func_kwargs, **kwargs}
-        else:
-            valid_kwargs = {k: v for k, v in func_kwargs.items() if k in sig.parameters}
-
-        # Run the benchmark
-        result = runner_func(**valid_kwargs)
-
-        if isinstance(result, str):
-            # Function returned a file path
-            return result
-        else:
-            logger.info(f"Benchmark {benchmark_name} completed successfully")
-            return "completed"
-
-    except Exception as e:
-        logger.error(f"Benchmark {benchmark_name} failed: {e}")
-        import traceback
-
-        logger.debug(traceback.format_exc())
-        return None
-
-
-def generate_summary_report(
-    output_dir: str,
-    benchmark_results: dict[str, Any],
-    logger: logging.Logger,
-    benchmark_run_uuid: Optional[str] = None,
-) -> str:
-    """Generate a summary report of all benchmark runs."""
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    summary_file = os.path.join(output_dir, f"benchmark_summary_{timestamp}.json")
-
-    summary_data = {
-        "run_metadata": {
-            "timestamp": datetime.utcnow().isoformat(),
-            "benchmark_run_uuid": benchmark_run_uuid,
-            "total_benchmarks": len(benchmark_results),
-            "successful_benchmarks": len([r for r in benchmark_results.values() if r is not None]),
-            "failed_benchmarks": len([r for r in benchmark_results.values() if r is None]),
-        },
-        "benchmark_results": benchmark_results,
-        "output_directory": output_dir,
-    }
-
-    with open(summary_file, "w") as f:
-        json.dump(summary_data, f, indent=2, default=str)
-
-    logger.info(f"Summary report saved to: {summary_file}")
-    return summary_file
-
-
-def upload_results_to_hf_dataset(
-    output_dir: str,
-    summary_file: str,
-    dataset_name: str,
-    run_id: Optional[str] = None,
-    token: Optional[str] = None,
-    logger: Optional[logging.Logger] = None,
-) -> Optional[str]:
-    """
-    Upload benchmark results to a HuggingFace Dataset.
-    Based on upload_collated_report() from utils/collated_reports.py
-    Args:
-        output_dir: Local output directory containing results
-        summary_file: Path to the summary file
-        dataset_name: Name of the HuggingFace dataset to upload to
-        run_id: Unique run identifier (if None, will generate one)
-        token: HuggingFace token for authentication (if None, will use environment variables)
-        logger: Logger instance
-    Returns:
-        The run_id used for the upload, None if upload failed
-    """
-    if logger is None:
-        logger = logging.getLogger(__name__)
-
-    import os
-
-    from huggingface_hub import HfApi
-
-    api = HfApi()
-
-    if run_id is None:
-        github_run_number = os.getenv("GITHUB_RUN_NUMBER")
-        github_run_id = os.getenv("GITHUB_RUN_ID")
-        if github_run_number and github_run_id:
-            run_id = f"{github_run_number}-{github_run_id}"
-
-    date_folder = datetime.now().strftime("%Y-%m-%d")
-
-    github_event_name = os.getenv("GITHUB_EVENT_NAME")
-    if github_event_name != "schedule":
-        # Non-scheduled runs go under a runs subfolder
-        repo_path = f"{date_folder}/runs/{run_id}/benchmark_results"
-    else:
-        # Scheduled runs go directly under the date
-        repo_path = f"{date_folder}/{run_id}/benchmark_results"
-
-    logger.info(f"Uploading benchmark results to dataset '{dataset_name}' at path '{repo_path}'")
-
-    try:
-        # Upload all files in the output directory
-        from pathlib import Path
-
-        output_path = Path(output_dir)
-
-        for file_path in output_path.rglob("*"):
-            if file_path.is_file():
-                # Calculate relative path from output_dir
-                relative_path = file_path.relative_to(output_path)
-                path_in_repo = f"{repo_path}/{relative_path}"
-
-                logger.debug(f"Uploading {file_path} to {path_in_repo}")
-
-                api.upload_file(
-                    path_or_fileobj=str(file_path),
-                    path_in_repo=path_in_repo,
-                    repo_id=dataset_name,
-                    repo_type="dataset",
-                    token=token,
-                    commit_message=f"Upload benchmark results for run {run_id}",
-                )
-
-        logger.info(
-            f"Successfully uploaded results to: https://huggingface.co/datasets/{dataset_name}/tree/main/{repo_path}"
-        )
-
-        return run_id
-
-    except Exception as upload_error:
-        logger.error(f"Failed to upload results: {upload_error}")
-        import traceback
-
-        logger.debug(traceback.format_exc())
-        return None
-
-
-def main():
-    """Main entry point for the benchmarking script."""
-    # Generate a unique UUID for this benchmark run
-    benchmark_run_uuid = str(uuid.uuid4())[:8]
-
-    parser = argparse.ArgumentParser(
-        description="Run all benchmarks in the ./benches directory",
-        epilog="""
-Examples:
-  # Run all available benchmarks
-  python3 run_benchmarks.py
-  
-  # Run with specific model and upload to HuggingFace Dataset
-  python3 run_benchmarks.py --model-id meta-llama/Llama-2-7b-hf --upload-to-hf username/benchmark-results
-  
-  # Run with custom run ID and upload to HuggingFace Dataset
-  python3 run_benchmarks.py --run-id experiment_v1 --upload-to-hf org/benchmarks
-  
-  # Run only specific benchmarks with file logging
-  python3 run_benchmarks.py --include llama --enable-file-logging
-        """,  # noqa: W293
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-    )
-
-    parser.add_argument(
-        "--output-dir",
-        type=str,
-        default="benchmark_results",
-        help="Base output directory for benchmark results (default: benchmark_results)",
-    )
-
-    parser.add_argument(
-        "--benches-dir",
-        type=str,
-        default="./benches",
-        help="Directory containing benchmark implementations (default: ./benches)",
-    )
-
-    parser.add_argument(
-        "--log-level",
-        type=str,
-        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
-        default="INFO",
-        help="Logging level (default: INFO)",
-    )
-
-    parser.add_argument("--model-id", type=str, help="Specific model ID to benchmark (if supported by benchmarks)")
-
-    parser.add_argument("--warmup-iterations", type=int, default=3, help="Number of warmup iterations (default: 3)")
-
-    parser.add_argument(
-        "--measurement-iterations", type=int, default=5, help="Number of measurement iterations (default: 5)"
-    )
-
-    parser.add_argument(
-        "--num-tokens-to-generate",
-        type=int,
-        default=100,
-        help="Number of tokens to generate in benchmarks (default: 100)",
-    )
-
-    parser.add_argument("--include", type=str, nargs="*", help="Only run benchmarks matching these names")
-
-    parser.add_argument("--exclude", type=str, nargs="*", help="Exclude benchmarks matching these names")
-
-    parser.add_argument("--enable-file-logging", action="store_true", help="Enable file logging (disabled by default)")
-
-    parser.add_argument(
-        "--commit-id", type=str, help="Git commit ID for metadata (if not provided, will auto-detect from git)"
-    )
-
-    parser.add_argument(
-        "--push-to-hub",
-        type=str,
-        help="Upload results to HuggingFace Dataset (provide dataset name, e.g., 'username/benchmark-results')",
-    )
-
-    parser.add_argument(
-        "--run-id", type=str, help="Custom run ID for organizing results (if not provided, will generate a unique ID)"
-    )
-
-    parser.add_argument(
-        "--token",
-        type=str,
-        help="HuggingFace token for dataset uploads (if not provided, will use HF_TOKEN environment variable)",
-    )
-
-    args = parser.parse_args()
-
-    # Setup logging
-    logger = setup_logging(args.log_level, args.enable_file_logging)
-
-    logger.info("Starting benchmark discovery and execution")
-    logger.info(f"Benchmark run UUID: {benchmark_run_uuid}")
-    logger.info(f"Output directory: {args.output_dir}")
-    logger.info(f"Benches directory: {args.benches_dir}")
-
-    # Create output directory
-    os.makedirs(args.output_dir, exist_ok=True)
-
-    try:
-        # Discover benchmarks
-        benchmarks = discover_benchmarks(args.benches_dir)
-        logger.info(f"Discovered {len(benchmarks)} benchmark(s): {[b['name'] for b in benchmarks]}")
-
-        if not benchmarks:
-            logger.warning("No benchmarks found!")
-            return 1
-
-        # Filter benchmarks based on include/exclude
-        filtered_benchmarks = benchmarks
-
-        if args.include:
-            filtered_benchmarks = [
-                b for b in filtered_benchmarks if any(pattern in b["name"] for pattern in args.include)
-            ]
-            logger.info(f"Filtered to include: {[b['name'] for b in filtered_benchmarks]}")
-
-        if args.exclude:
-            filtered_benchmarks = [
-                b for b in filtered_benchmarks if not any(pattern in b["name"] for pattern in args.exclude)
-            ]
-            logger.info(f"After exclusion: {[b['name'] for b in filtered_benchmarks]}")
-
-        if not filtered_benchmarks:
-            logger.warning("No benchmarks remaining after filtering!")
-            return 1
-
-        # Prepare common kwargs for benchmarks
-        benchmark_kwargs = {
-            "warmup_iterations": args.warmup_iterations,
-            "measurement_iterations": args.measurement_iterations,
-            "num_tokens_to_generate": args.num_tokens_to_generate,
-        }
-
-        if args.model_id:
-            benchmark_kwargs["model_id"] = args.model_id
-
-        # Add commit_id if provided
-        if args.commit_id:
-            benchmark_kwargs["commit_id"] = args.commit_id
-
-        # Run benchmarks
-        benchmark_results = {}
-        successful_count = 0
-
-        for benchmark_info in filtered_benchmarks:
-            result = run_single_benchmark(benchmark_info, args.output_dir, logger, **benchmark_kwargs)
-
-            benchmark_results[benchmark_info["name"]] = result
-
-            if result is not None:
-                successful_count += 1
-
-        # Generate summary report
-        summary_file = generate_summary_report(args.output_dir, benchmark_results, logger, benchmark_run_uuid)
-
-        # Upload results to HuggingFace Dataset if requested
-        upload_run_id = None
-        if args.push_to_hub:
-            logger.info("=" * 60)
-            logger.info("UPLOADING TO HUGGINGFACE DATASET")
-            logger.info("=" * 60)
-            # Use provided run_id or fallback to benchmark run UUID
-            effective_run_id = args.run_id or benchmark_run_uuid
-            upload_run_id = upload_results_to_hf_dataset(
-                output_dir=args.output_dir,
-                summary_file=summary_file,
-                dataset_name=args.push_to_hub,
-                run_id=effective_run_id,
-                token=args.token,
-                logger=logger,
-            )
-            if upload_run_id:
-                logger.info(f"Upload completed with run ID: {upload_run_id}")
-            else:
-                logger.warning("Upload failed - continuing with local results")
-
-        # Final summary
-        total_benchmarks = len(filtered_benchmarks)
-        failed_count = total_benchmarks - successful_count
-
-        logger.info("=" * 60)
-        logger.info("BENCHMARK RUN SUMMARY")
-        logger.info("=" * 60)
-        logger.info(f"Total benchmarks: {total_benchmarks}")
-        logger.info(f"Successful: {successful_count}")
-        logger.info(f"Failed: {failed_count}")
-        logger.info(f"Output directory: {args.output_dir}")
-        logger.info(f"Summary report: {summary_file}")
-
-        if args.push_to_hub:
-            if upload_run_id:
-                logger.info(f"HuggingFace Dataset: {args.push_to_hub}")
-                logger.info(f"Run ID: {upload_run_id}")
-                logger.info(
-                    f"View results: https://huggingface.co/datasets/{args.push_to_hub}/tree/main/{datetime.now().strftime('%Y-%m-%d')}/runs/{upload_run_id}"
-                )
-            else:
-                logger.warning("Upload to HuggingFace Dataset failed")
-
-        if failed_count > 0:
-            logger.warning(f"{failed_count} benchmark(s) failed. Check logs for details.")
-            return 1
-        else:
-            logger.info("All benchmarks completed successfully!")
-            return 0
-
-    except Exception as e:
-        logger.error(f"Benchmark run failed: {e}")
-        import traceback
-
-        logger.debug(traceback.format_exc())
-        return 1
-
-
-if __name__ == "__main__":
-    sys.exit(main())
--- a/conftest.py
+++ b/conftest.py
@ -16,7 +16,6 @@
 # by pytest before any tests are run

 import doctest
-import os
 import sys
 import warnings
 from os.path import abspath, dirname, join
@ -24,18 +23,12 @@ from os.path import abspath, dirname, join
 import _pytest
 import pytest

-from transformers.testing_utils import (
-    HfDoctestModule,
-    HfDocTestParser,
-    is_torch_available,
-    patch_testing_methods_to_collect_info,
-    patch_torch_compile_force_graph,
-)
+from transformers.testing_utils import HfDoctestModule, HfDocTestParser


 NOT_DEVICE_TESTS = {
    "test_tokenization",
-    "test_tokenization_mistral_common",
+    "test_processor",
    "test_processing",
    "test_beam_constraints",
    "test_configuration_utils",
@ -53,7 +46,12 @@ NOT_DEVICE_TESTS = {
    "test_keep_in_fp32_modules",
    "test_gradient_checkpointing_backward_compatibility",
    "test_gradient_checkpointing_enable_disable",
+    "test_save_load_fast_init_from_base",
+    "test_fast_init_context_manager",
+    "test_fast_init_tied_embeddings",
+    "test_save_load_fast_init_to_base",
    "test_torch_save_load",
+    "test_initialization",
    "test_forward_signature",
    "test_model_get_set_embeddings",
    "test_model_main_input_name",
@ -63,12 +61,17 @@ NOT_DEVICE_TESTS = {
    "test_load_save_without_tied_weights",
    "test_tied_weights_keys",
    "test_model_weights_reload_no_missing_tied_weights",
-    "test_can_load_ignoring_mismatched_shapes",
+    "test_pt_tf_model_equivalence",
+    "test_mismatched_shapes_have_properly_initialized_weights",
+    "test_matched_shapes_have_loaded_weights_when_some_mismatched_shapes_exist",
    "test_model_is_small",
+    "test_tf_from_pt_safetensors",
+    "test_flax_from_pt_safetensors",
    "ModelTest::test_pipeline_",  # None of the pipeline tests from PipelineTesterMixin (of which XxxModelTest inherits from) are running on device
    "ModelTester::test_pipeline_",
    "/repo_utils/",
    "/utils/",
+    "/agents/",
 }

 # allow having multiple repository checkouts and not needing to remember to rerun
@ -82,14 +85,17 @@ warnings.simplefilter(action="ignore", category=FutureWarning)


 def pytest_configure(config):
+    config.addinivalue_line(
+        "markers", "is_pt_tf_cross_test: mark test to run only when PT and TF interactions are tested"
+    )
+    config.addinivalue_line(
+        "markers", "is_pt_flax_cross_test: mark test to run only when PT and FLAX interactions are tested"
+    )
    config.addinivalue_line("markers", "is_pipeline_test: mark test to run only when pipelines are tested")
    config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment")
    config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate")
+    config.addinivalue_line("markers", "agent_tests: mark the agent tests that are run on their specific schedule")
    config.addinivalue_line("markers", "not_device_test: mark the tests always running on cpu")
-    config.addinivalue_line("markers", "torch_compile_test: mark test which tests torch compile functionality")
-    config.addinivalue_line("markers", "torch_export_test: mark test which tests torch export functionality")
-
-    os.environ["DISABLE_SAFETENSORS_CONVERSION"] = "true"


 def pytest_collection_modifyitems(items):
@ -134,18 +140,3 @@ class CustomOutputChecker(OutputChecker):
 doctest.OutputChecker = CustomOutputChecker
 _pytest.doctest.DoctestModule = HfDoctestModule
 doctest.DocTestParser = HfDocTestParser
-
-if is_torch_available():
-    import torch
-
-    # The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
-    # We set it to `False` for CI. See https://github.com/pytorch/pytorch/issues/157274#issuecomment-3090791615
-    torch.backends.cudnn.allow_tf32 = False
-
-    # patch `torch.compile`: if `TORCH_COMPILE_FORCE_FULLGRAPH=1` (or values considered as true, e.g. yes, y, etc.),
-    # the patched version will always run with `fullgraph=True`.
-    patch_torch_compile_force_graph()
-
-
-if os.environ.get("PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS", "").lower() in ("yes", "true", "on", "y", "1"):
-    patch_testing_methods_to_collect_info()
--- a/docker/README.md
+++ b/docker/README.md
@ -2,8 +2,8 @@

 In this folder you will find various docker files, and some subfolders. 
 - dockerfiles (ex: `consistency.dockerfile`) present under `~/docker` are used for our "fast" CIs. You should be able to use them for tasks that only need CPU. For example `torch-light` is a very light weights container (703MiB). 
- subfolders contain dockerfiles used for our `slow` CIs, which *can* be used for GPU tasks, but they are **BIG** as they were not specifically designed for a single model / single task. Thus the `~/docker/transformers-pytorch-gpu` includes additional dependencies to allow us to run ALL model tests (say `librosa` or `tesseract`, which you do not need to run LLMs)
+- subfloder contain dockerfiles used for our `slow` CIs, which *can* be used for GPU tasks, but they are **BIG** as they were not specifically designed for a single model / single task. Thus the `~/docker/transformers-pytorch-gpu` includes additional dependencies to allow us to run ALL model tests (say `librosa` or `tesseract`, which you do not need to run LLMs)

 Note that in both case, you need to run `uv pip install -e .`, which should take around 5 seconds. We do it outside the dockerfile for the need of our CI: we checkout a new branch each time, and the `transformers` code is thus updated. 

-We are open to contribution, and invite the community to create dockerfiles with potential arguments that properly choose extras depending on the model's dependencies! :hugs: 
+We are open to contribution, and invite the community to create dockerfiles with potential arguments that properly choose extras depending on the model's dependencies! :hugs: 
--- a/docker/consistency.dockerfile
+++ b/docker/consistency.dockerfile
@ -4,11 +4,13 @@ USER root
 ARG REF=main
 RUN apt-get update && apt-get install -y time git g++ pkg-config make git-lfs
 ENV UV_PYTHON=/usr/local/bin/python
-RUN pip install uv && uv pip install --no-cache-dir -U pip setuptools GitPython
-RUN uv pip install --no-cache-dir --upgrade 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
+RUN pip install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools GitPython
+RUN pip install --no-cache-dir --upgrade 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
+# tensorflow pin matching setup.py
 RUN uv pip install --no-cache-dir pypi-kenlm
-RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[quality,testing,torch-speech,vision]"
+RUN uv pip install --no-cache-dir "tensorflow-cpu<2.16" "tf-keras<2.16"
+RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,quality,testing,torch-speech,vision]"
 RUN git lfs install

-RUN uv pip uninstall transformers
-RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
+RUN pip uninstall -y transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
--- a/docker/custom-tokenizers.dockerfile
+++ b/docker/custom-tokenizers.dockerfile
@ -1,10 +1,9 @@
 FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
-ARG REF=main
 USER root
-RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake wget xz-utils build-essential g++5 libprotobuf-dev protobuf-compiler git-lfs curl
+RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake wget xz-utils build-essential g++5 libprotobuf-dev protobuf-compiler
 ENV UV_PYTHON=/usr/local/bin/python
-RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools

 RUN wget https://github.com/ku-nlp/jumanpp/releases/download/v2.0.0-rc3/jumanpp-2.0.0-rc3.tar.xz
 RUN tar xvf jumanpp-2.0.0-rc3.tar.xz
@ -15,21 +14,13 @@ RUN mv catch.hpp ../libs/
 RUN cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
 RUN make install -j 10

-WORKDIR /

 RUN uv pip install --no-cache --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
-RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu
-RUN uv pip install  --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ja,testing,sentencepiece,spacy,ftfy,rjieba]" unidic unidic-lite
+RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
+RUN uv pip install  --no-cache-dir "transformers[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]" unidic unidic-lite
 # spacy is not used so not tested. Causes to failures. TODO fix later
-RUN uv run python -m unidic download
-
-# fetch test data and hub objects within CircleCI docker images to reduce even more connections
-# we don't need a full clone of `transformers` to run `fetch_hub_objects_for_ci.py`
-# the data are downloaded to the directory `/test_data` and during CircleCI's CI runtime, we need to move them to the root of `transformers`
-RUN mkdir test_data && cd test_data && curl -O https://raw.githubusercontent.com/huggingface/transformers/${REF}/utils/fetch_hub_objects_for_ci.py && python3 fetch_hub_objects_for_ci.py
-
-
-RUN uv pip uninstall transformers
+RUN python3 -m unidic download
+RUN pip uninstall -y transformers

 RUN apt-get clean && rm -rf /var/lib/apt/lists/*
-RUN apt remove -y g++ cmake  xz-utils libprotobuf-dev protobuf-compiler
+RUN apt remove -y g++ cmake  xz-utils libprotobuf-dev protobuf-compiler
--- a/docker/examples-tf.dockerfile
+++ b/docker/examples-tf.dockerfile
@ -0,0 +1,12 @@
+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+USER root
+RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git
+RUN apt-get install -y g++ cmake
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip --no-cache-dir install uv && uv venv
+RUN uv pip install --no-cache-dir -U pip setuptools albumentations seqeval
+RUN pip install  --upgrade --no-cache-dir "transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
+RUN uv pip install --no-cache-dir  "protobuf==3.20.3" 
+RUN pip uninstall -y transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*
--- a/docker/examples-torch.dockerfile
+++ b/docker/examples-torch.dockerfile
@ -1,19 +1,11 @@
 FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
-ARG REF=main
 USER root
-RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git-lfs ffmpeg curl
+RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git
 ENV UV_PYTHON=/usr/local/bin/python
-RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
-RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
-RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer
-
-# fetch test data and hub objects within CircleCI docker images to reduce even more connections
-# we don't need a full clone of `transformers` to run `fetch_hub_objects_for_ci.py`
-# the data are downloaded to the directory `/test_data` and during CircleCI's CI runtime, we need to move them to the root of `transformers`
-RUN mkdir test_data && cd test_data && curl -O https://raw.githubusercontent.com/huggingface/transformers/${REF}/utils/fetch_hub_objects_for_ci.py && python3 fetch_hub_objects_for_ci.py
-
-
-RUN uv pip uninstall transformers
-RUN apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
+RUN uv pip install --no-cache-dir librosa "transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer
+RUN pip uninstall -y transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*
--- a/docker/exotic-models.dockerfile
+++ b/docker/exotic-models.dockerfile
@ -2,23 +2,16 @@ FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
-RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git libgl1 g++ tesseract-ocr git-lfs curl
+RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git libgl1-mesa-glx libgl1 g++ tesseract-ocr
 ENV UV_PYTHON=/usr/local/bin/python
-RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
+RUN pip --no-cache-dir install uv &&  uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir  --no-deps timm accelerate
-RUN uv pip install -U --no-cache-dir pytesseract python-Levenshtein opencv-python nltk
+RUN pip install -U --upgrade-strategy eager --no-cache-dir pytesseract python-Levenshtein opencv-python nltk
 # RUN uv pip install --no-cache-dir natten==0.15.1+torch210cpu -f https://shi-labs.com/natten/wheels
-RUN uv pip install  --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[testing, vision]" 'scikit-learn' 'torch-stft' 'nose'  'dataset'
+RUN pip install  --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[testing, vision]" 'scikit-learn' 'torch-stft' 'nose'  'dataset'
 # RUN git clone https://github.com/facebookresearch/detectron2.git
 # RUN python3 -m pip install --no-cache-dir -e detectron2
-RUN uv pip install 'git+https://github.com/facebookresearch/detectron2.git@92ae9f0b92aba5867824b4f12aa06a22a60a45d3' --no-build-isolation
-
-# fetch test data and hub objects within CircleCI docker images to reduce even more connections
-# we don't need a full clone of `transformers` to run `fetch_hub_objects_for_ci.py`
-# the data are downloaded to the directory `/test_data` and during CircleCI's CI runtime, we need to move them to the root of `transformers`
-RUN mkdir test_data && cd test_data && curl -O https://raw.githubusercontent.com/huggingface/transformers/${REF}/utils/fetch_hub_objects_for_ci.py && python3 fetch_hub_objects_for_ci.py
-
-
-RUN uv pip uninstall transformers
+RUN pip install 'git+https://github.com/facebookresearch/detectron2.git@92ae9f0b92aba5867824b4f12aa06a22a60a45d3'
+RUN pip uninstall -y transformers
 RUN apt-get clean && rm -rf /var/lib/apt/lists/*
--- a/docker/jax-light.dockerfile
+++ b/docker/jax-light.dockerfile
@ -0,0 +1,10 @@
+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
+USER root
+RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git g++ cmake
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip --no-cache-dir install uv &&  uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,testing,sentencepiece,flax-speech,vision]"
+RUN pip uninstall -y transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
--- a/docker/pipeline-tf.dockerfile
+++ b/docker/pipeline-tf.dockerfile
@ -0,0 +1,10 @@
+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
+USER root
+RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake g++
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]"
+RUN uv pip install --no-cache-dir  "protobuf==3.20.3" tensorflow_probability
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*
--- a/docker/pipeline-torch.dockerfile
+++ b/docker/pipeline-torch.dockerfile
@ -2,17 +2,10 @@ FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
-RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git ffmpeg curl
+RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git
 ENV UV_PYTHON=/usr/local/bin/python
-RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
-RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"
-
-# fetch test data and hub objects within CircleCI docker images to reduce even more connections
-# we don't need a full clone of `transformers` to run `fetch_hub_objects_for_ci.py`
-# the data are downloaded to the directory `/test_data` and during CircleCI's CI runtime, we need to move them to the root of `transformers`
-RUN mkdir test_data && cd test_data && curl -O https://raw.githubusercontent.com/huggingface/transformers/${REF}/utils/fetch_hub_objects_for_ci.py && python3 fetch_hub_objects_for_ci.py
-
-
-RUN uv pip uninstall transformers
+RUN pip uninstall -y transformers
--- a/docker/quality.dockerfile
+++ b/docker/quality.dockerfile
@ -2,8 +2,8 @@ FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
-RUN apt-get update && apt-get install -y time git
+RUN apt-get update && apt-get install -y time git 
 ENV UV_PYTHON=/usr/local/bin/python
-RUN pip install uv
+RUN pip install uv &&  uv venv
 RUN uv pip install --no-cache-dir -U pip setuptools GitPython "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ruff]" urllib3
-RUN apt-get install -y jq curl && apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN apt-get install -y jq curl && apt-get clean && rm -rf /var/lib/apt/lists/*
--- a/docker/tf-light.dockerfile
+++ b/docker/tf-light.dockerfile
@ -0,0 +1,12 @@
+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
+USER root
+RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ pkg-config openssh-client git
+RUN apt-get install -y  cmake
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN pip install  --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
+RUN uv pip install --no-cache-dir  "protobuf==3.20.3" 
+RUN pip uninstall -y transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Cyril Vallez	5060a334de	remove layer_idx	2024-12-13 14:07:01 +00:00
Cyril Vallez	caaa5e5508	tgi update	2024-12-12 18:29:26 +00:00
Arthur Zucker	95cb944ee6	be permissive	2024-12-12 11:33:37 +01:00
Arthur Zucker	584b443096	fix unpack imoprt	2024-12-12 10:43:13 +01:00
Arthur Zucker	57eece66af	Merge branch 'llama-refactor' of github.com:huggingface/transformers into llama-refactor	2024-12-12 10:36:54 +01:00
Arthur Zucker	9461039d87	nits	2024-12-12 10:36:49 +01:00
Arthur	f7395cc0cc	Merge branch 'main' into llama-refactor	2024-12-12 14:57:58 +05:30
Arthur Zucker	4f36712da1	nit?	2024-12-12 10:24:11 +01:00
Arthur Zucker	2016bc47d0	default init weights	2024-12-12 10:18:38 +01:00
Arthur Zucker	53450ac365	fix	2024-12-12 10:14:20 +01:00
Arthur Zucker	1a5a834f53	fix auto?	2024-12-12 09:53:53 +01:00
Arthur Zucker	3f68c7cf72	9 left!	2024-12-12 09:44:14 +01:00
Arthur Zucker	c224f36d10	fix some tests	2024-12-12 09:39:36 +01:00
Arthur Zucker	725d00caf4	fix some stuff	2024-12-12 09:22:04 +01:00
Arthur Zucker	6028e85990	fixup	2024-12-11 19:50:04 +01:00
Arthur Zucker	7a608da9f8	update	2024-12-11 19:44:29 +01:00
Arthur Zucker	e9d751abaa	fix attention_mask	2024-12-11 19:26:39 +01:00
Arthur Zucker	60189825d7	fix!	2024-12-11 19:09:09 +01:00
Arthur Zucker	d9156363bf	mm	2024-12-11 18:49:37 +01:00
Arthur Zucker	20c512bc80	clean	2024-12-11 18:07:53 +01:00
Arthur Zucker	7a911efddf	update	2024-12-11 17:39:40 +01:00
Arthur Zucker	89d32d6825	fix auto set	2024-12-11 17:30:26 +01:00
Arthur Zucker	3bbae39539	remove tanh	2024-12-11 17:24:56 +01:00
Arthur Zucker	e5d60b4f23	fix	2024-12-11 17:07:20 +01:00
Arthur Zucker	4b9a429a1c	style	2024-12-11 16:54:35 +01:00
Arthur Zucker	1ef18f49a9	style	2024-12-11 16:53:02 +01:00
Arthur Zucker	28829d2dd6	there was an issue with tie weight keys	2024-12-11 16:44:04 +01:00
Arthur Zucker	40154815cb	revert some stuff	2024-12-11 16:23:18 +01:00
Arthur Zucker	38dd294dd7	fix	2024-12-11 16:10:40 +01:00
Arthur Zucker	1baabd3207	update	2024-12-11 15:13:39 +01:00
Arthur Zucker	dcf7a37ce1	cache concatenates on the wrong axis	2024-12-11 14:38:26 +01:00
Arthur Zucker	f61a5fec41	pass attention	2024-12-11 14:37:12 +01:00
Arthur Zucker	556aa4ec2d	updates	2024-12-11 14:35:53 +01:00
Arthur Zucker	341b8ce9fa	nits	2024-12-11 14:27:20 +01:00
Arthur Zucker	0418f97553	make auto for causal lm work	2024-12-11 14:18:41 +01:00
Arthur Zucker	39ab8b757b	oupts	2024-12-11 13:58:09 +01:00
Arthur Zucker	13a195a7bb	_output_embedding and _input_embeding	2024-12-11 13:53:35 +01:00
Arthur Zucker	893ef382c4	nits	2024-12-11 13:47:59 +01:00
Arthur Zucker	4e681b9c72	nits	2024-12-11 13:38:19 +01:00
Arthur Zucker	0384db9c0c	more refactoring	2024-12-11 12:39:07 +01:00
Arthur Zucker	f446bd4c00	only change lLlama	2024-12-11 12:20:51 +01:00
Arthur Zucker	f14637a7b5	refactor LlamaAttention	2024-11-28 08:01:54 +01:00