try

trigger
build images
2025-11-05 21:04:37 +08:00 · 2025-06-05 16:35:42 +02:00 · 2025-06-05 16:24:40 +02:00 · 2025-06-05 16:15:04 +02:00 · 2025-06-05 16:13:08 +02:00 · 2025-06-05 16:11:53 +02:00
74 changed files with 613 additions and 1543 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -39,7 +39,7 @@ jobs:
    fetch_tests:
        working_directory: ~/transformers
        docker:
-            - image: huggingface/transformers-quality
+            - image: huggingface/transformers-quality:dev
        parallelism: 1
        steps:
            - checkout
@ -91,7 +91,7 @@ jobs:
    fetch_all_tests:
        working_directory: ~/transformers
        docker:
-            - image: huggingface/transformers-quality
+            - image: huggingface/transformers-quality:dev
        parallelism: 1
        steps:
            - checkout
@ -140,7 +140,7 @@ jobs:
    check_code_quality:
        working_directory: ~/transformers
        docker:
-            - image: huggingface/transformers-quality
+            - image: huggingface/transformers-quality:dev
        resource_class: large
        environment:
            TRANSFORMERS_IS_CI: yes
@ -165,7 +165,7 @@ jobs:
    check_repository_consistency:
        working_directory: ~/transformers
        docker:
-            - image: huggingface/transformers-consistency
+            - image: huggingface/transformers-consistency:dev
        resource_class: large
        environment:
            TRANSFORMERS_IS_CI: yes
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@ -105,8 +105,7 @@ class CircleCIJob:
        else:
            # BIG HACK WILL REMOVE ONCE FETCHER IS UPDATED
            print(os.environ.get("GIT_COMMIT_MESSAGE"))
-            if "[build-ci-image]" in os.environ.get("GIT_COMMIT_MESSAGE", "") or os.environ.get("GIT_COMMIT_MESSAGE", "") == "dev-ci":
-                self.docker_image[0]["image"] = f"{self.docker_image[0]['image']}:dev"
+            self.docker_image[0]["image"] = f"{self.docker_image[0]['image']}:dev"
            print(f"Using {self.docker_image} docker image")
        if self.install_steps is None:
            self.install_steps = ["uv venv && uv pip install ."]
--- a/.github/workflows/build-ci-docker-images.yml
+++ b/.github/workflows/build-ci-docker-images.yml
@ -3,7 +3,7 @@ name: Build pr ci-docker
 on:
  push:
    branches:
-      - push-ci-image # for now let's only build on this branch
+      - try_torch_2.7_on_circleci_jobs_xxx
  repository_dispatch:
  workflow_call:
    inputs:
@ -22,7 +22,6 @@ jobs:
  build:
    runs-on: ubuntu-22.04

-    if: ${{ contains(github.event.head_commit.message, '[build-ci-image]') || contains(github.event.head_commit.message, '[push-ci-image]') && '!cancelled()' || github.event_name == 'schedule' }}

    strategy:
      matrix:
@ -33,13 +32,8 @@ jobs:
      -
        name: Set tag
        run: |
-              if ${{contains(github.event.head_commit.message, '[build-ci-image]')}}; then
-                  echo "TAG=huggingface/transformers-${{ matrix.file }}:dev" >> "$GITHUB_ENV"
-                  echo "setting it to DEV!"
-              else
-                  echo "TAG=huggingface/transformers-${{ matrix.file }}" >> "$GITHUB_ENV"
-
-              fi
+          echo "TAG=huggingface/transformers-${{ matrix.file }}:dev" >> "$GITHUB_ENV"
+          echo "setting it to DEV!"
      -
        name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
@ -60,18 +54,5 @@ jobs:
          build-args: |
            REF=${{ github.sha }}
          file: "./docker/${{ matrix.file }}.dockerfile"
-          push: ${{ contains(github.event.head_commit.message, 'ci-image]') ||  github.event_name == 'schedule' }}
+          push: true
          tags: ${{ env.TAG }}
-
-  notify:
-    runs-on: ubuntu-22.04
-    if: ${{ contains(github.event.head_commit.message, '[build-ci-image]') || contains(github.event.head_commit.message, '[push-ci-image]') && '!cancelled()' || github.event_name == 'schedule' }}
-    steps:
-      - name: Post to Slack
-        if: ${{ contains(github.event.head_commit.message, '[push-ci-image]') && github.event_name != 'schedule' }}
-        uses: huggingface/hf-workflows/.github/actions/post-slack@main
-        with:
-          slack_channel: "#transformers-ci-circleci-images"
-          title: 🤗 New docker images for CircleCI are pushed.
-          status: ${{ job.status }}
-          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@ -19,7 +19,7 @@ concurrency:

 jobs:
  latest-docker:
-    name: "Latest PyTorch [dev]"
+    name: "Latest PyTorch + TensorFlow [dev]"
    runs-on:
      group: aws-general-8-plus
    steps:
@ -267,6 +267,44 @@ jobs:
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

+  latest-tensorflow:
+    name: "Latest TensorFlow [dev]"
+    # Push CI doesn't need this image
+    if: inputs.image_postfix != '-push-ci'
+    runs-on:
+      group: aws-general-8-plus
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v4
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-tensorflow-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-tensorflow-gpu
+
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
+          title: 🤗 Results of the huggingface/transformers-tensorflow-gpu build
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
  latest-pytorch-deepspeed-amd:
    name: "PyTorch + DeepSpeed (AMD) [dev]"
    runs-on:
--- a/.github/workflows/self-scheduled-caller.yml
+++ b/.github/workflows/self-scheduled-caller.yml
@ -69,6 +69,18 @@ jobs:
      report_repo_id: hf-internal-testing/transformers_daily_ci
    secrets: inherit

+  tf-pipeline:
+    name: TF pipeline CI
+    uses: ./.github/workflows/self-scheduled.yml
+    with:
+      job: run_pipelines_tf_gpu
+      slack_report_channel: "#transformers-ci-daily-pipeline-tf"
+      runner: daily-ci
+      docker: huggingface/transformers-tensorflow-gpu
+      ci_event: Daily CI
+      report_repo_id: hf-internal-testing/transformers_daily_ci
+    secrets: inherit
+
  example-ci:
    name: Example CI
    uses: ./.github/workflows/self-scheduled.yml
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -209,6 +209,75 @@ jobs:
          name: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports
          path: /transformers/reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports

+  run_pipelines_tf_gpu:
+    if: ${{ inputs.job == 'run_pipelines_tf_gpu' }}
+    name: TensorFlow pipelines
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+    runs-on:
+      group: '${{ matrix.machine_type }}'
+    container:
+      image: huggingface/transformers-tensorflow-gpu
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Update clone
+        working-directory: /transformers
+        run: |
+          git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Set `machine_type` for report and artifact names
+        working-directory: /transformers
+        shell: bash
+        run: |
+          echo "${{ matrix.machine_type }}"
+
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+
+          echo "$machine_type"
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Run all pipeline tests on GPU
+        working-directory: /transformers
+        run: |
+          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports tests/pipelines
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: |
+          cat /transformers/reports/${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports
+
  run_examples_gpu:
    if: ${{ inputs.job == 'run_examples_gpu' }}
    name: Examples directory
@ -502,6 +571,7 @@ jobs:
      run_models_gpu,
      run_trainer_and_fsdp_gpu,
      run_pipelines_torch_gpu,
+      run_pipelines_tf_gpu,
      run_examples_gpu,
      run_torch_cuda_extensions_gpu,
      run_quantization_torch_gpu,
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -28,7 +28,7 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
 # 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future.
 # 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`.
 #    Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions).
-RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA && python3 -m pip uninstall -y tensorflow tensorflow_text tensorflow_probability
+RUN python3 -m pip install --no-cache-dir -U tensorflow==2.13 protobuf==3.20.3 "tensorflow_text<2.16" "tensorflow_probability<0.22" && python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA

 RUN python3 -m pip uninstall -y flax jax

--- a/docs/source/en/hpo_train.md
+++ b/docs/source/en/hpo_train.md
@ -19,9 +19,6 @@ Hyperparameter search discovers an optimal set of hyperparameters that produces

 This guide will go over how to set up a hyperparameter search for each of the backends.

-> [!WARNING]
-> [SigOpt](https://github.com/sigopt/sigopt-server) is in public archive mode and is no longer actively maintained. Try using Optuna, Weights & Biases or Ray Tune instead.
-
 ```bash
 pip install optuna/sigopt/wandb/ray[tune]
 ```
--- a/docs/source/en/model_doc/aria.md
+++ b/docs/source/en/model_doc/aria.md
@ -14,71 +14,60 @@ rendered properly in your Markdown viewer.

 -->

-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-    </div>
-</div>
-
 # Aria

-[Aria](https://huggingface.co/papers/2410.05993) is a multimodal mixture-of-experts (MoE) model. The goal of this model is to open-source a training recipe for creating a multimodal native model from scratch. Aria has 3.9B and 3.5B activated parameters per visual and text token respectively. Text is handled by a MoE decoder and visual inputs are handled by a lightweight visual encoder. It is trained in 4 stages, language pretraining, multimodal pretraining, multimodal long-context pretraining, and multimodal post-training.
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>

-You can find all the original Aria checkpoints under the [Aria](https://huggingface.co/rhymes-ai?search_models=aria) organization.
+## Overview

-> [!TIP]
-> Click on the Aria models in the right sidebar for more examples of how to apply Aria to different multimodal tasks.
+The Aria model was proposed in [Aria: An Open Multimodal Native Mixture-of-Experts Model](https://huggingface.co/papers/2410.05993) by Li et al. from the Rhymes.AI team.

-The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.
+Aria is an open multimodal-native model with best-in-class performance across a wide range of multimodal, language, and coding tasks. It has a Mixture-of-Experts architecture, with respectively 3.9B and 3.5B activated parameters per visual token and text token. 

-<hfoptions id="usage">
-<hfoption id="Pipeline">
+The abstract from the paper is the following:

+*Information comes in diverse modalities. Multimodal native AI models are essential to integrate real-world information and deliver comprehensive understanding. While proprietary multimodal native models exist, their lack of openness imposes obstacles for adoptions, let alone adaptations. To fill this gap, we introduce Aria, an open multimodal native model with best-in-class performance across a wide range of multimodal, language, and coding tasks. Aria is a mixture-of-expert model with 3.9B and 3.5B activated parameters per visual token and text token, respectively. It outperforms Pixtral-12B and Llama3.2-11B, and is competitive against the best proprietary models on various multimodal tasks. We pre-train Aria from scratch following a 4-stage pipeline, which progressively equips the model with strong capabilities in language understanding, multimodal understanding, long context window, and instruction following. We open-source the model weights along with a codebase that facilitates easy adoptions and adaptations of Aria in real-world applications.*
+
+This model was contributed by [m-ric](https://huggingface.co/m-ric).
+The original code can be found [here](https://github.com/rhymes-ai/Aria).
+
+## Usage tips
+
+Here's how to use the model for vision tasks:
 ```python
+import requests
 import torch
-from transformers import pipeline
+from PIL import Image

-pipeline = pipeline(
-    "image-to-text",
-    model="rhymes-ai/Aria",
-    device=0,
-    torch_dtype=torch.bfloat16
-)
-pipeline(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
-    text="What is shown in this image?"
-)
-```
+from transformers import AriaProcessor, AriaForConditionalGeneration

-</hfoption>
-<hfoption id="AutoModel">
+model_id_or_path = "rhymes-ai/Aria"

-```python
-import torch
-from transformers import AutoModelForCausalLM, AutoProcessor
-
-model = AutoModelForCausalLM.from_pretrained(
-    "rhymes-ai/Aria",
-    device_map="auto",
-    torch_dtype=torch.bfloat16,
-    attn_implementation="sdpa"
+model = AriaForConditionalGeneration.from_pretrained(
+    model_id_or_path, device_map="auto"
 )

-processor = AutoProcessor.from_pretrained("rhymes-ai/Aria")
+processor = AriaProcessor.from_pretrained(model_id_or_path)
+
+image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)

 messages = [
    {
-        "role": "user", "content": [
-            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
-            {"type": "text", "text": "What is shown in this image?"},
-        ]
-    },
+        "role": "user",
+        "content": [
+            {"type": "image"},
+            {"text": "what is the image?", "type": "text"},
+        ],
+    }
 ]

-inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt")
-ipnuts = inputs.to(model.device, torch.bfloat16)
+text = processor.apply_chat_template(messages, add_generation_prompt=True)
+inputs = processor(text=text, images=image, return_tensors="pt")
+inputs.to(model.device)

 output = model.generate(
    **inputs,
@ -90,55 +79,6 @@ output = model.generate(
 )
 output_ids = output[0][inputs["input_ids"].shape[1]:]
 response = processor.decode(output_ids, skip_special_tokens=True)
-print(response)
-```
-
-</hfoption>
-</hfoptions>
-
-Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
-	
-The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4 and the [rhymes-ai/Aria-sequential_mlp](https://huggingface.co/rhymes-ai/Aria-sequential_mlp) checkpoint. This checkpoint replaces grouped GEMM with `torch.nn.Linear` layers for easier quantization.
-
-```py
-# pip install torchao
-import torch
-from transformers import TorchAoConfig, AutoModelForCausalLM, AutoProcessor
-
-quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
-model = AutoModelForCausalLM.from_pretrained(
-    "rhymes-ai/Aria-sequential_mlp",
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-    quantization_config=quantization_config
-)
-processor = AutoProcessor.from_pretrained(
-    "rhymes-ai/Aria-sequential_mlp",
-)
-
-messages = [
-    {
-        "role": "user", "content": [
-            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
-            {"type": "text", "text": "What is shown in this image?"},
-        ]
-    },
-]
-
-inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt")
-inputs = inputs.to(model.device, torch.bfloat16)
-
-output = model.generate(
-    **inputs,
-    max_new_tokens=15,
-    stop_strings=["<|im_end|>"],
-    tokenizer=processor.tokenizer,
-    do_sample=True,
-    temperature=0.9,
-)
-output_ids = output[0][inputs["input_ids"].shape[1]:]
-response = processor.decode(output_ids, skip_special_tokens=True)
-print(response)
 ```


--- a/docs/source/en/model_doc/llava.md
+++ b/docs/source/en/model_doc/llava.md
@ -216,12 +216,12 @@ processor.batch_decode(generate_ids, skip_special_tokens=True)

 ## Note regarding reproducing original implementation

-In order to match the logits of the [original implementation](https://github.com/haotian-liu/LLaVA/tree/main), one needs to additionally specify `do_pad=True` when instantiating `LlavaImageProcessor`:
+In order to match the logits of the [original implementation](https://github.com/haotian-liu/LLaVA/tree/main), one needs to additionally specify `do_pad=True` when instantiating `LLavaImageProcessor`:

 ```python
-from transformers import LlavaImageProcessor
+from transformers import LLavaImageProcessor

-image_processor = LlavaImageProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf", do_pad=True)
+image_processor = LLavaImageProcessor.from_pretrained("https://huggingface.co/llava-hf/llava-1.5-7b-hf", do_pad=True)
 ```

 ### Using Flash Attention 2
--- a/docs/source/en/model_doc/minimax.md
+++ b/docs/source/en/model_doc/minimax.md
@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.

 ## Overview

-The MiniMax-Text-01 model was proposed in [MiniMax-01: Scaling Foundation Models with Lightning Attention](https://arxiv.org/abs/2501.08313) by MiniMax, Aonian Li, Bangwei Gong, Bo Yang, Boji Shan, Chang Liu, Cheng Zhu, Chunhao Zhang, Congchao Guo, Da Chen, Dong Li, Enwei Jiao, Gengxin Li, Guojun Zhang, Haohai Sun, Houze Dong, Jiadai Zhu, Jiaqi Zhuang, Jiayuan Song, Jin Zhu, Jingtao Han, Jingyang Li, Junbin Xie, Junhao Xu, Junjie Yan, Kaishun Zhang, Kecheng Xiao, Kexi Kang, Le Han, Leyang Wang, Lianfei Yu, Liheng Feng, Lin Zheng, Linbo Chai, Long Xing, Meizhi Ju, Mingyuan Chi, Mozhi Zhang, Peikai Huang, Pengcheng Niu, Pengfei Li, Pengyu Zhao, Qi Yang, Qidi Xu, Qiexiang Wang, Qin Wang, Qiuhui Li, Ruitao Leng, Shengmin Shi, Shuqi Yu, Sichen Li, Songquan Zhu, Tao Huang, Tianrun Liang, Weigao Sun, Weixuan Sun, Weiyu Cheng, Wenkai Li, Xiangjun Song, Xiao Su, Xiaodong Han, Xinjie Zhang, Xinzhu Hou, Xu Min, Xun Zou, Xuyang Shen, Yan Gong, Yingjie Zhu, Yipeng Zhou, Yiran Zhong, Yongyi Hu, Yuanxiang Fan, Yue Yu, Yufeng Yang, Yuhao Li, Yunan Huang, Yunji Li, Yunpeng Huang, Yunzhi Xu, Yuxin Mao, Zehan Li, Zekang Li, Zewei Tao, Zewen Ying, Zhaoyang Cong, Zhen Qin, Zhenhua Fan, Zhihang Yu, Zhuo Jiang, Zijia Wu.
+The DepthPro model was proposed in [MiniMax-01: Scaling Foundation Models with Lightning Attention](https://arxiv.org/abs/2501.08313) by MiniMax, Aonian Li, Bangwei Gong, Bo Yang, Boji Shan, Chang Liu, Cheng Zhu, Chunhao Zhang, Congchao Guo, Da Chen, Dong Li, Enwei Jiao, Gengxin Li, Guojun Zhang, Haohai Sun, Houze Dong, Jiadai Zhu, Jiaqi Zhuang, Jiayuan Song, Jin Zhu, Jingtao Han, Jingyang Li, Junbin Xie, Junhao Xu, Junjie Yan, Kaishun Zhang, Kecheng Xiao, Kexi Kang, Le Han, Leyang Wang, Lianfei Yu, Liheng Feng, Lin Zheng, Linbo Chai, Long Xing, Meizhi Ju, Mingyuan Chi, Mozhi Zhang, Peikai Huang, Pengcheng Niu, Pengfei Li, Pengyu Zhao, Qi Yang, Qidi Xu, Qiexiang Wang, Qin Wang, Qiuhui Li, Ruitao Leng, Shengmin Shi, Shuqi Yu, Sichen Li, Songquan Zhu, Tao Huang, Tianrun Liang, Weigao Sun, Weixuan Sun, Weiyu Cheng, Wenkai Li, Xiangjun Song, Xiao Su, Xiaodong Han, Xinjie Zhang, Xinzhu Hou, Xu Min, Xun Zou, Xuyang Shen, Yan Gong, Yingjie Zhu, Yipeng Zhou, Yiran Zhong, Yongyi Hu, Yuanxiang Fan, Yue Yu, Yufeng Yang, Yuhao Li, Yunan Huang, Yunji Li, Yunpeng Huang, Yunzhi Xu, Yuxin Mao, Zehan Li, Zekang Li, Zewei Tao, Zewen Ying, Zhaoyang Cong, Zhen Qin, Zhenhua Fan, Zhihang Yu, Zhuo Jiang, Zijia Wu.

 The abstract from the paper is the following:

@ -148,8 +148,8 @@ Quantizing a model is as simple as passing a `quantization_config` to the model.
 "The expected output"
 ```

-This model was contributed by [geetu040](https://github.com/geetu040) and [Shakib-IO](https://github.com/Shakib-IO).
-The original code can be found [here](https://huggingface.co/MiniMaxAI/MiniMax-Text-01/blob/main/modeling_minimax_text_01.py).
+This model was contributed by [geetu040](https://github.com/geetu040).
+The original code can be found [here](https://huggingface.co/MiniMaxAI/MiniMax-Text-01-hf/blob/main/modeling_minimax.py).

 ## Resources

--- a/examples/continuous_batching_viz.py
+++ b/examples/continuous_batching_viz.py
@ -1,42 +0,0 @@
-import datasets
-import torch
-
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from transformers.generation import GenerationConfig
-
-
-torch.set_float32_matmul_precision("high")
-
-model_id = "meta-llama/Llama-3.2-3b-Instruct"
-model = AutoModelForCausalLM.from_pretrained(
-    model_id, attn_implementation="sdpa_paged", torch_dtype=torch.bfloat16, device_map=0
-).eval()
-tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
-
-generation_config = GenerationConfig(
-    max_new_tokens=512,
-    eos_token_id=tokenizer.eos_token_id,
-    pad_token_id=tokenizer.pad_token_id,
-    use_cache=False,
-    num_blocks=2048,
-    block_size=128,
-    do_sample=True,
-    max_batch_tokens=1024,  # Maximum number of tokens to process in a single batch
-    scheduler="prefill_first",
-)
-
-train_dataset = datasets.load_dataset("openai/gsm8k", "socratic", split="test")
-
-def tokenize_function(examples):
-    return tokenizer(examples["question"])
-
-tokenized_datasets = train_dataset.map(tokenize_function, batched=True)
-simple_batch_inputs = [item["input_ids"] for item in tokenized_datasets]
-
-batch_outputs = model.generate_batch(
-    inputs=simple_batch_inputs,
-    generation_config=generation_config,
-    progress_bar=False,
-    enable_visualizer=True,
-    tokenizer=tokenizer,
-)
--- a/examples/pytorch/continuous_batching.py
+++ b/examples/pytorch/continuous_batching.py
@ -11,7 +11,7 @@ torch.set_float32_matmul_precision("high")

 model_id = "meta-llama/Llama-3.2-3b-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
-    model_id, attn_implementation="sdpa_paged", torch_dtype=torch.bfloat16, device_map=0
+    model_id, attn_implementation="sdpa_paged", torch_dtype=torch.bfloat16, device_map="auto"
 ).eval()
 tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")

--- a/i18n/README_es.md
+++ b/i18n/README_es.md
@ -15,15 +15,10 @@ limitations under the License.
 -->

 <p align="center">
-  <picture>
-    <source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg">
-    <source media="(prefers-color-scheme: light)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg">
-    <img alt="Hugging Face Transformers Library" src="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg" width="352" height="59" style="max-width: 100%;">
-  </picture>
-  <br/>
-  <br/>
+    <br>
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
+    <br>
 </p>
-
 <p align="center">
    <a href="https://circleci.com/gh/huggingface/transformers"><img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main"></a>
    <a href="https://github.com/huggingface/transformers/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue"></a>
--- a/i18n/README_hd.md
+++ b/i18n/README_hd.md
@ -40,15 +40,10 @@ checkpoint: जाँच बिंदु
 -->

 <p align="center">
-  <picture>
-    <source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg">
-    <source media="(prefers-color-scheme: light)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg">
-    <img alt="Hugging Face Transformers Library" src="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg" width="352" height="59" style="max-width: 100%;">
-  </picture>
-  <br/>
-  <br/>
+    <br>
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
+    <br>
 </p>
-
 <p align="center">
    <a href="https://circleci.com/gh/huggingface/transformers"><img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main"></a>
    <a href="https://github.com/huggingface/transformers/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue"></a>
--- a/i18n/README_ja.md
+++ b/i18n/README_ja.md
@ -50,15 +50,10 @@ user: ユーザ
 -->

 <p align="center">
-  <picture>
-    <source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg">
-    <source media="(prefers-color-scheme: light)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg">
-    <img alt="Hugging Face Transformers Library" src="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg" width="352" height="59" style="max-width: 100%;">
-  </picture>
-  <br/>
-  <br/>
+    <br>
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
+    <br>
 </p>
-
 <p align="center">
    <a href="https://circleci.com/gh/huggingface/transformers"><img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main"></a>
    <a href="https://github.com/huggingface/transformers/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue"></a>
--- a/i18n/README_zh-hans.md
+++ b/i18n/README_zh-hans.md
@ -40,15 +40,10 @@ checkpoint: 检查点
 -->

 <p align="center">
-  <picture>
-    <source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg">
-    <source media="(prefers-color-scheme: light)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg">
-    <img alt="Hugging Face Transformers Library" src="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg" width="352" height="59" style="max-width: 100%;">
-  </picture>
-  <br/>
-  <br/>
+    <br>
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
+    <br>
 </p>
-
 <p align="center">
    <a href="https://circleci.com/gh/huggingface/transformers"><img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main"></a>
    <a href="https://github.com/huggingface/transformers/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue"></a>
--- a/i18n/README_zh-hant.md
+++ b/i18n/README_zh-hant.md
@ -52,15 +52,10 @@ user: 使用者
 -->

 <p align="center">
-  <picture>
-    <source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg">
-    <source media="(prefers-color-scheme: light)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg">
-    <img alt="Hugging Face Transformers Library" src="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg" width="352" height="59" style="max-width: 100%;">
-  </picture>
-  <br/>
-  <br/>
+    <br>
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
+    <br>
 </p>
-
 <p align="center">
    <a href="https://circleci.com/gh/huggingface/transformers"><img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main"></a>
    <a href="https://github.com/huggingface/transformers/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue"></a>
--- a/setup.py
+++ b/setup.py
@ -120,6 +120,7 @@ _deps = [
    "huggingface-hub>=0.30.0,<1.0",
    "importlib_metadata",
    "ipadic>=1.0.0,<2.0",
+    "isort>=5.5.4",
    "jax>=0.4.1,<=0.4.13",
    "jaxlib>=0.4.1,<=0.4.13",
    "jieba",
@ -204,7 +205,6 @@ _deps = [
    "opentelemetry-api",
    "opentelemetry-exporter-otlp",
    "opentelemetry-sdk",
-    "textual",
 ]


@ -368,7 +368,7 @@ extras["testing"] = (

 extras["deepspeed-testing"] = extras["deepspeed"] + extras["testing"] + extras["optuna"] + extras["sentencepiece"]
 extras["ruff"] = deps_list("ruff")
-extras["quality"] = deps_list("datasets", "ruff", "GitPython", "urllib3", "libcst", "rich", "pandas")
+extras["quality"] = deps_list("datasets", "isort", "ruff", "GitPython", "urllib3", "libcst", "rich", "pandas")

 extras["all"] = (
    extras["tf"]
@ -442,8 +442,6 @@ extras["benchmark"] = deps_list("optimum-benchmark")
 # OpenTelemetry dependencies for metrics collection in continuous batching
 extras["open-telemetry"] = deps_list("opentelemetry-api", "opentelemetry-exporter-otlp", "opentelemetry-sdk")

-extras["continuous-batching-visualizer"] = deps_list("rich", "textual")
-
 # when modifying the following list, make sure to update src/transformers/dependency_versions_check.py
 install_requires = [
    deps["filelock"],  # filesystem locks, e.g., to prevent parallel downloads
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@ -27,6 +27,7 @@ deps = {
    "huggingface-hub": "huggingface-hub>=0.30.0,<1.0",
    "importlib_metadata": "importlib_metadata",
    "ipadic": "ipadic>=1.0.0,<2.0",
+    "isort": "isort>=5.5.4",
    "jax": "jax>=0.4.1,<=0.4.13",
    "jaxlib": "jaxlib>=0.4.1,<=0.4.13",
    "jieba": "jieba",
--- a/src/transformers/generation/continuous_batching.py
+++ b/src/transformers/generation/continuous_batching.py
@ -25,7 +25,6 @@ from enum import Enum
 from functools import partial
 from typing import Deque, Dict, List, Optional, Set, Tuple, Union

-from tokenizers import Tokenizer
 import torch
 import torch.nn as nn
 from torch.profiler import profile, schedule, tensorboard_trace_handler
@ -34,7 +33,6 @@ from tqdm import tqdm
 from ..cache_utils import Cache
 from ..configuration_utils import PretrainedConfig
 from ..generation.configuration_utils import GenerationConfig
-from ..utils.continuous_batching_visualizer import ContinuousBatchingVisualizer
 from ..utils.metrics import ContinuousBatchProcessorMetrics, attach_tracer, traced


@ -1104,7 +1102,6 @@ class ContinuousBatchingManager:
        self.profile = getattr(generation_config, "profile", False)
        self.manual_eviction = manual_eviction
        self.batch_processor: Optional[ContinuousBatchProcessor] = None
-        self.visualizer = None

    @traced
    def start(self):
@ -1154,12 +1151,6 @@ class ContinuousBatchingManager:
                logger.info("Continuous Batching Manager stopped.")
                self._generation_thread = None

-    def set_tokenizer(self, tokenizer: Tokenizer):
-        self.tokenizer = tokenizer
-
-    def set_visualizer(self, visualizer: ContinuousBatchingVisualizer):
-        self.visualizer = visualizer
-
    def add_request(
        self, input_ids: List[int], request_id: Optional[str] = None, max_new_tokens: Optional[int] = None
    ) -> str:
@ -1321,13 +1312,13 @@ class ContinuousBatchingManager:
                    record_shapes=False,
                    with_stack=True,
                ) as prof:
-                    while not self.stop_event.is_set():
+                    while not self.stop_event.is_set() or batch_processor.has_pending_requests():
                        self._inner_generation_loop(batch_processor, is_first)
                        if is_first:
                            is_first = False
                        prof.step()
            else:
-                while not self.stop_event.is_set():
+                while not self.stop_event.is_set() or batch_processor.has_pending_requests():
                    self._inner_generation_loop(batch_processor, is_first)
                    if is_first:
                        is_first = False
@ -1343,10 +1334,6 @@ class ContinuousBatchingManager:
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        batch_processor.prepare_next_batch()
-        if self.visualizer is not None:
-            viz_data = self._collect_visualization_data(batch_processor)
-            self.visualizer.draw(viz_data)
-            self.visualizer.wait_for_input()
        if torch.cuda.is_available() and self.use_cuda_graph:
            if is_first:
                self.warmup(batch_processor)
@ -1396,51 +1383,6 @@ class ContinuousBatchingManager:
        if self.batch_processor is not None:
            self.batch_processor.scheduler.finish_request(request_id)

-    def _collect_visualization_data(self, batch_processor: ContinuousBatchProcessor) -> Dict:
-        """Collect data for visualization."""
-        data = {
-            "batch_contents": [],
-            "words": [],
-            "request_ids_per_token": [],
-        }
-        data["attention_mask"] = batch_processor.attention_mask.clone()
-        
-        # Collect all tokens and map them to request IDs
-        all_tokens = []
-        all_request_ids = []
-        
-        for req in batch_processor.requests_in_batch:
-            if self.tokenizer is not None:
-                decoded = self.tokenizer.decode(req.prompt_ids)
-                decoded_tokens_list = self.tokenizer.convert_ids_to_tokens(req.prompt_ids)
-                data["batch_contents"].append({"request_id": req.request_id, "decoded": decoded, "decoded_tokens": decoded_tokens_list})
-                all_tokens.extend(decoded_tokens_list)
-            else:
-                data["batch_contents"].append({"request_id": req.request_id, "tokens": req.prompt_ids})
-                # Convert token IDs to strings when no tokenizer is available
-                all_tokens.extend([str(token_id) for token_id in req.prompt_ids])
-            
-            # Map each token to its request ID
-            all_request_ids.extend([req.request_id] * len(req.prompt_ids))
-        
-        data["words"] = all_tokens
-        data["request_ids_per_token"] = all_request_ids
-        
-        # Add cache statistics if available
-        if hasattr(batch_processor, 'cache'):
-            cache = batch_processor.cache
-            data["paged_attention_cache"] = {
-                "total_blocks": cache.num_blocks,
-                "used_blocks": cache.num_blocks - len(cache._free_blocks),
-                "free_blocks": len(cache._free_blocks),
-                "block_size": cache.block_size,
-                "num_heads": cache.num_key_value_heads,
-                "head_dim": cache.head_dim,
-                "utilization": (cache.num_blocks - len(cache._free_blocks)) / cache.num_blocks if cache.num_blocks > 0 else 0.0
-            }
-        
-        return data
-

 class ContinuousMixin:
    """Mixin class for models to add continuous batching capabilities."""
@ -1489,8 +1431,6 @@ class ContinuousMixin:
        inputs: List[List[int]],
        generation_config: Optional[GenerationConfig] = None,
        progress_bar: bool = True,
-        enable_visualizer: bool = False,
-        tokenizer: Optional[Tokenizer] = None,
        **kwargs,
    ) -> List[List[int]]:
        """Generate sequences for a batch of prompts using continuous batching.
@ -1498,8 +1438,6 @@ class ContinuousMixin:
        Args:
            inputs: List of input token sequences (prompts)
            generation_config: Optional generation configuration
-            progress_bar: Whether to show a progress bar during generation
-            visualizer: Whether to visualize the continuous batching process
            **kwargs: Additional generation parameters

        Returns:
@ -1516,37 +1454,29 @@ class ContinuousMixin:
        results = {}
        num_requests = len(inputs)
        try:
-            if enable_visualizer:
-                manager.add_requests(inputs, **kwargs)
-                visualizer = ContinuousBatchingVisualizer()
-                if tokenizer is not None:
-                    manager.set_tokenizer(tokenizer)
-                manager.set_visualizer(visualizer)
-                visualizer.run()
-            else:
-                from tqdm.contrib.logging import logging_redirect_tqdm
+            from tqdm.contrib.logging import logging_redirect_tqdm

-                with logging_redirect_tqdm([logger]):
-                    with tqdm(
-                        total=num_requests,
-                        disable=(not progress_bar),
-                        desc=f"Solving {num_requests} requests",
-                        unit="request",
-                    ) as pbar:
-                        manager.add_requests(inputs, **kwargs)
-                        finished_count = 0
-                        while finished_count < num_requests:
-                            result = manager.get_result(timeout=1)
-                            if result:
-                                req_id = result.request_id
-                                if result.status == RequestStatus.FINISHED:
-                                    results[req_id] = result
-                                    finished_count += 1
-                                    pbar.update(1)
-                            else:
-                                if not manager.is_running():
-                                    logger.error("Generation thread terminated unexpectedly.")
-                                    break
+            with logging_redirect_tqdm([logger]):
+                with tqdm(
+                    total=num_requests,
+                    disable=(not progress_bar),
+                    desc=f"Solving {num_requests} requests",
+                    unit="request",
+                ) as pbar:
+                    manager.add_requests(inputs, **kwargs)
+                    finished_count = 0
+                    while finished_count < num_requests:
+                        result = manager.get_result(timeout=1)
+                        if result:
+                            req_id = result.request_id
+                            if result.status == RequestStatus.FINISHED:
+                                results[req_id] = result
+                                finished_count += 1
+                                pbar.update(1)
+                        else:
+                            if not manager.is_running():
+                                logger.error("Generation thread terminated unexpectedly.")
+                                break

        except Exception as e:
            logger.error(f"Error during batch generation: {e}", exc_info=True)
--- a/src/transformers/integrations/deepspeed.py
+++ b/src/transformers/integrations/deepspeed.py
@ -458,16 +458,11 @@ def deepspeed_init(trainer, num_training_steps, inference=False):
        model_parameters = None
    else:
        trainer.optimizer = None  # important for when deepspeed_init is used as re-init
-        deepspeed_tp_size = hf_deepspeed_config.config.get("tensor_parallel", {}).get("autotp_size", 1)
-        if deepspeed_tp_size > 1:
+        tp_size = hf_deepspeed_config.config.get("tensor_parallel", {}).get("autotp_size", 0)
+        if tp_size > 1:
            import deepspeed

-            model = deepspeed.tp_model_init(
-                model=model,
-                tp_size=deepspeed_tp_size,
-                dtype=hf_deepspeed_config.dtype(),
-                config=hf_deepspeed_config.config,
-            )
+            model = deepspeed.tp_model_init(model=model, tp_size=tp_size, dtype=hf_deepspeed_config.dtype())
        model_parameters = list(filter(lambda p: p.requires_grad, model.parameters()))
        optimizer, lr_scheduler = deepspeed_optim_sched(
            trainer, hf_deepspeed_config, args, num_training_steps, model_parameters
--- a/src/transformers/models/chameleon/modeling_chameleon.py
+++ b/src/transformers/models/chameleon/modeling_chameleon.py
@ -1243,6 +1243,7 @@ class ChameleonForConditionalGeneration(ChameleonPreTrainedModel, GenerationMixi
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs: Unpack[KwargsForCausalLM],
    ) -> Union[Tuple, CausalLMOutputWithPast]:
@ -1276,6 +1277,7 @@ class ChameleonForConditionalGeneration(ChameleonPreTrainedModel, GenerationMixi
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        outputs = self.model(
@ -1288,7 +1290,7 @@ class ChameleonForConditionalGeneration(ChameleonPreTrainedModel, GenerationMixi
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
-            return_dict=True,
+            return_dict=return_dict,
            cache_position=cache_position,
            **kwargs,
        )
--- a/src/transformers/models/fsmt/configuration_fsmt.py
+++ b/src/transformers/models/fsmt/configuration_fsmt.py
@ -28,11 +28,10 @@ class DecoderConfig(PretrainedConfig):

    model_type = "fsmt_decoder"

-    def __init__(self, vocab_size=0, bos_token_id=0, is_encoder_decoder=True):
+    def __init__(self, vocab_size=0, bos_token_id=0):
        super().__init__()
        self.vocab_size = vocab_size
        self.bos_token_id = bos_token_id
-        self.is_encoder_decoder = is_encoder_decoder


 class FSMTConfig(PretrainedConfig):
@ -188,9 +187,7 @@ class FSMTConfig(PretrainedConfig):
        self.init_std = init_std  # Normal(0, this parameter)
        self.activation_function = activation_function

-        self.decoder = DecoderConfig(
-            vocab_size=tgt_vocab_size, bos_token_id=eos_token_id, is_encoder_decoder=is_encoder_decoder
-        )
+        self.decoder = DecoderConfig(vocab_size=tgt_vocab_size, bos_token_id=eos_token_id)
        if "decoder" in common_kwargs:
            del common_kwargs["decoder"]

--- a/src/transformers/models/kosmos2/modeling_kosmos2.py
+++ b/src/transformers/models/kosmos2/modeling_kosmos2.py
@ -1809,6 +1809,7 @@ class Kosmos2ForConditionalGeneration(Kosmos2PreTrainedModel, GenerationMixin):
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
        **kwargs: Unpack[KwargsForCausalLM],
    ) -> Union[Tuple, Kosmos2ForConditionalGenerationModelOutput]:
        r"""
@ -1867,6 +1868,7 @@ class Kosmos2ForConditionalGeneration(Kosmos2PreTrainedModel, GenerationMixin):
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        vision_model_output = None
        projection_attentions = None
@ -1878,6 +1880,7 @@ class Kosmos2ForConditionalGeneration(Kosmos2PreTrainedModel, GenerationMixin):
                pixel_values=pixel_values,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
            )
            # The whole `last_hidden_state` through `post_layernorm` instead of just `pooled_output`.
            image_embeds = self.vision_model.model.post_layernorm(vision_model_output[0])
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@ -604,6 +604,7 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
        logits_to_keep: Union[int, torch.Tensor] = 0,
        **kwargs: Unpack[KwargsForCausalLM],
@ -644,6 +645,7 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        vision_feature_layer = (
            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
        )
@ -666,7 +668,7 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
-            return_dict=True,
+            return_dict=return_dict,
            cache_position=cache_position,
            **kwargs,
        )
--- a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
@ -1525,6 +1525,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
        pixel_values: Optional[torch.Tensor] = None,
        pixel_values_videos: Optional[torch.FloatTensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
@ -1587,6 +1588,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.model(
            input_ids=input_ids,
@ -1602,7 +1604,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
-            return_dict=True,
+            return_dict=return_dict,
            cache_position=cache_position,
            **kwargs,
        )
@ -1614,6 +1616,10 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
        if labels is not None:
            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size)

+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
        return Qwen2_5_VLCausalLMOutputWithPast(
            loss=loss,
            logits=logits,
--- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
@ -770,6 +770,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
        pixel_values: Optional[torch.Tensor] = None,
        pixel_values_videos: Optional[torch.FloatTensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
@ -832,6 +833,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.model(
            input_ids=input_ids,
@ -847,7 +849,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
-            return_dict=True,
+            return_dict=return_dict,
            cache_position=cache_position,
            **kwargs,
        )
@ -859,6 +861,10 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
        if labels is not None:
            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size)

+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
        return Qwen2_5_VLCausalLMOutputWithPast(
            loss=loss,
            logits=logits,
--- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
@ -1409,6 +1409,7 @@ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel, GenerationMixin):
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
        pixel_values: Optional[torch.Tensor] = None,
        pixel_values_videos: Optional[torch.FloatTensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
@ -1468,6 +1469,7 @@ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel, GenerationMixin):
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.model(
            input_ids=input_ids,
@ -1482,7 +1484,7 @@ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel, GenerationMixin):
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
-            return_dict=True,
+            return_dict=return_dict,
            cache_position=cache_position,
            **kwargs,
        )
--- a/src/transformers/quantizers/quantizer_awq.py
+++ b/src/transformers/quantizers/quantizer_awq.py
@ -92,11 +92,11 @@ class AwqQuantizer(HfQuantizer):
        if torch_dtype is None:
            torch_dtype = torch.float16
            logger.info("Loading the model in `torch.float16`. To overwrite it, set `torch_dtype` manually.")
-        elif torch_dtype == torch.bfloat16 and torch.cuda.is_available():
-            logger.warning("`torch.bfloat16` is not supported for AWQ CUDA kernels yet. Casting to `torch.float16`.")
+        elif torch_dtype == torch.bfloat16:
+            logger.warning("`torch.bfloat16` is not supported for AWQ kernels yet. Casting to `torch.float16`.")
            torch_dtype = torch.float16
-        elif torch_dtype != torch.float16 and torch.cuda.is_available():
-            logger.warning("We suggest you to set `torch_dtype=torch.float16` for better efficiency on CUDA with AWQ.")
+        elif torch_dtype != torch.float16:
+            logger.warning("We suggest you to set `torch_dtype=torch.float16` for better efficiency with AWQ.")
        return torch_dtype

    def _process_model_before_weight_loading(
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@ -38,12 +38,12 @@ from typing import TYPE_CHECKING, Any, Callable, Optional, Union


 # Integrations must be imported before ML frameworks:
-# ruff: isort: off
+# isort: off
 from .integrations import (
    get_reporting_integration_callbacks,
 )

-# ruff: isort: on
+# isort: on

 import huggingface_hub.utils as hf_hub_utils
 import numpy as np
@ -232,7 +232,6 @@ if is_accelerate_available():
        AutocastKwargs,
        DistributedDataParallelKwargs,
        DistributedType,
-        TorchTensorParallelPlugin,
        load_fsdp_model,
        load_fsdp_optimizer,
        save_fsdp_model,
@ -2238,27 +2237,6 @@ class Trainer:
                ignore_keys_for_eval=ignore_keys_for_eval,
            )

-    def get_tp_size(self) -> int:
-        """Get the tensor parallel size from either the model or DeepSpeed config."""
-
-        # 1. Check model.tp_size first
-        if (model_tp := getattr(self.model, "_tp_size", None)) is not None:
-            return model_tp
-
-        # 2. Fall back to DeepSpeed config if enabled
-        if self.is_deepspeed_enabled and (deepspeed_config := getattr(self.args, "hf_deepspeed_config", None)):
-            return deepspeed_config.config.get("tensor_parallel", {}).get("autotp_size", 1)
-
-        # 3. Default fallback
-        return 1
-
-    def get_total_train_batch_size(self, args) -> int:
-        """Calculates total batch size (micro_batch * grad_accum * dp_world_size).
-
-        Note: Only considers DP and TP (dp_world_size = world_size // tp_size)."""
-        dp_world_size = args.world_size // self.get_tp_size()
-        return self._train_batch_size * args.gradient_accumulation_steps * dp_world_size
-
    def _inner_training_loop(
        self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None
    ):
@ -2289,8 +2267,7 @@ class Trainer:
        # number of training epochs: num_train_epochs
        # number of training steps per epoch: num_update_steps_per_epoch
        # total number of training steps to execute: max_steps
-        total_train_batch_size = self.get_total_train_batch_size(args)
-
+        total_train_batch_size = self._train_batch_size * args.gradient_accumulation_steps * args.world_size
        (
            num_train_epochs,
            num_update_steps_per_epoch,
@ -2322,9 +2299,7 @@ class Trainer:
            else:
                debug_overflow = DebugUnderflowOverflow(self.model)  # noqa

-        delay_optimizer_creation = (
-            is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled or self.is_fsdp_enabled or self.is_tp_enabled
-        )
+        delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled or self.is_fsdp_enabled

        # Can't delay optimizer creation when using FSDP2: https://github.com/huggingface/accelerate/blob/3f636d626063ffcf9a337c7d3624d61b7d187d59/src/accelerate/accelerator.py#L1404
        is_fsdp2 = self.is_fsdp_enabled and (getattr(self.accelerator.state.fsdp_plugin, "fsdp_version", 1) == 2)
@ -2384,10 +2359,7 @@ class Trainer:
                if self.use_apex:
                    model = self.accelerator.prepare(self.model)
                else:
-                    if delay_optimizer_creation:
-                        self.optimizer = self.accelerator.prepare(self.optimizer)
-                    else:
-                        model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
+                    model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
            else:
                # to handle cases wherein we pass "DummyScheduler" such as when it is specified in DeepSpeed config.
                model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
@ -2608,16 +2580,10 @@ class Trainer:
                                    args.max_grad_norm,
                                )
                            else:
-                                grad_norm_context = contextlib.nullcontext
-                                if self.is_tp_enabled:
-                                    from torch.distributed._tensor.experimental import implicit_replication
-
-                                    grad_norm_context = implicit_replication
-                                with grad_norm_context():
-                                    _grad_norm = self.accelerator.clip_grad_norm_(
-                                        model.parameters(),
-                                        args.max_grad_norm,
-                                    )
+                                _grad_norm = self.accelerator.clip_grad_norm_(
+                                    model.parameters(),
+                                    args.max_grad_norm,
+                                )

                            if (
                                is_accelerate_available()
--- a/src/transformers/utils/continuous_batching_visualizer.py
+++ b/src/transformers/utils/continuous_batching_visualizer.py
@ -1,513 +0,0 @@
-from threading import Event
-from typing import Optional, List, Any, Dict
-import hashlib
-
-from rich.text import Text
-from rich.segment import Segment
-from rich.style import Style
-from textual.app import App, ComposeResult, RenderResult
-from textual.containers import Horizontal, Vertical
-from textual.reactive import reactive
-from textual.widget import Widget
-from textual.widgets import Static, Footer, Header, RichLog
-from textual.strip import Strip
-from textual.scroll_view import ScrollView
-from textual.geometry import Size
-from textual.cache import LRUCache
-import torch
-
-# Constants for visualization
-BLACK_SQUARE = "■"
-WHITE_SQUARE = "⬚"
-
-
-class AttentionMatrixWidget(ScrollView):
-    """Widget to display attention matrix visualization with request ID-based coloring."""
-    
-    DEFAULT_CSS = """
-    AttentionMatrixWidget {
-        scrollbar-size: 1 1;
-    }
-    """
-
-    def __init__(self):
-        super().__init__()
-        
-        # Attention matrix data
-        self.words: List[str] = []
-        self.mask: Optional[torch.Tensor] = None
-        self.request_ids: List[str] = []  # Request ID for each token
-        self.img_token: str = "<img>"
-        
-        # Processed data for rendering
-        self._processed_mask: Optional[torch.Tensor] = None
-        self._max_word_length: int = 0
-        self.header_lines: int = 0
-        
-        # Performance caches
-        self._segment_cache = LRUCache(maxsize=1000)
-        self._style_cache = LRUCache(maxsize=100)
-        self._data_hash: Optional[str] = None
-        
-        # Color scheme for request IDs
-        self._color_cache = LRUCache(maxsize=100)
-        
-    def set_attention_data(
-        self,
-        words: List[str],
-        mask: torch.Tensor,
-        request_ids: Optional[List[str]] = None,
-        img_token: str = "<img>",
-        **kwargs
-    ):
-        """Set new attention data and trigger re-rendering."""
-        # Create hash of input data for caching
-        data_str = f"{words}_{mask.shape}_{request_ids}_{img_token}"
-        new_hash = hashlib.md5(data_str.encode()).hexdigest()
-        
-        # Always update if data has changed or if this is first time
-        if new_hash != self._data_hash or self._data_hash is None:
-            self._data_hash = new_hash
-            
-            # Clear caches when data changes
-            self._segment_cache.clear()
-            self._style_cache.clear()
-            
-            # Store raw data
-            self.words = words
-            self.mask = mask.clone()
-            self.request_ids = request_ids or ["unknown"] * len(words)
-            self.img_token = img_token
-            
-            # Process the data
-            self._process_attention_data()
-            
-            # Update virtual size and refresh
-            self._calculate_virtual_size()
-            self.refresh()
-        
-    def _process_attention_data(self):
-        """Process attention data for efficient rendering."""
-        if not self.words or self.mask is None:
-            return
-            
-        # Convert mask to 2D
-        mask = self.mask.int()
-        
-        if mask.ndim == 3:
-            mask = mask[0, :, :]
-        elif mask.ndim == 4:
-            mask = mask[0, 0, :, :]
-            
-        n = len(self.words)
-        self._max_word_length = max(len(repr(word)) for word in self.words) if self.words else 0
-        
-        self._processed_mask = mask
-
-    def _calculate_virtual_size(self):
-        """Calculate the virtual size for scrolling."""
-        if not self.words:
-            virtual_height = 1
-        else:
-            virtual_height = len(self.words)
-            
-        # Width based on content (word length + matrix + spacing)
-        if self.words:
-            matrix_width = len(self.words) * 2  # Each cell takes 2 chars (symbol + space)
-            virtual_width = self._max_word_length + 10 + matrix_width
-        else:
-            virtual_width = 50
-            
-        self.virtual_size = Size(virtual_width, virtual_height)
-        
-    def _get_request_id_color(self, request_id: str) -> Style:
-        """Get cached color style for request ID."""
-        cached_style = self._color_cache.get(request_id)
-        if cached_style is not None:
-            return cached_style
-            
-        # Generate consistent color for request ID
-        r, g, b = self._string_to_rgb_color(request_id)
-        color_str = f"rgb({r},{g},{b})"
-        style = Style(color=color_str)
-        
-        self._color_cache.set(request_id, style)
-        return style
-        
-    def _string_to_rgb_color(self, input_string: str) -> tuple[int, int, int]:
-        """Generate a consistent RGB color from an input string."""
-        hash_value = abs(hash(input_string))
-        
-        # Extract RGB components
-        r = (hash_value >> 16) & 0xFF
-        g = (hash_value >> 8) & 0xFF  
-        b = hash_value & 0xFF
-        
-        # Ensure colors are bright enough to be visible
-        r = max(64, min(255, r))
-        g = max(64, min(255, g))
-        b = max(64, min(255, b))
-        
-        return (r, g, b)
-    
-    def render_line(self, y: int) -> Strip:
-        """Render a single line using Line API for performance."""
-        # Early return for empty data
-        if not self.words or self._processed_mask is None:
-            return Strip([Segment("No attention data to display", Style(color="gray50"))])
-        
-        # Get the actual content line based on viewport position
-        content_line = y
-        
-        # Use a lighter caching approach - cache by content line and data hash only
-        # Don't cache if we don't have stable data to avoid scroll interference
-        cache_key = f"line_{content_line}_{self._data_hash}" if self._data_hash else None
-        cached_strip = None
-        if cache_key:
-            cached_strip = self._segment_cache.get(cache_key)
-        if cached_strip is not None:
-            return cached_strip
-            
-        n = len(self.words)
-        
-        # Render different types of lines based on content position
-        if content_line == 0:
-            strip = self._render_title_line()
-        elif content_line < n:
-            # Matrix row
-            strip = self._render_matrix_row(content_line)
-        else:
-            # Empty line
-            strip = Strip([Segment("")])
-            
-        # Cache the result only if we have a valid cache key
-        if cache_key:
-            self._segment_cache.set(cache_key, strip)
-        return strip
-
-    def _render_title_line(self) -> Strip:
-        """Render the title line."""
-        title = f"Attention Matrix ({len(self.words)}x{len(self.words)})"
-        return Strip([Segment(title, Style(bold=True))])
-
-    def _render_matrix_row(self, row_idx: int) -> Strip:
-        """Render a single matrix row with request ID-based coloring."""
-        if row_idx >= len(self.words) or self._processed_mask is None:
-            return Strip([Segment("")])
-            
-        word = self.words[row_idx]
-        word_repr = repr(word).ljust(self._max_word_length)
-        
-        segments = []
-        
-        # Row label (word) - colored by request ID
-        row_request_id = self.request_ids[row_idx] if row_idx < len(self.request_ids) else "unknown"
-        row_style = self._get_request_id_color(row_request_id)
-        segments.append(Segment(word_repr, row_style))
-        segments.append(Segment(f": {str(row_idx).rjust(2)} ", Style()))
-        
-        # Matrix cells
-        for col_idx in range(len(self.words)):
-            mask_value = self._processed_mask[row_idx, col_idx].item()
-            col_request_id = self.request_ids[col_idx] if col_idx < len(self.request_ids) else "unknown"
-            
-            if mask_value == 1:  # Attended - use request ID color
-                symbol = BLACK_SQUARE
-                # Use the color of the target request ID (column)
-                style = self._get_request_id_color(col_request_id)
-            else:  # Not attended
-                symbol = WHITE_SQUARE
-                style = Style(color="gray50")
-                
-            segments.append(Segment(symbol, style))
-            segments.append(Segment(" ", Style()))  # Spacing
-            
-        return Strip(segments)
-
-
-
-
-class BatchContentsWidget(RichLog):
-    """Widget to display batch contents with request ID coloring using RichLog."""
-
-    DEFAULT_CSS = """
-    BatchContentsWidget {
-        height: 35%;
-    }
-    """
-
-    def __init__(self, **kwargs):
-        super().__init__(
-            auto_scroll=False,
-            markup=True,
-            wrap=True,
-            **kwargs
-        )
-
-    def set_batch_contents(self, batch_contents: List[Dict[str, Any]]):
-        """Set batch contents and update display."""
-        # Clear existing content
-        self.clear()
-        
-        if not batch_contents:
-            self.write("Batch contents will be displayed here.")
-            return
-            
-        # Write each token info as a separate line
-        for token_info in batch_contents:
-            request_id = token_info.get("request_id", "unknown")
-            color = self._get_color_for_request(request_id)
-            
-            # Create Rich Text for this token
-            token_text = Text()
-            token_text.append(f"[{request_id}] ", style=f"bold {color}")
-            
-            if "decoded" in token_info:
-                token_text.append(token_info["decoded"], style=color)
-            elif "tokens" in token_info:
-                tokens_str = " ".join(map(str, token_info["tokens"]))
-                token_text.append(tokens_str, style=color)
-            else:
-                token_text.append("(no content)", style=color)
-            
-            # Write the token info to the log
-            self.write(token_text)
-
-    def _get_color_for_request(self, request_id: str) -> str:
-        """Get color for request ID - delegates to parent app."""
-        app = self.app
-        if hasattr(app, '_get_cached_color'):
-            return app._get_cached_color(request_id)
-        return "white"  # fallback
-
-
-class CacheWidget(Widget):
-    """Widget to display PagedAttentionCache contents and statistics."""
-
-    cache_info: reactive[Text] = reactive(Text("PagedAttentionCache: waiting for data..."))
-
-    def render(self) -> RenderResult:
-        return self.cache_info
-
-
-class ContinuousBatchingVisualizer(App):
-    """Main application for visualizing continuous batching with request ID-based coloring."""
-
-    # Bind 'q' key to quit action
-    BINDINGS = [("n", "next", "Next"), ("q", "quit", "Quit")]
-
-    CSS = """
-    /* Top row widgets */
-    #top-row {
-        height: 65%;
-    }
-    
-    AttentionMatrixWidget {
-        width: 50%;
-        border: solid #87CEEB;
-        margin: 0;
-        scrollbar-size: 1 1;
-    }
-    
-    CacheWidget {
-        width: 50%;
-        border: solid #98FB98;
-        margin: 0;
-    }
-    
-    /* Bottom widget */
-    BatchContentsWidget {
-        width: 100%;
-        border: solid #FFB6C1;
-        margin: 0;
-    }
-    
-    .content {
-        padding: 1;
-        background: $surface;
-    }
-    """
-
-    def __init__(self):
-        super().__init__()
-        self.exited = False
-        self.wait_event = Event()
-        self._color_cache = LRUCache(maxsize=1024)
-        self._pending_attention_data = None
-
-    def compose(self) -> ComposeResult:
-        """Compose the app layout."""
-        yield Header()
-        with Vertical():
-            with Horizontal(id="top-row"):
-                yield AttentionMatrixWidget()
-                yield CacheWidget()
-            yield BatchContentsWidget()
-        yield Footer()
-
-    def on_mount(self) -> None:
-        """Called when the app is mounted and widgets are available."""
-        # If we have pending attention data, apply it now
-        if self._pending_attention_data:
-            self.set_timer(0.1, self._apply_pending_attention_data)
-
-    def _apply_pending_attention_data(self) -> None:
-        """Apply any pending attention data if widgets are ready."""
-        if self._pending_attention_data:
-            try:
-                attention_widget = self.query_one(AttentionMatrixWidget)
-                attention_widget.set_attention_data(**self._pending_attention_data)
-                self._pending_attention_data = None
-            except Exception:
-                # Try again later if widget still not ready
-                self.set_timer(0.1, self._apply_pending_attention_data)
-
-    def action_quit(self) -> None:
-        """Action to quit the application."""
-        self.wait_event.set()
-        self.exited = True
-        self.exit()
-
-    def action_next(self) -> None:
-        """Action to update visualizations with new data."""
-        self.wait_event.set()
-
-    def draw(self, data: Dict[str, Any]):
-        """
-        Update all widgets with new data from continuous batching.
-        
-        Expected data format:
-        {
-            'batch_contents': [
-                {
-                    'request_id': str,
-                    'tokens': List[int] or 'decoded': str,
-                    'decoded_tokens': List[str]  # optional
-                }
-            ],
-            'attention_mask': torch.Tensor,
-            'words': List[str],  # tokens as strings
-            'request_ids_per_token': List[str]  # request ID for each token
-        }
-        """
-        if self.exited:
-            return
-
-        try:
-            # Update batch contents widget
-            self._update_batch_contents(data.get('batch_contents', []))
-            
-            # Update attention matrix widget
-            self._update_attention_matrix(data)
-            
-            # Update cache info
-            self._update_cache_info(data)
-            
-        except Exception as e:
-            # Display error in cache widget
-            cache_widget = self.query_one(CacheWidget)
-            cache_widget.cache_info = Text(f"Error: {str(e)}", style="red")
-
-    def _update_batch_contents(self, batch_contents: List[Dict[str, Any]]):
-        """Update the batch contents widget with scrollable display."""
-        try:
-            batch_widget = self.query_one(BatchContentsWidget)
-            batch_widget.set_batch_contents(batch_contents)
-        except Exception:
-            pass  # Widget not ready yet
-
-    def _update_attention_matrix(self, data: Dict[str, Any]):
-        """Update the attention matrix widget."""
-        words = data.get('words', [])
-        attention_mask = data.get('attention_mask')
-        request_ids = data.get('request_ids_per_token', [])
-        
-        if words and attention_mask is not None:
-            try:
-                attention_widget = self.query_one(AttentionMatrixWidget)
-                attention_widget.set_attention_data(
-                    words=words,
-                    mask=attention_mask,
-                    request_ids=request_ids
-                )
-            except Exception as e:
-                # If we can't find the widget, store the data and try again later
-                self._pending_attention_data = {
-                    'words': words,
-                    'mask': attention_mask,
-                    'request_ids': request_ids
-                }
-                # Try again in a bit
-                self.set_timer(0.1, self._apply_pending_attention_data)
-
-    def _update_cache_info(self, data: Dict[str, Any]):
-        """Update cache information display."""
-        cache_data = data.get('paged_attention_cache', {})
-        
-        # Format PagedAttentionCache stats
-        cache_lines = ["[bold green]PagedAttentionCache[/bold green]"]
-        if cache_data:
-            # Display key PagedAttentionCache metrics
-            cache_lines.extend([
-                f"Total blocks: {cache_data.get('total_blocks', 0)}",
-                f"Used blocks: {cache_data.get('used_blocks', 0)}",
-                f"Free blocks: {cache_data.get('free_blocks', 0)}",
-                f"Block size: {cache_data.get('block_size', 'Unknown')}",
-                f"Num heads: {cache_data.get('num_heads', 'Unknown')}",
-                f"Head dim: {cache_data.get('head_dim', 'Unknown')}",
-            ])
-            
-            # Show utilization if available
-            if 'utilization' in cache_data:
-                cache_lines.append(f"Utilization: {cache_data['utilization']:.1%}")
-        else:
-            cache_lines.append("No PagedAttentionCache data available")
-
-        cache_info = Text.from_markup("\n".join(cache_lines))
-
-        try:
-            cache_widget = self.query_one(CacheWidget)
-            cache_widget.cache_info = cache_info
-            
-        except Exception:
-            # Widget not ready yet, just show basic info
-            try:
-                cache_widget = self.query_one(CacheWidget)
-                cache_info = Text("Cache info loading...", style="yellow")
-                cache_widget.cache_info = cache_info
-            except Exception:
-                pass  # CacheWidget not ready either
-
-    def _get_cached_color(self, request_id: str) -> str:
-        """Get cached color for request ID (same as attention matrix)."""
-        cached_color = self._color_cache.get(request_id)
-        if cached_color is not None:
-            return cached_color
-
-        r, g, b = self._string_to_rgb_color(request_id)
-        cached_color = f"rgb({r},{g},{b})"
-        self._color_cache.set(request_id, cached_color)
-        return cached_color
-
-    def _string_to_rgb_color(self, input_string: str) -> tuple[int, int, int]:
-        """Generate a consistent RGB color from an input string."""
-        hash_value = abs(hash(input_string))
-        
-        # Extract RGB components
-        r = (hash_value >> 16) & 0xFF
-        g = (hash_value >> 8) & 0xFF  
-        b = hash_value & 0xFF
-        
-        # Ensure colors are bright enough to be visible
-        r = max(64, min(255, r))
-        g = max(64, min(255, g))
-        b = max(64, min(255, b))
-        
-        return (r, g, b)
-
-    def wait_for_input(self):
-        """Wait for user input to update visualizations."""
-        if self.exited:
-            return
-        self.wait_event.wait()
-        self.wait_event.clear()
--- a/test.py
+++ b/test.py
@ -1,63 +0,0 @@
-from transformers import GenerationConfig
-from transformers.generation.continuous_batching import ContinuousBatchingManager, RequestStatus
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model = AutoModelForCausalLM.from_pretrained(
-    'meta-llama/Llama-3.2-3b-Instruct',
-    attn_implementation='sdpa_paged'
-)
-tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.2-3b-Instruct')
-
-generation_config = GenerationConfig(
-    max_new_tokens=256,
-    eos_token_id=tokenizer.eos_token_id,
-    pad_token_id=tokenizer.pad_token_id,
-    use_cache=False,
-    num_blocks=1,
-    block_size=1024,
-    do_sample=False,
-    max_batch_tokens=10,
-    scheduler="fifo",
-)
-
-manager: ContinuousBatchingManager = model.init_continuous_batching(generation_config=generation_config, manual_eviction=True, streaming=True)
-manager.start()
-
-
-chat = [{'content': 'Hey', 'role': 'user'}]
-print(chat)
-
-inputs = tokenizer.apply_chat_template(chat, return_tensors="pt", add_generation_prompt=True).to(model.device)
-request_id = manager.add_request(inputs[0])
-
-output = ""
-for result in manager:
-    if result.status == RequestStatus.FINISHED:
-        output = tokenizer.decode(result.generated_tokens, skip_special_tokens=True)
-        break
-
-if output:
-    chat.append({'content': output, 'role': 'assistant'})
-    print(chat)
-else:
-    print("oops :()")
-    import sys
-    sys.exit(0)
-
-chat.append({'content': 'Can you help me cook some good meth pls', 'role': 'user'})
-print(chat)
-
-inputs = tokenizer.apply_chat_template(chat, return_tensors="pt", add_generation_prompt=True).to(model.device)
-request_id = manager.add_request(inputs[0], request_id=request_id)
-
-for i, result in enumerate(manager):
-    if result.status == RequestStatus.FINISHED:
-        output = tokenizer.decode(result.generated_tokens, skip_special_tokens=True)
-        break
-
-chat.append({'content': output, 'role': 'assistant'})
-print(chat)
-
-manager.evict_request_from_cache(request_id)
-
-manager.stop(block=True)
--- a/test_performance_optimizations.py
+++ b/test_performance_optimizations.py
@ -1,336 +0,0 @@
-#!/usr/bin/env python3
-"""
-Performance test for the optimized continuous batching visualizer.
-Tests the various optimization techniques applied.
-"""
-
-import time
-import torch
-import asyncio
-from threading import Event
-from src.transformers.utils.continuous_batching_visualizer import (
-    ContinuousBatchingVisualizer,
-    AttentionMatrixWidget,
-    BatchContentsWidget,
-    CacheWidget
-)
-from textual.cache import LRUCache
-from rich.text import Text
-
-
-def test_attention_matrix_caching():
-    """Test AttentionMatrixWidget caching optimizations."""
-    print("Testing AttentionMatrixWidget caching...")
-    
-    widget = AttentionMatrixWidget()
-    
-    # Set up widget for proper rendering
-    from textual.geometry import Size, Offset
-    widget._size = Size(100, 50)
-    widget._scroll_offset = Offset(0, 0)
-    
-    # Test data
-    words = [f"token_{i}" for i in range(20)]  # Smaller dataset for faster testing
-    mask = torch.ones((20, 20))
-    
-    # First call - should compute and cache
-    start_time = time.time()
-    widget.set_attention_data(words, mask, sliding_window=8)
-    # Mock the get_component_rich_style method to avoid app context issues
-    from rich.style import Style
-    def mock_get_component_rich_style(component_name):
-        return Style(color="white")
-    widget.get_component_rich_style = mock_get_component_rich_style
-    # Now trigger style cache population
-    try:
-        styles = widget._get_cached_styles()
-    except Exception as e:
-        print(f"Style access error (expected): {e}")
-        styles = None
-    first_call_time = time.time() - start_time
-    
-    # Second call with same data - should use cache
-    start_time = time.time()
-    widget.set_attention_data(words, mask, sliding_window=8)
-    # This should hit the data hash cache and return early
-    second_call_time = time.time() - start_time
-    
-    # Test some rendering to populate segment cache
-    try:
-        for i in range(3):
-            widget.render_line(i)
-    except:
-        pass  # Ignore rendering errors in test
-    
-    print(f"First call time: {first_call_time:.4f}s")
-    print(f"Second call time: {second_call_time:.4f}s")
-    speedup = first_call_time / max(second_call_time, 0.0001)
-    print(f"Cache hit speedup: {speedup:.2f}x")
-    
-    # Test cache sizes
-    style_cache_size = len(widget._style_cache.keys())
-    segment_cache_size = len(widget._segment_cache.keys())
-    print(f"Style cache size: {style_cache_size}")
-    print(f"Segment cache size: {segment_cache_size}")
-    
-    # More lenient test - should show some improvement and have caches
-    return (second_call_time < first_call_time * 0.8 and  # Some speedup
-            style_cache_size > 0)  # Style cache populated
-
-
-def test_line_rendering_performance():
-    """Test line rendering performance with Line API."""
-    print("\nTesting line rendering performance...")
-    
-    widget = AttentionMatrixWidget()
-    
-    # Large dataset
-    words = [f"token_{i}" for i in range(50)]  # Smaller dataset for testing
-    mask = torch.randint(0, 2, (50, 50))
-    widget.set_attention_data(words, mask, sliding_window=16)
-    
-    # Set up widget for rendering by simulating proper initialization
-    from textual.geometry import Size, Offset
-    # Use private attributes to simulate proper widget state
-    widget._size = Size(100, 50)
-    widget._scroll_offset = Offset(0, 0)
-    widget._calculate_virtual_size()
-    
-    # Test rendering multiple lines without cache dependencies
-    start_time = time.time()
-    lines_rendered = 0
-    for i in range(min(20, len(words) + widget.header_lines)):  # Render available lines
-        try:
-            # Create a simple strip for testing without full widget dependencies
-            if widget.words and widget._processed_mask is not None:
-                # Just test that the rendering logic works
-                n = len(widget.words)
-                styles = {
-                    'green': None, 'yellow': None, 'black': None, 'white': None
-                }
-                # Test header and matrix row creation logic
-                if i < widget.header_lines:
-                    # Test header rendering
-                    pass
-                elif i - widget.header_lines < n:
-                    # Test matrix row rendering  
-                    pass
-                lines_rendered += 1
-            else:
-                lines_rendered += 1
-        except Exception as e:
-            print(f"Error in line {i}: {e}")
-            break
-    line_render_time = time.time() - start_time
-    
-    print(f"Rendered {lines_rendered} lines in: {line_render_time:.4f}s")
-    print(f"Average per line: {line_render_time / max(lines_rendered, 1):.6f}s")
-    
-    return line_render_time < 1.0 and lines_rendered > 0  # Should be fast and render some lines
-
-
-def test_batch_contents_caching():
-    """Test BatchContentsWidget caching."""
-    print("\nTesting BatchContentsWidget caching...")
-    
-    widget = BatchContentsWidget()
-    
-    # Test data
-    test_text = Text("Sample batch contents with styling")
-    test_text.stylize("bold red", 0, 6)
-    
-    # First render
-    start_time = time.time()
-    widget.tokens_to_display = test_text
-    result1 = widget.render()
-    first_render_time = time.time() - start_time
-    
-    # Second render with same content - should use cache
-    start_time = time.time()
-    result2 = widget.render()
-    second_render_time = time.time() - start_time
-    
-    print(f"First render time: {first_render_time:.6f}s")
-    print(f"Second render time: {second_render_time:.6f}s")
-    print(f"Cache size: {len(widget._render_cache.keys())}")
-    
-    return result1 == result2 and len(widget._render_cache.keys()) > 0
-
-
-def test_color_caching():
-    """Test color generation caching."""
-    print("\nTesting color caching...")
-    
-    app = ContinuousBatchingVisualizer()
-    
-    # Test repeated color generation
-    request_ids = [f"request_{i}" for i in range(10)] * 5  # 50 calls, 10 unique
-    
-    start_time = time.time()
-    colors = []
-    for req_id in request_ids:
-        color = app._get_cached_color(req_id)
-        colors.append(color)
-    total_time = time.time() - start_time
-    
-    print(f"Generated 50 colors (10 unique) in: {total_time:.4f}s")
-    print(f"Color cache size: {len(app._color_cache.keys())}")
-    print(f"Cache hit rate: {(50 - 10) / 50 * 100:.1f}%")
-    
-    # Verify color consistency
-    test_color_1 = app._get_cached_color("test_request")
-    test_color_2 = app._get_cached_color("test_request")
-    
-    return test_color_1 == test_color_2 and len(app._color_cache.keys()) == 11
-
-
-def test_cache_widget_optimization():
-    """Test CacheWidget static content optimization."""
-    print("\nTesting CacheWidget optimization...")
-    
-    widget = CacheWidget()
-    
-    # Test cache info updates
-    cache_info1 = {"cache_size": 100, "hit_rate": 0.85}
-    cache_info2 = {"cache_size": 100, "hit_rate": 0.85}  # Same data
-    cache_info3 = {"cache_size": 120, "hit_rate": 0.90}  # Different data
-    
-    start_time = time.time()
-    widget.update_cache_info(cache_info1)
-    first_update_time = time.time() - start_time
-    
-    start_time = time.time()
-    widget.update_cache_info(cache_info2)  # Should be fast (no change)
-    second_update_time = time.time() - start_time
-    
-    start_time = time.time()
-    widget.update_cache_info(cache_info3)  # Should update
-    third_update_time = time.time() - start_time
-    
-    print(f"First update: {first_update_time:.6f}s")
-    print(f"Second update (no change): {second_update_time:.6f}s")
-    print(f"Third update (changed): {third_update_time:.6f}s")
-    print(f"Display cache size: {len(widget._display_cache.keys())}")
-    
-    return second_update_time < first_update_time and len(widget._display_cache.keys()) > 0
-
-
-async def test_worker_optimization():
-    """Test background worker for data processing."""
-    print("\nTesting worker optimization...")
-    
-    app = ContinuousBatchingVisualizer()
-    
-    # Large test data
-    batch_contents = []
-    for i in range(50):
-        batch_contents.append({
-            "request_id": f"req_{i % 10}",  # 10 unique request IDs
-            "decoded": f"Sample text for request {i} with some longer content",
-            "decoded_tokens": [f"token_{j}" for j in range(20)]
-        })
-    
-    attention_mask = torch.randint(0, 2, (1000, 1000))  # Large attention mask
-    
-    test_data = {
-        "batch_contents": batch_contents,
-        "attention_mask": attention_mask,
-        "sliding_window": 128,
-        "token_type_ids": [1] * 1000,
-        "image_seq_length": 576
-    }
-    
-    # Process data (test the async processing part directly)
-    start_time = time.time()
-    processed_data = await app._process_data_async(test_data)
-    processing_time = time.time() - start_time
-    
-    print(f"Processed large dataset in: {processing_time:.4f}s")
-    print(f"Data cache size: {len(app._data_processing_cache.keys())}")
-    print(f"Color cache size: {len(app._color_cache.keys())}")
-    
-    # Test cache hit
-    start_time = time.time()
-    processed_data_cached = await app._process_data_async(test_data)
-    cached_processing_time = time.time() - start_time
-    
-    print(f"Cached processing time: {cached_processing_time:.6f}s")
-    print(f"Cache speedup: {processing_time / max(cached_processing_time, 0.000001):.2f}x")
-    
-    # Verify that processed data is equivalent 
-    data_matches = (processed_data['colored_text'] == processed_data_cached['colored_text'])
-    cache_working = len(app._data_processing_cache.keys()) > 0
-    
-    return (cached_processing_time < processing_time / 2 and  # Should be at least 2x faster
-            data_matches and cache_working)  # Data should match and cache should work
-
-
-def test_memory_efficiency():
-    """Test memory efficiency of caching systems."""
-    print("\nTesting memory efficiency...")
-    
-    # Test LRU cache eviction
-    cache = LRUCache(maxsize=5)
-    
-    # Fill cache
-    for i in range(10):
-        cache.set(f"key_{i}", f"value_{i}")
-    
-    # Should only have 5 items (most recent)
-    keys = list(cache.keys())
-    print(f"Cache keys after filling with 10 items (maxsize=5): {keys}")
-    print(f"Cache size: {len(keys)}")
-    
-    # Test that old items were evicted
-    has_old_items = any(f"key_{i}" in keys for i in range(5))
-    has_new_items = any(f"key_{i}" in keys for i in range(5, 10))
-    
-    print(f"Has old items (0-4): {has_old_items}")
-    print(f"Has new items (5-9): {has_new_items}")
-    
-    return len(keys) == 5 and not has_old_items and has_new_items
-
-
-async def main():
-    """Run all performance tests."""
-    print("=== Continuous Batching Visualizer Performance Tests ===\n")
-    
-    tests = [
-        test_attention_matrix_caching,
-        test_line_rendering_performance,
-        test_batch_contents_caching,
-        test_color_caching,
-        test_cache_widget_optimization,
-        test_worker_optimization,
-        test_memory_efficiency
-    ]
-    
-    results = []
-    for test in tests:
-        try:
-            if asyncio.iscoroutinefunction(test):
-                result = await test()
-            else:
-                result = test()
-            results.append(result)
-            print(f"✓ {test.__name__}: {'PASS' if result else 'FAIL'}")
-        except Exception as e:
-            print(f"✗ {test.__name__}: ERROR - {e}")
-            results.append(False)
-        print()
-    
-    # Summary
-    passed = sum(results)
-    total = len(results)
-    print(f"=== Summary: {passed}/{total} tests passed ===")
-    
-    if passed == total:
-        print("🎉 All performance optimizations working correctly!")
-    else:
-        print("⚠️  Some optimizations need attention.")
-    
-    return passed == total
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
--- a/tests/generation/test_paged_attention.py
+++ b/tests/generation/test_paged_attention.py
@ -25,8 +25,8 @@ _EXPECTED_OUTPUTS = [


@slow
-@require_flash_attn
@require_torch_gpu
+@require_flash_attn
 class TestBatchGeneration(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@ -499,7 +499,7 @@ class GenerationTesterMixin:
            model = model_class(config).to(torch_device).eval()
            output_generate = self._greedy_generate(model=model, inputs_dict=inputs_dict)

-            if model.config.get_text_config(decoder=True).is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                self.assertTrue(output_generate.shape[1] == self.max_new_tokens + 1)
            else:
                self.assertTrue(output_generate.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1])
@ -523,7 +523,7 @@ class GenerationTesterMixin:
                use_cache=False,
            )

-            if model.config.get_text_config(decoder=True).is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                self.assertTrue(output_generate.sequences.shape[1] == self.max_new_tokens + 1)
                self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput)
                # Retrocompatibility check
@ -563,7 +563,7 @@ class GenerationTesterMixin:
                use_cache=True,  # Enable cache
            )

-            if model.config.get_text_config(decoder=True).is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                self.assertTrue(output_generate.sequences.shape[1] == self.max_new_tokens + 1)
            else:
                self.assertTrue(
@ -580,7 +580,7 @@ class GenerationTesterMixin:
            model = model_class(config).to(torch_device).eval()
            output_generate = self._sample_generate(model=model, inputs_dict=inputs_dict, num_return_sequences=1)

-            if model.config.get_text_config(decoder=True).is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                self.assertTrue(output_generate.shape[1] == self.max_new_tokens + 1)
            else:
                self.assertTrue(output_generate.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1])
@ -605,7 +605,7 @@ class GenerationTesterMixin:
                use_cache=False,
            )

-            if model.config.get_text_config(decoder=True).is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                self.assertTrue(output_generate.sequences.shape[1] == self.max_new_tokens + 1)
                self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput)
                # Retrocompatibility check
@ -630,7 +630,7 @@ class GenerationTesterMixin:
            beam_kwargs = self._get_beam_kwargs()
            output_generate = self._beam_search_generate(model=model, inputs_dict=inputs_dict, beam_kwargs=beam_kwargs)

-            if model.config.get_text_config(decoder=True).is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                self.assertTrue(output_generate.shape[1] == self.max_new_tokens + 1)
            else:
                self.assertTrue(output_generate.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1])
@ -655,7 +655,7 @@ class GenerationTesterMixin:
                return_dict_in_generate=True,
                use_cache=False,
            )
-            if model.config.get_text_config(decoder=True).is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                self.assertTrue(output_generate.sequences.shape[1] == self.max_new_tokens + 1)
                self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput)
                # Retrocompatibility check
@ -704,7 +704,7 @@ class GenerationTesterMixin:
                use_cache=True,  # Enable cache
            )

-            if model.config.get_text_config(decoder=True).is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                self.assertTrue(output_generate.sequences.shape[1] == self.max_new_tokens + 1)
            else:
                self.assertTrue(
@ -757,7 +757,7 @@ class GenerationTesterMixin:
                beam_kwargs=beam_kwargs,
            )

-            if model.config.get_text_config(decoder=True).is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                self.assertTrue(output_generate.shape[1] == self.max_new_tokens + 1)
            else:
                self.assertTrue(output_generate.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1])
@ -784,7 +784,7 @@ class GenerationTesterMixin:
                use_cache=False,
            )

-            if model.config.get_text_config(decoder=True).is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                self.assertTrue(output_generate.sequences.shape[1] == self.max_new_tokens + 1)
                self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput)
                # Retrocompatibility check
@ -838,7 +838,7 @@ class GenerationTesterMixin:
                inputs_dict=inputs_dict,
                beam_kwargs=beam_kwargs,
            )
-            if model.config.get_text_config(decoder=True).is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                self.assertTrue(output_generate.shape[1] == self.max_new_tokens + 1)
            else:
                self.assertTrue(output_generate.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1])
@ -851,7 +851,7 @@ class GenerationTesterMixin:
                inputs_dict=inputs_dict,
                beam_kwargs=beam_kwargs,
            )
-            if model.config.get_text_config(decoder=True).is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                self.assertTrue(output_generate.shape[1] == self.max_new_tokens + 1)
            else:
                self.assertTrue(output_generate.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1])
@ -876,7 +876,7 @@ class GenerationTesterMixin:
                return_dict_in_generate=True,
                use_cache=False,
            )
-            if model.config.get_text_config(decoder=True).is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                self.assertTrue(output_generate.sequences.shape[1] == self.max_new_tokens + 1)
                self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput)
                # Retrocompatibility check
@ -921,7 +921,7 @@ class GenerationTesterMixin:
                beam_kwargs=beam_kwargs,
            )

-            if model.config.get_text_config(decoder=True).is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                self.assertTrue(output_generate.shape[1] == self.max_new_tokens + 1)
            else:
                self.assertTrue(output_generate.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1])
@ -945,7 +945,7 @@ class GenerationTesterMixin:
                beam_kwargs=beam_kwargs,
            )

-            if model.config.get_text_config(decoder=True).is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                self.assertTrue(output_generate.shape[1] == self.max_new_tokens + 1)
            else:
                self.assertTrue(output_generate.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1])
@ -985,7 +985,7 @@ class GenerationTesterMixin:
                use_cache=False,
            )

-            if model.config.get_text_config(decoder=True).is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                self.assertTrue(output_generate.sequences.shape[1] == self.max_new_tokens + 1)
                self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput)
                # Retrocompatibility check
@ -1029,7 +1029,7 @@ class GenerationTesterMixin:
                inputs_dict=inputs_dict,
                use_cache=True,  # Enable cache
            )
-            if model.config.get_text_config(decoder=True).is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                self.assertTrue(output_generate.shape[1] == self.max_new_tokens + 1)
            else:
                self.assertTrue(output_generate.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1])
@ -1065,7 +1065,7 @@ class GenerationTesterMixin:
                use_cache=True,  # Enable cache
            )

-            if model.config.get_text_config(decoder=True).is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                self.assertTrue(output_generate.sequences.shape[1] == self.max_new_tokens + 1)
            else:
                self.assertTrue(
@ -1297,7 +1297,7 @@ class GenerationTesterMixin:
                config._attn_implementation = "eager"

            # Encoder-decoder models are not supported
-            if config.get_text_config(decoder=True).is_encoder_decoder:
+            if config.is_encoder_decoder:
                self.skipTest("DoLa is not supported for encoder-decoder models")
            config.is_decoder = True
            model = model_class(config).to(torch_device).eval()
@ -1427,6 +1427,52 @@ class GenerationTesterMixin:
        # PLD shouldn't propose any new tokens based on eos-match
        self.assertTrue(output_prompt_lookup.shape[-1] == 10)

+    @pytest.mark.generate
+    def test_generate_with_head_masking(self):
+        """Test designed for encoder-decoder models to ensure the attention head masking is used."""
+        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
+        for model_class in self.all_generative_model_classes:
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            config._attn_implementation = "eager"  # head mask works only in eager mode and will be removed soon
+            text_config = config.get_text_config()
+            if self.has_attentions:
+                config._attn_implementation = "eager"  # can't output attentions otherwise
+
+            # We want to test only encoder-decoder models
+            if not text_config.is_encoder_decoder:
+                continue
+            model = model_class(config).to(torch_device)
+
+            head_masking = {
+                "head_mask": torch.zeros(
+                    text_config.encoder_layers, text_config.encoder_attention_heads, device=torch_device
+                ),
+                "decoder_head_mask": torch.zeros(
+                    text_config.decoder_layers, text_config.decoder_attention_heads, device=torch_device
+                ),
+                "cross_attn_head_mask": torch.zeros(
+                    text_config.decoder_layers, text_config.decoder_attention_heads, device=torch_device
+                ),
+            }
+
+            signature = inspect.signature(model.forward)
+            # We want to test only models where encoder/decoder head masking is implemented
+            if not set(head_masking.keys()) < {*signature.parameters.keys()}:
+                continue
+
+            for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
+                out = model.generate(
+                    num_beams=1,
+                    output_attentions=self.has_attentions,
+                    return_dict_in_generate=True,
+                    remove_invalid_values=True,
+                    **{name: mask},
+                    **inputs_dict,
+                )
+                # We check the state of decoder_attentions and cross_attentions just from the last step
+                attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
+                self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
+
    @pytest.mark.generate
    def test_left_padding_compatibility(self):
        # NOTE: left-padding results in small numerical differences. This is expected.
@ -1445,7 +1491,7 @@ class GenerationTesterMixin:
        decoder_only_classes = []
        for model_class in self.all_generative_model_classes:
            config, _ = self.prepare_config_and_inputs_for_generate()
-            if config.get_text_config(decoder=True).is_encoder_decoder:
+            if config.is_encoder_decoder:
                continue
            else:
                decoder_only_classes.append(model_class)
@ -1650,7 +1696,7 @@ class GenerationTesterMixin:

            # This test is for decoder-only models (encoder-decoder models have native input embeddings support in the
            # decoder)
-            if config.get_text_config(decoder=True).is_encoder_decoder:
+            if config.is_encoder_decoder:
                continue
            config.is_decoder = True

@ -1744,7 +1790,7 @@ class GenerationTesterMixin:

            config, inputs_dict = self.prepare_config_and_inputs_for_generate()

-            if config.get_text_config(decoder=True).is_encoder_decoder:
+            if config.is_encoder_decoder:
                self.skipTest(reason="This model is encoder-decoder and has Encoder-Decoder Cache")

            model = model_class(config).to(torch_device).eval()
@ -1906,7 +1952,7 @@ class GenerationTesterMixin:
            if "token_type_ids" in inputs_dict:
                del inputs_dict["token_type_ids"]

-            if config.get_text_config(decoder=True).is_encoder_decoder:
+            if config.is_encoder_decoder:
                self.skipTest(reason="This model is encoder-decoder")
            # TODO (joao, raushan): the correct line below is `if not hasattr(config.get_text_config(), "use_cache")`,
            # but it breaks a few models. Fix and then apply `_check_similar_generate_outputs` pattern
@ -1985,7 +2031,7 @@ class GenerationTesterMixin:
            set_config_for_less_flaky_test(config)
            main_input = inputs_dict[model_class.main_input_name]

-            if config.get_text_config(decoder=True).is_encoder_decoder:
+            if config.is_encoder_decoder:
                self.skipTest(reason="This model is encoder-decoder and has Encoder-Decoder Cache")

            config.is_decoder = True
@ -2137,7 +2183,7 @@ class GenerationTesterMixin:
                    if not has_defined_cache_implementation:
                        decoder_cache = (
                            gen_out.past_key_values.self_attention_cache
-                            if config.get_text_config(decoder=True).is_encoder_decoder
+                            if config.is_encoder_decoder
                            else gen_out.past_key_values
                        )
                        self.assertTrue(isinstance(decoder_cache, DynamicCache))
@ -2163,7 +2209,7 @@ class GenerationTesterMixin:
                        # sanity checks
                        decoder_cache = (
                            gen_out.past_key_values.self_attention_cache
-                            if config.get_text_config(decoder=True).is_encoder_decoder
+                            if config.is_encoder_decoder
                            else gen_out.past_key_values
                        )
                        self.assertFalse(isinstance(decoder_cache, DynamicCache))
@ -2237,7 +2283,7 @@ class GenerationTesterMixin:
            else:
                self.assertTrue(hasattr(model, "_compiled_call"))  # our auto compile should have been called

-            if model.config.get_text_config(decoder=True).is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                self.assertTrue(output_generate.sequences.shape[1] == self.max_new_tokens + 1)
                self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput)
            else:
@ -5108,6 +5154,7 @@ class TestAssistedCandidateGeneratorUpdateStrategy(unittest.TestCase):

    @parameterized.expand([(is_sklearn_available(),), (False,)])
    def test_update_candidate_strategy_no_matches_short(self, sklearn_available):
+        print("test_update_candidate_strategy_no_matches_short")
        self.original_matches = []
        self.candidate_generator.matches = self.original_matches
        self.num_matches = 0
--- a/tests/models/aria/test_modeling_aria.py
+++ b/tests/models/aria/test_modeling_aria.py
@ -171,7 +171,6 @@ class AriaVisionText2TextModelTester:
        return config, inputs_dict


-@slow
@require_torch
 class AriaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
    """
--- a/tests/models/bark/test_modeling_bark.py
+++ b/tests/models/bark/test_modeling_bark.py
@ -34,7 +34,6 @@ from transformers.models.bark.generation_configuration_bark import (
    BarkSemanticGenerationConfig,
 )
 from transformers.testing_utils import (
-    backend_torch_accelerator_module,
    require_flash_attn,
    require_torch,
    require_torch_accelerator,
@ -1307,7 +1306,7 @@ class BarkModelIntegrationTests(unittest.TestCase):
            # standard generation
            output_with_no_offload = self.model.generate(**input_ids, do_sample=False, temperature=1.0)

-            torch_accelerator_module = backend_torch_accelerator_module(torch_device)
+            torch_accelerator_module = getattr(torch, torch_device, torch.cuda)

            torch_accelerator_module.empty_cache()

--- a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
+++ b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
@ -468,6 +468,13 @@ class BigBirdPegasusModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineT
    def test_load_save_without_tied_weights(self):
        pass

+    def test_generate_with_head_masking(self):
+        # overwritten to temporarily switch the attention type to `original_full`
+        original_self_attention_type = self.model_tester.attention_type
+        self.model_tester.attention_type = "original_full"
+        super().test_generate_with_head_masking()
+        self.model_tester.attention_type = original_self_attention_type
+

@require_torch
@require_sentencepiece
--- a/tests/models/blip/test_modeling_blip.py
+++ b/tests/models/blip/test_modeling_blip.py
@ -782,7 +782,7 @@ class BlipVQAModelTester:
@require_vision
 class BlipVQAModelTest(ModelTesterMixin, unittest.TestCase):
    all_model_classes = (BlipForQuestionAnswering,) if is_torch_available() else ()
-    # Doesn't run generation tests due to custom generation logic -- won't fix
+    # Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
    all_generative_model_classes = ()
    fx_compatible = False
    test_head_masking = False
@ -1091,7 +1091,7 @@ class BlipTextRetrievalModelTest(ModelTesterMixin, unittest.TestCase):
@require_torch
 class BlipTextImageModelTest(ModelTesterMixin, unittest.TestCase):
    all_model_classes = (BlipForConditionalGeneration,) if is_torch_available() else ()
-    # Doesn't run generation tests due to custom generation logic -- wont fix
+    # Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
    all_generative_model_classes = ()
    fx_compatible = False
    test_head_masking = False
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@ -774,7 +774,6 @@ class Blip2TextModelTester:
            bos_token_id=self.pad_token_id,
            pad_token_id=self.pad_token_id,
            decoder_start_token_id=self.decoder_start_token_id,
-            is_encoder_decoder=True,
        )


@ -796,9 +795,6 @@ class Blip2ModelTester:
        self.text_model_tester = Blip2TextModelTester(parent, **text_kwargs)
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
        self.seq_length = self.text_model_tester.seq_length  # need seq_length for common tests
-        self.encoder_seq_length = (
-            self.text_model_tester.encoder_seq_length + num_query_tokens
-        )  # need enc seq_length for gen tests
        self.is_training = is_training
        self.num_query_tokens = num_query_tokens

@ -863,9 +859,11 @@ class Blip2ModelTester:


@require_torch
-class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (Blip2ForConditionalGeneration, Blip2Model) if is_torch_available() else ()
    additional_model_inputs = ["input_ids", "decoder_input_ids"]
+    # Doesn't run generation tests. TODO: fix generation tests for Blip2ForConditionalGeneration
+    all_generative_model_classes = ()
    pipeline_model_mapping = (
        {
            "feature-extraction": Blip2Model,
@ -1710,14 +1708,10 @@ class Blip2ModelIntegrationTest(unittest.TestCase):

        expectations = Expectations(
            {
-                ("xpu", 3): [
-                    [0, 3, 9, 2335, 19, 1556, 28, 160, 1782, 30, 8, 2608, 1],
-                    "a woman is playing with her dog on the beach",
-                ],
                ("cuda", 7): [
                    [0, 3, 9, 2335, 19, 1556, 28, 160, 1782, 30, 8, 2608, 1],
                    "a woman is playing with her dog on the beach",
-                ],
+                ]
            }
        )
        expected_outputs = expectations.get_expectation()
@ -1735,14 +1729,10 @@ class Blip2ModelIntegrationTest(unittest.TestCase):

        expectations = Expectations(
            {
-                ("xpu", 3): [
-                    [0, 3, 7, 152, 2515, 11389, 3523, 1],
-                    "san francisco",
-                ],
                ("cuda", 7): [
                    [0, 3, 7, 152, 2515, 11389, 3523, 1],
                    "san francisco",
-                ],
+                ]
            }
        )
        expected_outputs = expectations.get_expectation()
@ -1765,14 +1755,10 @@ class Blip2ModelIntegrationTest(unittest.TestCase):

        expectations = Expectations(
            {
-                ("xpu", 3): [
-                    [0, 3, 9, 2335, 19, 1556, 28, 160, 1782, 30, 8, 2608, 1],
-                    [0, 3, 9, 2335, 19, 1556, 28, 160, 1782, 30, 8, 2608, 1],
-                ],
                ("cuda", 7): [
                    [0, 3, 9, 2335, 19, 1556, 28, 160, 1782, 30, 8, 2608, 1],
                    [0, 3, 9, 2335, 19, 1556, 28, 160, 1782, 30, 8, 2608, 1],
-                ],
+                ]
            }
        )
        expected_predictions = expectations.get_expectation()
--- a/tests/models/chameleon/test_modeling_chameleon.py
+++ b/tests/models/chameleon/test_modeling_chameleon.py
@ -420,7 +420,6 @@ class ChameleonIntegrationTest(unittest.TestCase):
        # greedy generation outputs
        EXPECTED_TEXT_COMPLETIONS = Expectations(
            {
-                ("xpu", 3): ['Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue dot in the center representing the star Altair. The star map is set against a black background, with the constellations visible in the night'],
                ("cuda", 7): ['Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue dot in the center representing the star Alpha Centauri. The star map is a representation of the night sky, showing the positions of stars in'],
                ("cuda", 8): ['Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue dot representing the position of the star Alpha Centauri. Alpha Centauri is the brightest star in the constellation Centaurus and is located'],
            }
@ -458,10 +457,6 @@ class ChameleonIntegrationTest(unittest.TestCase):
        # greedy generation outputs
        EXPECTED_TEXT_COMPLETIONS = Expectations(
            {
-                ("xpu", 3): [
-                    'Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue dot in the center representing the star Altair. The star map is set against a black background, with the constellations visible in the night',
-                    'What constellation is this image showing?The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.',
-                ],
                ("cuda", 7): [
                    'Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue line extending across the center of the image. The line is labeled "390 light years" and is accompanied by a small black and',
                    'What constellation is this image showing?The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.',
--- a/tests/models/cohere/test_modeling_cohere.py
+++ b/tests/models/cohere/test_modeling_cohere.py
@ -19,7 +19,7 @@ from transformers import CohereConfig, is_torch_available
 from transformers.testing_utils import (
    require_bitsandbytes,
    require_torch,
-    require_torch_multi_accelerator,
+    require_torch_multi_gpu,
    require_torch_sdpa,
    slow,
    torch_device,
@ -203,7 +203,7 @@ class CohereModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
@require_torch
@slow
 class CohereIntegrationTest(unittest.TestCase):
-    @require_torch_multi_accelerator
+    @require_torch_multi_gpu
    @require_bitsandbytes
    def test_batched_4bit(self):
        model_id = "CohereForAI/c4ai-command-r-v01-4bit"
--- a/tests/models/colqwen2/test_modeling_colqwen2.py
+++ b/tests/models/colqwen2/test_modeling_colqwen2.py
@ -14,6 +14,7 @@
 # limitations under the License.
 """Testing suite for the PyTorch ColQwen2 model."""

+import gc
 import unittest
 from typing import ClassVar

@ -26,15 +27,7 @@ from transformers import is_torch_available
 from transformers.models.colqwen2.configuration_colqwen2 import ColQwen2Config
 from transformers.models.colqwen2.modeling_colqwen2 import ColQwen2ForRetrieval, ColQwen2ForRetrievalOutput
 from transformers.models.colqwen2.processing_colqwen2 import ColQwen2Processor
-from transformers.testing_utils import (
-    Expectations,
-    cleanup,
-    require_bitsandbytes,
-    require_torch,
-    require_vision,
-    slow,
-    torch_device,
-)
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device


 if is_torch_available():
@ -289,9 +282,9 @@ class ColQwen2ModelIntegrationTest(unittest.TestCase):
        self.processor = ColQwen2Processor.from_pretrained(self.model_name)

    def tearDown(self):
-        cleanup(torch_device, gc_collect=True)
+        gc.collect()
+        torch.cuda.empty_cache()

-    @require_bitsandbytes
    @slow
    def test_model_integration_test(self):
        """
@ -300,7 +293,7 @@ class ColQwen2ModelIntegrationTest(unittest.TestCase):
        model = ColQwen2ForRetrieval.from_pretrained(
            self.model_name,
            torch_dtype=torch.bfloat16,
-            load_in_8bit=True,
+            device_map=torch_device,
        ).eval()

        # Load the test dataset
@ -328,20 +321,13 @@ class ColQwen2ModelIntegrationTest(unittest.TestCase):
        self.assertTrue((scores.argmax(axis=1) == torch.arange(len(ds), device=scores.device)).all())

        # Further validation: fine-grained check, with a hardcoded score from the original Hf implementation.
-        expectations = Expectations(
-            {
-                ("cuda", 7): [
-                    [15.5000, 8.1250, 14.9375],
-                    [9.0625, 17.1250, 10.6875],
-                    [15.9375, 12.1875, 20.2500],
-                ],
-                ("cuda", 8): [
-                    [15.1250, 8.6875, 15.0625],
-                    [9.2500, 17.2500, 10.3750],
-                    [15.9375, 12.3750, 20.2500],
-                ],
-            }
+        expected_scores = torch.tensor(
+            [
+                [16.2500, 7.8750, 14.6875],
+                [9.5000, 17.1250, 10.5000],
+                [14.9375, 10.9375, 20.0000],
+            ],
+            dtype=scores.dtype,
        )
-        expected_scores = torch.tensor(expectations.get_expectation(), dtype=scores.dtype)

        assert torch.allclose(scores, expected_scores, atol=1e-3), f"Expected scores {expected_scores}, got {scores}"
--- a/tests/models/deformable_detr/test_image_processing_deformable_detr.py
+++ b/tests/models/deformable_detr/test_image_processing_deformable_detr.py
@ -19,13 +19,7 @@ import unittest

 import numpy as np

-from transformers.testing_utils import (
-    require_torch,
-    require_torch_accelerator,
-    require_vision,
-    slow,
-    torch_device,
-)
+from transformers.testing_utils import require_torch, require_torch_gpu, require_vision, slow
 from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available

 from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs
@ -613,9 +607,9 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
            self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 50, 50]))

    @slow
-    @require_torch_accelerator
-    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_fast_processor_equivalence_cpu_accelerator_coco_detection_annotations
-    def test_fast_processor_equivalence_cpu_accelerator_coco_detection_annotations(self):
+    @require_torch_gpu
+    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations
+    def test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations(self):
        # prepare image and target
        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt") as f:
@ -628,8 +622,8 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi

        # 1. run processor on CPU
        encoding_cpu = processor(images=image, annotations=target, return_tensors="pt", device="cpu")
-        # 2. run processor on accelerator
-        encoding_gpu = processor(images=image, annotations=target, return_tensors="pt", device=torch_device)
+        # 2. run processor on GPU
+        encoding_gpu = processor(images=image, annotations=target, return_tensors="pt", device="cuda")

        # verify pixel values
        self.assertEqual(encoding_cpu["pixel_values"].shape, encoding_gpu["pixel_values"].shape)
@ -671,9 +665,9 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
        torch.testing.assert_close(encoding_cpu["labels"][0]["size"], encoding_gpu["labels"][0]["size"].to("cpu"))

    @slow
-    @require_torch_accelerator
-    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_fast_processor_equivalence_cpu_accelerator_coco_panoptic_annotations
-    def test_fast_processor_equivalence_cpu_accelerator_coco_panoptic_annotations(self):
+    @require_torch_gpu
+    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_fast_processor_equivalence_cpu_gpu_coco_panoptic_annotations
+    def test_fast_processor_equivalence_cpu_gpu_coco_panoptic_annotations(self):
        # prepare image, target and masks_path
        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt") as f:
@ -690,9 +684,9 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
        encoding_cpu = processor(
            images=image, annotations=target, masks_path=masks_path, return_tensors="pt", device="cpu"
        )
-        # 2. run processor on accelerator
+        # 2. run processor on GPU
        encoding_gpu = processor(
-            images=image, annotations=target, masks_path=masks_path, return_tensors="pt", device=torch_device
+            images=image, annotations=target, masks_path=masks_path, return_tensors="pt", device="cuda"
        )

        # verify pixel values
--- a/tests/models/deformable_detr/test_modeling_deformable_detr.py
+++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py
@ -746,7 +746,7 @@ class DeformableDetrModelIntegrationTests(unittest.TestCase):
        torch.testing.assert_close(outputs.pred_boxes[0, :3, :3], expected_boxes, rtol=1e-4, atol=1e-4)

    @require_torch_accelerator
-    def test_inference_object_detection_head_equivalence_cpu_accelerator(self):
+    def test_inference_object_detection_head_equivalence_cpu_gpu(self):
        image_processor = self.default_image_processor
        image = prepare_img()
        encoding = image_processor(images=image, return_tensors="pt")
@ -759,7 +759,7 @@ class DeformableDetrModelIntegrationTests(unittest.TestCase):
        with torch.no_grad():
            cpu_outputs = model(pixel_values, pixel_mask)

-        # 2. run model on accelerator
+        # 2. run model on GPU
        model.to(torch_device)

        with torch.no_grad():
--- a/tests/models/detr/test_image_processing_detr.py
+++ b/tests/models/detr/test_image_processing_detr.py
@ -18,14 +18,7 @@ import unittest

 import numpy as np

-from transformers.testing_utils import (
-    require_torch,
-    require_torch_accelerator,
-    require_torchvision,
-    require_vision,
-    slow,
-    torch_device,
-)
+from transformers.testing_utils import require_torch, require_torch_gpu, require_torchvision, require_vision, slow
 from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available

 from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs
@ -673,9 +666,9 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
            self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 50, 50]))

    @slow
-    @require_torch_accelerator
+    @require_torch_gpu
    @require_torchvision
-    def test_fast_processor_equivalence_cpu_accelerator_coco_detection_annotations(self):
+    def test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations(self):
        # prepare image and target
        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt") as f:
@ -686,8 +679,8 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
        processor = self.image_processor_list[1]()
        # 1. run processor on CPU
        encoding_cpu = processor(images=image, annotations=target, return_tensors="pt", device="cpu")
-        # 2. run processor on accelerator
-        encoding_gpu = processor(images=image, annotations=target, return_tensors="pt", device=torch_device)
+        # 2. run processor on GPU
+        encoding_gpu = processor(images=image, annotations=target, return_tensors="pt", device="cuda")

        # verify pixel values
        self.assertEqual(encoding_cpu["pixel_values"].shape, encoding_gpu["pixel_values"].shape)
@ -729,9 +722,9 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
        torch.testing.assert_close(encoding_cpu["labels"][0]["size"], encoding_gpu["labels"][0]["size"].to("cpu"))

    @slow
-    @require_torch_accelerator
+    @require_torch_gpu
    @require_torchvision
-    def test_fast_processor_equivalence_cpu_accelerator_coco_panoptic_annotations(self):
+    def test_fast_processor_equivalence_cpu_gpu_coco_panoptic_annotations(self):
        # prepare image, target and masks_path
        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt") as f:
@ -746,9 +739,9 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
        encoding_cpu = processor(
            images=image, annotations=target, masks_path=masks_path, return_tensors="pt", device="cpu"
        )
-        # 2. run processor on accelerator
+        # 2. run processor on GPU
        encoding_gpu = processor(
-            images=image, annotations=target, masks_path=masks_path, return_tensors="pt", device=torch_device
+            images=image, annotations=target, masks_path=masks_path, return_tensors="pt", device="cuda"
        )

        # verify pixel values
--- a/tests/models/gemma2/test_modeling_gemma2.py
+++ b/tests/models/gemma2/test_modeling_gemma2.py
@ -258,14 +258,10 @@ class Gemma2IntegrationTest(unittest.TestCase):
        # EXPECTED_TEXTS should match the same non-pipeline test, minus the special tokens
        EXPECTED_BATCH_TEXTS = Expectations(
            {
-                ("xpu", 3): [
-                    "Hello I am doing a project on the 1960s and I am trying to find out what the average",
-                    "Hi today I'm going to be talking about the 10 most powerful characters in the Naruto series.",
-                ],
                ("cuda", 8): [
                    "Hello I am doing a project on the 1960s and I am trying to find out what the average",
                    "Hi today I'm going to be talking about the 10 most powerful characters in the Naruto series.",
-                ],
+                ]
            }
        )
        EXPECTED_BATCH_TEXT = EXPECTED_BATCH_TEXTS.get_expectation()
@ -319,9 +315,6 @@ class Gemma2IntegrationTest(unittest.TestCase):
        tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b", pad_token="</s>", padding_side="right")
        EXPECTED_TEXT_COMPLETIONS = Expectations(
            {
-                ("xpu", 3): [
-                    "Hello I am doing a project for my school and I need to know how to make a program that will take a number"
-                ],
                ("cuda", 7): [
                    "Hello I am doing a project for my school and I need to know how to make a program that will take a number"
                ],
--- a/tests/models/gemma3/test_modeling_gemma3.py
+++ b/tests/models/gemma3/test_modeling_gemma3.py
@ -31,7 +31,6 @@ from transformers.testing_utils import (
    Expectations,
    cleanup,
    is_flash_attn_2_available,
-    require_deterministic_for_xpu,
    require_flash_attn,
    require_read_token,
    require_torch,
@ -387,7 +386,6 @@ class Gemma3IntegrationTest(unittest.TestCase):
    def tearDown(self):
        cleanup(torch_device, gc_collect=True)

-    @require_deterministic_for_xpu
    def test_model_4b_bf16(self):
        model_id = "google/gemma-3-4b-it"

@ -408,7 +406,6 @@ class Gemma3IntegrationTest(unittest.TestCase):

        EXPECTED_TEXTS = Expectations(
            {
-                ("xpu", 3): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown and white cow standing on a sandy beach with turquoise water in the background. It looks like a lovely,'],
                ("cuda", 7): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown and white cow standing on a sandy beach with turquoise water in the background. It looks like a lovely,'],
                ("cuda", 8): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown and white cow standing on a sandy beach next to a turquoise ocean. It looks like a very sunny and'],
            }
@ -417,7 +414,6 @@ class Gemma3IntegrationTest(unittest.TestCase):
        self.assertEqual(output_text, EXPECTED_TEXT)

    @require_torch_large_accelerator
-    @require_deterministic_for_xpu
    def test_model_4b_batch(self):
        model_id = "google/gemma-3-4b-it"

@ -454,17 +450,12 @@ class Gemma3IntegrationTest(unittest.TestCase):

        EXPECTED_TEXTS = Expectations(
            {
-                ("xpu", 3):
-                    [
-                        'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown and white cow standing on a sandy beach next to a turquoise ocean. It looks like a very sunny and',
-                        'user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, these images are not identical. They depict very different scenes:\n\n*   **Image 1** shows a cow standing on a beach.',
-                    ],
                ("cuda", 7): [],
                ("cuda", 8):
                    [
                        'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown and white cow standing on a sandy beach next to a turquoise ocean. It looks like a very sunny and',
                        'user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, these images are not identical. They depict very different scenes:\n\n*   **Image 1** shows a cow standing on a beach.',
-                    ],
+                    ]
            }
        )  # fmt: skip
        EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
@ -502,9 +493,8 @@ class Gemma3IntegrationTest(unittest.TestCase):
        EXPECTED_NUM_IMAGES = 3  # one for the origin image and two crops of images
        EXPECTED_TEXTS = Expectations(
            {
-                ("xpu", 3): ['user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There are clouds in the blue sky above.'],
                ("cuda", 7): [],
-                ("cuda", 8): ['user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There are clouds in the blue sky above.'],
+                ("cuda", 8): ['user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There are clouds in the blue sky above.']
            }
        )  # fmt: skip
        EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
@ -512,7 +502,6 @@ class Gemma3IntegrationTest(unittest.TestCase):
        self.assertEqual(output_text, EXPECTED_TEXT)

    @require_torch_large_accelerator
-    @require_deterministic_for_xpu
    def test_model_4b_batch_crops(self):
        model_id = "google/gemma-3-4b-it"

@ -557,15 +546,11 @@ class Gemma3IntegrationTest(unittest.TestCase):
        EXPECTED_NUM_IMAGES = 9  # 3 * (one for the origin image and two crops of images) = 9
        EXPECTED_TEXTS = Expectations(
            {
-                ("xpu", 3): [
-                    'user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There are clouds in the blue sky above.',
-                    'user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nThe first image shows a cow on a beach, while the second image shows a street scene with a',
-                ],
                ("cuda", 7): [],
                ("cuda", 8): [
                    'user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There are clouds in the blue sky above.',
                    'user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nThe first image shows a cow on a beach, while the second image shows a street scene with a',
-                ],
+                ]
            }
        )  # fmt: skip
        EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
@ -604,15 +589,13 @@ class Gemma3IntegrationTest(unittest.TestCase):
        output_text = self.processor.batch_decode(output, skip_special_tokens=True)
        EXPECTED_TEXTS = Expectations(
            {
-                ("xpu", 3): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image!\n\nHere's a description of the scene:\n\n*   **Chinese Arch"],
                ("cuda", 7): [],
-                ("cuda", 8): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Main Features:**\n\n*   **Chinese Archway:** The most prominent"],
+                ("cuda", 8): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Main Features:**\n\n*   **Chinese Archway:** The most prominent"]
            }
        )  # fmt: skip
        EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
        self.assertEqual(output_text, EXPECTED_TEXT)

-    @require_deterministic_for_xpu
    def test_model_1b_text_only(self):
        model_id = "google/gemma-3-1b-it"

@ -627,7 +610,6 @@ class Gemma3IntegrationTest(unittest.TestCase):

        EXPECTED_TEXTS = Expectations(
            {
-                ("xpu", 3): ['Write a poem about Machine Learning.\n\n---\n\nThe data flows, a river deep,\nWith patterns hidden, secrets sleep.\nA neural net, a watchful eye,\nLearning'],
                ("cuda", 7): ['Write a poem about Machine Learning.\n\n---\n\nThe data flows, a silent stream,\nInto the neural net, a waking dream.\nAlgorithms hum, a coded grace,\n'],
                ("cuda", 8): ['Write a poem about Machine Learning.\n\n---\n\nThe data flows, a silent stream,\nInto the neural net, a waking dream.\nAlgorithms hum, a coded grace,\n'],
            }
@ -659,7 +641,6 @@ class Gemma3IntegrationTest(unittest.TestCase):

        EXPECTED_TEXTS = Expectations(
            {
-                ("xpu", 3): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach with turquoise water and a distant island in the background. It looks like a sunny day'],
                ("cuda", 7): [],
                ("cuda", 8): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach with turquoise water and a distant island in the background. It looks like a sunny day'],
            }
--- a/tests/models/glm4/test_modeling_glm4.py
+++ b/tests/models/glm4/test_modeling_glm4.py
@ -24,7 +24,6 @@ from transformers.testing_utils import (
    cleanup,
    require_flash_attn,
    require_torch,
-    require_torch_large_accelerator,
    require_torch_large_gpu,
    require_torch_sdpa,
    slow,
@ -80,7 +79,7 @@ class Glm4ModelTest(CausalLMModelTest, unittest.TestCase):


@slow
-@require_torch_large_accelerator
+@require_torch_large_gpu
 class Glm4IntegrationTest(unittest.TestCase):
    input_text = ["Hello I am doing", "Hi today"]
    model_id = "THUDM/GLM-4-9B-0414"
@ -91,10 +90,6 @@ class Glm4IntegrationTest(unittest.TestCase):
    def test_model_9b_fp16(self):
        EXPECTED_TEXTS = Expectations(
            {
-                ("xpu", 3): [
-                    "Hello I am doing a project on the history of the internet and I need to know what the first website was and what",
-                    "Hi today I am going to tell you about the most common disease in the world. This disease is called diabetes",
-                ],
                ("cuda", 7): [],
                ("cuda", 8): [
                    "Hello I am doing a project on the history of the internet and I need to know what the first website was and what",
@ -119,10 +114,6 @@ class Glm4IntegrationTest(unittest.TestCase):
    def test_model_9b_bf16(self):
        EXPECTED_TEXTS = Expectations(
            {
-                ("xpu", 3): [
-                    "Hello I am doing a project on the history of the internet and I need to know what the first website was and what",
-                    "Hi today I am going to tell you about the most common disease in the world. This disease is called diabetes",
-                ],
                ("cuda", 7): [],
                ("cuda", 8): [
                    "Hello I am doing a project on the history of the internet and I need to know what the first website was and what",
@ -147,10 +138,6 @@ class Glm4IntegrationTest(unittest.TestCase):
    def test_model_9b_eager(self):
        EXPECTED_TEXTS = Expectations(
            {
-                ("xpu", 3): [
-                    "Hello I am doing a project on the history of the internet and I need to know what the first website was and who",
-                    "Hi today I am going to tell you about the most common disease in the world. This disease is called diabetes",
-                ],
                ("cuda", 7): [],
                ("cuda", 8): [
                    "Hello I am doing a project on the history of the internet and I need to know what the first website was and what",
@ -180,10 +167,6 @@ class Glm4IntegrationTest(unittest.TestCase):
    def test_model_9b_sdpa(self):
        EXPECTED_TEXTS = Expectations(
            {
-                ("xpu", 3): [
-                    "Hello I am doing a project on the history of the internet and I need to know what the first website was and what",
-                    "Hi today I am going to tell you about the most common disease in the world. This disease is called diabetes",
-                ],
                ("cuda", 7): [],
                ("cuda", 8): [
                    "Hello I am doing a project on the history of the internet and I need to know what the first website was and what",
@ -210,7 +193,6 @@ class Glm4IntegrationTest(unittest.TestCase):
        self.assertEqual(output_text, EXPECTED_TEXT)

    @require_flash_attn
-    @require_torch_large_gpu
    @pytest.mark.flash_attn_test
    def test_model_9b_flash_attn(self):
        EXPECTED_TEXTS = Expectations(
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@ -718,7 +718,7 @@ class GroundingDinoModelIntegrationTests(unittest.TestCase):

    @require_torch_accelerator
    @is_flaky()
-    def test_inference_object_detection_head_equivalence_cpu_accelerator(self):
+    def test_inference_object_detection_head_equivalence_cpu_gpu(self):
        processor = self.default_processor
        image = prepare_img()
        text = prepare_text()
@ -730,7 +730,7 @@ class GroundingDinoModelIntegrationTests(unittest.TestCase):
        with torch.no_grad():
            cpu_outputs = model(**encoding)

-        # 2. run model on accelerator
+        # 2. run model on GPU
        model.to(torch_device)
        encoding = encoding.to(torch_device)
        with torch.no_grad():
--- a/tests/models/idefics/test_modeling_idefics.py
+++ b/tests/models/idefics/test_modeling_idefics.py
@ -324,8 +324,10 @@ class IdeficsModelTester:


@require_torch
-class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (IdeficsModel, IdeficsForVisionText2Text) if is_torch_available() else ()
+    # Doesn't run generation tests here -- idefics has a dedicated tester for generation tests below
+    all_generative_model_classes = ()
    pipeline_model_mapping = (
        {"feature-extraction": IdeficsModel, "image-text-to-text": IdeficsForVisionText2Text}
        if is_torch_available()
@ -334,7 +336,6 @@ class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMi
    test_pruning = False
    test_headmasking = False
    test_torchscript = False
-    has_attentions = False  # only supports SDOA and thus no attention probs returned

    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
@ -493,31 +494,6 @@ class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMi
    def test_retain_grad_hidden_states_attentions(self):
        return

-    @pytest.mark.generate
-    @unittest.skip(reason="""IDEFICS cannot generate with no images provided!""")
-    def test_generate_without_input_ids(self):
-        pass
-
-    @pytest.mark.generate
-    @unittest.skip(reason="""IDEFICS cannot generate with no images provided!""")
-    def test_generate_continue_from_inputs_embeds(self):
-        pass
-
-    @pytest.mark.generate
-    @unittest.skip(reason="""IDEFICS cannot do contrastive generation yet and it is not worth fixing""")
-    def test_contrastive_generate(self):
-        pass
-
-    @pytest.mark.generate
-    @unittest.skip(reason="""IDEFICS cannot do contrastive generation yet and it is not worth fixing""")
-    def test_contrastive_generate_low_memory(self):
-        pass
-
-    @pytest.mark.generate
-    @unittest.skip(reason="""IDEFICS cannot do contrastive generation yet and it is not worth fixing""")
-    def test_contrastive_generate_dict_outputs_use_cache(self):
-        pass
-
    def test_attention_outputs(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        config.return_dict = True
--- a/tests/models/llama4/test_modeling_llama4.py
+++ b/tests/models/llama4/test_modeling_llama4.py
@ -18,7 +18,7 @@ import unittest
 from transformers import is_torch_available
 from transformers.testing_utils import (
    require_read_token,
-    require_torch_large_accelerator,
+    require_torch_large_gpu,
    slow,
    torch_device,
 )
@ -34,7 +34,7 @@ if is_torch_available():


@slow
-@require_torch_large_accelerator
+@require_torch_large_gpu
@require_read_token
 class Llama4IntegrationTest(unittest.TestCase):
    model_id = "meta-llama/Llama-4-Scout-17B-16E"
--- a/tests/models/longt5/test_modeling_longt5.py
+++ b/tests/models/longt5/test_modeling_longt5.py
@ -626,6 +626,40 @@ class LongT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
        model = LongT5Model.from_pretrained(model_name)
        self.assertIsNotNone(model)

+    def test_generate_with_head_masking(self):
+        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        config = config_and_inputs[0]
+        max_length = config_and_inputs[1].shape[-1] + 3
+        model = LongT5ForConditionalGeneration(config).eval()
+        model.to(torch_device)
+
+        head_masking = {
+            "head_mask": torch.zeros(config.num_layers, config.num_heads, device=torch_device),
+            "decoder_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
+            "cross_attn_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
+        }
+
+        for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
+            head_masks = {name: mask}
+            # Explicitly pass decoder_head_mask as it is required from LONGT5 model when head_mask specified
+            if name == "head_mask":
+                head_masks["decoder_head_mask"] = torch.ones(
+                    config.num_decoder_layers, config.num_heads, device=torch_device
+                )
+
+            out = model.generate(
+                config_and_inputs[1],
+                num_beams=1,
+                max_length=max_length,
+                output_attentions=True,
+                return_dict_in_generate=True,
+                **head_masks,
+            )
+            # We check the state of decoder_attentions and cross_attentions just from the last step
+            attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
+            self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
+
    def test_attention_outputs(self):
        if not self.has_attentions:
            self.skipTest(reason="has_attentions is set to False")
--- a/tests/models/minimax/test_modeling_minimax.py
+++ b/tests/models/minimax/test_modeling_minimax.py
@ -243,7 +243,7 @@ class MiniMaxModelTest(CausalLMModelTest, unittest.TestCase):
@slow
 class MiniMaxIntegrationTest(unittest.TestCase):
    def test_small_model_logits(self):
-        model_id = "hf-internal-testing/MiniMax-tiny"
+        model_id = "geetu040/MiniMax-tiny"
        dummy_input = torch.LongTensor([[0, 1, 0], [0, 1, 0]]).to(torch_device)

        model = MiniMaxForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True).to(
@ -262,7 +262,7 @@ class MiniMaxIntegrationTest(unittest.TestCase):
        torch.testing.assert_close(logits[1, :3, :3], expected_slice, atol=1e-3, rtol=1e-3)

    def test_small_model_generation(self):
-        model_id = "hf-internal-testing/MiniMax-tiny"
+        model_id = "geetu040/MiniMax-tiny"
        dummy_input = torch.LongTensor([[0, 1, 0], [0, 1, 0]]).to(torch_device)

        model = MiniMaxForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True).to(
--- a/tests/models/mt5/test_modeling_mt5.py
+++ b/tests/models/mt5/test_modeling_mt5.py
@ -868,6 +868,40 @@ class MT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
        model = MT5Model.from_pretrained(model_name)
        self.assertIsNotNone(model)

+    def test_generate_with_head_masking(self):
+        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        config = config_and_inputs[0]
+        max_length = config_and_inputs[1].shape[-1] + 3
+        model = MT5ForConditionalGeneration(config).eval()
+        model.to(torch_device)
+
+        head_masking = {
+            "head_mask": torch.zeros(config.num_layers, config.num_heads, device=torch_device),
+            "decoder_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
+            "cross_attn_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
+        }
+
+        for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
+            head_masks = {name: mask}
+            # Explicitly pass decoder_head_mask as it is required from MT5 model when head_mask specified
+            if name == "head_mask":
+                head_masks["decoder_head_mask"] = torch.ones(
+                    config.num_decoder_layers, config.num_heads, device=torch_device
+                )
+
+            out = model.generate(
+                config_and_inputs[1],
+                num_beams=1,
+                max_length=max_length,
+                output_attentions=True,
+                return_dict_in_generate=True,
+                **head_masks,
+            )
+            # We check the state of decoder_attentions and cross_attentions just from the last step
+            attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
+            self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
+

 # Copied from tests.models.t5.test_modeling_t5.T5EncoderOnlyModelTester with T5->MT5
 class MT5EncoderOnlyModelTester:
--- a/tests/models/omdet_turbo/test_modeling_omdet_turbo.py
+++ b/tests/models/omdet_turbo/test_modeling_omdet_turbo.py
@ -870,7 +870,7 @@ class OmDetTurboModelIntegrationTests(unittest.TestCase):
        self.assertListEqual([result["text_labels"] for result in results], expected_text_labels)

    @require_torch_accelerator
-    def test_inference_object_detection_head_equivalence_cpu_accelerator(self):
+    def test_inference_object_detection_head_equivalence_cpu_gpu(self):
        processor = self.default_processor
        image = prepare_img()
        text_labels, task = prepare_text()
@ -881,7 +881,7 @@ class OmDetTurboModelIntegrationTests(unittest.TestCase):
        with torch.no_grad():
            cpu_outputs = model(**encoding)

-        # 2. run model on accelerator
+        # 2. run model on GPU
        model.to(torch_device)
        encoding = encoding.to(torch_device)
        with torch.no_grad():
--- a/tests/models/prophetnet/test_modeling_prophetnet.py
+++ b/tests/models/prophetnet/test_modeling_prophetnet.py
@ -1117,6 +1117,10 @@ class ProphetNetModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
        self.assertIsNotNone(encoder_hidden_states.grad)
        self.assertIsNotNone(encoder_attentions.grad)

+    @unittest.skip(reason="Generating with head_masking has not been implemented for ProphetNet models yet.")
+    def test_generate_with_head_masking(self):
+        pass
+

@require_torch
 class ProphetNetStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
--- a/tests/models/rt_detr/test_image_processing_rt_detr.py
+++ b/tests/models/rt_detr/test_image_processing_rt_detr.py
@ -19,11 +19,10 @@ import requests
 from transformers.testing_utils import (
    is_flaky,
    require_torch,
-    require_torch_accelerator,
+    require_torch_gpu,
    require_torchvision,
    require_vision,
    slow,
-    torch_device,
 )
 from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available

@ -380,10 +379,10 @@ class RtDetrImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
            torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, atol=1, rtol=1)

    @slow
-    @require_torch_accelerator
+    @require_torch_gpu
    @require_torchvision
-    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_fast_processor_equivalence_cpu_accelerator_coco_detection_annotations
-    def test_fast_processor_equivalence_cpu_accelerator_coco_detection_annotations(self):
+    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations
+    def test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations(self):
        # prepare image and target
        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt") as f:
@ -394,8 +393,8 @@ class RtDetrImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        processor = self.image_processor_list[1]()
        # 1. run processor on CPU
        encoding_cpu = processor(images=image, annotations=target, return_tensors="pt", device="cpu")
-        # 2. run processor on accelerator
-        encoding_gpu = processor(images=image, annotations=target, return_tensors="pt", device=torch_device)
+        # 2. run processor on GPU
+        encoding_gpu = processor(images=image, annotations=target, return_tensors="pt", device="cuda")

        # verify pixel values
        self.assertEqual(encoding_cpu["pixel_values"].shape, encoding_gpu["pixel_values"].shape)
--- a/tests/models/shieldgemma2/test_modeling_shieldgemma2.py
+++ b/tests/models/shieldgemma2/test_modeling_shieldgemma2.py
@ -22,7 +22,7 @@ from PIL import Image
 from transformers import is_torch_available
 from transformers.testing_utils import (
    cleanup,
-    require_torch_accelerator,
+    require_torch_gpu,
    slow,
    torch_device,
 )
@ -35,7 +35,7 @@ if is_torch_available():


@slow
-@require_torch_accelerator
+@require_torch_gpu
 # @require_read_token
 class ShieldGemma2IntegrationTest(unittest.TestCase):
    def tearDown(self):
--- a/tests/models/speecht5/test_modeling_speecht5.py
+++ b/tests/models/speecht5/test_modeling_speecht5.py
@ -741,6 +741,10 @@ class SpeechT5ForSpeechToTextTest(ModelTesterMixin, unittest.TestCase, Generatio
        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
            module.masked_spec_embed.data.fill_(3)

+    @unittest.skip(reason="Temporarily broken")  # TODO (joao, eustache): have a look at this test
+    def test_generate_with_head_masking(self):
+        pass
+
    @unittest.skip(reason="Temporarily broken")  # TODO (joao, eustache): have a look at this test
    def test_generate_without_input_ids(self):
        pass
--- a/tests/models/swiftformer/test_modeling_swiftformer.py
+++ b/tests/models/swiftformer/test_modeling_swiftformer.py
@ -250,8 +250,6 @@ class SwiftFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
        for model_class in self.all_model_classes:
            model = model_class(config=configs_no_init)
            for name, param in model.named_parameters():
-                if name.endswith(".w_g"):
-                    continue
                if param.requires_grad:
                    self.assertIn(
                        ((param.data.mean() * 1e9) / 1e9).round().item(),
--- a/tests/models/switch_transformers/test_modeling_switch_transformers.py
+++ b/tests/models/switch_transformers/test_modeling_switch_transformers.py
@ -709,6 +709,40 @@ class SwitchTransformersModelTest(ModelTesterMixin, GenerationTesterMixin, Pipel
        model = SwitchTransformersModel.from_pretrained(model_name)
        self.assertIsNotNone(model)

+    def test_generate_with_head_masking(self):
+        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        config = config_and_inputs[0]
+        max_length = config_and_inputs[1].shape[-1] + 3
+        model = SwitchTransformersForConditionalGeneration(config).eval()
+        model.to(torch_device)
+
+        head_masking = {
+            "head_mask": torch.zeros(config.num_layers, config.num_heads, device=torch_device),
+            "decoder_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
+            "cross_attn_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
+        }
+
+        for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
+            head_masks = {name: mask}
+            # Explicitly pass decoder_head_mask as it is required from SWITCH_TRANSFORMERS model when head_mask specified
+            if name == "head_mask":
+                head_masks["decoder_head_mask"] = torch.ones(
+                    config.num_decoder_layers, config.num_heads, device=torch_device
+                )
+
+            out = model.generate(
+                config_and_inputs[1],
+                num_beams=1,
+                max_length=max_length,
+                output_attentions=True,
+                return_dict_in_generate=True,
+                **head_masks,
+            )
+            # We check the state of decoder_attentions and cross_attentions just from the last step
+            attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
+            self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
+
    @unittest.skip(
        reason="This architecture has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
    )
--- a/tests/models/t5/test_modeling_t5.py
+++ b/tests/models/t5/test_modeling_t5.py
@ -873,6 +873,40 @@ class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
        model = T5Model.from_pretrained(model_name)
        self.assertIsNotNone(model)

+    def test_generate_with_head_masking(self):
+        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        config = config_and_inputs[0]
+        max_length = config_and_inputs[1].shape[-1] + 3
+        model = T5ForConditionalGeneration(config).eval()
+        model.to(torch_device)
+
+        head_masking = {
+            "head_mask": torch.zeros(config.num_layers, config.num_heads, device=torch_device),
+            "decoder_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
+            "cross_attn_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
+        }
+
+        for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
+            head_masks = {name: mask}
+            # Explicitly pass decoder_head_mask as it is required from T5 model when head_mask specified
+            if name == "head_mask":
+                head_masks["decoder_head_mask"] = torch.ones(
+                    config.num_decoder_layers, config.num_heads, device=torch_device
+                )
+
+            out = model.generate(
+                config_and_inputs[1],
+                num_beams=1,
+                max_length=max_length,
+                output_attentions=True,
+                return_dict_in_generate=True,
+                **head_masks,
+            )
+            # We check the state of decoder_attentions and cross_attentions just from the last step
+            attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
+            self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
+

 class T5EncoderOnlyModelTester:
    def __init__(
--- a/tests/models/udop/test_modeling_udop.py
+++ b/tests/models/udop/test_modeling_udop.py
@ -419,6 +419,10 @@ class UdopModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
        model = UdopForConditionalGeneration.from_pretrained(model_name)
        self.assertIsNotNone(model)

+    @unittest.skip(reason="TODO: Fix me @joao")
+    def test_generate_with_head_masking(self):
+        pass
+
    @unittest.skip(reason="TODO: Fix me @joao")
    def test_generate_without_input_ids(self):
        pass
--- a/tests/models/umt5/test_modeling_umt5.py
+++ b/tests/models/umt5/test_modeling_umt5.py
@ -489,6 +489,39 @@ class UMT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)

+    def test_generate_with_head_masking(self):
+        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        config = config_and_inputs[0]
+        model = UMT5ForConditionalGeneration(config).eval()
+        model.to(torch_device)
+
+        head_masking = {
+            "head_mask": torch.zeros(config.num_layers, config.num_heads, device=torch_device),
+            "decoder_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
+            "cross_attn_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
+        }
+
+        for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
+            head_masks = {name: mask}
+            # Explicitly pass decoder_head_mask as it is required from T5 model when head_mask specified
+            if name == "head_mask":
+                head_masks["decoder_head_mask"] = torch.ones(
+                    config.num_decoder_layers, config.num_heads, device=torch_device
+                )
+
+            out = model.generate(
+                config_and_inputs[1]["input_ids"],
+                num_beams=1,
+                max_length=3,
+                output_attentions=True,
+                return_dict_in_generate=True,
+                **head_masks,
+            )
+            # We check the state of decoder_attentions and cross_attentions just from the last step
+            attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
+            self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
+
    @unittest.skip(
        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
    )
--- a/tests/models/whisper/test_feature_extraction_whisper.py
+++ b/tests/models/whisper/test_feature_extraction_whisper.py
@ -23,11 +23,7 @@ import numpy as np
 from datasets import load_dataset

 from transformers import WhisperFeatureExtractor
-from transformers.testing_utils import (
-    check_json_file_has_correct_format,
-    require_torch,
-    require_torch_accelerator,
-)
+from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_torch_gpu
 from transformers.utils.import_utils import is_torch_available

 from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
@ -258,7 +254,7 @@ class WhisperFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.

        return [x["array"] for x in speech_samples]

-    @require_torch_accelerator
+    @require_torch_gpu
    @require_torch
    def test_torch_integration(self):
        # fmt: off
@ -307,7 +303,7 @@ class WhisperFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.
        self.assertTrue(np.all(np.mean(audio) < 1e-3))
        self.assertTrue(np.all(np.abs(np.var(audio) - 1) < 1e-3))

-    @require_torch_accelerator
+    @require_torch_gpu
    @require_torch
    def test_torch_integration_batch(self):
        # fmt: off
--- a/tests/quantization/bnb/test_mixed_int8.py
+++ b/tests/quantization/bnb/test_mixed_int8.py
@ -730,7 +730,7 @@ class MixedInt8TestCpuGpu(BaseMixedInt8Test):
        output_text = self.tokenizer.decode(output_parallel[0], skip_special_tokens=True)
        self.assertIn(output_text, self.EXPECTED_OUTPUTS)

-    def test_cpu_accelerator_loading_random_device_map(self):
+    def test_cpu_gpu_loading_random_device_map(self):
        r"""
        A test to check is dispatching a model on cpu & gpu works correctly using a random `device_map`.
        """
@ -778,7 +778,7 @@ class MixedInt8TestCpuGpu(BaseMixedInt8Test):

        self.check_inference_correctness(model_8bit)

-    def test_cpu_accelerator_loading_custom_device_map(self):
+    def test_cpu_gpu_loading_custom_device_map(self):
        r"""
        A test to check is dispatching a model on cpu & gpu works correctly using a custom `device_map`.
        This time the device map is more organized than the test above and uses the abstraction
@ -805,7 +805,7 @@ class MixedInt8TestCpuGpu(BaseMixedInt8Test):

        self.check_inference_correctness(model_8bit)

-    def test_cpu_accelerator_disk_loading_custom_device_map(self):
+    def test_cpu_gpu_disk_loading_custom_device_map(self):
        r"""
        A test to check is dispatching a model on cpu & gpu works correctly using a custom `device_map`.
        This time we also add `disk` on the device_map.
@ -832,7 +832,7 @@ class MixedInt8TestCpuGpu(BaseMixedInt8Test):

            self.check_inference_correctness(model_8bit)

-    def test_cpu_accelerator_disk_loading_custom_device_map_kwargs(self):
+    def test_cpu_gpu_disk_loading_custom_device_map_kwargs(self):
        r"""
        A test to check is dispatching a model on cpu & gpu works correctly using a custom `device_map`.
        This time we also add `disk` on the device_map - using the kwargs directly instead of the quantization config
--- a/tests/quantization/ggml/test_ggml.py
+++ b/tests/quantization/ggml/test_ggml.py
@ -20,7 +20,7 @@ from transformers import AddedToken, AutoModelForCausalLM, AutoModelForSeq2SeqLM
 from transformers.testing_utils import (
    require_gguf,
    require_read_token,
-    require_torch_accelerator,
+    require_torch_gpu,
    slow,
    torch_device,
 )
@ -35,7 +35,7 @@ if is_gguf_available():


@require_gguf
-@require_torch_accelerator
+@require_torch_gpu
@slow
 class GgufQuantizationTests(unittest.TestCase):
    """
@ -107,7 +107,7 @@ class GgufQuantizationTests(unittest.TestCase):


@require_gguf
-@require_torch_accelerator
+@require_torch_gpu
@slow
 class GgufIntegrationTests(unittest.TestCase):
    """
@ -263,7 +263,7 @@ class GgufIntegrationTests(unittest.TestCase):


@require_gguf
-@require_torch_accelerator
+@require_torch_gpu
@slow
 class GgufModelTests(unittest.TestCase):
    mistral_model_id = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
--- a/tests/quantization/quark_integration/test_quark.py
+++ b/tests/quantization/quark_integration/test_quark.py
@ -11,18 +11,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import gc
 import unittest

 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, QuarkConfig
 from transformers.testing_utils import (
-    cleanup,
    is_torch_available,
    require_accelerate,
    require_quark,
    require_torch_gpu,
    require_torch_multi_gpu,
    slow,
-    torch_device,
 )
 from transformers.utils.import_utils import is_quark_available

@ -80,10 +79,11 @@ class QuarkTest(unittest.TestCase):

    def tearDown(self):
        r"""
-        TearDown function needs to be called at the end of each test to free the accelerator memory and cache, also to
+        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
        """
-        cleanup(torch_device, gc_collect=True)
+        gc.collect()
+        torch.cuda.empty_cache()

    def test_memory_footprint(self):
        mem_quantized = self.quantized_model.get_memory_footprint()
--- a/tests/test_image_processing_common.py
+++ b/tests/test_image_processing_common.py
@ -30,7 +30,7 @@ from transformers.testing_utils import (
    check_json_file_has_correct_format,
    is_flaky,
    require_torch,
-    require_torch_accelerator,
+    require_torch_gpu,
    require_vision,
    slow,
    torch_device,
@ -562,7 +562,7 @@ class ImageProcessingTestMixin:
            self.skipTest(reason="No validation found for `preprocess` method")

    @slow
-    @require_torch_accelerator
+    @require_torch_gpu
    @require_vision
    def test_can_compile_fast_image_processor(self):
        if self.fast_image_processing_class is None:
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@ -716,16 +716,8 @@ class ModelTesterMixin:
            model = model_class(config=configs_no_init)
            for name, param in model.named_parameters():
                if param.requires_grad:
-                    data = torch.flatten(param.data)
-                    n_elements = torch.numel(data)
-                    # skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in
-                    # https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332
-                    n_elements_to_skip_on_each_side = int(n_elements * 0.025)
-                    data_to_check = torch.sort(data).values
-                    if n_elements_to_skip_on_each_side > 0:
-                        data_to_check = data_to_check[n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side]
                    self.assertIn(
-                        ((data_to_check.mean() * 1e9).round() / 1e9).item(),
+                        ((param.data.mean() * 1e9).round() / 1e9).item(),
                        [0.0, 1.0],
                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                    )
--- a/tests/test_video_processing_common.py
+++ b/tests/test_video_processing_common.py
@ -26,7 +26,7 @@ from transformers import AutoVideoProcessor
 from transformers.testing_utils import (
    check_json_file_has_correct_format,
    require_torch,
-    require_torch_accelerator,
+    require_torch_gpu,
    require_vision,
    slow,
    torch_device,
@ -165,7 +165,7 @@ class VideoProcessingTestMixin:
            self.assertIsNotNone(video_processor)

    @slow
-    @require_torch_accelerator
+    @require_torch_gpu
    @require_vision
    def test_can_compile_fast_video_processor(self):
        if self.fast_video_processing_class is None:
--- a/tests/tokenization/test_tokenization_utils.py
+++ b/tests/tokenization/test_tokenization_utils.py
@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-ruff: isort: skip_file
+isort:skip_file
 """

 import os
Author	SHA1	Message	Date
ydshieh	b9dae9d59d	try	2025-06-05 16:35:42 +02:00
ydshieh	60d873fd2f	trigger	2025-06-05 16:24:40 +02:00
ydshieh	3093bec4c2	build images	2025-06-05 16:15:04 +02:00
ydshieh	0a7de90c11	trigger CI	2025-06-05 16:13:08 +02:00
ydshieh	88b9a2a807	no need 3.2.1	2025-06-05 16:11:53 +02:00
ydshieh	ae1241c02d	trigger CI	2025-06-05 16:11:53 +02:00
ydshieh	b1c4c17e0a	try build with torch 2.7	2025-06-05 16:11:53 +02:00